Skip to content

Commit

Permalink
Merge pull request #294 from korpling/feature/table-export
Browse files Browse the repository at this point in the history
Feature/table export
  • Loading branch information
MartinKl authored Aug 13, 2024
2 parents 8bd2560 + a7d9d2a commit 02cde83
Show file tree
Hide file tree
Showing 6 changed files with 349 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/exporter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
pub mod exmaralda;
pub mod graphml;
pub mod sequence;
pub(crate) mod table;
pub mod textgrid;
pub mod xlsx;

Expand Down
11 changes: 11 additions & 0 deletions src/exporter/snapshots/annatto__exporter__table__tests__core.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
source: src/exporter/table.rs
expression: export.unwrap()
---
norm::lemma id_norm::lemma norm::pos id_norm::pos dipl::sentence id_dipl::sentence norm::norm id_norm::norm dipl::dipl id_dipl::dipl
I exmaralda/test_doc#a_norm_T286-T0 PRON exmaralda/test_doc#a_norm_T286-T0 1 exmaralda/test_doc#a_dipl_T286-T4 I exmaralda/test_doc#t_norm_T286-T0 I'm exmaralda/test_doc#t_dipl_T286-T1
be exmaralda/test_doc#a_norm_T0-T1 VERB exmaralda/test_doc#a_norm_T0-T1 1 exmaralda/test_doc#a_dipl_T286-T4 am exmaralda/test_doc#t_norm_T0-T1 I'm exmaralda/test_doc#t_dipl_T286-T1
in exmaralda/test_doc#a_norm_T1-T2 ADP exmaralda/test_doc#a_norm_T1-T2 1 exmaralda/test_doc#a_dipl_T286-T4 in exmaralda/test_doc#t_norm_T1-T2 in exmaralda/test_doc#t_dipl_T1-T2
New York exmaralda/test_doc#a_norm_T2-T4 ADP exmaralda/test_doc#a_norm_T2-T4 1 exmaralda/test_doc#a_dipl_T286-T4 New York exmaralda/test_doc#t_norm_T2-T4 New exmaralda/test_doc#t_dipl_T2-T3
New York exmaralda/test_doc#a_norm_T2-T4 ADP exmaralda/test_doc#a_norm_T2-T4 1 exmaralda/test_doc#a_dipl_T286-T4 New York exmaralda/test_doc#t_norm_T2-T4 York exmaralda/test_doc#t_dipl_T3-T4

Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
source: src/exporter/table.rs
expression: export.unwrap()
---
"norm::lemma" "id_norm::lemma" "norm::pos" "id_norm::pos" "dipl::sentence" "id_dipl::sentence" "norm::norm" "id_norm::norm" "dipl::dipl" "id_dipl::dipl"
"I" "exmaralda/test_doc#a_norm_T286-T0" "PRON" "exmaralda/test_doc#a_norm_T286-T0" "1" "exmaralda/test_doc#a_dipl_T286-T4" "I" "exmaralda/test_doc#t_norm_T286-T0" "I'm" "exmaralda/test_doc#t_dipl_T286-T1"
"be" "exmaralda/test_doc#a_norm_T0-T1" "VERB" "exmaralda/test_doc#a_norm_T0-T1" "1" "exmaralda/test_doc#a_dipl_T286-T4" "am" "exmaralda/test_doc#t_norm_T0-T1" "I'm" "exmaralda/test_doc#t_dipl_T286-T1"
"in" "exmaralda/test_doc#a_norm_T1-T2" "ADP" "exmaralda/test_doc#a_norm_T1-T2" "1" "exmaralda/test_doc#a_dipl_T286-T4" "in" "exmaralda/test_doc#t_norm_T1-T2" "in" "exmaralda/test_doc#t_dipl_T1-T2"
"New York" "exmaralda/test_doc#a_norm_T2-T4" "ADP" "exmaralda/test_doc#a_norm_T2-T4" "1" "exmaralda/test_doc#a_dipl_T286-T4" "New York" "exmaralda/test_doc#t_norm_T2-T4" "New" "exmaralda/test_doc#t_dipl_T2-T3"
"New York" "exmaralda/test_doc#a_norm_T2-T4" "ADP" "exmaralda/test_doc#a_norm_T2-T4" "1" "exmaralda/test_doc#a_dipl_T286-T4" "New York" "exmaralda/test_doc#t_norm_T2-T4" "York" "exmaralda/test_doc#t_dipl_T3-T4"

318 changes: 318 additions & 0 deletions src/exporter/table.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
use std::{
borrow::Cow,
collections::{btree_map::Entry, BTreeMap},
path::Path,
};

use anyhow::anyhow;
use documented::{Documented, DocumentedFields};
use graphannis::{
graph::{AnnoKey, NodeID},
model::{AnnotationComponent, AnnotationComponentType},
AnnotationGraph,
};
use graphannis_core::{
dfs::CycleSafeDFS,
graph::{ANNIS_NS, NODE_NAME_KEY},
util::join_qname,
};
use itertools::Itertools;
use serde::Deserialize;
use struct_field_names_as_array::FieldNamesAsSlice;

use super::Exporter;

use crate::deserialize::deserialize_anno_key;

/// This module exports all ordered nodes and nodes connected by coverage edges of any name into a table.
#[derive(Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)]
#[serde(deny_unknown_fields)]
pub struct ExportTable {
/// The provided annotation key defines which nodes within the part-of component define a document. All nodes holding said annotation
/// will be exported to a file with the name according to the annotation value. Therefore annotation values must not contain path
/// delimiters.
///
/// Example:
/// ```toml
/// [export.config]
/// doc_anno = "my_namespace::document"
/// ```
///
/// The default is `annis::doc`.
#[serde(
deserialize_with = "deserialize_anno_key",
default = "default_doc_anno"
)]
doc_anno: AnnoKey,
/// The provided character defines the column delimiter. The default value is tab.
///
/// Example:
/// ```toml
/// [export.config]
/// delimiter = ";"
/// ```
#[serde(default = "default_delimiter")]
delimiter: char,
/// The provided character will be used for quoting values. If nothing is provided, all columns will contain bare values. If a character is provided,
/// all values will be quoted.
///
/// Example:
/// ```toml
/// [export.config]
/// quote_char = "\""
/// ```
#[serde(default)]
quote_char: Option<char>,
}

impl Default for ExportTable {
fn default() -> Self {
Self {
doc_anno: default_doc_anno(),
delimiter: default_delimiter(),
quote_char: None,
}
}
}

fn default_doc_anno() -> AnnoKey {
AnnoKey {
name: "doc".into(),
ns: ANNIS_NS.into(),
}
}

fn default_delimiter() -> char {
'\t'
}

const FILE_EXTENSION: &str = "csv";

impl Exporter for ExportTable {
fn export_corpus(
&self,
graph: &graphannis::AnnotationGraph,
output_path: &std::path::Path,
_step_id: crate::StepID,
_tx: Option<crate::workflow::StatusSender>,
) -> Result<(), Box<dyn std::error::Error>> {
let base_ordering = AnnotationComponent::new(
AnnotationComponentType::Ordering,
ANNIS_NS.into(),
"".into(),
);
let storage = graph
.get_graphstorage(&base_ordering)
.ok_or(anyhow!("Storage of base ordering unavailable"))?;
let part_of_storage = graph
.get_graphstorage(&AnnotationComponent::new(
AnnotationComponentType::PartOf,
ANNIS_NS.into(),
"".into(),
))
.ok_or(anyhow!("Part-of storage unavailbale."))?;
let mut doc_node_to_start = BTreeMap::new();
for node in storage
.source_nodes()
.flatten()
.filter(|n| !storage.has_ingoing_edges(*n).unwrap_or_default())
{
let dfs = CycleSafeDFS::new(
part_of_storage.as_edgecontainer(),
node,
0,
NodeID::MAX as usize,
);
for nxt in dfs {
let n = nxt?.node;
if graph
.get_node_annos()
.has_value_for_item(&n, &self.doc_anno)
.unwrap_or_default()
{
if let Entry::Vacant(e) = doc_node_to_start.entry(n) {
e.insert(node);
break;
} else {
let doc_node_name = graph
.get_node_annos()
.get_value_for_item(&n, &NODE_NAME_KEY)?
.unwrap_or_default();
return Err(anyhow!(
"Document {doc_node_name} has more than one start node for base ordering."
)
.into());
}
}
}
}
doc_node_to_start
.into_iter()
.try_for_each(|(doc, start)| self.export_document(graph, output_path, doc, start))?;
Ok(())
}

fn file_extension(&self) -> &str {
FILE_EXTENSION
}
}

type Data<'a> = BTreeMap<usize, Cow<'a, str>>;

impl ExportTable {
fn export_document(
&self,
graph: &AnnotationGraph,
corpus_path: &Path,
doc_node: NodeID,
start_node: NodeID,
) -> Result<(), anyhow::Error> {
let node_annos = graph.get_node_annos();
let doc_node_name = node_annos
.get_value_for_item(&doc_node, &self.doc_anno)?
.ok_or(anyhow!("Could not determine document node name."))?;
let ordering_storage = graph
.get_graphstorage(&AnnotationComponent::new(
AnnotationComponentType::Ordering,
ANNIS_NS.into(),
"".into(),
))
.ok_or(anyhow!("Storage of ordering component unavailable."))?;
let ordered_nodes = ordering_storage
.find_connected(start_node, 0, std::ops::Bound::Excluded(usize::MAX))
.flatten()
.collect_vec();
let mut table_data: Vec<Data> = Vec::with_capacity(ordered_nodes.len());
let coverage_components =
graph.get_all_components(Some(AnnotationComponentType::Coverage), None);
let coverage_storages = coverage_components
.iter()
.filter_map(|c| graph.get_graphstorage(c))
.collect_vec();
let mut index_map = BTreeMap::default();
for node in ordered_nodes {
let reachable_nodes = coverage_storages
.iter()
.flat_map(|s| {
s.find_connected_inverse(node, 0, std::ops::Bound::Excluded(usize::MAX))
})
.flatten();
let mut data = Data::default();
for rn in reachable_nodes {
let node_name = node_annos
.get_value_for_item(&rn, &NODE_NAME_KEY)?
.ok_or(anyhow!("Node has no name"))?;
for anno_key in node_annos.get_all_keys_for_item(&rn, None, None)? {
if anno_key.ns.as_str() != ANNIS_NS {
let qname = join_qname(anno_key.ns.as_str(), anno_key.name.as_str());
let id_name = format!("id_{qname}");
let index = if let Some(index) = index_map.get(&qname) {
*index
} else {
index_map.insert(qname.to_string(), index_map.len());
index_map.insert(id_name.to_string(), index_map.len());
index_map.len() - 2
};
let value = node_annos
.get_value_for_item(&rn, &anno_key)?
.ok_or(anyhow!("Annotation has no value"))?;
data.insert(index, value);
data.insert(index + 1, node_name.clone());
}
}
}
table_data.push(data);
}
let file_path =
Path::new(corpus_path).join(format!("{doc_node_name}.{}", self.file_extension()));
let mut writer_builder = csv::WriterBuilder::new();
writer_builder.delimiter(self.delimiter as u8);
if let Some(c) = &self.quote_char {
writer_builder.quote(*c as u8);
writer_builder.quote_style(csv::QuoteStyle::Always);
}
let mut writer = writer_builder.from_path(file_path)?;
let header = index_map
.iter()
.sorted_by(|(_, v), (_, v_)| v.cmp(v_))
.map(|(k, _)| k)
.collect_vec();
writer.write_record(header)?;
let index_bound = index_map.len();
for mut entry in table_data {
let mut row = Vec::with_capacity(index_bound);
for col_index in 0..index_bound {
row.push(entry.remove(&col_index).unwrap_or_default().to_string());
}
if !row.iter().all(String::is_empty) {
writer.write_record(&row)?;
}
}
Ok(())
}
}

#[cfg(test)]
mod tests {
use std::path::Path;

use graphannis::AnnotationGraph;
use insta::assert_snapshot;

use crate::{
exporter::table::ExportTable,
importer::{exmaralda::ImportEXMARaLDA, Importer},
test_util::export_to_string,
StepID,
};

#[test]
fn core() {
let exmaralda = ImportEXMARaLDA {};
let mprt = exmaralda.import_corpus(
Path::new("tests/data/import/exmaralda/clean/import/exmaralda/"),
StepID {
module_name: "test_import_exb".to_string(),
path: None,
},
None,
);
assert!(mprt.is_ok());
let mut update_import = mprt.unwrap();
let g = AnnotationGraph::with_default_graphstorages(true);
assert!(g.is_ok());
let mut graph = g.unwrap();
assert!(graph.apply_update(&mut update_import, |_| {}).is_ok());
let export = export_to_string(&graph, ExportTable::default());
assert!(export.is_ok(), "error: {:?}", export.err());
assert_snapshot!(export.unwrap());
}

#[test]
fn quoted() {
let exmaralda = ImportEXMARaLDA {};
let mprt = exmaralda.import_corpus(
Path::new("tests/data/import/exmaralda/clean/import/exmaralda/"),
StepID {
module_name: "test_import_exb".to_string(),
path: None,
},
None,
);
assert!(mprt.is_ok());
let mut update_import = mprt.unwrap();
let g = AnnotationGraph::with_default_graphstorages(true);
assert!(g.is_ok());
let mut graph = g.unwrap();
assert!(graph.apply_update(&mut update_import, |_| {}).is_ok());
let export = export_to_string(
&graph,
ExportTable {
quote_char: Some('"'),
..Default::default()
},
);
assert!(export.is_ok(), "error: {:?}", export.err());
assert_snapshot!(export.unwrap());
}
}
8 changes: 7 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use documented::{Documented, DocumentedFields};
use error::Result;
use exporter::{
exmaralda::ExportExmaralda, graphml::GraphMLExporter, sequence::ExportSequence,
textgrid::ExportTextGrid, xlsx::ExportXlsx, Exporter,
table::ExportTable, textgrid::ExportTextGrid, xlsx::ExportXlsx, Exporter,
};
use graphannis::AnnotationGraph;
use importer::{
Expand Down Expand Up @@ -55,6 +55,7 @@ pub enum WriteAs {
GraphML(#[serde(default)] GraphMLExporter), // the purpose of serde(default) here is, that an empty `[export.config]` table can be omited
EXMARaLDA(#[serde(default)] ExportExmaralda),
Sequence(#[serde(default)] ExportSequence),
Table(#[serde(default)] ExportTable),
TextGrid(ExportTextGrid), // do not use default, as all attributes have their individual defaults
Xlsx(#[serde(default)] ExportXlsx),
}
Expand All @@ -72,6 +73,7 @@ impl WriteAs {
WriteAs::GraphML(m) => m,
WriteAs::EXMARaLDA(m) => m,
WriteAs::Sequence(m) => m,
WriteAs::Table(m) => m,
WriteAs::TextGrid(m) => m,
WriteAs::Xlsx(m) => m,
}
Expand All @@ -84,6 +86,7 @@ impl WriteAsDiscriminants {
WriteAsDiscriminants::GraphML => GraphMLExporter::DOCS,
WriteAsDiscriminants::EXMARaLDA => ExportExmaralda::DOCS,
WriteAsDiscriminants::Sequence => ExportSequence::DOCS,
WriteAsDiscriminants::Table => ExportTable::DOCS,
WriteAsDiscriminants::TextGrid => ExportTextGrid::DOCS,
WriteAsDiscriminants::Xlsx => ExportXlsx::DOCS,
}
Expand All @@ -104,6 +107,9 @@ impl WriteAsDiscriminants {
ExportSequence::FIELD_NAMES_AS_SLICE,
ExportSequence::FIELD_DOCS,
),
WriteAsDiscriminants::Table => {
(ExportTable::FIELD_NAMES_AS_SLICE, ExportTable::FIELD_DOCS)
}
WriteAsDiscriminants::TextGrid => (
ExportTextGrid::FIELD_NAMES_AS_SLICE,
ExportTextGrid::FIELD_DOCS,
Expand Down
Loading

0 comments on commit 02cde83

Please sign in to comment.