Skip to content

Commit

Permalink
Merge pull request #297 from korpling/feature/table-export-additional…
Browse files Browse the repository at this point in the history
…-components

Feature/table export additional components
  • Loading branch information
MartinKl authored Aug 14, 2024
2 parents bf58031 + b2f63b2 commit 4700fe1
Show file tree
Hide file tree
Showing 5 changed files with 240 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Adds `saltxml` import format
- Adds `table` export format
- Adds `filter` graph op
- `table` export can include in- and outgoing edges

### Fixed

Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ indicatif = "0.17"
itertools = "0.12"
lazy_static = "1.4.0"
linked-hash-map = "0.5.6"
linked_hash_set = "0.1.4"
log = "0.4"
normpath = "1.1"
ordered-float = {version = "4.1", default-features = false}
Expand Down
14 changes: 14 additions & 0 deletions src/deserialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,20 @@ pub fn deserialize_annotation_component_opt<'de, D: Deserializer<'de>>(
Ok(dc_opt.map(|d| d.into_inner()))
}

pub fn deserialize_annotation_component_seq<
'de,
D: Deserializer<'de>,
T: FromIterator<AnnotationComponent>,
>(
deserializer: D,
) -> Result<T, D::Error> {
let component_seq = Vec::<DeserializableComponent>::deserialize(deserializer)?;
Ok(component_seq
.into_iter()
.map(|dc| dc.into_inner())
.collect::<T>())
}

// offer a function that can deserialize an AnnoKey from String and from a map
pub fn deserialize_anno_key<'de, D: Deserializer<'de>>(
deserializer: D,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
---
source: src/exporter/table.rs
expression: export.unwrap()
---
lemma id_lemma upos id_upos xpos id_xpos Case id_Case Number id_Number in_Pointing__dep_deprel in_Pointing__dep sent_id id_sent_id text id_text Person id_Person Tense id_Tense SpaceAfter id_SpaceAfter PronType id_PronType
they valid/website_example#t5_1 PRON valid/website_example#t5_1 PRP valid/website_example#t5_1 Nom valid/website_example#t5_1 Plur valid/website_example#t5_1 nsubj valid/website_example#t5_68 1 valid/website_example#s1_1 They buy and sell books. valid/website_example#s1_1
buy valid/website_example#t5_68 VERB valid/website_example#t5_68 VBP valid/website_example#t5_68 Plur valid/website_example#t5_68 1 valid/website_example#s1_1 They buy and sell books. valid/website_example#s1_1 3 valid/website_example#t5_68 Pres valid/website_example#t5_68
and valid/website_example#t6_67 CONJ valid/website_example#t6_67 CC valid/website_example#t6_67 cc valid/website_example#t7_32 1 valid/website_example#s1_1 They buy and sell books. valid/website_example#s1_1
sell valid/website_example#t7_32 VERB valid/website_example#t7_32 VBP valid/website_example#t7_32 Plur valid/website_example#t7_32 conj valid/website_example#t5_68 1 valid/website_example#s1_1 They buy and sell books. valid/website_example#s1_1 3 valid/website_example#t7_32 Pres valid/website_example#t7_32
book valid/website_example#t8_76 NOUN valid/website_example#t8_76 NNS valid/website_example#t8_76 Plur valid/website_example#t8_76 obj valid/website_example#t5_68 1 valid/website_example#s1_1 They buy and sell books. valid/website_example#s1_1 No valid/website_example#t8_76
. valid/website_example#t9_66 PUNCT valid/website_example#t9_66 . valid/website_example#t9_66 punct valid/website_example#t5_68 1 valid/website_example#s1_1 They buy and sell books. valid/website_example#s1_1
I valid/website_example#t14_1 PRON valid/website_example#t14_1 PRP valid/website_example#t14_1 Nom valid/website_example#t14_1 Sing valid/website_example#t14_1 nsubj valid/website_example#t14_57 2 valid/website_example#s10_34 I have no clue. valid/website_example#s10_34 1 valid/website_example#t14_1
have valid/website_example#t14_57 VERB valid/website_example#t14_57 VBP valid/website_example#t14_57 Sing valid/website_example#t14_57 2 valid/website_example#s10_34 I have no clue. valid/website_example#s10_34 1 valid/website_example#t14_57 Pres valid/website_example#t14_57
no valid/website_example#t15_64 DET valid/website_example#t15_64 DT valid/website_example#t15_64 det valid/website_example#t16_38 2 valid/website_example#s10_34 I have no clue. valid/website_example#s10_34 Neg valid/website_example#t15_64
clue valid/website_example#t16_38 NOUN valid/website_example#t16_38 NN valid/website_example#t16_38 Sing valid/website_example#t16_38 obj valid/website_example#t14_57 2 valid/website_example#s10_34 I have no clue. valid/website_example#s10_34 No valid/website_example#t16_38
. valid/website_example#t17_54 PUNCT valid/website_example#t17_54 . valid/website_example#t17_54 punct valid/website_example#t14_57 2 valid/website_example#s10_34 I have no clue. valid/website_example#s10_34

212 changes: 207 additions & 5 deletions src/exporter/table.rs
Original file line number Diff line number Diff line change
@@ -1,28 +1,31 @@
use std::{
borrow::Cow,
collections::{btree_map::Entry, BTreeMap},
ops::Bound,
path::Path,
};

use anyhow::anyhow;
use anyhow::{anyhow, bail};
use documented::{Documented, DocumentedFields};
use graphannis::{
graph::{AnnoKey, NodeID},
graph::{AnnoKey, Edge, NodeID},
model::{AnnotationComponent, AnnotationComponentType},
AnnotationGraph,
};
use graphannis_core::{
annostorage::EdgeAnnotationStorage,
dfs::CycleSafeDFS,
graph::{ANNIS_NS, NODE_NAME_KEY},
util::join_qname,
};
use itertools::Itertools;
use linked_hash_set::LinkedHashSet;
use serde::Deserialize;
use struct_field_names_as_array::FieldNamesAsSlice;

use super::Exporter;

use crate::deserialize::deserialize_anno_key;
use crate::deserialize::{deserialize_anno_key, deserialize_annotation_component_seq};

/// This module exports all ordered nodes and nodes connected by coverage edges of any name into a table.
#[derive(Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)]
Expand Down Expand Up @@ -63,6 +66,28 @@ pub struct ExportTable {
/// ```
#[serde(default)]
quote_char: Option<char>,
/// By listing annotation components, the ingoing edges of that component and their annotations
/// will be exported as well. Multiple ingoing edges will be separated by a ";". Each exported
/// node will be checked for ingoing edges in the respective components.
///
/// Example:
/// ```toml
/// [export.config]
/// ingoing = [{ ctype = "Pointing", layer = "", ns = "dep"}]
/// ```
#[serde(default, deserialize_with = "deserialize_annotation_component_seq")]
ingoing: Vec<AnnotationComponent>,
/// By listing annotation components, the ingoing edges of that component and their annotations
/// will be exported as well. Multiple outgoing edges will be separated by a ";". Each exported
/// node will be checked for outgoing edges in the respective components.
///
/// Example:
/// ```toml
/// [export.config]
/// outgoing = [{ ctype = "Pointing", layer = "", ns = "reference"}]
/// ```
#[serde(default, deserialize_with = "deserialize_annotation_component_seq")]
outgoing: Vec<AnnotationComponent>,
}

impl Default for ExportTable {
Expand All @@ -71,6 +96,8 @@ impl Default for ExportTable {
doc_anno: default_doc_anno(),
delimiter: default_delimiter(),
quote_char: None,
ingoing: vec![],
outgoing: vec![],
}
}
}
Expand Down Expand Up @@ -158,6 +185,8 @@ impl Exporter for ExportTable {
}

type Data<'a> = BTreeMap<usize, Cow<'a, str>>;
type EdgeData<'a> = BTreeMap<usize, LinkedHashSet<Cow<'a, str>>>; // insertion order is critical
type SingleEdgeData<'a> = (String, &'a AnnotationComponent, Vec<(String, String)>);

impl ExportTable {
fn export_document(
Expand Down Expand Up @@ -190,6 +219,7 @@ impl ExportTable {
.filter_map(|c| graph.get_graphstorage(c))
.collect_vec();
let mut index_map = BTreeMap::default();
let follow_edges = !self.outgoing.is_empty() || !self.ingoing.is_empty();
for node in ordered_nodes {
let reachable_nodes = coverage_storages
.iter()
Expand All @@ -198,7 +228,9 @@ impl ExportTable {
})
.flatten();
let mut data = Data::default();
let mut edge_column_data = EdgeData::default();
for rn in reachable_nodes {
// reachable nodes contains the start node
let node_name = node_annos
.get_value_for_item(&rn, &NODE_NAME_KEY)?
.ok_or(anyhow!("Node has no name"))?;
Expand All @@ -220,7 +252,52 @@ impl ExportTable {
data.insert(index + 1, node_name.clone());
}
}
if follow_edges {
let (sources, targets) = self.connected_nodes(graph, rn)?;
let mut prefixes = sources.iter().map(|_| "in").collect_vec();
prefixes.extend(targets.iter().map(|_| "out"));
for ((connected_node_name, component, mut edge_annotations), prefix) in
sources.into_iter().chain(targets.into_iter()).zip(prefixes)
{
let qualified_name = [
prefix,
component.get_type().to_string().as_str(),
component.layer.as_str(),
component.name.as_str(),
]
.join("_");
edge_annotations.extend([("".to_string(), connected_node_name)]);
for (name, value) in edge_annotations {
let qname = if name.is_empty() {
qualified_name.to_string()
} else {
[qualified_name.as_str(), name.as_str()].join("_")
};
let index = if let Some(index) = index_map.get(&qname) {
*index
} else {
index_map.insert(qname, index_map.len());
index_map.len() - 1
};
match edge_column_data.entry(index) {
Entry::Vacant(e) => {
let mut new_value = LinkedHashSet::default();
new_value.insert(Cow::Owned(value));
e.insert(new_value);
}
Entry::Occupied(mut e) => {
e.get_mut().insert(Cow::Owned(value));
}
};
}
}
}
}
data.extend(
edge_column_data.into_iter().map(|(ix, value_set)| {
(ix, Cow::Owned(value_set.iter().join(";").to_string()))
}),
);
table_data.push(data);
}
let file_path =
Expand Down Expand Up @@ -250,18 +327,111 @@ impl ExportTable {
}
Ok(())
}

fn connected_nodes(
&self,
graph: &AnnotationGraph,
node: NodeID,
) -> Result<(Vec<SingleEdgeData>, Vec<SingleEdgeData>), anyhow::Error> {
let mut sources: Vec<SingleEdgeData> = Vec::new();
let mut targets: Vec<SingleEdgeData> = Vec::new();
for component in &self.ingoing {
if let Some(storage) = graph.get_graphstorage(component) {
let sources_ingoing = storage
.find_connected_inverse(node, 1, Bound::Excluded(2))
.flatten()
.collect_vec();
for src in sources_ingoing {
if let Some(node_name) = graph
.get_node_annos()
.get_value_for_item(&src, &NODE_NAME_KEY)?
{
let edge = Edge {
source: src,
target: node,
};
let anno_storage = storage.get_anno_storage();
sources.push((
node_name.to_string(),
component,
edge_annos(anno_storage, &edge)?,
));
}
}
} else {
bail!(
"Component {}::{}::{} has no storage.",
component.get_type(),
component.layer,
component.name
);
}
}
for component in &self.outgoing {
if let Some(storage) = graph.get_graphstorage(component) {
let targets_outgoing = storage
.find_connected(node, 1, Bound::Excluded(2))
.flatten()
.collect_vec();
for tgt in targets_outgoing {
if let Some(node_name) = graph
.get_node_annos()
.get_value_for_item(&tgt, &NODE_NAME_KEY)?
{
let edge = Edge {
source: node,
target: tgt,
};
let anno_storage = storage.get_anno_storage();
targets.push((
node_name.to_string(),
component,
edge_annos(anno_storage, &edge)?,
));
}
}
} else {
bail!(
"Component {}::{}::{} has no storage.",
component.get_type(),
component.layer,
component.name
);
}
}
Ok((sources, targets))
}
}

fn edge_annos(
anno_storage: &dyn EdgeAnnotationStorage,
edge: &Edge,
) -> Result<Vec<(String, String)>, anyhow::Error> {
let mut annotations = Vec::new();
for anno_key in anno_storage.get_all_keys_for_item(edge, None, None)? {
if anno_key.ns != ANNIS_NS {
let qname = join_qname(&anno_key.ns, &anno_key.name);
if let Some(value) = anno_storage.get_value_for_item(edge, &anno_key)? {
annotations.push((qname, value.to_string()));
}
}
}
Ok(annotations)
}

#[cfg(test)]
mod tests {
use std::path::Path;

use graphannis::AnnotationGraph;
use graphannis::{
model::{AnnotationComponent, AnnotationComponentType},
AnnotationGraph,
};
use insta::assert_snapshot;

use crate::{
exporter::table::ExportTable,
importer::{exmaralda::ImportEXMARaLDA, Importer},
importer::{conllu::ImportCoNLLU, exmaralda::ImportEXMARaLDA, Importer},
test_util::export_to_string,
StepID,
};
Expand Down Expand Up @@ -315,4 +485,36 @@ mod tests {
assert!(export.is_ok(), "error: {:?}", export.err());
assert_snapshot!(export.unwrap());
}

#[test]
fn edge_features() {
let to_conll = ImportCoNLLU::default();
let mprt = to_conll.import_corpus(
Path::new("tests/data/import/conll/valid/"),
StepID {
module_name: "test_import_conll".to_string(),
path: None,
},
None,
);
assert!(mprt.is_ok());
let mut update_import = mprt.unwrap();
let g = AnnotationGraph::with_default_graphstorages(true);
assert!(g.is_ok());
let mut graph = g.unwrap();
assert!(graph.apply_update(&mut update_import, |_| {}).is_ok());
let export = export_to_string(
&graph,
ExportTable {
ingoing: vec![AnnotationComponent::new(
AnnotationComponentType::Pointing,
"".into(),
"dep".into(),
)],
..Default::default()
},
);
assert!(export.is_ok(), "error: {:?}", export.err());
assert_snapshot!(export.unwrap());
}
}

0 comments on commit 4700fe1

Please sign in to comment.