From 5c10e5807898892879051acfd5608a18df580cbf Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Wed, 14 Aug 2024 13:57:34 +0200 Subject: [PATCH 1/3] added edge feature to table export --- Cargo.toml | 1 + src/deserialize.rs | 14 ++ ...exporter__table__tests__edge_features.snap | 17 ++ src/exporter/table.rs | 214 +++++++++++++++++- 4 files changed, 240 insertions(+), 6 deletions(-) create mode 100644 src/exporter/snapshots/annatto__exporter__table__tests__edge_features.snap diff --git a/Cargo.toml b/Cargo.toml index d020f9f7..872c5720 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ indicatif = "0.17" itertools = "0.12" lazy_static = "1.4.0" linked-hash-map = "0.5.6" +linked_hash_set = "0.1.4" log = "0.4" normpath = "1.1" ordered-float = {version = "4.1", default-features = false} diff --git a/src/deserialize.rs b/src/deserialize.rs index cdbd07f3..ba478fd9 100644 --- a/src/deserialize.rs +++ b/src/deserialize.rs @@ -46,6 +46,20 @@ pub fn deserialize_annotation_component_opt<'de, D: Deserializer<'de>>( Ok(dc_opt.map(|d| d.into_inner())) } +pub fn deserialize_annotation_component_seq< + 'de, + D: Deserializer<'de>, + T: FromIterator, +>( + deserializer: D, +) -> Result { + let component_seq = Vec::::deserialize(deserializer)?; + Ok(component_seq + .into_iter() + .map(|dc| dc.into_inner()) + .collect::()) +} + // offer a function that can deserialize an AnnoKey from String and from a map pub fn deserialize_anno_key<'de, D: Deserializer<'de>>( deserializer: D, diff --git a/src/exporter/snapshots/annatto__exporter__table__tests__edge_features.snap b/src/exporter/snapshots/annatto__exporter__table__tests__edge_features.snap new file mode 100644 index 00000000..776c976c --- /dev/null +++ b/src/exporter/snapshots/annatto__exporter__table__tests__edge_features.snap @@ -0,0 +1,17 @@ +--- +source: src/exporter/table.rs +expression: export.unwrap() +--- +lemma id_lemma upos id_upos xpos id_xpos Case id_Case Number id_Number in_Pointing__dep_deprel in_Pointing__dep sent_id id_sent_id text id_text Person id_Person Tense id_Tense SpaceAfter id_SpaceAfter PronType id_PronType +they valid/website_example#t5_1 PRON valid/website_example#t5_1 PRP valid/website_example#t5_1 Nom valid/website_example#t5_1 Plur valid/website_example#t5_1 nsubj valid/website_example#t5_68 1 valid/website_example#s1_1 They buy and sell books. valid/website_example#s1_1 +buy valid/website_example#t5_68 VERB valid/website_example#t5_68 VBP valid/website_example#t5_68 Plur valid/website_example#t5_68 1 valid/website_example#s1_1 They buy and sell books. valid/website_example#s1_1 3 valid/website_example#t5_68 Pres valid/website_example#t5_68 +and valid/website_example#t6_67 CONJ valid/website_example#t6_67 CC valid/website_example#t6_67 cc valid/website_example#t7_32 1 valid/website_example#s1_1 They buy and sell books. valid/website_example#s1_1 +sell valid/website_example#t7_32 VERB valid/website_example#t7_32 VBP valid/website_example#t7_32 Plur valid/website_example#t7_32 conj valid/website_example#t5_68 1 valid/website_example#s1_1 They buy and sell books. valid/website_example#s1_1 3 valid/website_example#t7_32 Pres valid/website_example#t7_32 +book valid/website_example#t8_76 NOUN valid/website_example#t8_76 NNS valid/website_example#t8_76 Plur valid/website_example#t8_76 obj valid/website_example#t5_68 1 valid/website_example#s1_1 They buy and sell books. valid/website_example#s1_1 No valid/website_example#t8_76 +. valid/website_example#t9_66 PUNCT valid/website_example#t9_66 . valid/website_example#t9_66 punct valid/website_example#t5_68 1 valid/website_example#s1_1 They buy and sell books. valid/website_example#s1_1 +I valid/website_example#t14_1 PRON valid/website_example#t14_1 PRP valid/website_example#t14_1 Nom valid/website_example#t14_1 Sing valid/website_example#t14_1 nsubj valid/website_example#t14_57 2 valid/website_example#s10_34 I have no clue. valid/website_example#s10_34 1 valid/website_example#t14_1 +have valid/website_example#t14_57 VERB valid/website_example#t14_57 VBP valid/website_example#t14_57 Sing valid/website_example#t14_57 2 valid/website_example#s10_34 I have no clue. valid/website_example#s10_34 1 valid/website_example#t14_57 Pres valid/website_example#t14_57 +no valid/website_example#t15_64 DET valid/website_example#t15_64 DT valid/website_example#t15_64 det valid/website_example#t16_38 2 valid/website_example#s10_34 I have no clue. valid/website_example#s10_34 Neg valid/website_example#t15_64 +clue valid/website_example#t16_38 NOUN valid/website_example#t16_38 NN valid/website_example#t16_38 Sing valid/website_example#t16_38 obj valid/website_example#t14_57 2 valid/website_example#s10_34 I have no clue. valid/website_example#s10_34 No valid/website_example#t16_38 +. valid/website_example#t17_54 PUNCT valid/website_example#t17_54 . valid/website_example#t17_54 punct valid/website_example#t14_57 2 valid/website_example#s10_34 I have no clue. valid/website_example#s10_34 + diff --git a/src/exporter/table.rs b/src/exporter/table.rs index 532eafcd..fcc5a1d7 100644 --- a/src/exporter/table.rs +++ b/src/exporter/table.rs @@ -1,28 +1,31 @@ use std::{ borrow::Cow, - collections::{btree_map::Entry, BTreeMap}, + collections::{btree_map::Entry, BTreeMap, BTreeSet}, + ops::Bound, path::Path, }; -use anyhow::anyhow; +use anyhow::{anyhow, bail}; use documented::{Documented, DocumentedFields}; use graphannis::{ - graph::{AnnoKey, NodeID}, + graph::{AnnoKey, Edge, NodeID}, model::{AnnotationComponent, AnnotationComponentType}, AnnotationGraph, }; use graphannis_core::{ + annostorage::EdgeAnnotationStorage, dfs::CycleSafeDFS, graph::{ANNIS_NS, NODE_NAME_KEY}, util::join_qname, }; use itertools::Itertools; +use linked_hash_set::LinkedHashSet; use serde::Deserialize; use struct_field_names_as_array::FieldNamesAsSlice; use super::Exporter; -use crate::deserialize::deserialize_anno_key; +use crate::deserialize::{deserialize_anno_key, deserialize_annotation_component_seq}; /// This module exports all ordered nodes and nodes connected by coverage edges of any name into a table. #[derive(Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)] @@ -63,6 +66,28 @@ pub struct ExportTable { /// ``` #[serde(default)] quote_char: Option, + /// By listing annotation components, the ingoing edges of that component and their annotations + /// will be exported as well. Multiple ingoing edges will be separated by a ";". Each exported + /// node will be checked for ingoing edges in the respective components. + /// + /// Example: + /// ```toml + /// [export.config] + /// ingoing = [{ ctype = "Pointing", layer = "", ns = "dep"}] + /// ``` + #[serde(default, deserialize_with = "deserialize_annotation_component_seq")] + ingoing: Vec, + /// By listing annotation components, the ingoing edges of that component and their annotations + /// will be exported as well. Multiple outgoing edges will be separated by a ";". Each exported + /// node will be checked for outgoing edges in the respective components. + /// + /// Example: + /// ```toml + /// [export.config] + /// outgoing = [{ ctype = "Pointing", layer = "", ns = "reference"}] + /// ``` + #[serde(default, deserialize_with = "deserialize_annotation_component_seq")] + outgoing: Vec, } impl Default for ExportTable { @@ -71,6 +96,8 @@ impl Default for ExportTable { doc_anno: default_doc_anno(), delimiter: default_delimiter(), quote_char: None, + ingoing: vec![], + outgoing: vec![], } } } @@ -158,6 +185,8 @@ impl Exporter for ExportTable { } type Data<'a> = BTreeMap>; +type EdgeData<'a> = BTreeMap>>; // insertion order is critical +type SingleEdgeData<'a> = (String, &'a AnnotationComponent, Vec<(String, String)>); impl ExportTable { fn export_document( @@ -190,6 +219,7 @@ impl ExportTable { .filter_map(|c| graph.get_graphstorage(c)) .collect_vec(); let mut index_map = BTreeMap::default(); + let follow_edges = !self.outgoing.is_empty() || !self.ingoing.is_empty(); for node in ordered_nodes { let reachable_nodes = coverage_storages .iter() @@ -198,7 +228,9 @@ impl ExportTable { }) .flatten(); let mut data = Data::default(); + let mut edge_column_data = EdgeData::default(); for rn in reachable_nodes { + // reachable nodes contains the start node let node_name = node_annos .get_value_for_item(&rn, &NODE_NAME_KEY)? .ok_or(anyhow!("Node has no name"))?; @@ -220,7 +252,52 @@ impl ExportTable { data.insert(index + 1, node_name.clone()); } } + if follow_edges { + let (sources, targets) = self.connected_nodes(graph, rn)?; + let mut prefixes = sources.iter().map(|_| "in").collect_vec(); + prefixes.extend(targets.iter().map(|_| "out")); + for ((connected_node_name, component, mut edge_annotations), prefix) in + sources.into_iter().chain(targets.into_iter()).zip(prefixes) + { + let qualified_name = [ + prefix, + component.get_type().to_string().as_str(), + component.layer.as_str(), + component.name.as_str(), + ] + .join("_"); + edge_annotations.extend([("".to_string(), connected_node_name)]); + for (name, value) in edge_annotations { + let qname = if name.is_empty() { + qualified_name.to_string() + } else { + [qualified_name.as_str(), name.as_str()].join("_") + }; + let index = if let Some(index) = index_map.get(&qname) { + *index + } else { + index_map.insert(qname, index_map.len()); + index_map.len() - 1 + }; + match edge_column_data.entry(index) { + Entry::Vacant(e) => { + let mut new_value = LinkedHashSet::default(); + new_value.insert(Cow::Owned(value)); + e.insert(new_value); + } + Entry::Occupied(mut e) => { + e.get_mut().insert(Cow::Owned(value)); + } + }; + } + } + } } + data.extend( + edge_column_data.into_iter().map(|(ix, value_set)| { + (ix, Cow::Owned(value_set.iter().join(";").to_string())) + }), + ); table_data.push(data); } let file_path = @@ -250,18 +327,111 @@ impl ExportTable { } Ok(()) } + + fn connected_nodes( + &self, + graph: &AnnotationGraph, + node: NodeID, + ) -> Result<(Vec, Vec), anyhow::Error> { + let mut sources: Vec = Vec::new(); + let mut targets: Vec = Vec::new(); + for component in &self.ingoing { + if let Some(storage) = graph.get_graphstorage(component) { + let sources_ingoing = storage + .find_connected_inverse(node, 1, Bound::Excluded(2)) + .flatten() + .collect_vec(); + for src in sources_ingoing { + if let Some(node_name) = graph + .get_node_annos() + .get_value_for_item(&src, &NODE_NAME_KEY)? + { + let edge = Edge { + source: src, + target: node, + }; + let anno_storage = storage.get_anno_storage(); + sources.push(( + node_name.to_string(), + component, + edge_annos(anno_storage, &edge)?, + )); + } + } + } else { + bail!( + "Component {}::{}::{} has no storage.", + component.get_type(), + component.layer, + component.name + ); + } + } + for component in &self.outgoing { + if let Some(storage) = graph.get_graphstorage(component) { + let targets_outgoing = storage + .find_connected(node, 1, Bound::Excluded(2)) + .flatten() + .collect_vec(); + for tgt in targets_outgoing { + if let Some(node_name) = graph + .get_node_annos() + .get_value_for_item(&tgt, &NODE_NAME_KEY)? + { + let edge = Edge { + source: node, + target: tgt, + }; + let anno_storage = storage.get_anno_storage(); + targets.push(( + node_name.to_string(), + component, + edge_annos(anno_storage, &edge)?, + )); + } + } + } else { + bail!( + "Component {}::{}::{} has no storage.", + component.get_type(), + component.layer, + component.name + ); + } + } + Ok((sources, targets)) + } +} + +fn edge_annos( + anno_storage: &dyn EdgeAnnotationStorage, + edge: &Edge, +) -> Result, anyhow::Error> { + let mut annotations = Vec::new(); + for anno_key in anno_storage.get_all_keys_for_item(edge, None, None)? { + if anno_key.ns != ANNIS_NS { + let qname = join_qname(&anno_key.ns, &anno_key.name); + if let Some(value) = anno_storage.get_value_for_item(edge, &anno_key)? { + annotations.push((qname, value.to_string())); + } + } + } + Ok(annotations) } #[cfg(test)] mod tests { use std::path::Path; - use graphannis::AnnotationGraph; + use graphannis::{ + model::{AnnotationComponent, AnnotationComponentType}, + AnnotationGraph, + }; use insta::assert_snapshot; use crate::{ exporter::table::ExportTable, - importer::{exmaralda::ImportEXMARaLDA, Importer}, + importer::{conllu::ImportCoNLLU, exmaralda::ImportEXMARaLDA, Importer}, test_util::export_to_string, StepID, }; @@ -315,4 +485,36 @@ mod tests { assert!(export.is_ok(), "error: {:?}", export.err()); assert_snapshot!(export.unwrap()); } + + #[test] + fn edge_features() { + let to_conll = ImportCoNLLU::default(); + let mprt = to_conll.import_corpus( + Path::new("tests/data/import/conll/valid/"), + StepID { + module_name: "test_import_conll".to_string(), + path: None, + }, + None, + ); + assert!(mprt.is_ok()); + let mut update_import = mprt.unwrap(); + let g = AnnotationGraph::with_default_graphstorages(true); + assert!(g.is_ok()); + let mut graph = g.unwrap(); + assert!(graph.apply_update(&mut update_import, |_| {}).is_ok()); + let export = export_to_string( + &graph, + ExportTable { + ingoing: vec![AnnotationComponent::new( + AnnotationComponentType::Pointing, + "".into(), + "dep".into(), + )], + ..Default::default() + }, + ); + assert!(export.is_ok(), "error: {:?}", export.err()); + assert_snapshot!(export.unwrap()); + } } From 14549b4e1835807b8fc3d8a6b1994e52b6ca1ff8 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Wed, 14 Aug 2024 14:03:48 +0200 Subject: [PATCH 2/3] clippy --- src/exporter/table.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/exporter/table.rs b/src/exporter/table.rs index fcc5a1d7..b52b3b72 100644 --- a/src/exporter/table.rs +++ b/src/exporter/table.rs @@ -1,6 +1,6 @@ use std::{ borrow::Cow, - collections::{btree_map::Entry, BTreeMap, BTreeSet}, + collections::{btree_map::Entry, BTreeMap}, ops::Bound, path::Path, }; From b2f63b2b5412f6c5ba9f40ebe546f38afbe43c20 Mon Sep 17 00:00:00 2001 From: Martin Klotz Date: Wed, 14 Aug 2024 14:04:21 +0200 Subject: [PATCH 3/3] update --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a7884d07..9685338a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Adds `saltxml` import format - Adds `table` export format - Adds `filter` graph op +- `table` export can include in- and outgoing edges ### Fixed