Skip to content

Commit

Permalink
Merge pull request #295 from korpling/feature/filter-nodes
Browse files Browse the repository at this point in the history
Feature/filter nodes
  • Loading branch information
MartinKl authored Aug 13, 2024
2 parents 02cde83 + 803a5e5 commit 022b2e0
Show file tree
Hide file tree
Showing 18 changed files with 680 additions and 91 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ visibility = "preloaded"
<data key="k11">norm</data>
<data key="k12">New York</data>
<data key="k13">node</data>
<data key="k17">ADP</data>
<data key="k17">PRON</data>
<data key="k19">3.33333-5.55555</data>
</node>
<edge id="e0" source="exmaralda/test_doc#t_dipl_T286-T1" target="exmaralda/test_doc#T286" label="Coverage/annis/">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ expression: actual.unwrap()
<event start="T0" end="T1">PRON</event>
<event start="T1" end="T2">VERB</event>
<event start="T2" end="T3">ADP</event>
<event start="T3" end="T5">ADP</event>
<event start="T3" end="T5">PRON</event>
</tier>
<tier speaker="dipl" category="sentence" type="a" id="TIER18" display-name="dipl[sentence]">
<event start="T0" end="T5">1</event>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ expression: actual.unwrap()
<event start="T0" end="T1">PRON</event>
<event start="T1" end="T2">VERB</event>
<event start="T2" end="T3">ADP</event>
<event start="T3" end="T5">ADP</event>
<event start="T3" end="T5">PRON</event>
</tier>
<tier speaker="dipl" category="sentence" type="a" id="TIER18" display-name="dipl[sentence]">
<event start="T0" end="T5">1</event>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ norm::lemma id_norm::lemma norm::pos id_norm::pos dipl::sentence id_dipl::senten
I exmaralda/test_doc#a_norm_T286-T0 PRON exmaralda/test_doc#a_norm_T286-T0 1 exmaralda/test_doc#a_dipl_T286-T4 I exmaralda/test_doc#t_norm_T286-T0 I'm exmaralda/test_doc#t_dipl_T286-T1
be exmaralda/test_doc#a_norm_T0-T1 VERB exmaralda/test_doc#a_norm_T0-T1 1 exmaralda/test_doc#a_dipl_T286-T4 am exmaralda/test_doc#t_norm_T0-T1 I'm exmaralda/test_doc#t_dipl_T286-T1
in exmaralda/test_doc#a_norm_T1-T2 ADP exmaralda/test_doc#a_norm_T1-T2 1 exmaralda/test_doc#a_dipl_T286-T4 in exmaralda/test_doc#t_norm_T1-T2 in exmaralda/test_doc#t_dipl_T1-T2
New York exmaralda/test_doc#a_norm_T2-T4 ADP exmaralda/test_doc#a_norm_T2-T4 1 exmaralda/test_doc#a_dipl_T286-T4 New York exmaralda/test_doc#t_norm_T2-T4 New exmaralda/test_doc#t_dipl_T2-T3
New York exmaralda/test_doc#a_norm_T2-T4 ADP exmaralda/test_doc#a_norm_T2-T4 1 exmaralda/test_doc#a_dipl_T286-T4 New York exmaralda/test_doc#t_norm_T2-T4 York exmaralda/test_doc#t_dipl_T3-T4
New York exmaralda/test_doc#a_norm_T2-T4 PRON exmaralda/test_doc#a_norm_T2-T4 1 exmaralda/test_doc#a_dipl_T286-T4 New York exmaralda/test_doc#t_norm_T2-T4 New exmaralda/test_doc#t_dipl_T2-T3
New York exmaralda/test_doc#a_norm_T2-T4 PRON exmaralda/test_doc#a_norm_T2-T4 1 exmaralda/test_doc#a_dipl_T286-T4 New York exmaralda/test_doc#t_norm_T2-T4 York exmaralda/test_doc#t_dipl_T3-T4

Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ expression: export.unwrap()
"I" "exmaralda/test_doc#a_norm_T286-T0" "PRON" "exmaralda/test_doc#a_norm_T286-T0" "1" "exmaralda/test_doc#a_dipl_T286-T4" "I" "exmaralda/test_doc#t_norm_T286-T0" "I'm" "exmaralda/test_doc#t_dipl_T286-T1"
"be" "exmaralda/test_doc#a_norm_T0-T1" "VERB" "exmaralda/test_doc#a_norm_T0-T1" "1" "exmaralda/test_doc#a_dipl_T286-T4" "am" "exmaralda/test_doc#t_norm_T0-T1" "I'm" "exmaralda/test_doc#t_dipl_T286-T1"
"in" "exmaralda/test_doc#a_norm_T1-T2" "ADP" "exmaralda/test_doc#a_norm_T1-T2" "1" "exmaralda/test_doc#a_dipl_T286-T4" "in" "exmaralda/test_doc#t_norm_T1-T2" "in" "exmaralda/test_doc#t_dipl_T1-T2"
"New York" "exmaralda/test_doc#a_norm_T2-T4" "ADP" "exmaralda/test_doc#a_norm_T2-T4" "1" "exmaralda/test_doc#a_dipl_T286-T4" "New York" "exmaralda/test_doc#t_norm_T2-T4" "New" "exmaralda/test_doc#t_dipl_T2-T3"
"New York" "exmaralda/test_doc#a_norm_T2-T4" "ADP" "exmaralda/test_doc#a_norm_T2-T4" "1" "exmaralda/test_doc#a_dipl_T286-T4" "New York" "exmaralda/test_doc#t_norm_T2-T4" "York" "exmaralda/test_doc#t_dipl_T3-T4"
"New York" "exmaralda/test_doc#a_norm_T2-T4" "PRON" "exmaralda/test_doc#a_norm_T2-T4" "1" "exmaralda/test_doc#a_dipl_T286-T4" "New York" "exmaralda/test_doc#t_norm_T2-T4" "New" "exmaralda/test_doc#t_dipl_T2-T3"
"New York" "exmaralda/test_doc#a_norm_T2-T4" "PRON" "exmaralda/test_doc#a_norm_T2-T4" "1" "exmaralda/test_doc#a_dipl_T286-T4" "New York" "exmaralda/test_doc#t_norm_T2-T4" "York" "exmaralda/test_doc#t_dipl_T3-T4"

Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,6 @@ item []:
intervals [4]:
xmin = 3.33333
xmax = 5.55555
text = "ADP"
text = "PRON"


Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ item []:
intervals [4]:
xmin = 3.33333
xmax = 5.55555
text = "ADP"
text = "PRON"
item [5]:
class = "IntervalTier"
name = "dipl::sentence"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,6 @@ item []:
intervals [4]:
xmin = 3.33333
xmax = 5.55555
text = "ADP"
text = "PRON"


Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ expression: r.unwrap()
<data key="k10">norm</data>
<data key="k11">New York</data>
<data key="k12">node</data>
<data key="k16">ADP</data>
<data key="k16">PRON</data>
<data key="k18">3.33333-5.55555</data>
</node>
<edge id="e0" source="import/exmaralda/test_doc#t_dipl_T286-T1" target="import/exmaralda/test_doc#T286" label="Coverage/annis/">
Expand Down
11 changes: 9 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ use importer::{
Importer,
};
use manipulator::{
check::Check, chunker::Chunk, collapse::Collapse, enumerate::EnumerateMatches, link::LinkNodes,
map::MapAnnos, no_op::NoOp, re::Revise, split::SplitValues, visualize::Visualize, Manipulator,
check::Check, chunker::Chunk, collapse::Collapse, enumerate::EnumerateMatches,
filter::FilterNodes, link::LinkNodes, map::MapAnnos, no_op::NoOp, re::Revise,
split::SplitValues, visualize::Visualize, Manipulator,
};
use serde_derive::Deserialize;
use struct_field_names_as_array::FieldNamesAsSlice;
Expand Down Expand Up @@ -288,6 +289,7 @@ impl ReadFromDiscriminants {
pub enum GraphOp {
Check(Check), // no default, has a (required) path attribute
Collapse(Collapse), // no default, there is no such thing as a default component
Filter(FilterNodes),
Visualize(#[serde(default)] Visualize),
Enumerate(#[serde(default)] EnumerateMatches),
Link(LinkNodes), // no default, has required attributes
Expand Down Expand Up @@ -318,6 +320,7 @@ impl GraphOp {
GraphOp::Enumerate(m) => m,
GraphOp::Chunk(m) => m,
GraphOp::Split(m) => m,
GraphOp::Filter(m) => m,
}
}
}
Expand All @@ -335,6 +338,7 @@ impl GraphOpDiscriminants {
GraphOpDiscriminants::Chunk => Chunk::DOCS,
GraphOpDiscriminants::None => NoOp::DOCS,
GraphOpDiscriminants::Split => SplitValues::DOCS,
GraphOpDiscriminants::Filter => FilterNodes::DOCS,
}
}

Expand All @@ -360,6 +364,9 @@ impl GraphOpDiscriminants {
GraphOpDiscriminants::Split => {
(SplitValues::FIELD_NAMES_AS_SLICE, SplitValues::FIELD_DOCS)
}
GraphOpDiscriminants::Filter => {
(FilterNodes::FIELD_NAMES_AS_SLICE, FilterNodes::FIELD_DOCS)
}
};
for (idx, n) in field_names.iter().enumerate() {
if idx < field_docs.len() {
Expand Down
233 changes: 233 additions & 0 deletions src/manipulator/filter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
use std::collections::BTreeSet;

use anyhow::anyhow;
use documented::{Documented, DocumentedFields};
use graphannis::{
aql,
graph::NodeID,
model::{AnnotationComponent, AnnotationComponentType},
update::{GraphUpdate, UpdateEvent},
};
use graphannis_core::graph::{ANNIS_NS, NODE_NAME_KEY, NODE_TYPE_KEY};
use serde::Deserialize;
use struct_field_names_as_array::FieldNamesAsSlice;

use super::Manipulator;

/// This module acts as a positive filter, i. e., all nodes that do not match the query and are not real tokens
/// are deleted. In inverse mode, all matching nodes (except real tokens) get deleted. This only applies to nodes
/// that are of node type "node". Other node types will be ignored.
///
/// The following example configuration deletes all nodes that are annotated to be nouns and are not real tokens:
/// ```toml
/// [[graph_op]]
/// action = "filter"
///
/// [graph_op.config]
/// query = "pos=/NOUN/"
/// inverse = true
/// ```
#[derive(Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)]
#[serde(deny_unknown_fields)]
pub struct FilterNodes {
/// The AQL query to use to identify all relevant nodes.
///
/// Example:
/// ```toml
/// [graph_op.config]
/// query = "pos=/NOUN/"
/// ```
query: String,
/// If this is set to true, all matching nodes, that are not coverage terminals ("real tokens"), are deleted. If false (default),
/// the matching nodes and all real tokens are preserved, all other nodes are deleted.
///
/// Example:
/// ```toml
/// [graph_op.config]
/// query = "pos=/NOUN/"
/// inverse = true
/// ```
#[serde(default)]
inverse: bool,
}

impl Manipulator for FilterNodes {
fn manipulate_corpus(
&self,
graph: &mut graphannis::AnnotationGraph,
_workflow_directory: &std::path::Path,
_step_id: crate::StepID,
_tx: Option<crate::workflow::StatusSender>,
) -> Result<(), Box<dyn std::error::Error>> {
let mut update = GraphUpdate::default();
let query = aql::parse(&self.query, false)?;
let mut matching_nodes = BTreeSet::default();
let node_annos = graph.get_node_annos();
// collect timeline nodes along component "Ordering/annis/" to also keep the timeline
let terminals = {
let mut v = BTreeSet::default();
if let Some(storage) = graph.get_graphstorage(&AnnotationComponent::new(
AnnotationComponentType::Ordering,
ANNIS_NS.into(),
"".into(),
)) {
let roots = storage
.source_nodes()
.flatten()
.filter(|n| !storage.has_ingoing_edges(*n).unwrap_or_default());
for root in roots {
storage
.find_connected(root, 0, std::ops::Bound::Excluded(usize::MAX))
.flatten()
.for_each(|n| {
v.insert(n);
});
}
}
v
};
aql::execute_query_on_graph(graph, &query, true, None)?
.flatten()
.for_each(|group| {
for member in group {
matching_nodes.insert(member.node);
}
});
dbg!(&matching_nodes);
dbg!(&terminals);
if self.inverse {
// delete matching nodes (without terminals aka real tokens)
for n in matching_nodes.difference(&terminals) {
if let Some(node_name) = node_annos.get_value_for_item(n, &NODE_NAME_KEY)? {
update.add_event(UpdateEvent::DeleteNode {
node_name: node_name.to_string(),
})?;
} else {
return Err(anyhow!("Node has no name. This is invalid.").into());
}
}
} else {
// delete non-matching nodes of type "node" (excluding real tokens)
let max_id = node_annos.get_largest_item()?.unwrap_or(NodeID::MAX);
for n in 0..max_id {
if let Some(node_type) = node_annos.get_value_for_item(&n, &NODE_TYPE_KEY)? {
if !matching_nodes.contains(&n)
&& !terminals.contains(&n)
&& &*node_type == "node"
{
if let Some(node_name) =
node_annos.get_value_for_item(&n, &NODE_NAME_KEY)?
{
update.add_event(UpdateEvent::DeleteNode {
node_name: node_name.to_string(),
})?;
} else {
return Err(anyhow!("Node has no name. This is invalid.").into());
}
}
}
}
}
graph.apply_update(&mut update, |_| {})?;
Ok(())
}
}

#[cfg(test)]
mod tests {
use std::{fs, path::Path};

use graphannis::AnnotationGraph;
use insta::assert_snapshot;

use crate::{
exporter::graphml::GraphMLExporter,
importer::{exmaralda::ImportEXMARaLDA, Importer},
manipulator::{filter::FilterNodes, Manipulator},
test_util::export_to_string,
StepID,
};

#[test]
fn default() {
let exmaralda = ImportEXMARaLDA {};
let mprt = exmaralda.import_corpus(
Path::new("tests/data/import/exmaralda/clean/import/exmaralda/"),
StepID {
module_name: "test_import_exb".to_string(),
path: None,
},
None,
);
assert!(mprt.is_ok());
let mut update_import = mprt.unwrap();
let g = AnnotationGraph::with_default_graphstorages(true);
assert!(g.is_ok());
let mut graph = g.unwrap();
assert!(graph.apply_update(&mut update_import, |_| {}).is_ok());
let manipulation = FilterNodes {
query: "pos=/PRON/".to_string(),
inverse: false,
};
assert!(manipulation
.manipulate_corpus(
&mut graph,
Path::new("./"),
StepID {
module_name: "test_filter".to_string(),
path: None
},
None
)
.is_ok());
let export = export_to_string(&graph, GraphMLExporter::default());
assert!(export.is_ok(), "error: {:?}", export.err());
assert_snapshot!(export.unwrap());
}

#[test]
fn inverse() {
let exmaralda = ImportEXMARaLDA {};
let mprt = exmaralda.import_corpus(
Path::new("tests/data/import/exmaralda/clean/import/exmaralda/"),
StepID {
module_name: "test_import_exb".to_string(),
path: None,
},
None,
);
assert!(mprt.is_ok());
let mut update_import = mprt.unwrap();
let g = AnnotationGraph::with_default_graphstorages(true);
assert!(g.is_ok());
let mut graph = g.unwrap();
assert!(graph.apply_update(&mut update_import, |_| {}).is_ok());
let manipulation = FilterNodes {
query: "pos=/PRON/".to_string(),
inverse: true,
};
assert!(manipulation
.manipulate_corpus(
&mut graph,
Path::new("./"),
StepID {
module_name: "test_filter".to_string(),
path: None
},
None
)
.is_ok());
let export = export_to_string(&graph, GraphMLExporter::default());
assert!(export.is_ok(), "error: {:?}", export.err());
assert_snapshot!(export.unwrap());
}

#[test]
fn deserialize() {
let toml_str =
fs::read_to_string(Path::new("./tests/data/graph_op/filter/deserialize.toml"))
.unwrap_or_default();
let filter_nodes: Result<FilterNodes, _> = toml::from_str(toml_str.as_str());
assert!(filter_nodes.is_ok(), "error: {:?}", filter_nodes.err());
}
}
1 change: 1 addition & 0 deletions src/manipulator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ pub mod check;
pub mod chunker;
pub mod collapse;
pub mod enumerate;
pub mod filter;
pub mod link;
pub mod map;
pub mod no_op;
Expand Down
Loading

0 comments on commit 022b2e0

Please sign in to comment.