Skip to content

Commit

Permalink
Document and test the repetition functionality in the map module
Browse files Browse the repository at this point in the history
  • Loading branch information
thomaskrause committed Sep 2, 2024
1 parent 3d1ccf1 commit 16f21ce
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 31 deletions.
118 changes: 118 additions & 0 deletions src/manipulator/map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,28 @@ use struct_field_names_as_array::FieldNamesAsSlice;
/// to more than one copy of the query by using arrays instead of a single
/// number. In this case, the node values are concatenated using a space as
/// seperator.
///
/// You can also apply a set of rules repeatedly. The standard is to only
/// executed it once. But you can configure
/// ```toml
/// repetition = {Fixed = {n = 3}}
///
/// [[rules]]
/// # ...
/// ```
/// at the beginning to set the fixed number of repetitions (in this case `3`).
/// An even more advanced usage is to apply the changes until none of the
/// queries in the rules matches anymore.
/// ```toml
/// repetition = "UntilUnchanged"
///
/// [[rules]]
/// # ...
/// ```
/// Make sure that the updates in the rules actually change the condition of the
/// rule, otherwise you might get an endless loop and the workflow will never
/// finish!
///
#[derive(Deserialize, Documented, DocumentedFields, FieldNamesAsSlice)]
#[serde(deny_unknown_fields)]
pub struct MapAnnos {
Expand Down Expand Up @@ -447,6 +469,8 @@ mod tests {
AnnotationGraph,
};
use graphannis_core::{annostorage::ValueSearch, graph::ANNIS_NS};

use pretty_assertions::assert_eq;
use tempfile::NamedTempFile;

use crate::{manipulator::Manipulator, test_util, util::example_generator, StepID};
Expand Down Expand Up @@ -637,6 +661,92 @@ replacements = [
assert_eq!("ellembogem|ellenbogem|ellembogen|ellenbogen", result);
}

#[test]
fn repeat_mapping_fixed() {
let config = r#"
repetition = {Fixed = {n = 3}}
[[rules]]
query = "tok"
target = 1
ns = "annis"
name = "tok"
[rules.value]
target = 1
# Only replace the last character of each token.
replacements = [
['(\w\u0304?)X*$', 'X'],
]
"#;
let mut g = tokens_with_macrons().unwrap();

let tmp = NamedTempFile::new().unwrap();

std::fs::write(tmp.path(), config).unwrap();
let mapper = MapAnnos {
rule_file: tmp.path().to_path_buf(),
};
let step_id = StepID {
module_name: "test_map".to_string(),
path: None,
};
mapper
.manipulate_corpus(&mut g, tmp.path().parent().unwrap(), step_id, None)
.unwrap();

let th = TokenHelper::new(&g).unwrap();

let tokens = th.get_ordered_token("doc", None).unwrap();
let text = th.spanned_text(&tokens).unwrap();

// The rule is applied three times, to the last 3 characters of each
// token should have been replaced.
assert_eq!("X krX wechX etX anðthaX ellēbX hX", text);
}

#[test]
fn repeat_mapping_until_unchanged() {
let config = r#"
repetition = "UntilUnchanged"
[[rules]]
query = 'tok!="X"'
target = 1
ns = "annis"
name = "tok"
[rules.value]
target = 1
replacements = [
['[^X]X*$', 'X'],
]
"#;
let mut g = tokens_with_macrons().unwrap();

let tmp = NamedTempFile::new().unwrap();

std::fs::write(tmp.path(), config).unwrap();
let mapper = MapAnnos {
rule_file: tmp.path().to_path_buf(),
};
let step_id = StepID {
module_name: "test_map".to_string(),
path: None,
};
mapper
.manipulate_corpus(&mut g, tmp.path().parent().unwrap(), step_id, None)
.unwrap();

let th = TokenHelper::new(&g).unwrap();

let tokens = th.get_ordered_token("doc", None).unwrap();
let text = th.spanned_text(&tokens).unwrap();

// The rule is applied until all characters have been replaced.
assert_eq!("X X X X X X X", text);
}

#[test]
fn test_map_spans() {
let mut updates = GraphUpdate::new();
Expand Down Expand Up @@ -809,6 +919,7 @@ value = "PROPN"
Ok(g)
}

/// Create tokens "ein kraut wechſzt etwan anðthalbē ellēbogē hoch".
fn tokens_with_macrons() -> Result<AnnotationGraph, Box<dyn std::error::Error>> {
let mut g = AnnotationGraph::with_default_graphstorages(true)?;
let mut u = GraphUpdate::default();
Expand Down Expand Up @@ -839,6 +950,13 @@ value = "PROPN"
anno_name: "tok".to_string(),
anno_value: text.to_string(),
})?;
u.add_event(UpdateEvent::AddEdge {
source_node: format!("doc#t{i}"),
target_node: "doc".to_string(),
layer: ANNIS_NS.to_string(),
component_type: AnnotationComponentType::PartOf.to_string(),
component_name: "".to_string(),
})?;
if i > 0 {
u.add_event(UpdateEvent::AddEdge {
source_node: format!("doc#t{i}"),
Expand Down
63 changes: 32 additions & 31 deletions tests/snapshots/cli__module_info.snap
Original file line number Diff line number Diff line change
Expand Up @@ -11,37 +11,38 @@ spans and merged cells can be used for spans that cover more than one token.

*Configuration*

| name | description |
|-------------|-----------------------------------------------------------------------------------------------------------|
| column_map | Maps token columns to annotation columns. If there is more than one |
| | token column, it is assumed that the corpus has multiple segmentations. |
| | In this case, it is necessary to tell the importer which annotation column belongs to which token column. |
| | |
| | Example with the two token columns "dipl" and "norm": |
| | |
| | ```toml |
| | [import.config] |
| | column_map = {"dipl" = ["sentence"], "norm" = ["pos", "lemma", "seg"]} |
| | ``` |
| | The column "sentence" must be always be aligned with the "dipl" token |
| | and "pos", "lemma" and "seg" are aligned with the "norm" token. |
| fallback | If given, the name of the token column to be used when there is no |
| | explicit mapping given in the `column_map` parameter for this annotation |
| | column. |
| | |
| | Example with two token columns "dipl" and "norm", where all annotation |
| | columns except "lemma" and "pos" are mapped to the "dipl" token column: |
| | |
| | ```toml |
| | [import.config] |
| | column_map = {"dipl" = [], "norm" = ["pos", "lemma"]} |
| | fallback = "dipl" |
| | ``` |
| datasheet | Optional value of the Excel sheet that contains the data. If not given, |
| | the first sheet is used. |
| metasheet | Optional value of the Excel sheet that contains the metadata table. If |
| | no metadata is imported. |
| token_annos | Map the given annotation columns as token annotations and not as span if possible. |
| name | description |
|---------------------|-----------------------------------------------------------------------------------------------------------|
| column_map | Maps token columns to annotation columns. If there is more than one |
| | token column, it is assumed that the corpus has multiple segmentations. |
| | In this case, it is necessary to tell the importer which annotation column belongs to which token column. |
| | |
| | Example with the two token columns "dipl" and "norm": |
| | |
| | ```toml |
| | [import.config] |
| | column_map = {"dipl" = ["sentence"], "norm" = ["pos", "lemma", "seg"]} |
| | ``` |
| | The column "sentence" must be always be aligned with the "dipl" token |
| | and "pos", "lemma" and "seg" are aligned with the "norm" token. |
| fallback | If given, the name of the token column to be used when there is no |
| | explicit mapping given in the `column_map` parameter for this annotation |
| | column. |
| | |
| | Example with two token columns "dipl" and "norm", where all annotation |
| | columns except "lemma" and "pos" are mapped to the "dipl" token column: |
| | |
| | ```toml |
| | [import.config] |
| | column_map = {"dipl" = [], "norm" = ["pos", "lemma"]} |
| | fallback = "dipl" |
| | ``` |
| datasheet | Optional value of the Excel sheet that contains the data. If not given, |
| | the first sheet is used. |
| metasheet | Optional value of the Excel sheet that contains the metadata table. If |
| | no metadata is imported. |
| metasheet_skip_rows | Skip the first given rows in the meta data sheet. |
| token_annos | Map the given annotation columns as token annotations and not as span if possible. |

# Exporters

Expand Down

0 comments on commit 16f21ce

Please sign in to comment.