Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/spreadsheet unknown columns #137

Merged
merged 4 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- a separator for joining node values in `link` can be set with attribute `value_sep`
- spreadsheet imports can now be configured with a fallback token column for annotation names not mentioned in a column map, an empty string means map to timeline directly

### Fixed

Expand Down
178 changes: 151 additions & 27 deletions src/importer/spreadsheet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,17 @@

pub const MODULE_NAME: &str = "import_spreadsheet";

#[derive(Default, Deserialize)]

Check warning on line 25 in src/importer/spreadsheet.rs

View workflow job for this annotation

GitHub Actions / Execute tests with code coverage

25 line is not covered with tests
#[serde(default)]
pub struct ImportSpreadsheet {
column_map: BTreeMap<String, BTreeSet<String>>,
fallback: Option<String>,
}

impl Module for ImportSpreadsheet {
fn module_name(&self) -> &str {
MODULE_NAME
}

Check warning on line 35 in src/importer/spreadsheet.rs

View workflow job for this annotation

GitHub Actions / Execute tests with code coverage

33-35 lines are not covered with tests
}

fn import_workbook(
Expand All @@ -39,20 +40,38 @@
root_path: &Path,
path: &Path,
column_map: &BTreeMap<String, BTreeSet<String>>,
fallback: &Option<String>,
tx: &Option<StatusSender>,
) -> Result<(), Box<dyn std::error::Error>> {
let doc_path = insert_corpus_nodes_from_path(update, root_path, path)?;
let book = umya_spreadsheet::reader::xlsx::read(path)?;
let sheet = book.get_sheet(&0)?;
let merged_cells = sheet.get_merge_cells();
let mut fullmap = column_map.clone();
let known_names = column_map.values().flatten().collect::<BTreeSet<&String>>();
if let Some(fallback_name) = &fallback {
if fallback_name.is_empty() {
fullmap.insert("".to_string(), BTreeSet::new());
}
}
let name_to_col_0index = {
let mut m = BTreeMap::new();
let header_row = sheet.get_collection_by_row(&1);
for cell in header_row {
let name = cell.get_cell_value().get_value().trim().to_string();
if !name.is_empty() {
m.insert(name, cell.get_coordinate().get_col_num() - 1);
m.insert(name.to_string(), cell.get_coordinate().get_col_num() - 1);
if let Some(fallback_name) = &fallback {
if !known_names.contains(&name) && !fullmap.contains_key(&name) {
if let Some(anno_names) = fullmap.get_mut(fallback_name) {
anno_names.insert(name);
} else if let Some(sender) = tx {
let message = StatusMessage::Warning(format!("`{fallback_name}` is not a valid fallback. Only empty string and keys of the column map are allowed. Column `{name}` will be ignored."));
sender.send(message)?;
}
}
}
}

Check warning on line 74 in src/importer/spreadsheet.rs

View workflow job for this annotation

GitHub Actions / Execute tests with code coverage

74 line is not covered with tests
}
m
};
Expand All @@ -68,14 +87,14 @@
let start_col = match cell_range.get_coordinate_start_col().as_ref() {
Some(c) => c,
None => {
if let Some(sender) = tx {
let message = StatusMessage::Warning(format!(
"Could not parse start column of merged cell {}",
cell_range.get_range()
));
sender.send(message)?;
}
continue;

Check warning on line 97 in src/importer/spreadsheet.rs

View workflow job for this annotation

GitHub Actions / Execute tests with code coverage

90-97 lines are not covered with tests
}
};
let col_1i = start_col.get_num();
Expand All @@ -88,7 +107,7 @@
cell_range.get_range()
));
sender.send(message)?;
}

Check warning on line 110 in src/importer/spreadsheet.rs

View workflow job for this annotation

GitHub Actions / Execute tests with code coverage

110 line is not covered with tests
start_col
}
};
Expand All @@ -104,14 +123,14 @@
let start_row = match cell_range.get_coordinate_start_row().as_ref() {
Some(r) => r,
None => {
if let Some(sender) = tx {
let message = StatusMessage::Warning(format!(
"Could not parse start row of merged cell {}",
cell_range.get_range()
));
sender.send(message)?;
}
continue;

Check warning on line 133 in src/importer/spreadsheet.rs

View workflow job for this annotation

GitHub Actions / Execute tests with code coverage

126-133 lines are not covered with tests
}
};
let start_1i = start_row.get_num();
Expand All @@ -124,7 +143,7 @@
cell_range.get_range()
));
sender.send(message)?;
}

Check warning on line 146 in src/importer/spreadsheet.rs

View workflow job for this annotation

GitHub Actions / Execute tests with code coverage

146 line is not covered with tests
start_row
}
};
Expand All @@ -140,7 +159,7 @@
cell_range.get_range()
));
sender.send(message)?;
}

Check warning on line 162 in src/importer/spreadsheet.rs

View workflow job for this annotation

GitHub Actions / Execute tests with code coverage

162 line is not covered with tests
}
m
};
Expand Down Expand Up @@ -178,8 +197,12 @@
})?;
Ok::<(), Box<dyn std::error::Error>>(())
})?;
for (tok_name, anno_names) in column_map {
let mut names = vec![tok_name];
for (tok_name, anno_names) in &fullmap {
let mut names = if tok_name.is_empty() {
vec![]
} else {
vec![tok_name]
};
names.extend(anno_names);
for name in names {
let index_opt = match name_to_col_0index.get(name) {
Expand Down Expand Up @@ -211,18 +234,22 @@
node_name: node_name.to_string(),
node_type: "node".to_string(),
})?;
update.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: ANNIS_NS.to_string(),
anno_name: "tok".to_string(),
anno_value: value.to_string(),
})?;
update.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: ANNIS_NS.to_string(),
anno_name: "layer".to_string(),
anno_value: tok_name.to_string(),
})?;
if name == tok_name {
update.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: ANNIS_NS.to_string(),
anno_name: "tok".to_string(),
anno_value: value.to_string(),
})?;
}
if !tok_name.is_empty() {
update.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: ANNIS_NS.to_string(),
anno_name: "layer".to_string(),
anno_value: tok_name.to_string(),
})?;
}
update.add_event(UpdateEvent::AddNodeLabel {
node_name: node_name.to_string(),
anno_ns: tok_name.to_string(),
Expand Down Expand Up @@ -254,10 +281,12 @@
},
)?;
}
} else {
// TODO warning
continue; // no tokenization, no mapping of dependent annotations
} else if let Some(sender) = tx {
let message =
StatusMessage::Info(format!("No column `{name}` in file {}", &doc_path));
sender.send(message)?;
continue;
}

Check warning on line 289 in src/importer/spreadsheet.rs

View workflow job for this annotation

GitHub Actions / Execute tests with code coverage

289 line is not covered with tests
}
}
Ok(())
Expand All @@ -273,7 +302,14 @@
let column_map = &self.column_map;
let all_files = get_all_files(input_path, vec!["xlsx"])?;
all_files.into_iter().try_for_each(|pb| {
import_workbook(&mut update, input_path, pb.as_path(), column_map, &tx)
import_workbook(
&mut update,
input_path,
pb.as_path(),
column_map,
&self.fallback,
&tx,
)
})?;
Ok(update)
}
Expand All @@ -291,7 +327,10 @@

use super::*;

fn run_spreadsheet_import(on_disk: bool) -> Result<(), Box<dyn std::error::Error>> {
fn run_spreadsheet_import(
on_disk: bool,
fallback: Option<String>,
) -> Result<(), Box<dyn std::error::Error>> {
let mut col_map = BTreeMap::new();
col_map.insert(
"dipl".to_string(),
Expand All @@ -301,18 +340,31 @@
);
col_map.insert(
"norm".to_string(),
vec!["pos".to_string(), "lemma".to_string()]
.into_iter()
.collect(),
{
match fallback {
None => vec!["pos".to_string(), "lemma".to_string()],
Some(_) => vec!["pos".to_string()],
}
}
.into_iter()
.collect(),
);
let importer = ImportSpreadsheet {
column_map: col_map,
fallback: fallback,
};
let path = Path::new("./tests/data/import/xlsx/clean/xlsx/");
let import = importer.import_corpus(path, None);
let mut u = import?;
let mut g = AnnotationGraph::new(on_disk)?;
g.apply_update(&mut u, |_| {})?;
let lemma_count = match &importer.fallback {
Some(v) => match &v[..] {
"norm" => 4,
_ => 0,
},
_ => 4,
};
let queries_and_results: [(&str, u64); 19] = [
("dipl", 4),
("norm", 4),
Expand All @@ -330,9 +382,9 @@
("dipl:seg _l_ dipl", 2),
("dipl:seg _r_ dipl", 2),
("norm:pos", 4),
("norm:lemma", 4),
("norm:lemma", lemma_count),
("norm:pos _=_ norm", 4),
("norm:lemma _=_ norm", 4),
("norm:lemma _=_ norm", lemma_count),
];
let corpus_name = "current";
let tmp_dir = tempdir_in(temp_dir())?;
Expand All @@ -350,7 +402,7 @@
let count = cs.count(query)?;
assert_eq!(
count, expected_result,
"Result for query `{}` does not match",

Check warning on line 405 in src/importer/spreadsheet.rs

View workflow job for this annotation

GitHub Actions / Execute tests with code coverage

405 line is not covered with tests
query_s
);
}
Expand All @@ -359,7 +411,7 @@

#[test]
fn spreadsheet_import_in_mem() {
let import = run_spreadsheet_import(false);
let import = run_spreadsheet_import(false, None);
assert!(
import.is_ok(),
"Spreadsheet import failed with error: {:?}",
Expand All @@ -369,7 +421,7 @@

#[test]
fn spreadsheet_import_on_disk() {
let import = run_spreadsheet_import(true);
let import = run_spreadsheet_import(true, None);
assert!(
import.is_ok(),
"Spreadsheet import failed with error: {:?}",
Expand All @@ -394,6 +446,7 @@
);
let importer = ImportSpreadsheet {
column_map: col_map,
fallback: None,
};
let path = Path::new("./tests/data/import/xlsx/dirty/xlsx/");
let (sender, receiver) = mpsc::channel();
Expand All @@ -419,11 +472,82 @@
);
let importer = ImportSpreadsheet {
column_map: col_map,
fallback: None,
};
let path = Path::new("./tests/data/import/xlsx/warnings/xlsx/");
let (sender, receiver) = mpsc::channel();
let import = importer.import_corpus(path, Some(sender));
assert!(import.is_ok());
assert_ne!(receiver.into_iter().count(), 0);
}

#[test]
fn spreadsheet_fallback_value_in_mem() {
let import = run_spreadsheet_import(true, Some("norm".to_string()));
assert!(
import.is_ok(),
"Spreadsheet import failed with error: {:?}",
import.err()
);
}

#[test]
fn spreadsheet_fallback_value_on_disk() {
let import = run_spreadsheet_import(false, Some("norm".to_string()));
assert!(
import.is_ok(),
"Spreadsheet import failed with error: {:?}",
import.err()
);
}

#[test]
fn spreadsheet_empty_fallback_value_in_mem() {
let import = run_spreadsheet_import(true, Some("".to_string()));
assert!(
import.is_ok(),
"Spreadsheet import failed with error: {:?}",
import.err()
);
}

#[test]
fn spreadsheet_empty_fallback_value_on_disk() {
let import = run_spreadsheet_import(false, Some("".to_string()));
assert!(
import.is_ok(),
"Spreadsheet import failed with error: {:?}",
import.err()
);
}

#[test]
fn spreadsheet_invalid_fallback_value() {
let import = run_spreadsheet_import(false, Some("tok".to_string()));
assert!(
import.is_ok(),
"Spreadsheet import failed with error: {:?}",
import.err()
);
let mut col_map = BTreeMap::new();
col_map.insert(
"dipl".to_string(),
vec!["sentence".to_string(), "seg".to_string()]
.into_iter()
.collect(),
);
col_map.insert(
"norm".to_string(),
vec!["pos".to_string()].into_iter().collect(),
);
let importer = ImportSpreadsheet {
column_map: col_map,
fallback: Some("tok".to_string()),
};
let path = Path::new("./tests/data/import/xlsx/clean/xlsx/");
let (sender, receiver) = mpsc::channel();
let import = importer.import_corpus(path, Some(sender));
assert!(import.is_ok());
assert_ne!(receiver.into_iter().count(), 0);
}
}
Loading