diff --git a/elpis/datasets/dataset.py b/elpis/datasets/dataset.py index 27b4c92..0323bbc 100644 --- a/elpis/datasets/dataset.py +++ b/elpis/datasets/dataset.py @@ -153,6 +153,14 @@ def from_dict(cls, data: Dict[str, Any]) -> Dataset: elan_options=elan_options, ) + @property + def valid_transcriptions(self): + return ( + self._transcript_files() + .difference(self.mismatched_files()) + .difference(self.colliding_files()) + ) + def to_batches(self) -> List[ProcessingBatch]: """Converts a valid dataset to a list of processing jobs, matching transcript and audio files. @@ -164,7 +172,7 @@ def to_batches(self) -> List[ProcessingBatch]: cleaning_options=self.cleaning_options, elan_options=self.elan_options, ) - for transcription_file in self._transcript_files() + for transcription_file in self.valid_transcriptions ] def to_dict(self) -> Dict[str, Any]: diff --git a/tests/datasets/test_dataset.py b/tests/datasets/test_dataset.py index ff78b61..c76e76d 100644 --- a/tests/datasets/test_dataset.py +++ b/tests/datasets/test_dataset.py @@ -47,13 +47,18 @@ def test_serialize_dataset_options(): FILES_WITHOUT_ELAN = ["1.txt", "1.wav"] MISMATCHED_FILES = ["1.eaf", "1.wav", "2.wav", "3.txt"] COLLIDING_FILES = ["1.eaf", "1.wav", "1.txt"] - +MESSY_FILES = ["1.eaf", "1.wav", "2.eaf", "2.txt", "2.wav", "3.eaf", "4.wav"] DATASET_DICT = { "name": "dataset", "files": FILES_WITH_ELAN, "cleaning_options": CLEANING_OPTIONS_DICT, } +MESSY_DATASET_DICT = { + "name": "dataset", + "files": MESSY_FILES, + "cleaning_options": CLEANING_OPTIONS_DICT, +} DATASET_DICT_ELAN = DATASET_DICT | {"elan_options": ELAN_OPTIONS_DICT} @@ -124,6 +129,11 @@ def test_duplicate_files(): assert set(dataset.colliding_files()) == {Path("1.eaf"), Path("1.txt")} +def test_valid_transcriptions(): + dataset = Dataset.from_dict(MESSY_DATASET_DICT) + assert len(dataset.valid_transcriptions) == 1 + + def test_dataset_batching(): dataset = Dataset.from_dict(DATASET_DICT) batch = dataset.to_batches()