From 87c910559acf42f9338ace02158a920e3f432817 Mon Sep 17 00:00:00 2001 From: Sivaprasad S Date: Mon, 18 Dec 2023 19:22:17 +0530 Subject: [PATCH] Provision iterator syntax sugar in python The changes enhances the project a flexible and reusable iterator system for processing annotations in Python, along with associated tests. `.gitattributes`: Added a new file with various attributes for Git, specifying binary handling for certain file types. `bindings/python/iterators.py`: Added a new Python module (iterators.py) that defines iterators for processing annotations, sentences, and words. Includes WordIterator and SentenceIterator classes. Provides `sentences()` and `words()` for syntax sugar. `bindings/python/tests/test_iterators.py`: Added a new test module (`test_iterators.py`) to test the functionality of the iterators in the `iterators.py` module. `slimt/CMakeLists.txt`: Modified the CMakeLists.txt file to include changes related to target link libraries and include directories. Pull request: https://github.com/jerinphilip/slimt/pull/40 --- .gitattributes | 27 ++++++++++ bindings/python/iterators.py | 67 +++++++++++++++++++++++++ bindings/python/tests/test_encoding.py | 2 +- bindings/python/tests/test_iterators.py | 44 ++++++++++++++++ slimt/CMakeLists.txt | 11 ++-- 5 files changed, 145 insertions(+), 6 deletions(-) create mode 100644 .gitattributes create mode 100644 bindings/python/iterators.py create mode 100644 bindings/python/tests/test_iterators.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..ea04562a --- /dev/null +++ b/.gitattributes @@ -0,0 +1,27 @@ +* text=auto + +# Compiled Object files +*.slo binary +*.lo binary +*.o binary +*.obj binary + +# Precompiled Headers +*.gch binary +*.pch binary + +# Compiled Dynamic libraries +*.so binary +*.dylib binary +*.dll binary + +# Compiled Static libraries +*.lai binary +*.la binary +*.a binary +*.lib binary + +# Executables +*.exe binary +*.out binary +*.app binary diff --git a/bindings/python/iterators.py b/bindings/python/iterators.py new file mode 100644 index 00000000..679a7cd4 --- /dev/null +++ b/bindings/python/iterators.py @@ -0,0 +1,67 @@ +class WordIterator: + def __init__(self, annotation, sentence_id=None): + self._annotation = annotation + + if sentence_id == None: + self._sentence_id = 0 + self._max_sentence_id = self._annotation.sentence_count() + else: + self._sentence_id = sentence_id + self._max_sentence_id = sentence_id + 1 + + self._word_id = -1 + + def __iter__(self): + self._word_id = -1 + return self + + def __next__(self): + if self._annotation.sentence_count() == 0: + raise StopIteration + + self._word_id += 1 + if self._word_id >= self._annotation.word_count(self._sentence_id): + self._sentence_id += 1 + if self._sentence_id >= self._max_sentence_id: + raise StopIteration + self._word_id = 0 + return self + + def surface(self): + range = self.range() + return self._annotation.text[range.begin:range.end] + + def range(self): + return self._annotation.word_as_range(self._sentence_id, self._word_id) + + def id(self): + return (self._sentence_id, self._word_id) + +class SentenceIterator: + def __init__(self, annotation): + self._annotation = annotation + self._sentence_id = -1 + + def __iter__(self): + self._sentence_id = -1 + return self + + def __next__(self): + self._sentence_id += 1 + if self._sentence_id >= self._annotation.sentence_count(): + raise StopIteration + return self + + def words(self): + return WordIterator(self._annotation, self._sentence_id) + + def __repr__(self): + range = self._annotation.sentence_as_range(self._sentence_id) + sentence = self._annotation.text[range.begin:range.end] + return f'{sentence}' + +def sentences(annotation): + return SentenceIterator(annotation) + +def words(annotation, sentence_id=None): + return WordIterator(annotation, sentence_id) diff --git a/bindings/python/tests/test_encoding.py b/bindings/python/tests/test_encoding.py index a367cb66..f1606091 100644 --- a/bindings/python/tests/test_encoding.py +++ b/bindings/python/tests/test_encoding.py @@ -4,7 +4,7 @@ from collections import namedtuple -def test_basic(service, models): +def test_encoding(service, models): Pair = namedtuple("Pair", ["byte", "utf8"]) source = "no sé 😀 😃 😄 😁 😆 ⛄ 🤔" model = models[1] diff --git a/bindings/python/tests/test_iterators.py b/bindings/python/tests/test_iterators.py new file mode 100644 index 00000000..5e58448e --- /dev/null +++ b/bindings/python/tests/test_iterators.py @@ -0,0 +1,44 @@ +# type: ignore +from slimt import iterators + +def test_iterators(service, models): + source = "Hi, How are you? Its been a long time.\nCan you help me out with some things?" + model = models[1] + response = service.translate(model, [source], html=False)[0] + + target = response.target + text = target.text + + sentences = iterators.sentences(target) + words = iterators.words(target) + + sentence_count = target.sentence_count() + for sentence_idx, word_iter in zip(range(sentence_count), sentences): + word_count = target.word_count(sentence_idx) + for word_idx, word in zip(range(word_count), word_iter.words()): + + expected_range = target.word_as_range(sentence_idx, word_idx) + expected_word = text[expected_range.begin:expected_range.end] + + # For Sentence Iterator and Word Iterator + # Range + reconstructed = word.range() + + assert expected_range.begin == reconstructed.begin + assert expected_range.end == reconstructed.end + + # Word + reconstructed = word.surface() + assert expected_word == reconstructed + + # For Global Word Iterator + word_global = next(words) + + # Range + reconstructed = word_global.range() + assert expected_range.begin == reconstructed.begin + assert expected_range.end == reconstructed.end + + # Word + reconstructed = word_global.surface() + assert expected_word == reconstructed diff --git a/slimt/CMakeLists.txt b/slimt/CMakeLists.txt index 4711444a..8bf507d4 100644 --- a/slimt/CMakeLists.txt +++ b/slimt/CMakeLists.txt @@ -91,15 +91,16 @@ foreach(SLIMT_LIB IN LISTS SLIMT_LIBRARIES) target_link_libraries( ${SLIMT_LIB} PUBLIC ${SLIMT_PUBLIC_LIBS} - INTERFACE $ - PRIVATE $) + INTERFACE "$" + PRIVATE "$") target_include_directories( ${SLIMT_LIB} PUBLIC - $ - $ - $) + "$" + "$" + "$" + ) target_link_options(${SLIMT_LIB} PUBLIC ${SLIMT_LINK_OPTIONS}) target_compile_options(${SLIMT_LIB} PRIVATE ${SLIMT_COMPILE_OPTIONS})