From ef60531ebd00336707beda7ae60e1379febfa306 Mon Sep 17 00:00:00 2001 From: Sivaprasad Date: Mon, 11 Dec 2023 15:39:14 +0530 Subject: [PATCH 1/7] Provision iterator syntax sugar in python --- .gitattributes | 27 +++++++++++++++++ bindings/python/iterators/__init__.py | 2 ++ bindings/python/iterators/sentences.py | 24 +++++++++++++++ bindings/python/iterators/words.py | 38 +++++++++++++++++++++++ bindings/python/tests/test_encoding.py | 2 +- bindings/python/tests/test_iterators.py | 40 +++++++++++++++++++++++++ setup.py | 2 +- slimt/CMakeLists.txt | 10 +++---- 8 files changed, 138 insertions(+), 7 deletions(-) create mode 100644 .gitattributes create mode 100644 bindings/python/iterators/__init__.py create mode 100644 bindings/python/iterators/sentences.py create mode 100644 bindings/python/iterators/words.py create mode 100644 bindings/python/tests/test_iterators.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..58309376 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,27 @@ +* text=auto + +# Compiled Object files +*.slo binary +*.lo binary +*.o binary +*.obj binary + +# Precompiled Headers +*.gch binary +*.pch binary + +# Compiled Dynamic libraries +*.so binary +*.dylib binary +*.dll binary + +# Compiled Static libraries +*.lai binary +*.la binary +*.a binary +*.lib binary + +# Executables +*.exe binary +*.out binary +*.app binary \ No newline at end of file diff --git a/bindings/python/iterators/__init__.py b/bindings/python/iterators/__init__.py new file mode 100644 index 00000000..08fb471b --- /dev/null +++ b/bindings/python/iterators/__init__.py @@ -0,0 +1,2 @@ +from .sentences import SentenceIterator as sentences +from .words import WordIterator as words \ No newline at end of file diff --git a/bindings/python/iterators/sentences.py b/bindings/python/iterators/sentences.py new file mode 100644 index 00000000..2c613caa --- /dev/null +++ b/bindings/python/iterators/sentences.py @@ -0,0 +1,24 @@ +from .words import WordIterator + +class SentenceIterator: + def __init__(self, annotation): + self._annotation = annotation + self._sentence_id = -1 + + def __iter__(self): + self._sentence_id = -1 + return self + + def __next__(self): + self._sentence_id += 1 + if self._sentence_id >= self._annotation.sentence_count(): + raise StopIteration + return self + + def words(self): + return WordIterator(self._annotation, self._sentence_id) + + def __repr__(self): + range = self._annotation.sentence_as_range(self._sentence_id) + sentence = self._annotation.text[range.begin:range.end] + return f'{sentence}' diff --git a/bindings/python/iterators/words.py b/bindings/python/iterators/words.py new file mode 100644 index 00000000..b3701781 --- /dev/null +++ b/bindings/python/iterators/words.py @@ -0,0 +1,38 @@ +class WordIterator: + def __init__(self, annotation, sentence_id=None): + self._annotation = annotation + + if sentence_id == None: + self._sentence_id = 0 + self._max_sentence_id = self._annotation.sentence_count() + else: + self._sentence_id = sentence_id + self._max_sentence_id = sentence_id + 1 + + self._word_id = -1 + + def __iter__(self): + self._word_id = -1 + return self + + def __next__(self): + if self._annotation.sentence_count() == 0: + raise StopIteration + + self._word_id += 1 + if self._word_id >= self._annotation.word_count(self._sentence_id): + self._sentence_id += 1 + if self._sentence_id >= self._max_sentence_id: + raise StopIteration + self._word_id = 0 + return self + + def surface(self): + range = self.range() + return self._annotation.text[range.begin:range.end] + + def range(self): + return self._annotation.word_as_range(self._sentence_id, self._word_id) + + def id(self): + return (self._sentence_id, self._word_id) \ No newline at end of file diff --git a/bindings/python/tests/test_encoding.py b/bindings/python/tests/test_encoding.py index a367cb66..e45cec97 100644 --- a/bindings/python/tests/test_encoding.py +++ b/bindings/python/tests/test_encoding.py @@ -46,4 +46,4 @@ def test_basic(service, models): byte_range = byte.word_as_range(sentence_idx, word_idx) utf8_to_byte_range = utf8_to_byte.word_as_range(sentence_idx, word_idx) assert byte_range.begin == utf8_to_byte_range.begin - assert byte_range.end == utf8_to_byte_range.end + assert byte_range.end == utf8_to_byte_range.end \ No newline at end of file diff --git a/bindings/python/tests/test_iterators.py b/bindings/python/tests/test_iterators.py new file mode 100644 index 00000000..26559fb4 --- /dev/null +++ b/bindings/python/tests/test_iterators.py @@ -0,0 +1,40 @@ +# type: ignore +from slimt import iterators + +def test_iterators(service, models): + source = "Hi, How are you? Its been a long time.\nCan you help me out with some things?" + model = models[1] + response = service.translate(model, [source], html=False)[0] + + target = response.target + text = target.text + + sen_iter_tgt = iterators.sentences(target) + word_iter_global = iterators.words(target) + + sentence_count = target.sentence_count() + for sentence_idx, word_iter in zip(range(sentence_count), sen_iter_tgt): + word_count = target.word_count(sentence_idx) + for word_idx, word in zip(range(word_count), word_iter.words()): + + expected_text_range = target.word_as_range(sentence_idx, word_idx) + reconstructed_text_range = word.range() + + # For Sentence Iterator and Word Iterator + assert expected_text_range.begin == reconstructed_text_range.begin + assert expected_text_range.end == reconstructed_text_range.end + + expected_word = text[expected_text_range.begin:expected_text_range.end] + reconstructed_word = word.surface() + + assert expected_word == reconstructed_word + + word_global = next(word_iter_global) + + reconstructed_text_range_glob = word_global.range() + reconstructed_word_glob = word_global.surface() + + # For Global Word Iterator + assert expected_text_range.begin == reconstructed_text_range_glob.begin + assert expected_text_range.end == reconstructed_text_range_glob.end + assert expected_word == reconstructed_word_glob \ No newline at end of file diff --git a/setup.py b/setup.py index 100a0d8e..522ac2bd 100644 --- a/setup.py +++ b/setup.py @@ -211,7 +211,7 @@ def run(self): extras_require={"test": ["pytest>=6.0"]}, license_files=("LICENSE",), python_requires=">=3.6", - packages=["slimt", "slimt.tests"], + packages=["slimt", "slimt.tests", "slimt.iterators"], package_dir={"slimt": "bindings/python"}, install_requires=["pyyaml>=5.1", "appdirs"], entry_points={ diff --git a/slimt/CMakeLists.txt b/slimt/CMakeLists.txt index 4711444a..e838032a 100644 --- a/slimt/CMakeLists.txt +++ b/slimt/CMakeLists.txt @@ -91,15 +91,15 @@ foreach(SLIMT_LIB IN LISTS SLIMT_LIBRARIES) target_link_libraries( ${SLIMT_LIB} PUBLIC ${SLIMT_PUBLIC_LIBS} - INTERFACE $ - PRIVATE $) + INTERFACE "$" + PRIVATE "$") target_include_directories( ${SLIMT_LIB} PUBLIC - $ - $ - $) + "$" + "$" + "$") target_link_options(${SLIMT_LIB} PUBLIC ${SLIMT_LINK_OPTIONS}) target_compile_options(${SLIMT_LIB} PRIVATE ${SLIMT_COMPILE_OPTIONS}) From c67f889e25662ede066f8650b211069e533eac55 Mon Sep 17 00:00:00 2001 From: Sivaprasad Date: Sat, 16 Dec 2023 22:08:35 +0530 Subject: [PATCH 2/7] Minor module restructure --- .../{iterators/words.py => iterators.py} | 31 ++++++++++++++++++- bindings/python/iterators/__init__.py | 2 -- bindings/python/iterators/sentences.py | 24 -------------- setup.py | 2 +- slimt/CMakeLists.txt | 3 +- 5 files changed, 33 insertions(+), 29 deletions(-) rename bindings/python/{iterators/words.py => iterators.py} (54%) delete mode 100644 bindings/python/iterators/__init__.py delete mode 100644 bindings/python/iterators/sentences.py diff --git a/bindings/python/iterators/words.py b/bindings/python/iterators.py similarity index 54% rename from bindings/python/iterators/words.py rename to bindings/python/iterators.py index b3701781..6f68d48e 100644 --- a/bindings/python/iterators/words.py +++ b/bindings/python/iterators.py @@ -1,3 +1,9 @@ +def sentences(annotation): + return SentenceIterator(annotation) + +def words(annotation, sentence_id=None): + return WordIterator(annotation, sentence_id) + class WordIterator: def __init__(self, annotation, sentence_id=None): self._annotation = annotation @@ -35,4 +41,27 @@ def range(self): return self._annotation.word_as_range(self._sentence_id, self._word_id) def id(self): - return (self._sentence_id, self._word_id) \ No newline at end of file + return (self._sentence_id, self._word_id) + +class SentenceIterator: + def __init__(self, annotation): + self._annotation = annotation + self._sentence_id = -1 + + def __iter__(self): + self._sentence_id = -1 + return self + + def __next__(self): + self._sentence_id += 1 + if self._sentence_id >= self._annotation.sentence_count(): + raise StopIteration + return self + + def words(self): + return WordIterator(self._annotation, self._sentence_id) + + def __repr__(self): + range = self._annotation.sentence_as_range(self._sentence_id) + sentence = self._annotation.text[range.begin:range.end] + return f'{sentence}' \ No newline at end of file diff --git a/bindings/python/iterators/__init__.py b/bindings/python/iterators/__init__.py deleted file mode 100644 index 08fb471b..00000000 --- a/bindings/python/iterators/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .sentences import SentenceIterator as sentences -from .words import WordIterator as words \ No newline at end of file diff --git a/bindings/python/iterators/sentences.py b/bindings/python/iterators/sentences.py deleted file mode 100644 index 2c613caa..00000000 --- a/bindings/python/iterators/sentences.py +++ /dev/null @@ -1,24 +0,0 @@ -from .words import WordIterator - -class SentenceIterator: - def __init__(self, annotation): - self._annotation = annotation - self._sentence_id = -1 - - def __iter__(self): - self._sentence_id = -1 - return self - - def __next__(self): - self._sentence_id += 1 - if self._sentence_id >= self._annotation.sentence_count(): - raise StopIteration - return self - - def words(self): - return WordIterator(self._annotation, self._sentence_id) - - def __repr__(self): - range = self._annotation.sentence_as_range(self._sentence_id) - sentence = self._annotation.text[range.begin:range.end] - return f'{sentence}' diff --git a/setup.py b/setup.py index 522ac2bd..100a0d8e 100644 --- a/setup.py +++ b/setup.py @@ -211,7 +211,7 @@ def run(self): extras_require={"test": ["pytest>=6.0"]}, license_files=("LICENSE",), python_requires=">=3.6", - packages=["slimt", "slimt.tests", "slimt.iterators"], + packages=["slimt", "slimt.tests"], package_dir={"slimt": "bindings/python"}, install_requires=["pyyaml>=5.1", "appdirs"], entry_points={ diff --git a/slimt/CMakeLists.txt b/slimt/CMakeLists.txt index e838032a..8bf507d4 100644 --- a/slimt/CMakeLists.txt +++ b/slimt/CMakeLists.txt @@ -99,7 +99,8 @@ foreach(SLIMT_LIB IN LISTS SLIMT_LIBRARIES) PUBLIC "$" "$" - "$") + "$" + ) target_link_options(${SLIMT_LIB} PUBLIC ${SLIMT_LINK_OPTIONS}) target_compile_options(${SLIMT_LIB} PRIVATE ${SLIMT_COMPILE_OPTIONS}) From 988fc391d77328efad580bf25372478053857e9b Mon Sep 17 00:00:00 2001 From: Sivaprasad Date: Sat, 16 Dec 2023 22:41:31 +0530 Subject: [PATCH 3/7] Method name correction in test_encoding --- bindings/python/tests/test_encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/tests/test_encoding.py b/bindings/python/tests/test_encoding.py index e45cec97..a367cb66 100644 --- a/bindings/python/tests/test_encoding.py +++ b/bindings/python/tests/test_encoding.py @@ -46,4 +46,4 @@ def test_basic(service, models): byte_range = byte.word_as_range(sentence_idx, word_idx) utf8_to_byte_range = utf8_to_byte.word_as_range(sentence_idx, word_idx) assert byte_range.begin == utf8_to_byte_range.begin - assert byte_range.end == utf8_to_byte_range.end \ No newline at end of file + assert byte_range.end == utf8_to_byte_range.end From 0dd82dbe7cd9de96705e5e4a25d1bf23829e3c60 Mon Sep 17 00:00:00 2001 From: Sivaprasad Date: Sat, 16 Dec 2023 23:36:37 +0530 Subject: [PATCH 4/7] Rearrange the factory methods in iterators --- bindings/python/iterators.py | 14 +++++++------- bindings/python/tests/test_encoding.py | 2 +- bindings/python/tests/test_iterators.py | 3 ++- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/bindings/python/iterators.py b/bindings/python/iterators.py index 6f68d48e..679a7cd4 100644 --- a/bindings/python/iterators.py +++ b/bindings/python/iterators.py @@ -1,9 +1,3 @@ -def sentences(annotation): - return SentenceIterator(annotation) - -def words(annotation, sentence_id=None): - return WordIterator(annotation, sentence_id) - class WordIterator: def __init__(self, annotation, sentence_id=None): self._annotation = annotation @@ -64,4 +58,10 @@ def words(self): def __repr__(self): range = self._annotation.sentence_as_range(self._sentence_id) sentence = self._annotation.text[range.begin:range.end] - return f'{sentence}' \ No newline at end of file + return f'{sentence}' + +def sentences(annotation): + return SentenceIterator(annotation) + +def words(annotation, sentence_id=None): + return WordIterator(annotation, sentence_id) diff --git a/bindings/python/tests/test_encoding.py b/bindings/python/tests/test_encoding.py index a367cb66..f1606091 100644 --- a/bindings/python/tests/test_encoding.py +++ b/bindings/python/tests/test_encoding.py @@ -4,7 +4,7 @@ from collections import namedtuple -def test_basic(service, models): +def test_encoding(service, models): Pair = namedtuple("Pair", ["byte", "utf8"]) source = "no sé 😀 😃 😄 😁 😆 ⛄ 🤔" model = models[1] diff --git a/bindings/python/tests/test_iterators.py b/bindings/python/tests/test_iterators.py index 26559fb4..0f7c2c23 100644 --- a/bindings/python/tests/test_iterators.py +++ b/bindings/python/tests/test_iterators.py @@ -37,4 +37,5 @@ def test_iterators(service, models): # For Global Word Iterator assert expected_text_range.begin == reconstructed_text_range_glob.begin assert expected_text_range.end == reconstructed_text_range_glob.end - assert expected_word == reconstructed_word_glob \ No newline at end of file + assert expected_word == reconstructed_word_glob + \ No newline at end of file From cdf23d960c4bb98c91f1274f46e3dc4b6707e155 Mon Sep 17 00:00:00 2001 From: Sivaprasad Date: Sat, 16 Dec 2023 23:47:00 +0530 Subject: [PATCH 5/7] Fix line endings --- bindings/python/tests/test_iterators.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bindings/python/tests/test_iterators.py b/bindings/python/tests/test_iterators.py index 0f7c2c23..f97803a7 100644 --- a/bindings/python/tests/test_iterators.py +++ b/bindings/python/tests/test_iterators.py @@ -38,4 +38,3 @@ def test_iterators(service, models): assert expected_text_range.begin == reconstructed_text_range_glob.begin assert expected_text_range.end == reconstructed_text_range_glob.end assert expected_word == reconstructed_word_glob - \ No newline at end of file From 0ad475540cdbd9671c131484f1fc77ecfdbf10eb Mon Sep 17 00:00:00 2001 From: Sivaprasad Date: Sat, 16 Dec 2023 23:59:01 +0530 Subject: [PATCH 6/7] Add final line ending to .gitattributes --- .gitattributes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 58309376..ea04562a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -24,4 +24,4 @@ # Executables *.exe binary *.out binary -*.app binary \ No newline at end of file +*.app binary From afcbf99ee3883c0a813671ecf7920bb077becb42 Mon Sep 17 00:00:00 2001 From: Sivaprasad Date: Mon, 18 Dec 2023 18:41:47 +0530 Subject: [PATCH 7/7] Clean up variable names for better readability --- bindings/python/tests/test_iterators.py | 40 ++++++++++++++----------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/bindings/python/tests/test_iterators.py b/bindings/python/tests/test_iterators.py index f97803a7..5e58448e 100644 --- a/bindings/python/tests/test_iterators.py +++ b/bindings/python/tests/test_iterators.py @@ -9,32 +9,36 @@ def test_iterators(service, models): target = response.target text = target.text - sen_iter_tgt = iterators.sentences(target) - word_iter_global = iterators.words(target) + sentences = iterators.sentences(target) + words = iterators.words(target) sentence_count = target.sentence_count() - for sentence_idx, word_iter in zip(range(sentence_count), sen_iter_tgt): + for sentence_idx, word_iter in zip(range(sentence_count), sentences): word_count = target.word_count(sentence_idx) for word_idx, word in zip(range(word_count), word_iter.words()): - expected_text_range = target.word_as_range(sentence_idx, word_idx) - reconstructed_text_range = word.range() + expected_range = target.word_as_range(sentence_idx, word_idx) + expected_word = text[expected_range.begin:expected_range.end] # For Sentence Iterator and Word Iterator - assert expected_text_range.begin == reconstructed_text_range.begin - assert expected_text_range.end == reconstructed_text_range.end + # Range + reconstructed = word.range() - expected_word = text[expected_text_range.begin:expected_text_range.end] - reconstructed_word = word.surface() + assert expected_range.begin == reconstructed.begin + assert expected_range.end == reconstructed.end - assert expected_word == reconstructed_word - - word_global = next(word_iter_global) - - reconstructed_text_range_glob = word_global.range() - reconstructed_word_glob = word_global.surface() + # Word + reconstructed = word.surface() + assert expected_word == reconstructed # For Global Word Iterator - assert expected_text_range.begin == reconstructed_text_range_glob.begin - assert expected_text_range.end == reconstructed_text_range_glob.end - assert expected_word == reconstructed_word_glob + word_global = next(words) + + # Range + reconstructed = word_global.range() + assert expected_range.begin == reconstructed.begin + assert expected_range.end == reconstructed.end + + # Word + reconstructed = word_global.surface() + assert expected_word == reconstructed