Provision iterator syntax sugar in python

The changes enhances the project a flexible and reusable iterator system for processing annotations in Python, along with associated tests. `.gitattributes`: Added a new file with various attributes for Git, specifying binary handling for certain file types. `bindings/python/iterators.py`: Added a new Python module (iterators.py) that defines iterators for processing annotations, sentences, and words. Includes WordIterator and SentenceIterator classes. Provides `sentences()` and `words()` for syntax sugar. `bindings/python/tests/test_iterators.py`: Added a new test module (`test_iterators.py`) to test the functionality of the iterators in the `iterators.py` module. `slimt/CMakeLists.txt`: Modified the CMakeLists.txt file to include changes related to target link libraries and include directories. Pull request: #40
jerinphilip · Dec 18, 2023 · 87c9105 · 87c9105
1 parent 8255316
commit 87c9105
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 6 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,27 @@
+* text=auto
+
+# Compiled Object files
+*.slo   binary
+*.lo    binary
+*.o     binary
+*.obj   binary
+
+# Precompiled Headers
+*.gch   binary
+*.pch   binary
+
+# Compiled Dynamic libraries
+*.so    binary
+*.dylib binary
+*.dll   binary
+
+# Compiled Static libraries
+*.lai   binary
+*.la    binary
+*.a     binary
+*.lib   binary
+
+# Executables
+*.exe   binary
+*.out   binary
+*.app   binary
diff --git a/bindings/python/iterators.py b/bindings/python/iterators.py
@@ -0,0 +1,67 @@
+class WordIterator:
+  def __init__(self, annotation, sentence_id=None):
+    self._annotation = annotation
+
+    if sentence_id == None:
+      self._sentence_id = 0
+      self._max_sentence_id = self._annotation.sentence_count()
+    else:
+      self._sentence_id = sentence_id
+      self._max_sentence_id = sentence_id + 1
+
+    self._word_id = -1
+
+  def __iter__(self):
+    self._word_id = -1
+    return self
+
+  def __next__(self):
+    if self._annotation.sentence_count() == 0:
+      raise StopIteration
+
+    self._word_id += 1
+    if self._word_id >= self._annotation.word_count(self._sentence_id):
+      self._sentence_id += 1
+      if self._sentence_id >= self._max_sentence_id:
+        raise StopIteration
+      self._word_id = 0
+    return self
+
+  def surface(self):
+    range = self.range()
+    return self._annotation.text[range.begin:range.end]
+
+  def range(self):
+    return self._annotation.word_as_range(self._sentence_id, self._word_id)
+
+  def id(self):
+    return (self._sentence_id, self._word_id)
+
+class SentenceIterator:
+  def __init__(self, annotation):
+    self._annotation = annotation
+    self._sentence_id = -1
+
+  def __iter__(self):
+    self._sentence_id = -1
+    return self
+
+  def __next__(self):
+    self._sentence_id += 1
+    if self._sentence_id >= self._annotation.sentence_count():
+      raise StopIteration
+    return self
+
+  def words(self):
+    return WordIterator(self._annotation, self._sentence_id)
+
+  def __repr__(self):
+    range = self._annotation.sentence_as_range(self._sentence_id)
+    sentence = self._annotation.text[range.begin:range.end]
+    return f'{sentence}'
+
+def sentences(annotation):
+  return SentenceIterator(annotation)
+
+def words(annotation, sentence_id=None):
+  return WordIterator(annotation, sentence_id)
diff --git a/bindings/python/tests/test_encoding.py b/bindings/python/tests/test_encoding.py
@@ -4,7 +4,7 @@
 from collections import namedtuple
 
 
-def test_basic(service, models):
+def test_encoding(service, models):
     Pair = namedtuple("Pair", ["byte", "utf8"])
     source = "no sé 😀 😃 😄 😁 😆 ⛄ 🤔"
     model = models[1]

diff --git a/bindings/python/tests/test_iterators.py b/bindings/python/tests/test_iterators.py
@@ -0,0 +1,44 @@
+# type: ignore
+from slimt import iterators
+
+def test_iterators(service, models):
+    source = "Hi, How are you? Its been a long time.\nCan you help me out with some things?"
+    model = models[1]
+    response = service.translate(model, [source], html=False)[0]
+
+    target = response.target
+    text = target.text
+
+    sentences = iterators.sentences(target)
+    words = iterators.words(target)
+
+    sentence_count = target.sentence_count()
+    for sentence_idx, word_iter in zip(range(sentence_count), sentences):
+        word_count = target.word_count(sentence_idx)
+        for word_idx, word in zip(range(word_count), word_iter.words()):
+
+            expected_range = target.word_as_range(sentence_idx, word_idx)  
+            expected_word = text[expected_range.begin:expected_range.end]
+
+            # For Sentence Iterator and Word Iterator
+            # Range
+            reconstructed = word.range()
+
+            assert expected_range.begin == reconstructed.begin
+            assert expected_range.end == reconstructed.end
+
+            # Word
+            reconstructed = word.surface()
+            assert expected_word == reconstructed
+
+            # For Global Word Iterator
+            word_global = next(words)
+
+            # Range
+            reconstructed = word_global.range()
+            assert expected_range.begin == reconstructed.begin
+            assert expected_range.end == reconstructed.end
+
+            # Word
+            reconstructed = word_global.surface()
+            assert expected_word == reconstructed
diff --git a/slimt/CMakeLists.txt b/slimt/CMakeLists.txt
@@ -91,15 +91,16 @@ foreach(SLIMT_LIB IN LISTS SLIMT_LIBRARIES)
   target_link_libraries(
     ${SLIMT_LIB}
     PUBLIC ${SLIMT_PUBLIC_LIBS}
-    INTERFACE $<BUILD_INTERFACE:${SLIMT_INTERFACE_LIBS}>
-    PRIVATE $<BUILD_INTERFACE:${SLIMT_PRIVATE_LIBS}>)
+    INTERFACE "$<BUILD_INTERFACE:${SLIMT_INTERFACE_LIBS}>"
+    PRIVATE "$<BUILD_INTERFACE:${SLIMT_PRIVATE_LIBS}>")
 
   target_include_directories(
     ${SLIMT_LIB}
     PUBLIC
-      $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
-      $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
-      $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}>)
+      "$<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>"
+      "$<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>"
+      "$<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}>"
+  )
 
   target_link_options(${SLIMT_LIB} PUBLIC ${SLIMT_LINK_OPTIONS})
   target_compile_options(${SLIMT_LIB} PRIVATE ${SLIMT_COMPILE_OPTIONS})