From 87c910559acf42f9338ace02158a920e3f432817 Mon Sep 17 00:00:00 2001
From: Sivaprasad S <sivaprasad2000@outlook.com>
Date: Mon, 18 Dec 2023 19:22:17 +0530
Subject: [PATCH] Provision iterator syntax sugar in python

The changes enhances the project a flexible and reusable iterator system
for processing annotations in Python, along with associated tests.

`.gitattributes`:
Added a new file with various attributes for Git, specifying binary
handling for certain file types.

`bindings/python/iterators.py`:
Added a new Python module (iterators.py) that defines iterators for
processing annotations, sentences, and words.
    Includes WordIterator and SentenceIterator classes.
    Provides `sentences()` and `words()` for syntax sugar.

`bindings/python/tests/test_iterators.py`:
Added a new test module (`test_iterators.py`) to test the functionality
of the iterators in the `iterators.py` module.

`slimt/CMakeLists.txt`:
Modified the CMakeLists.txt file to include changes related to target
link libraries and include directories.

Pull request: https://github.com/jerinphilip/slimt/pull/40
---
 .gitattributes                          | 27 ++++++++++
 bindings/python/iterators.py            | 67 +++++++++++++++++++++++++
 bindings/python/tests/test_encoding.py  |  2 +-
 bindings/python/tests/test_iterators.py | 44 ++++++++++++++++
 slimt/CMakeLists.txt                    | 11 ++--
 5 files changed, 145 insertions(+), 6 deletions(-)
 create mode 100644 .gitattributes
 create mode 100644 bindings/python/iterators.py
 create mode 100644 bindings/python/tests/test_iterators.py

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..ea04562a
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,27 @@
+* text=auto
+
+# Compiled Object files
+*.slo   binary
+*.lo    binary
+*.o     binary
+*.obj   binary
+
+# Precompiled Headers
+*.gch   binary
+*.pch   binary
+
+# Compiled Dynamic libraries
+*.so    binary
+*.dylib binary
+*.dll   binary
+
+# Compiled Static libraries
+*.lai   binary
+*.la    binary
+*.a     binary
+*.lib   binary
+
+# Executables
+*.exe   binary
+*.out   binary
+*.app   binary
diff --git a/bindings/python/iterators.py b/bindings/python/iterators.py
new file mode 100644
index 00000000..679a7cd4
--- /dev/null
+++ b/bindings/python/iterators.py
@@ -0,0 +1,67 @@
+class WordIterator:
+  def __init__(self, annotation, sentence_id=None):
+    self._annotation = annotation
+    
+    if sentence_id == None:
+      self._sentence_id = 0
+      self._max_sentence_id = self._annotation.sentence_count()
+    else:
+      self._sentence_id = sentence_id
+      self._max_sentence_id = sentence_id + 1
+
+    self._word_id = -1
+
+  def __iter__(self):
+    self._word_id = -1
+    return self
+
+  def __next__(self):
+    if self._annotation.sentence_count() == 0:
+      raise StopIteration
+
+    self._word_id += 1
+    if self._word_id >= self._annotation.word_count(self._sentence_id):
+      self._sentence_id += 1
+      if self._sentence_id >= self._max_sentence_id:
+        raise StopIteration
+      self._word_id = 0
+    return self
+
+  def surface(self):
+    range = self.range()
+    return self._annotation.text[range.begin:range.end]
+
+  def range(self):
+    return self._annotation.word_as_range(self._sentence_id, self._word_id)
+
+  def id(self):
+    return (self._sentence_id, self._word_id)
+
+class SentenceIterator:
+  def __init__(self, annotation):
+    self._annotation = annotation
+    self._sentence_id = -1
+
+  def __iter__(self):
+    self._sentence_id = -1
+    return self
+
+  def __next__(self):
+    self._sentence_id += 1
+    if self._sentence_id >= self._annotation.sentence_count():
+      raise StopIteration
+    return self
+
+  def words(self):
+    return WordIterator(self._annotation, self._sentence_id)
+
+  def __repr__(self):
+    range = self._annotation.sentence_as_range(self._sentence_id)
+    sentence = self._annotation.text[range.begin:range.end]
+    return f'{sentence}'
+  
+def sentences(annotation):
+  return SentenceIterator(annotation)
+
+def words(annotation, sentence_id=None):
+  return WordIterator(annotation, sentence_id)
diff --git a/bindings/python/tests/test_encoding.py b/bindings/python/tests/test_encoding.py
index a367cb66..f1606091 100644
--- a/bindings/python/tests/test_encoding.py
+++ b/bindings/python/tests/test_encoding.py
@@ -4,7 +4,7 @@
 from collections import namedtuple
 
 
-def test_basic(service, models):
+def test_encoding(service, models):
     Pair = namedtuple("Pair", ["byte", "utf8"])
     source = "no sé 😀 😃 😄 😁 😆 ⛄ 🤔"
     model = models[1]
diff --git a/bindings/python/tests/test_iterators.py b/bindings/python/tests/test_iterators.py
new file mode 100644
index 00000000..5e58448e
--- /dev/null
+++ b/bindings/python/tests/test_iterators.py
@@ -0,0 +1,44 @@
+# type: ignore
+from slimt import iterators
+
+def test_iterators(service, models):
+    source = "Hi, How are you? Its been a long time.\nCan you help me out with some things?"
+    model = models[1]
+    response = service.translate(model, [source], html=False)[0]
+
+    target = response.target
+    text = target.text
+
+    sentences = iterators.sentences(target)
+    words = iterators.words(target)
+
+    sentence_count = target.sentence_count()
+    for sentence_idx, word_iter in zip(range(sentence_count), sentences):
+        word_count = target.word_count(sentence_idx)
+        for word_idx, word in zip(range(word_count), word_iter.words()):
+            
+            expected_range = target.word_as_range(sentence_idx, word_idx)  
+            expected_word = text[expected_range.begin:expected_range.end]
+
+            # For Sentence Iterator and Word Iterator
+            # Range
+            reconstructed = word.range()
+
+            assert expected_range.begin == reconstructed.begin
+            assert expected_range.end == reconstructed.end
+
+            # Word
+            reconstructed = word.surface()
+            assert expected_word == reconstructed
+
+            # For Global Word Iterator
+            word_global = next(words)
+            
+            # Range
+            reconstructed = word_global.range()
+            assert expected_range.begin == reconstructed.begin
+            assert expected_range.end == reconstructed.end
+
+            # Word
+            reconstructed = word_global.surface()
+            assert expected_word == reconstructed
diff --git a/slimt/CMakeLists.txt b/slimt/CMakeLists.txt
index 4711444a..8bf507d4 100644
--- a/slimt/CMakeLists.txt
+++ b/slimt/CMakeLists.txt
@@ -91,15 +91,16 @@ foreach(SLIMT_LIB IN LISTS SLIMT_LIBRARIES)
   target_link_libraries(
     ${SLIMT_LIB}
     PUBLIC ${SLIMT_PUBLIC_LIBS}
-    INTERFACE $<BUILD_INTERFACE:${SLIMT_INTERFACE_LIBS}>
-    PRIVATE $<BUILD_INTERFACE:${SLIMT_PRIVATE_LIBS}>)
+    INTERFACE "$<BUILD_INTERFACE:${SLIMT_INTERFACE_LIBS}>"
+    PRIVATE "$<BUILD_INTERFACE:${SLIMT_PRIVATE_LIBS}>")
 
   target_include_directories(
     ${SLIMT_LIB}
     PUBLIC
-      $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
-      $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
-      $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}>)
+      "$<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>"
+      "$<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>"
+      "$<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}>"
+  )
 
   target_link_options(${SLIMT_LIB} PUBLIC ${SLIMT_LINK_OPTIONS})
   target_compile_options(${SLIMT_LIB} PRIVATE ${SLIMT_COMPILE_OPTIONS})