-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Provision iterator syntax sugar in python
The changes enhances the project a flexible and reusable iterator system for processing annotations in Python, along with associated tests. `.gitattributes`: Added a new file with various attributes for Git, specifying binary handling for certain file types. `bindings/python/iterators.py`: Added a new Python module (iterators.py) that defines iterators for processing annotations, sentences, and words. Includes WordIterator and SentenceIterator classes. Provides `sentences()` and `words()` for syntax sugar. `bindings/python/tests/test_iterators.py`: Added a new test module (`test_iterators.py`) to test the functionality of the iterators in the `iterators.py` module. `slimt/CMakeLists.txt`: Modified the CMakeLists.txt file to include changes related to target link libraries and include directories. Pull request: #40
- Loading branch information
1 parent
8255316
commit 87c9105
Showing
5 changed files
with
145 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
* text=auto | ||
|
||
# Compiled Object files | ||
*.slo binary | ||
*.lo binary | ||
*.o binary | ||
*.obj binary | ||
|
||
# Precompiled Headers | ||
*.gch binary | ||
*.pch binary | ||
|
||
# Compiled Dynamic libraries | ||
*.so binary | ||
*.dylib binary | ||
*.dll binary | ||
|
||
# Compiled Static libraries | ||
*.lai binary | ||
*.la binary | ||
*.a binary | ||
*.lib binary | ||
|
||
# Executables | ||
*.exe binary | ||
*.out binary | ||
*.app binary |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
class WordIterator: | ||
def __init__(self, annotation, sentence_id=None): | ||
self._annotation = annotation | ||
|
||
if sentence_id == None: | ||
self._sentence_id = 0 | ||
self._max_sentence_id = self._annotation.sentence_count() | ||
else: | ||
self._sentence_id = sentence_id | ||
self._max_sentence_id = sentence_id + 1 | ||
|
||
self._word_id = -1 | ||
|
||
def __iter__(self): | ||
self._word_id = -1 | ||
return self | ||
|
||
def __next__(self): | ||
if self._annotation.sentence_count() == 0: | ||
raise StopIteration | ||
|
||
self._word_id += 1 | ||
if self._word_id >= self._annotation.word_count(self._sentence_id): | ||
self._sentence_id += 1 | ||
if self._sentence_id >= self._max_sentence_id: | ||
raise StopIteration | ||
self._word_id = 0 | ||
return self | ||
|
||
def surface(self): | ||
range = self.range() | ||
return self._annotation.text[range.begin:range.end] | ||
|
||
def range(self): | ||
return self._annotation.word_as_range(self._sentence_id, self._word_id) | ||
|
||
def id(self): | ||
return (self._sentence_id, self._word_id) | ||
|
||
class SentenceIterator: | ||
def __init__(self, annotation): | ||
self._annotation = annotation | ||
self._sentence_id = -1 | ||
|
||
def __iter__(self): | ||
self._sentence_id = -1 | ||
return self | ||
|
||
def __next__(self): | ||
self._sentence_id += 1 | ||
if self._sentence_id >= self._annotation.sentence_count(): | ||
raise StopIteration | ||
return self | ||
|
||
def words(self): | ||
return WordIterator(self._annotation, self._sentence_id) | ||
|
||
def __repr__(self): | ||
range = self._annotation.sentence_as_range(self._sentence_id) | ||
sentence = self._annotation.text[range.begin:range.end] | ||
return f'{sentence}' | ||
|
||
def sentences(annotation): | ||
return SentenceIterator(annotation) | ||
|
||
def words(annotation, sentence_id=None): | ||
return WordIterator(annotation, sentence_id) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# type: ignore | ||
from slimt import iterators | ||
|
||
def test_iterators(service, models): | ||
source = "Hi, How are you? Its been a long time.\nCan you help me out with some things?" | ||
model = models[1] | ||
response = service.translate(model, [source], html=False)[0] | ||
|
||
target = response.target | ||
text = target.text | ||
|
||
sentences = iterators.sentences(target) | ||
words = iterators.words(target) | ||
|
||
sentence_count = target.sentence_count() | ||
for sentence_idx, word_iter in zip(range(sentence_count), sentences): | ||
word_count = target.word_count(sentence_idx) | ||
for word_idx, word in zip(range(word_count), word_iter.words()): | ||
|
||
expected_range = target.word_as_range(sentence_idx, word_idx) | ||
expected_word = text[expected_range.begin:expected_range.end] | ||
|
||
# For Sentence Iterator and Word Iterator | ||
# Range | ||
reconstructed = word.range() | ||
|
||
assert expected_range.begin == reconstructed.begin | ||
assert expected_range.end == reconstructed.end | ||
|
||
# Word | ||
reconstructed = word.surface() | ||
assert expected_word == reconstructed | ||
|
||
# For Global Word Iterator | ||
word_global = next(words) | ||
|
||
# Range | ||
reconstructed = word_global.range() | ||
assert expected_range.begin == reconstructed.begin | ||
assert expected_range.end == reconstructed.end | ||
|
||
# Word | ||
reconstructed = word_global.surface() | ||
assert expected_word == reconstructed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters