diff --git a/docs/package.md b/docs/package.md index 90a51257..b4bf2c12 100644 --- a/docs/package.md +++ b/docs/package.md @@ -32,10 +32,10 @@ Basic usage for the language-aware tokenizer: from g2p import make_tokenizer tokenizer = make_tokenizer("dan") for token in tokenizer.tokenize_text("Åh, hvordan har du det, Åbenrå?"): - if token["is_word"]: - word = token["text"] + if token.is_word + word = token.text else: - interword_punctuation_and_spaces = token["text"] + interword_punctuation_and_spaces = token.text ``` Note that selecting the tokenizer language is important to make sure punctuation-like letters are handled correctly. For example `:` and `'` are punctuation in English but they will be part of the word tokens in Kanien'kéha (moh): diff --git a/g2p/__init__.py b/g2p/__init__.py index d4513d50..e60a2e13 100644 --- a/g2p/__init__.py +++ b/g2p/__init__.py @@ -16,10 +16,10 @@ from g2p import make_tokenizer tokenizer = make_tokenizer(lang) for token in tokenizer.tokenize_text(input_text): - if token["is_word"]: - word = token["text"] + if token.is_word: + word = token.text else: - interword_punctuation_and_spaces = token["text"] + interword_punctuation_and_spaces = token.text from g2p import get_arpabet_langs LANGS, LANG_NAMES = get_arpabet_langs() @@ -29,7 +29,7 @@ from typing import Dict, Optional, Tuple, Union from g2p.exceptions import InvalidLanguageCode, NoPath -from g2p.shared_types import BaseTokenizer, BaseTransducer +from g2p.shared_types import BaseTokenizer, BaseTransducer, Token if sys.version_info < (3, 7): # pragma: no cover sys.exit( @@ -47,7 +47,7 @@ def make_g2p( # noqa: C901 *, tokenize: bool = True, custom_tokenizer: Optional[BaseTokenizer] = None, -): +) -> BaseTransducer: """Make a g2p Transducer for mapping text from in_lang to out_lang via the shortest path between them. @@ -132,13 +132,13 @@ def make_g2p( # noqa: C901 return transducer -def tokenize_and_map(tokenizer, transducer, input: str): +def tokenize_and_map(tokenizer: BaseTokenizer, transducer: BaseTransducer, input: str): result = "" for token in tokenizer.tokenize_text(input): - if token["is_word"]: - result += transducer(token["text"]).output_string + if token.is_word: + result += transducer(token.text).output_string else: - result += token["text"] + result += token.text return result @@ -213,7 +213,7 @@ def get_arpabet_langs(): return _langs_cache, _lang_names_cache -def make_tokenizer(in_lang=None, out_lang=None, tok_path=None): +def make_tokenizer(in_lang=None, out_lang=None, tok_path=None) -> BaseTokenizer: """Make the tokenizer for input in language in_lang Logic used when only in_lang is provided: @@ -234,3 +234,18 @@ def make_tokenizer(in_lang=None, out_lang=None, tok_path=None): from g2p.mappings.tokenizer import make_tokenizer as _make_tokenizer return _make_tokenizer(in_lang, out_lang, tok_path) + + +# Declare what's actually part of g2p's programmatic API. +# Please don't import anything else from g2p directly. +__all__ = [ + "BaseTokenizer", + "BaseTransducer", + "InvalidLanguageCode", + "NoPath", + "Token", + "get_arpabet_langs", + "make_g2p", + "make_tokenizer", + "tokenize_and_map", +] diff --git a/g2p/api_v2.py b/g2p/api_v2.py index 9ed4551e..680a7f8e 100644 --- a/g2p/api_v2.py +++ b/g2p/api_v2.py @@ -300,7 +300,7 @@ def convert_one_writing_or_phonetic_system_to_another( # noqa: C901 tokenizer = g2p.make_tokenizer(in_lang) tokens = tokenizer.tokenize_text(request.text) else: - tokens = [{"text": request.text, "is_word": True}] + tokens = [g2p.Token(request.text, is_word=True)] except NoPath: raise HTTPException( status_code=400, detail=f"No path from {in_lang} to {out_lang}" @@ -314,8 +314,8 @@ def convert_one_writing_or_phonetic_system_to_another( # noqa: C901 segments: List[Segment] = [] for token in tokens: conversions: List[Conversion] = [] - if not token["is_word"]: # non-word, has no in_lang/out_lang - tg = TransductionGraph(token["text"]) + if not token.is_word: # non-word, has no in_lang/out_lang + tg = TransductionGraph(token.text) conv = Conversion(substring_alignments=tg.substring_alignments()) if request.indices: conv.alignments = tg.alignments() @@ -323,7 +323,7 @@ def convert_one_writing_or_phonetic_system_to_another( # noqa: C901 conv.output_nodes = list(tg.output_string) conversions.append(conv) else: - tg = transducer(token["text"]) + tg = transducer(token.text) if request.compose_from: composed_tiers: List[TransductionGraph] = [] for tr, tier in zip(transducer.transducers, tg.tiers): diff --git a/g2p/mappings/tokenizer.py b/g2p/mappings/tokenizer.py index d799241b..12b6dd2f 100644 --- a/g2p/mappings/tokenizer.py +++ b/g2p/mappings/tokenizer.py @@ -11,10 +11,10 @@ from g2p.exceptions import MappingMissing from g2p.log import LOGGER -from g2p.mappings import Mapping +from g2p.mappings import Mapping, utils from g2p.mappings.langs import LANGS_NETWORK -from g2p.mappings.utils import get_unicode_category, is_ipa, merge_if_same_label -from g2p.shared_types import BaseTokenizer +from g2p.mappings.utils import is_ipa +from g2p.shared_types import BaseTokenizer, Token class Tokenizer(BaseTokenizer): @@ -42,23 +42,18 @@ def is_word_character(self, c): if self.delim and c == self.delim: return True assert len(c) <= 1 - if get_unicode_category(c) in ["letter", "number", "diacritic"]: + if utils.get_unicode_category(c) in ["letter", "number", "diacritic"]: return True return False - def tokenize_text(self, text): + def tokenize_text(self, text: str) -> List[Token]: matches = self.tokenize_aux(text) - units = [{"text": m, "is_word": self.is_word_character(m)} for m in matches] + units = [Token(m, self.is_word_character(m)) for m in matches] if self.dot_is_letter: for i, unit in enumerate(units): - if ( - unit["text"] == "." - and i + 1 < len(units) - and units[i + 1]["is_word"] - ): - unit["is_word"] = True - units = merge_if_same_label(units, "text", "is_word") - return units + if unit.text == "." and i + 1 < len(units) and units[i + 1].is_word: + unit.is_word = True + return utils.merge_same_type_tokens(units) class SpecializedTokenizer(Tokenizer): @@ -98,6 +93,51 @@ def tokenize_aux(self, text): return self.regex.findall(text) +class LexiconTokenizer(Tokenizer): + """Lexicon-based tokenizer will consider any entry in the lexicon a token, + even if it contains punctuation characters. For text not in the lexicon, + falls back to the default tokenization. + """ + + def __init__(self, mapping: Mapping): + super().__init__() + self.mapping = mapping + self.lang = mapping.language_name + + def _recursive_helper(self, tokens: list, output_tokens: list): + """Emit the longest prefix found in the lexicon, if any, as a token. + If None, emit the first unit as a token. + Recursively process the rest of the units. + """ + if not tokens: + return + if len(tokens) == 1: + output_tokens.append(tokens[0]) + return + for i in range(len(tokens), 0, -1): + candidate = "".join([u.text for u in tokens[:i]]) + if utils.find_alignment(self.mapping.alignments, candidate.lower()): + output_tokens.append(Token(candidate, True)) + return self._recursive_helper(tokens[i:], output_tokens) + # No prefix found, emit the first unit as a token + output_tokens.append(tokens[0]) + self._recursive_helper(tokens[1:], output_tokens) + + def tokenize_text(self, text: str) -> List[Token]: + blocks = re.split(r"(\s+)", text) + output_tokens = [] + for i, block in enumerate(blocks): + if i % 2 == 1 and block: + output_tokens.append(Token(block, False)) + else: + default_tokens = super().tokenize_text(block) + # Split non-word tokens into smaller parts for lexicon lookup + candidate_tokens = utils.split_non_word_tokens(default_tokens) + self._recursive_helper(candidate_tokens, output_tokens) + + return utils.merge_non_word_tokens(output_tokens) + + class MultiHopTokenizer(SpecializedTokenizer): def __init__(self, mappings: List[Mapping]): self.delim = "" @@ -202,7 +242,10 @@ def make_tokenizer( # noqa C901 # Build a one-hop tokenizer try: mapping = Mapping.find_mapping(in_lang=in_lang, out_lang=out_lang) - self.tokenizers[tokenizer_key] = SpecializedTokenizer(mapping) + if mapping.type == utils.MAPPING_TYPE.lexicon: + self.tokenizers[tokenizer_key] = LexiconTokenizer(mapping) + else: + self.tokenizers[tokenizer_key] = SpecializedTokenizer(mapping) except MappingMissing: self.tokenizers[tokenizer_key] = self.tokenizers[None] LOGGER.warning( diff --git a/g2p/mappings/utils.py b/g2p/mappings/utils.py index fcd3e294..7b248dc1 100644 --- a/g2p/mappings/utils.py +++ b/g2p/mappings/utils.py @@ -10,7 +10,6 @@ import unicodedata as ud from bisect import bisect_left from collections import defaultdict -from copy import deepcopy from enum import Enum from pathlib import Path from typing import ( @@ -43,6 +42,7 @@ from g2p import exceptions from g2p.log import LOGGER from g2p.mappings import langs +from g2p.shared_types import Token GEN_DIR = os.path.join(os.path.dirname(langs.__file__), "generated") GEN_CONFIG = os.path.join(GEN_DIR, "config-g2p.yaml") @@ -151,7 +151,7 @@ def normalize(inp: str, norm_form: Union[str, None]): if norm_form is None or norm_form == "none": return unicode_escape(inp) if norm_form not in ["NFC", "NFD", "NFKC", "NFKD"]: - raise exceptions.InvalidNormalization(normalize) + raise exceptions.InvalidNormalization(norm_form) # Sadly mypy doesn't do narrowing to literals properly norm_form = cast(Literal["NFC", "NFD", "NFKC", "NFKD"], norm_form) normalized = ud.normalize(norm_form, unicode_escape(inp)) @@ -178,8 +178,8 @@ def compose_indices( """Compose indices1 + indices2 into direct arcs from the inputs of indices1 to the outputs of indices 2. - E.g., [(0,1), (1,4)] composed with [(0,0), (1,2), (1,3), (4,2)] is - [(0,2), (0,3), (1,2)] + >>> compose_indices([(0,1), (1,4)], [(0,0), (1,2), (1,3), (4,2)]) + [(0, 2), (0, 3), (1, 2)] """ # for O(1) lookup of arcs leaving indices2 indices2_as_dict = defaultdict(dict) # type: ignore @@ -239,7 +239,7 @@ def normalize_with_indices( return normalize_to_NFD_with_indices(inp, norm_form) if norm_form in ("none", None): return inp, [(i, i) for i in range(len(inp))] - raise exceptions.InvalidNormalization(normalize) + raise exceptions.InvalidNormalization(norm_form) def unicode_escape(text): @@ -596,22 +596,76 @@ def ignore_aliases(self, *_args): return True -def merge_if_same_label(lst_of_dicts, text_key, label_key): - results = [] - current_item = None - for dct in lst_of_dicts: - if label_key not in dct: - dct[label_key] = None - if not current_item: - current_item = deepcopy(dct) - elif dct[label_key] == current_item[label_key]: - current_item[text_key] += dct[text_key] +def merge_same_type_tokens(tokens: List[Token]) -> List[Token]: + """Merge tokens that have the same type. Destroys tokens in the process. + + >>> merge_same_type_tokens([Token("test", True), Token("b", True), Token(":", False), Token(",", False)]) + [Token(text='testb', is_word=True), Token(text=':,', is_word=False)] + >>> merge_same_type_tokens([]) + [] + """ + if not tokens: + return [] + merged_tokens = [tokens[0]] + for token in tokens[1:]: + if token.is_word == merged_tokens[-1].is_word: + merged_tokens[-1].text += token.text + else: + merged_tokens.append(token) + return merged_tokens + + +def split_non_word_tokens(tokens: List[Token]) -> List[Token]: + """Split non-word units into characters. Reuses the word tokens. + + Generates a maximum of 5 units per non-word token: if the input token is + more than 5 non-word characters, the output will be the first two + individually, the middle as a block, and the last two individually, because + lexicon-based tokenization does not need more granularity than that. + This prevents degenerate input like a large number of consecutive punctuation + marks from taking quadratic time in lexicon-based tokenization. + + >>> split_non_word_tokens([Token("test", True), Token(":,- ", False), Token("", False)]) + [Token(text='test', is_word=True), Token(text=':', is_word=False), Token(text=',', is_word=False), Token(text='-', is_word=False), Token(text=' ', is_word=False)] + >>> split_non_word_tokens([]) + [] + >>> split_non_word_tokens([Token(".,.,.,.", False)]) + [Token(text='.', is_word=False), Token(text=',', is_word=False), Token(text='.,.', is_word=False), Token(text=',', is_word=False), Token(text='.', is_word=False)] + """ + new_tokens = [] + for token in tokens: + if not token.is_word: + text = token.text + if len(text) > 5: + new_tokens.append(Token(text[0], False)) + new_tokens.append(Token(text[1], False)) + new_tokens.append(Token(text[2:-2], False)) + new_tokens.append(Token(text[-2], False)) + new_tokens.append(Token(text[-1], False)) + else: + new_tokens.extend([Token(char, False) for char in text]) + else: + new_tokens.append(token) + return new_tokens + + +def merge_non_word_tokens(tokens: List[Token]) -> List[Token]: + """Merge consecutive non-word units into a single token. Destroys tokens in the process. + + >>> merge_non_word_tokens([Token("test", True), Token(":", False), Token(",", False)]) + [Token(text='test', is_word=True), Token(text=':,', is_word=False)] + >>> merge_non_word_tokens([]) + [] + """ + if not tokens: + return tokens + merged_tokens = [tokens[0]] + for token in tokens[1:]: + if not token.is_word and not merged_tokens[-1].is_word: + merged_tokens[-1].text += token.text else: - results.append(current_item) - current_item = deepcopy(dct) - if current_item: - results.append(current_item) - return results + merged_tokens.append(token) + return merged_tokens CATEGORIES = { diff --git a/g2p/shared_types.py b/g2p/shared_types.py index 2472d0fb..1296c35f 100644 --- a/g2p/shared_types.py +++ b/g2p/shared_types.py @@ -4,6 +4,43 @@ """ from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import List + +from typing_extensions import deprecated + + +@dataclass +class Token: + """A token from the g2p tokenizer.""" + + text: str + is_word: bool + + @deprecated( + "Accessing g2p Token objects as dicts is deprecated since g2p 2.2.0. " + "Please use the 'text' and 'is_word' attributes instead.", + ) + def __getitem__(self, key): + """For backward compatibility only, allow access as if it were a dict.""" + if key == "text": + return self.text + if key == "is_word": + return self.is_word + raise KeyError(key) + + @deprecated( + "Accessing g2p Token objects as dicts is deprecated since g2p 2.2.0. " + "Please use the 'text' and 'is_word' attributes instead.", + ) + def __setitem__(self, key, value): + """For backward compatibility only, allow setting values as if it were a dict.""" + if key == "text": + self.text = value + elif key == "is_word": + self.is_word = value + else: + raise KeyError(key) class BaseTransducer(ABC): @@ -13,6 +50,21 @@ class BaseTransducer(ABC): def __call__(self, to_convert: str): """Transduce to_convert.""" + @property + @abstractmethod + def transducers(self): + """A list of BaseTransducer objects for each tier in the transducer.""" + + @property + @abstractmethod + def in_lang(self) -> str: + """The input language code of the transducer.""" + + @property + @abstractmethod + def out_lang(self) -> str: + """The output language code of the transducer.""" + class BaseTransductionGraph(ABC): """Base class to typecheck transduction graphs without having to import them.""" @@ -27,5 +79,5 @@ class BaseTokenizer(ABC): """Base class to typecheck tokenizers without having to import them.""" @abstractmethod - def tokenize_text(self, text): + def tokenize_text(self, text: str) -> List[Token]: """Tokenize text.""" diff --git a/g2p/tests/test_tokenizer.py b/g2p/tests/test_tokenizer.py index d51f764c..b15b8611 100755 --- a/g2p/tests/test_tokenizer.py +++ b/g2p/tests/test_tokenizer.py @@ -17,32 +17,50 @@ def test_tokenize_fra(self): tokenizer = tok.make_tokenizer("fra") tokens = tokenizer.tokenize_text(input) self.assertEqual(len(tokens), 8) - self.assertTrue(tokens[0]["is_word"]) - self.assertEqual(tokens[0]["text"], "ceci") - self.assertFalse(tokens[1]["is_word"]) - self.assertEqual(tokens[1]["text"], " ") - self.assertTrue(tokens[2]["is_word"]) - self.assertEqual(tokens[2]["text"], "était") - self.assertFalse(tokens[3]["is_word"]) - self.assertEqual(tokens[3]["text"], " '") - self.assertTrue(tokens[4]["is_word"]) - self.assertEqual(tokens[4]["text"], "un") - self.assertFalse(tokens[5]["is_word"]) - self.assertEqual(tokens[5]["text"], "' ") - self.assertTrue(tokens[6]["is_word"]) - self.assertEqual(tokens[6]["text"], "test") - self.assertFalse(tokens[7]["is_word"]) - self.assertEqual(tokens[7]["text"], ".") + self.assertTrue(tokens[0].is_word) + self.assertEqual(tokens[0].text, "ceci") + self.assertFalse(tokens[1].is_word) + self.assertEqual(tokens[1].text, " ") + self.assertTrue(tokens[2].is_word) + self.assertEqual(tokens[2].text, "était") + self.assertFalse(tokens[3].is_word) + self.assertEqual(tokens[3].text, " '") + self.assertTrue(tokens[4].is_word) + self.assertEqual(tokens[4].text, "un") + self.assertFalse(tokens[5].is_word) + self.assertEqual(tokens[5].text, "' ") + self.assertTrue(tokens[6].is_word) + self.assertEqual(tokens[6].text, "test") + self.assertFalse(tokens[7].is_word) + self.assertEqual(tokens[7].text, ".") def test_tokenize_eng(self): input = "This is éçà test." tokenizer = tok.make_tokenizer("eng") tokens = tokenizer.tokenize_text(input) self.assertEqual(len(tokens), 8) - self.assertTrue(tokens[0]["is_word"]) - self.assertEqual(tokens[0]["text"], "This") - self.assertFalse(tokens[1]["is_word"]) - self.assertEqual(tokens[1]["text"], " ") + self.assertTrue(tokens[0].is_word) + self.assertEqual(tokens[0].text, "This") + self.assertFalse(tokens[1].is_word) + self.assertEqual(tokens[1].text, " ") + + def test_lexicon_tokenizer(self): + tokenizer = tok.make_tokenizer("eng") + tests = [ + ("It's", ["It's"]), + ("'cause", ["'cause"]), + ('"\'cause"', ['"', "'cause", '"']), + ("aardvark's", ["aardvark", "'s"]), + ("'aardvark's'", ["'", "aardvark", "'s", "'"]), + ("ten a.m.", ["ten", " ", "a.m."]), + ('ten "a.m.,!"', ["ten", ' "', "a.m.", ',!"']), + ("all-out war", ["all-out", " ", "war"]), # all-out is in the lexicon + ("all-in: nonsense", ["all", "-", "in", ": ", "nonsense"]), # all-in is not + ] + for input_text, expected_tokens in tests: + with self.subTest(input_text=input_text): + tokens = tokenizer.tokenize_text(input_text) + self.assertEqual([x.text for x in tokens], expected_tokens) def test_tokenize_win(self): """win is easy to tokenize because win -> win-ipa exists and has ' in its inventory""" @@ -52,8 +70,8 @@ def test_tokenize_win(self): tokenizer = tok.make_tokenizer("win") tokens = tokenizer.tokenize_text(input) self.assertEqual(len(tokens), 1) - self.assertTrue(tokens[0]["is_word"]) - self.assertEqual(tokens[0]["text"], "p'ōį̄ą") + self.assertTrue(tokens[0].is_word) + self.assertEqual(tokens[0].text, "p'ōį̄ą") def test_tokenize_tce(self): """tce is hard to tokenize correctly because we have tce -> tce-equiv -> tce-ipa, and ' is @@ -71,14 +89,14 @@ def test_tokenize_tce(self): tokenizer = tok.make_tokenizer("tce") tokens = tokenizer.tokenize_text(input) self.assertEqual(len(tokens), 1) - self.assertTrue(tokens[0]["is_word"]) - self.assertEqual(tokens[0]["text"], "ts'nj") + self.assertTrue(tokens[0].is_word) + self.assertEqual(tokens[0].text, "ts'nj") def test_tokenize_tce_equiv(self): input = "ts'e ts`e ts‘e ts’" self.assertEqual(len(tok.make_tokenizer("fra").tokenize_text(input)), 14) # tce_tokens = tok.make_tokenizer("tce").tokenize_text(input) - # LOGGER.warning([x["text"] for x in tce_tokens]) + # LOGGER.warning([x.text for x in tce_tokens]) self.assertEqual(len(tok.make_tokenizer("tce").tokenize_text(input)), 7) def test_tokenizer_identity_tce(self): diff --git a/g2p/tests/test_utils.py b/g2p/tests/test_utils.py index 5d9b003a..44c8ce2d 100755 --- a/g2p/tests/test_utils.py +++ b/g2p/tests/test_utils.py @@ -4,9 +4,11 @@ """ import doctest +import io import os import re from collections import defaultdict +from contextlib import redirect_stderr from pathlib import Path from unittest import TestCase, main @@ -14,9 +16,9 @@ from pep440 import is_canonical import g2p +import g2p.exceptions from g2p import get_arpabet_langs from g2p._version import VERSION, version_tuple -from g2p.exceptions import IncorrectFileType, RecursionError from g2p.log import LOGGER from g2p.mappings import Mapping, utils from g2p.mappings.utils import RULE_ORDERING_ENUM, Rule @@ -60,7 +62,7 @@ def test_abb_expand(self): ) # shouldn't allow self-referential abbreviations expanded_plain = utils.expand_abbreviations("test", test_dict) expanded_bad_plain = utils.expand_abbreviations("test", bad_dict) - with self.assertRaises(RecursionError): + with self.assertRaises(g2p.exceptions.RecursionError): utils.expand_abbreviations("HIGH_VOWELS", bad_dict) expanded_non_recursive = utils.expand_abbreviations("HIGH_VOWELS", test_dict) expanded_recursive = utils.expand_abbreviations("VOWELS", test_dict) @@ -156,7 +158,7 @@ def test_escape_special(self): ) def test_load_abbs(self): - with self.assertRaises(IncorrectFileType): + with self.assertRaises(g2p.exceptions.IncorrectFileType): utils.load_abbreviations_from_file( os.path.join(PUBLIC_DIR, "mappings", "abbreviations.json") ) @@ -212,6 +214,10 @@ def test_generated_mapping(self): test_config_added.display_name, "test custom to test-out custom" ) + def test_bad_normalization(self): + with self.assertRaises(g2p.exceptions.InvalidNormalization): + utils.normalize_with_indices("test", "bad") + def test_normalize_to_NFD_with_indices(self): # Usefull site to get combining character code points: # http://www.alanwood.net/unicode/combining_diacritical_marks.html @@ -323,6 +329,34 @@ def test_scm_pretend_version_is_up_to_date(self): # This is fine, it's only used in development pass + def test_token_class(self): + from g2p.shared_types import Token + + t1 = Token("test", True) + t2 = Token(":", False) + + f = io.StringIO() + with redirect_stderr(f): + # Current usage and deprecated usage + for t in t1, t2: + self.assertEqual(t.text, t["text"]) + self.assertEqual(t.is_word, t["is_word"]) + # new way to set + t1.text = "test2" + t1.is_word = False + self.assertEqual(t1.text, "test2") + self.assertEqual(t1.is_word, False) + # deprecated way to set + t1["text"] = "test3" + t1["is_word"] = True + self.assertEqual(t1.text, "test3") + self.assertEqual(t1.is_word, True) + + with self.assertRaises(KeyError): + t1["bad_key"] = "test" + with self.assertRaises(KeyError): + _ = t2["bad_key"] + if __name__ == "__main__": main() diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py index e02835a9..8a167660 100644 --- a/g2p/transducer/__init__.py +++ b/g2p/transducer/__init__.py @@ -1219,11 +1219,11 @@ def __call__(self, to_convert: str): tg.clear_debugger() # clear the meaningless initial debugger for token in self._tokenizer.tokenize_text(to_convert): - if token["is_word"]: - word_tg = self._transducer(token["text"]) + if token.is_word: + word_tg = self._transducer(token.text) tg += word_tg else: - non_word_tg = TransductionGraph(token["text"]) + non_word_tg = TransductionGraph(token.text) tg += non_word_tg return tg @@ -1256,8 +1256,8 @@ def check(self, tg: TransductionGraph, shallow=False, display_warnings=False): # by step. I don't like this solution, but I don't see how to get around it. result = True for token in self._tokenizer.tokenize_text(tg.input_string): - if token["is_word"] and not self._transducer.check( - self._transducer(token["text"]), + if token.is_word and not self._transducer.check( + self._transducer(token.text), shallow, display_warnings=display_warnings, ):