Merge pull request #400 from roedoejet/dev.ej/compact-lexicon

Dev.ej/compact lexicon
roedoejet · Sep 16, 2024 · 53c78f1 · 53c78f1
2 parents b315a6c + 291708d
commit 53c78f1
Show file tree

Hide file tree

Showing 3 changed files with 114 additions and 10 deletions.
diff --git a/g2p/mappings/langs/langs.json.gz b/g2p/mappings/langs/langs.json.gz
diff --git a/g2p/mappings/utils.py b/g2p/mappings/utils.py
@@ -13,7 +13,18 @@
 from copy import deepcopy
 from enum import Enum
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Pattern, Tuple, TypeVar, Union, cast
+from typing import (
+    Any,
+    Dict,
+    List,
+    Optional,
+    Pattern,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
 
 import regex as re
 import yaml
@@ -495,16 +506,50 @@ def get_alignment_sequence(alignment: str, delimiter="") -> List[Tuple[int, str]
 # The joiner between key and value must be 0 so that it sorts before all
 # characters and thus won't break bisect_left()
 _JOINER = "\0"
+# For compacting a group of lexicon entries into one string.
+# This just has to be somethign that does not occur in the lexicon data
+_BLOCK_JOINER = "\1"
 
 
 def find_alignment(alignments: List[str], word: str) -> List[Tuple[int, str]]:
-    """Given a sorted list of (word, alignment), find word and return its parsed alignment."""
+    """Given a sorted list of (word, alignment), find word and return its parsed alignment.
+
+    Algorithm: double bisect over blocks and then entries within blocks.
+    """
     i = bisect_left(alignments, word)
-    if i != len(alignments):
-        k, v = alignments[i].split(_JOINER, maxsplit=1)
-        if k == word:
-            return get_alignment_sequence(v)
-    return []
+    if i != len(alignments) and alignments[i].startswith(word + _JOINER):
+        # Looking for the first entry of a block bisects to the correct block
+        alignment_entry, _, _ = alignments[i].partition(_BLOCK_JOINER)
+    elif i > 0:
+        # Looking for the remaining entries of a block bisects one block too far:
+        # bisect again within the previous block
+        alignment_block = alignments[i - 1].split(_BLOCK_JOINER)
+        j = bisect_left(alignment_block, word)
+        if j != len(alignment_block):
+            alignment_entry = alignment_block[j]
+        else:
+            return []  # word not found: would have been between this and next block
+    else:
+        return []  # word not found: would have been before the first block
+
+    k, _, v = alignment_entry.partition(_JOINER)
+    if k == word:
+        return get_alignment_sequence(v)  # word found
+    else:
+        return []  # word not found: key in bisected location does not match word
+
+
+def compact_alignments(alignments: Sequence[str]) -> List[str]:
+    """Memory footprint optimization: compact the list of alignments into blocks.
+
+    Each Python string has a significant overhead: grouping them into blocks of 16
+    saves 15MB of RAM for the cmudict English lexicon, at no significant speed cost.
+    """
+    _BLOCK_SIZE = 16
+    return [
+        _BLOCK_JOINER.join(alignments[i : i + _BLOCK_SIZE])
+        for i in range(0, len(alignments), _BLOCK_SIZE)
+    ]
 
 
 def load_alignments_from_file(path, delimiter="") -> List[str]:
@@ -526,7 +571,7 @@ def load_alignments_from_file(path, delimiter="") -> List[str]:
                 continue
             word = get_alignment_input_string(spam)
             alignments.append(word + _JOINER + spam)
-    return sorted(alignments)
+    return compact_alignments(sorted(alignments))
 
 
 def is_ipa(lang: str) -> bool:

diff --git a/g2p/tests/test_lexicon_transducer.py b/g2p/tests/test_lexicon_transducer.py
@@ -200,8 +200,7 @@ def test_eng_lexicon(self):
         )
 
     def test_eng_transducer(self):
-        """Test the cached eng to eng-ipa lexicon from make_g2p
-        ."""
+        """Test the cached eng to eng-ipa lexicon from make_g2p."""
         transducer = make_g2p("eng", "eng-arpabet")
         tg = transducer("hello")
         self.assertEqual(tg.output_string, "HH AH L OW ")
@@ -211,6 +210,66 @@ def test_eng_transducer(self):
             transducer("hello my friend").output_string, "HH AH L OW  M AY  F R EH N D "
         )
 
+    def test_eng_lexicon_corner_cases(self):
+        """White-box testing for compact storage of lexicon mappings."""
+        test_cases = (
+            ("'bout", "baʊt"),  # first entry in eng->eng-ipa
+            ("'cause", "kʌz"),  # second entry
+            ("'course", "kɔɹs"),  # third
+            ("'tis", "tɪz"),  # 15th entry
+            ("'twas", "twʌz"),  # 16th entry
+            ("a", "ʌ"),  # 17th entry
+            ("buttering", "bʌtɜ˞ɪŋ"),  # 15998th, which is -2 mod 16
+            ("buttermilk", "bʌtɜ˞mɪlk"),  # 15999th, -1 mod 16
+            ("buttermore", "bʌtɜ˞mɔɹ"),  # 16000th, 0 mod 16
+            ("butters", "bʌtɜ˞z"),  # 16001th, 1 mod 16
+            ("butterscotch", "bʌtɜ˞skɑtʃ"),
+            ("butterworth", "bʌtɜ˞wɜ˞θ"),
+            ("buttery", "bʌtɜ˞i"),
+            ("butthead", "bʌthɛd"),
+            ("butting", "bʌtɪŋ"),
+            ("buttitta", "butitʌ"),
+            ("buttke", "bʌtki"),
+            ("buttler", "bʌtlɜ˞"),
+            ("buttner", "bʌtnɜ˞"),
+            ("buttock", "bʌtʌk"),
+            ("buttocks", "bʌtʌks"),
+            ("button", "bʌtʌn"),
+            ("buttoned", "bʌtʌnd"),
+            ("buttonhole", "bʌtʌnhoʊl"),
+            ("buttonholed", "bʌtʌnhoʊld"),
+            ("buttonholes", "bʌtʌnhoʊlz"),
+            ("buttons", "bʌtʌnz"),  # 16018th
+            ("zwieg", "zwiɡ"),  # last block of the lexicon
+            ("zwilling", "zwɪlɪŋ"),
+            ("zwolinski", "zvʌlɪnski"),
+            ("zycad", "zɪkæd"),
+            ("zych", "zaɪtʃ"),
+            ("zycher", "zɪkɜ˞"),
+            ("zydeco", "zaɪdʌkoʊ"),
+            ("zygmunt", "zɪɡmʌnt"),
+            ("zygote", "zaɪɡoʊt"),
+            ("zyla", "zɪlʌ"),
+            ("zylka", "zɪlkʌ"),
+            ("zylstra", "zɪlstɹʌ"),
+            ("zyman", "zaɪmʌn"),
+            ("zynda", "zɪndʌ"),
+            ("zysk", "zaɪsk"),
+            ("zyskowski", "zɪskɔfski"),
+            ("zyuganov", "zjuɡɑnɑv"),
+            ("zyuganov's", "zjuɡɑnɑvz"),
+            ("zywicki", "zɪwɪki"),
+        )
+
+        transducer = make_g2p("eng", "eng-ipa", tokenize=False)
+        for word, expected in test_cases:
+            tg = transducer(word)
+            self.assertEqual(tg.output_string, expected)
+            before = word[:-1] + chr(ord(word[-1]) - 1) + "z"
+            self.assertEqual(transducer(before).output_string, "", f"word={word} before={before}")
+            after = word[:-1] + chr(ord(word[-1]) + 1) + "z"
+            self.assertEqual(transducer(after).output_string, "", f"word={word} after={after}")
+
 
 if __name__ == "__main__":
     main()