Skip to content

Commit

Permalink
Merge pull request #400 from roedoejet/dev.ej/compact-lexicon
Browse files Browse the repository at this point in the history
Dev.ej/compact lexicon
  • Loading branch information
joanise authored Sep 16, 2024
2 parents b315a6c + 291708d commit 53c78f1
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 10 deletions.
Binary file modified g2p/mappings/langs/langs.json.gz
Binary file not shown.
61 changes: 53 additions & 8 deletions g2p/mappings/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,18 @@
from copy import deepcopy
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Pattern, Tuple, TypeVar, Union, cast
from typing import (
Any,
Dict,
List,
Optional,
Pattern,
Sequence,
Tuple,
TypeVar,
Union,
cast,
)

import regex as re
import yaml
Expand Down Expand Up @@ -495,16 +506,50 @@ def get_alignment_sequence(alignment: str, delimiter="") -> List[Tuple[int, str]
# The joiner between key and value must be 0 so that it sorts before all
# characters and thus won't break bisect_left()
_JOINER = "\0"
# For compacting a group of lexicon entries into one string.
# This just has to be somethign that does not occur in the lexicon data
_BLOCK_JOINER = "\1"


def find_alignment(alignments: List[str], word: str) -> List[Tuple[int, str]]:
"""Given a sorted list of (word, alignment), find word and return its parsed alignment."""
"""Given a sorted list of (word, alignment), find word and return its parsed alignment.
Algorithm: double bisect over blocks and then entries within blocks.
"""
i = bisect_left(alignments, word)
if i != len(alignments):
k, v = alignments[i].split(_JOINER, maxsplit=1)
if k == word:
return get_alignment_sequence(v)
return []
if i != len(alignments) and alignments[i].startswith(word + _JOINER):
# Looking for the first entry of a block bisects to the correct block
alignment_entry, _, _ = alignments[i].partition(_BLOCK_JOINER)
elif i > 0:
# Looking for the remaining entries of a block bisects one block too far:
# bisect again within the previous block
alignment_block = alignments[i - 1].split(_BLOCK_JOINER)
j = bisect_left(alignment_block, word)
if j != len(alignment_block):
alignment_entry = alignment_block[j]
else:
return [] # word not found: would have been between this and next block
else:
return [] # word not found: would have been before the first block

k, _, v = alignment_entry.partition(_JOINER)
if k == word:
return get_alignment_sequence(v) # word found
else:
return [] # word not found: key in bisected location does not match word


def compact_alignments(alignments: Sequence[str]) -> List[str]:
"""Memory footprint optimization: compact the list of alignments into blocks.
Each Python string has a significant overhead: grouping them into blocks of 16
saves 15MB of RAM for the cmudict English lexicon, at no significant speed cost.
"""
_BLOCK_SIZE = 16
return [
_BLOCK_JOINER.join(alignments[i : i + _BLOCK_SIZE])
for i in range(0, len(alignments), _BLOCK_SIZE)
]


def load_alignments_from_file(path, delimiter="") -> List[str]:
Expand All @@ -526,7 +571,7 @@ def load_alignments_from_file(path, delimiter="") -> List[str]:
continue
word = get_alignment_input_string(spam)
alignments.append(word + _JOINER + spam)
return sorted(alignments)
return compact_alignments(sorted(alignments))


def is_ipa(lang: str) -> bool:
Expand Down
63 changes: 61 additions & 2 deletions g2p/tests/test_lexicon_transducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,7 @@ def test_eng_lexicon(self):
)

def test_eng_transducer(self):
"""Test the cached eng to eng-ipa lexicon from make_g2p
."""
"""Test the cached eng to eng-ipa lexicon from make_g2p."""
transducer = make_g2p("eng", "eng-arpabet")
tg = transducer("hello")
self.assertEqual(tg.output_string, "HH AH L OW ")
Expand All @@ -211,6 +210,66 @@ def test_eng_transducer(self):
transducer("hello my friend").output_string, "HH AH L OW M AY F R EH N D "
)

def test_eng_lexicon_corner_cases(self):
"""White-box testing for compact storage of lexicon mappings."""
test_cases = (
("'bout", "baʊt"), # first entry in eng->eng-ipa
("'cause", "kʌz"), # second entry
("'course", "kɔɹs"), # third
("'tis", "tɪz"), # 15th entry
("'twas", "twʌz"), # 16th entry
("a", "ʌ"), # 17th entry
("buttering", "bʌtɜ˞ɪŋ"), # 15998th, which is -2 mod 16
("buttermilk", "bʌtɜ˞mɪlk"), # 15999th, -1 mod 16
("buttermore", "bʌtɜ˞mɔɹ"), # 16000th, 0 mod 16
("butters", "bʌtɜ˞z"), # 16001th, 1 mod 16
("butterscotch", "bʌtɜ˞skɑtʃ"),
("butterworth", "bʌtɜ˞wɜ˞θ"),
("buttery", "bʌtɜ˞i"),
("butthead", "bʌthɛd"),
("butting", "bʌtɪŋ"),
("buttitta", "butitʌ"),
("buttke", "bʌtki"),
("buttler", "bʌtlɜ˞"),
("buttner", "bʌtnɜ˞"),
("buttock", "bʌtʌk"),
("buttocks", "bʌtʌks"),
("button", "bʌtʌn"),
("buttoned", "bʌtʌnd"),
("buttonhole", "bʌtʌnhoʊl"),
("buttonholed", "bʌtʌnhoʊld"),
("buttonholes", "bʌtʌnhoʊlz"),
("buttons", "bʌtʌnz"), # 16018th
("zwieg", "zwiɡ"), # last block of the lexicon
("zwilling", "zwɪlɪŋ"),
("zwolinski", "zvʌlɪnski"),
("zycad", "zɪkæd"),
("zych", "zaɪtʃ"),
("zycher", "zɪkɜ˞"),
("zydeco", "zaɪdʌkoʊ"),
("zygmunt", "zɪɡmʌnt"),
("zygote", "zaɪɡoʊt"),
("zyla", "zɪlʌ"),
("zylka", "zɪlkʌ"),
("zylstra", "zɪlstɹʌ"),
("zyman", "zaɪmʌn"),
("zynda", "zɪndʌ"),
("zysk", "zaɪsk"),
("zyskowski", "zɪskɔfski"),
("zyuganov", "zjuɡɑnɑv"),
("zyuganov's", "zjuɡɑnɑvz"),
("zywicki", "zɪwɪki"),
)

transducer = make_g2p("eng", "eng-ipa", tokenize=False)
for word, expected in test_cases:
tg = transducer(word)
self.assertEqual(tg.output_string, expected)
before = word[:-1] + chr(ord(word[-1]) - 1) + "z"
self.assertEqual(transducer(before).output_string, "", f"word={word} before={before}")
after = word[:-1] + chr(ord(word[-1]) + 1) + "z"
self.assertEqual(transducer(after).output_string, "", f"word={word} after={after}")


if __name__ == "__main__":
main()

0 comments on commit 53c78f1

Please sign in to comment.