Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support unambiguous detection of language if only prefixes are supplied #106

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions src/mnemonic/mnemonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,34 @@ def normalize_string(txt: AnyStr) -> str:

@classmethod
def detect_language(cls, code: str) -> str:
"""Scan the Mnemonic until the language becomes unambiguous."""
"""Scan the Mnemonic until the language becomes unambiguous, including as abbreviation prefixes.

Unfortunately, there are valid words that are ambiguous between languages, which are complete words
in one language and are prefixes in another:

english: abandon ... about
french: abandon ... aboutir

If prefixes remain ambiguous, require exactly one language where word(s) match exactly.
"""
code = cls.normalize_string(code)
possible = set(cls(lang) for lang in cls.list_languages())
for word in code.split():
possible = set(p for p in possible if word in p.wordlist)
words = set(code.split())
for word in words:
# possible languages have candidate(s) starting with the word/prefix
possible = set(p for p in possible if any(c.startswith( word ) for c in p.wordlist))
if not possible:
raise ConfigurationError(f"Language unrecognized for {word!r}")
if len(possible) == 1:
pjkundert marked this conversation as resolved.
Show resolved Hide resolved
return possible.pop().language
# Multiple languages match: A prefix in many, but an exact match in one determines language.
complete = set()
for word in words:
exact = set(p for p in possible if word in p.wordlist)
if len(exact) == 1:
complete.update(exact)
if len(complete) == 1:
return complete.pop().language
raise ConfigurationError(
f"Language ambiguous between {', '.join( p.language for p in possible)}"
)
Expand Down
16 changes: 16 additions & 0 deletions tests/test_mnemonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ def test_failed_checksum(self) -> None:
def test_detection(self) -> None:
self.assertEqual("english", Mnemonic.detect_language("security"))

self.assertEqual( "english", Mnemonic.detect_language( "fruit wave dwarf" )) # ambiguous up to wave
self.assertEqual( "english", Mnemonic.detect_language( "fru wago dw" )) # ambiguous french/english up to dwarf prefix
self.assertEqual( "french", Mnemonic.detect_language( "fru wago dur enje" )) # ambiguous french/english up to enjeu prefix

with self.assertRaises(Exception):
Mnemonic.detect_language(
"jaguar xxxxxxx"
Expand All @@ -67,8 +71,20 @@ def test_detection(self) -> None:
"jaguar jaguar"
) # Ambiguous after examining all words

# Allowing word prefixes in language detection presents ambiguity issues. Require exactly
# one language that matches all prefixes, or one language matching some word(s) exactly.
self.assertEqual("english", Mnemonic.detect_language("jaguar security"))
self.assertEqual("french", Mnemonic.detect_language("jaguar aboyer"))
self.assertEqual("english", Mnemonic.detect_language("abandon about"))
self.assertEqual("french", Mnemonic.detect_language("abandon aboutir"))
self.assertEqual("french", Mnemonic.detect_language("fav financer"))
self.assertEqual("czech", Mnemonic.detect_language("fav finance"))
with self.assertRaises(Exception):
Mnemonic.detect_language("favor finan")
self.assertEqual("czech", Mnemonic.detect_language("flanel"))
self.assertEqual("portuguese", Mnemonic.detect_language("flanela"))
with self.assertRaises(Exception):
Mnemonic.detect_language("flane")

def test_utf8_nfkd(self) -> None:
# The same sentence in various UTF-8 forms
Expand Down
Loading