diff --git a/engine/hunspell_table.py b/engine/hunspell_table.py index 1e4b4eb2..78507365 100644 --- a/engine/hunspell_table.py +++ b/engine/hunspell_table.py @@ -7721,7 +7721,8 @@ def _process_key_event(self, key: itb_util.KeyEvent) -> bool: if (not self.is_empty() and not self._typed_compose_sequence and self._is_restricted_engine()): - (committed, committed_index, preedit) = self._transliterators[ + (committed, committed_index, preedit, + _cursor_pos, _status, _candidates) = self._transliterators[ self.get_current_imes()[0]].transliterate_parts( self._typed_string, ascii_digits=self._ascii_digits) if self._debug_level > 1: diff --git a/engine/m17n_translit.py b/engine/m17n_translit.py index e8b669e9..8bbcab06 100644 --- a/engine/m17n_translit.py +++ b/engine/m17n_translit.py @@ -917,6 +917,7 @@ def convert_digits_to_ascii(text: str) -> str: return text.translate(DIGIT_TRANS_TABLE) class Transliterator: + # pylint: disable=line-too-long '''A class for transliterators using libm17n If initializing the transliterator fails, for example because a @@ -950,23 +951,23 @@ class Transliterator: 'नमस्ते' >>> trans.transliterate_parts(list('n')) - ('', 0, 'न्') + ('', 0, 'न्', 2, 'क', []) >>> trans.transliterate_parts(list('n ')) - ('न ', 2, '') + ('न ', 2, '', 0, 'क', []) >>> trans.transliterate_parts(list('na')) - ('', 0, 'न') + ('', 0, 'न', 1, 'क', []) >>> trans.transliterate_parts(list('nam')) - ('न', 2, 'म्') + ('न', 2, 'म्', 2, 'क', []) >>> trans.transliterate_parts(list('nama')) - ('न', 2, 'म') + ('न', 2, 'म', 1, 'क', []) >>> trans.transliterate_parts(list('namas')) - ('नम', 4, 'स्') + ('नम', 4, 'स्', 2, 'क', []) >>> trans.transliterate_parts(list('namast')) - ('नम', 4, 'स्त्') + ('नम', 4, 'स्त्', 4, 'क', []) >>> trans.transliterate_parts(list('namaste')) - ('नम', 4, 'स्ते') + ('नम', 4, 'स्ते', 4, 'क', []) >>> trans.transliterate_parts(list('namaste ')) - ('नमस्ते ', 8, '') + ('नमस्ते ', 8, '', 0, 'क', []) >>> trans.transliterate(list('. ')) '। ' @@ -1015,30 +1016,47 @@ class Transliterator: >>> trans = Transliterator('t-latn-post') >>> trans.transliterate_parts(list('u')) - ('', 0, 'u') + ('', 0, 'u', 1, 'Latin-post', []) >>> trans.transliterate_parts(list('u"')) - ('', 0, 'ü') + ('', 0, 'ü', 1, 'Latin-post', []) >>> trans.transliterate_parts(list('u""')) - ('u"', 3, '') + ('u"', 3, '', 0, 'Latin-post', []) >>> trans.transliterate_parts(list('u"u')) - ('ü', 2, 'u') + ('ü', 2, 'u', 1, 'Latin-post', []) >>> trans.transliterate_parts(list('üu"u')) - ('üü', 3, 'u') + ('üü', 3, 'u', 1, 'Latin-post', []) >>> trans = Transliterator('t-rfc1345') >>> trans.transliterate_parts(list('&')) - ('', 0, '&') + ('', 0, '&', 1, 'RFC1345', []) >>> trans.transliterate_parts(list('&C')) - ('', 0, '&C') + ('', 0, '&C', 2, 'RFC1345', []) >>> trans.transliterate_parts(list('&Co')) - ('©', 3, '') + ('©', 3, '', 0, 'RFC1345', []) >>> trans.transliterate_parts(list('&f')) - ('', 0, '&f') + ('', 0, '&f', 2, 'RFC1345', []) >>> trans.transliterate_parts(list('&ff')) - ('', 0, 'ff') + ('', 0, 'ff', 1, 'RFC1345', []) >>> trans.transliterate_parts(list('&ffi')) - ('ffi', 4, '') + ('ffi', 4, '', 0, 'RFC1345', []) + + >>> trans = Transliterator('t-lsymbol') + >>> trans.transliterate_parts(list('/:)')) + ('', 0, '☺️', 2, 'lsymbol', ['☺️', '😃', '😅', '😆', '😉', '😇', '😂', '😏', '😛', '😜', '😝', '😋', '😉', '💏', '💋', '😍', '😘', '😚', '😽', '😻']) + >>> trans.transliterate_parts(list('a')) + ('a', 1, '', 0, 'lsymbol', []) + >>> trans.transliterate_parts(list('a/')) + ('a', 1, '/', 1, 'lsymbol', []) + >>> trans.transliterate_parts(list('a/:')) + ('a', 1, '/:', 2, 'lsymbol', []) + >>> trans.transliterate_parts(list('a/:(')) + ('a', 1, '😢', 1, 'lsymbol', ['😢', '😩', '😡', '😭', '😪', '🙈', '🙊', '🙉']) + >>> trans.transliterate_parts(list('a/:(b')) + ('a😢b', 5, '', 0, 'lsymbol', []) + + For a test transliterating parts using 'ja-anthy' see 'tests/test_m17n_translit.py'. ''' + # pylint: enable=line-too-long def __init__(self, ime: str) -> None: '''Initialize the input method to use for the transliteration @@ -1074,7 +1092,7 @@ def __init__(self, ime: str) -> None: def transliterate_parts( self, msymbol_list: Iterable[str], - ascii_digits: bool = False) -> Tuple[str, int, str]: + ascii_digits: bool = False) -> Tuple[str, int, str, int, str, List[str]]: '''Transliterate a list of Msymbol names :param msymbol_list: A list of strings which are interpreted @@ -1084,16 +1102,40 @@ def transliterate_parts( Msymbols is just joined to a single string. :param ascii_digits: If true, convert language specific digits to ASCII digits - :return: The transliteration in two parts: (committed, preedit) + :return: The transliteration in several parts: + + (committed, committed_index, preedit, cursor_pos, status, candidates) + + committed: str The part of the transliteration which cannot be + changed anymore by adding more input, could be + committed already if desired. + committed_index: int The index up to which the msymbol_list input + was “used up” to create the “committed” text. + preedit: The transliteration of the remaining input, + may still change by adding more input. + cursor_pos: int The cursor position in the preedit. + Counted in codepoints, not glyphs. + Usually this is at the end of the preedit + but an input method may move the cursor + within the preedit! + (I think only ja-anthy.mim actually uses this) + status: str May change for some input methods to + indicate a state. + For example in case of ja-anthy.mim, + this is 'aあ' before Henkan and changes + to '漢' in Henkan mode. + candidates: List[str] May contain a list of candidates if the + input method can produce multiple candidates. ''' if not isinstance(msymbol_list, list): raise ValueError('Argument of transliterate() must be a list.') if self._dummy: - return (''.join(msymbol_list), 0, '') + return (''.join(msymbol_list), 0, '', 0, '', []) libm17n__minput_reset_ic(self._ic) # type: ignore committed = '' committed_index = 0 preedit = '' + candidates: List[str] = [] for index, symbol in enumerate(msymbol_list): if len(symbol) == 1 and not itb_util.is_ascii(symbol): symbol = IBus.keyval_name(IBus.unicode_to_keyval(symbol)) @@ -1124,6 +1166,42 @@ def transliterate_parts( except Exception as error: # pylint: disable=broad-except # This should never happen: raise ValueError('Problem accessing preedit') from error + plist = self._ic.contents.candidate_list + while bool(plist): # NULL pointers have a False boolean value + key = libm17n__mplist_key(plist) # type: ignore + if not bool(key): + break + key_name = libm17n__msymbol_name(key.contents) # type: ignore + if key_name == b'mtext': + characters = mtext_to_string( + ctypes.cast(libm17n__mplist_value(plist), # type: ignore + ctypes.POINTER(libm17n__MText))) + candidates += list(characters) + elif key_name == b'plist': + candidate_plist = ctypes.cast( + libm17n__mplist_value(plist), # type: ignore + ctypes.POINTER(libm17n__MPlist)) + while True: + candidate_plist_key = libm17n__mplist_key( # type: ignore + candidate_plist) + if not bool(candidate_plist_key): + break + candidate_plist_key_name = libm17n__msymbol_name( # type: ignore + candidate_plist_key.contents) + if candidate_plist_key_name != b'mtext': + break + candidate = mtext_to_string( + ctypes.cast( + libm17n__mplist_value(candidate_plist), # type: ignore + ctypes.POINTER(libm17n__MText))) + candidates.append(candidate) + candidate_plist = libm17n__mplist_next( # type: ignore + candidate_plist) + else: + break + plist = libm17n__mplist_next(plist) # type: ignore + cursor_pos = self._ic.contents.cursor_pos + status = mtext_to_string(self._ic.contents.status) # From the m17n-lib documentation: # # The minput_reset_ic () function resets input context $IC by @@ -1151,10 +1229,18 @@ def transliterate_parts( if committed and not preedit: committed_index = len(msymbol_list) if not ascii_digits: - return (committed, committed_index, preedit) + return (committed, + committed_index, + preedit, + cursor_pos, + status, + candidates) return (convert_digits_to_ascii(committed), committed_index, - convert_digits_to_ascii(preedit)) + convert_digits_to_ascii(preedit), + cursor_pos, + status, + candidates) def transliterate(self, msymbol_list: Iterable[str], ascii_digits: bool = False) -> str: '''Transliterate a list of Msymbol names @@ -1168,7 +1254,12 @@ def transliterate(self, msymbol_list: Iterable[str], ascii_digits: bool = False) to ASCII digits :return: The transliteration in one string ''' - (committed, _committed_index, preedit) = self.transliterate_parts( + (committed, + _committed_index, + preedit, + _cursor_pos, + _status, + _candidates) = self.transliterate_parts( msymbol_list, ascii_digits) return committed + preedit diff --git a/tests/test_m17n_translit.py b/tests/test_m17n_translit.py index 73274afa..616aae0c 100755 --- a/tests/test_m17n_translit.py +++ b/tests/test_m17n_translit.py @@ -457,50 +457,150 @@ def test_hi_itrans(self) -> None: def test_hi_itrans_parts(self) -> None: trans = self.get_transliterator_or_skip('hi-itrans') - self.assertEqual(trans.transliterate_parts(list('n')), ('', 0, 'न्')) - self.assertEqual(trans.transliterate_parts(['n', 'S-C-Return']), ('न्', 2, '')) - self.assertEqual(trans.transliterate_parts(['n', 'S-C-Return', ' ']), ('न् ', 3, '')) - self.assertEqual(trans.transliterate_parts(list('n ')), ('न ', 2, '')) - self.assertEqual(trans.transliterate_parts(list('na')), ('', 0, 'न')) - self.assertEqual(trans.transliterate_parts(list('nam')), ('न', 2, 'म्')) - self.assertEqual(trans.transliterate_parts(list('nama')), ('न', 2, 'म')) - self.assertEqual(trans.transliterate_parts(list('namas')), ('नम', 4, 'स्')) - self.assertEqual(trans.transliterate_parts(list('namast')), ('नम', 4, 'स्त्')) - self.assertEqual(trans.transliterate_parts(list('namaste')), ('नम', 4, 'स्ते')) - self.assertEqual(trans.transliterate_parts(list('namaste ')), ('नमस्ते ', 8, '')) + self.assertEqual(trans.transliterate_parts(list('n')), ('', 0, 'न्', 2, 'क', [])) + self.assertEqual(trans.transliterate_parts(['n', 'S-C-Return']), ('न्', 2, '', 0, 'क', [])) + self.assertEqual(trans.transliterate_parts(['n', 'S-C-Return', ' ']), ('न् ', 3, '', 0, 'क', [])) + self.assertEqual(trans.transliterate_parts(list('n ')), ('न ', 2, '', 0, 'क', [])) + self.assertEqual(trans.transliterate_parts(list('na')), ('', 0, 'न', 1, 'क', [])) + self.assertEqual(trans.transliterate_parts(list('nam')), ('न', 2, 'म्', 2, 'क', [])) + self.assertEqual(trans.transliterate_parts(list('nama')), ('न', 2, 'म', 1, 'क', [])) + self.assertEqual(trans.transliterate_parts(list('namas')), ('नम', 4, 'स्', 2, 'क', [])) + self.assertEqual(trans.transliterate_parts(list('namast')), ('नम', 4, 'स्त्', 4, 'क', [])) + self.assertEqual(trans.transliterate_parts(list('namaste')), ('नम', 4, 'स्ते', 4, 'क', [])) + self.assertEqual(trans.transliterate_parts(list('namaste ')), ('नमस्ते ', 8, '', 0, 'क', [])) def test_t_latn_post_parts(self) -> None: trans = self.get_transliterator_or_skip('t-latn-post') - self.assertEqual(trans.transliterate_parts(list('u')), ('', 0, 'u')) - self.assertEqual(trans.transliterate_parts(list('u"')), ('', 0, 'ü')) - self.assertEqual(trans.transliterate_parts(list('u""')), ('u"', 3, '')) - self.assertEqual(trans.transliterate_parts(list('u"u')), ('ü', 2, 'u')) - self.assertEqual(trans.transliterate_parts(list('üu"u')), ('üü', 3, 'u')) + self.assertEqual(trans.transliterate_parts(list('u')), ('', 0, 'u', 1, 'Latin-post', [])) + self.assertEqual(trans.transliterate_parts(list('u"')), ('', 0, 'ü', 1, 'Latin-post', [])) + self.assertEqual(trans.transliterate_parts(list('u""')), ('u"', 3, '', 0, 'Latin-post', [])) + self.assertEqual(trans.transliterate_parts(list('u"u')), ('ü', 2, 'u', 1, 'Latin-post', [])) + self.assertEqual(trans.transliterate_parts(list('üu"u')), ('üü', 3, 'u', 1, 'Latin-post', [])) def test_t_rfc1345_parts(self) -> None: trans = self.get_transliterator_or_skip('t-rfc1345') - self.assertEqual(trans.transliterate_parts(list('&')), ('', 0, '&')) - self.assertEqual(trans.transliterate_parts(list('&C')), ('', 0, '&C')) - self.assertEqual(trans.transliterate_parts(list('&Co')), ('©', 3, '')) - self.assertEqual(trans.transliterate_parts(list('&f')), ('', 0, '&f')) - self.assertEqual(trans.transliterate_parts(list('&ff')), ('', 0, 'ff')) - self.assertEqual(trans.transliterate_parts(list('&ffi')), ('ffi', 4, '')) - self.assertEqual(trans.transliterate_parts(list('☺&ffi中')), ('☺ffi中', 6, '')) + self.assertEqual(trans.transliterate_parts(list('&')), ('', 0, '&', 1, 'RFC1345', [])) + self.assertEqual(trans.transliterate_parts(list('&C')), ('', 0, '&C', 2, 'RFC1345', [])) + self.assertEqual(trans.transliterate_parts(list('&Co')), ('©', 3, '', 0, 'RFC1345', [])) + self.assertEqual(trans.transliterate_parts(list('&f')), ('', 0, '&f', 2, 'RFC1345', [])) + self.assertEqual(trans.transliterate_parts(list('&ff')), ('', 0, 'ff', 1, 'RFC1345', [])) + self.assertEqual(trans.transliterate_parts(list('&ffi')), ('ffi', 4, '', 0, 'RFC1345', [])) + self.assertEqual(trans.transliterate_parts(list('☺&ffi中')), ('☺ffi中', 6, '', 0, 'RFC1345', [])) + + @unittest.skipUnless( + M17N_DB_VERSION >= (1, 8, 8), + 'Skipping because m17n-db is too old') + def test_t_lsymbol_parts(self) -> None: + trans = self.get_transliterator_or_skip('t-lsymbol') + self.assertEqual( + trans.transliterate_parts(list('/:)')), + ('', 0, '☺️', 2, 'lsymbol', ['☺️', '😃', '😅', '😆', '😉', '😇', '😂', '😏', '😛', '😜', '😝', '😋', '😉', '💏', '💋', '😍', '😘', '😚', '😽', '😻'])) + self.assertEqual( + trans.transliterate_parts(list('a')), + ('a', 1, '', 0, 'lsymbol', [])) + self.assertEqual( + trans.transliterate_parts(list('a/')), + ('a', 1, '/', 1, 'lsymbol', [])) + self.assertEqual( + trans.transliterate_parts(list('a/:')), + ('a', 1, '/:', 2, 'lsymbol', [])) + self.assertEqual( + trans.transliterate_parts(list('a/:(')), + ('a', 1, '😢', 1, 'lsymbol', ['😢', '😩', '😡', '😭', '😪', '🙈', '🙊', '🙉'])) + self.assertEqual( + trans.transliterate_parts(list('a/:(b')), + ('a😢b', 5, '', 0, 'lsymbol', [])) + + def test_ja_anthy_parts(self) -> None: + trans = self.get_transliterator_or_skip('ja-anthy') + if trans.transliterate(list('a ')).startswith('あ'): + self.skipTest( + 'Henkan doesn’t work. ' + 'Apparently some libraries necessary for ' + 'ja-anthy to work correctly are not installed.') + self.assertEqual( + trans.transliterate_parts(list('あ')), + ('あ', 1, '', 0, 'aあ', [])) + self.assertEqual( + trans.transliterate_parts(list('亜')), + ('亜', 1, '', 0, 'aあ', [])) + self.assertEqual( + trans.transliterate_parts(list('😇')), + ('😇', 1, '', 0, 'aあ', [])) + self.assertEqual( + trans.transliterate_parts(list('a')), + ('', 0, 'あ', 1, 'aあ', [])) + (committed, committed_index, preedit, cursor_pos, status, candidates) = trans.transliterate_parts( + list('a ')) + self.assertEqual(committed, '') + self.assertEqual(committed_index, 0) + self.assertEqual(len(preedit), 1) + self.assertEqual(cursor_pos, 1) + self.assertEqual(status, '漢') + self.assertTrue(len(candidates) > 5) + self.assertTrue('娃' in candidates) + self.assertTrue('亜' in candidates) + self.assertTrue('阿' in candidates) + self.assertTrue('あ' in candidates) + self.assertTrue('ア' in candidates) + self.assertEqual( + trans.transliterate_parts(list('kisha')), + ('', 0, 'きしゃ', 3, 'aあ', [])) + (committed, committed_index, preedit, cursor_pos, status, candidates) = trans.transliterate_parts( + list('kisha ')) + self.assertEqual(committed, '') + self.assertEqual(committed_index, 0) + self.assertEqual(len(preedit), 2) + self.assertEqual(cursor_pos, 2) + self.assertEqual(status, '漢') + self.assertTrue(len(candidates) > 5) + self.assertTrue('記者' in candidates) + self.assertTrue('帰社' in candidates) + self.assertTrue('汽車' in candidates) + self.assertTrue('貴社' in candidates) + self.assertTrue('きしゃ' in candidates) + self.assertTrue('キシャ' in candidates) + (committed, committed_index, preedit, cursor_pos, status, candidates) = trans.transliterate_parts( + list('akisha ')) + self.assertEqual(committed, '') + self.assertEqual(committed_index, 0) + self.assertEqual(len(preedit), 3) + self.assertEqual(cursor_pos, 1) + self.assertEqual(status, '漢') + self.assertTrue(len(candidates) > 5) + self.assertTrue('娃' in candidates) + self.assertTrue('亜' in candidates) + self.assertTrue('阿' in candidates) + self.assertTrue('あ' in candidates) + self.assertTrue('ア' in candidates) + (committed, committed_index, preedit, cursor_pos, status, candidates) = trans.transliterate_parts( + list('akisha ') + ['Right']) # 'Right' moves the Henkan segment right + self.assertEqual(committed, '') + self.assertEqual(committed_index, 0) + self.assertEqual(len(preedit), 3) + self.assertEqual(cursor_pos, 3) + self.assertEqual(status, '漢') + self.assertTrue('記者' in candidates) + self.assertTrue('帰社' in candidates) + self.assertTrue('汽車' in candidates) + self.assertTrue('貴社' in candidates) + self.assertTrue('きしゃ' in candidates) + self.assertTrue('キシャ' in candidates) @unittest.skipUnless( M17N_DB_VERSION >= (1, 8, 8), 'Skipping because m17n-db is too old') def test_t_math_latex_parts(self) -> None: trans = self.get_transliterator_or_skip('t-math-latex') - self.assertEqual(trans.transliterate_parts(list('\\')), ('', 0, '\\')) - self.assertEqual(trans.transliterate_parts(list('\\i')), ('', 0, '\\i')) - self.assertEqual(trans.transliterate_parts(list('\\in')), ('', 0, '\\∈')) - self.assertEqual(trans.transliterate_parts(list('\\int')), ('', 0, '\\∫')) - self.assertEqual(trans.transliterate_parts(list('\\inter')), ('', 0, '\\inter')) - self.assertEqual(trans.transliterate_parts(list('\\inters')), ('', 0, '∩')) - self.assertEqual(trans.transliterate_parts(list('\\inters ')), ('∩ ', 8, '')) - self.assertEqual(trans.transliterate_parts(list('\\inters☺')), ('∩☺', 8, '')) - self.assertEqual(trans.transliterate_parts(list('☺\\int')), ('☺', 1, '\\∫')) + self.assertEqual(trans.transliterate_parts(list('\\')), ('', 0, '\\', 1, 'Math: latex', [])) + self.assertEqual(trans.transliterate_parts(list('\\i')), ('', 0, '\\i', 2, 'Math: latex', [])) + self.assertEqual(trans.transliterate_parts(list('\\in')), ('', 0, '\\∈', 2, 'Math: latex', [])) + self.assertEqual(trans.transliterate_parts(list('\\int')), ('', 0, '\\∫', 2, 'Math: latex', [])) + self.assertEqual(trans.transliterate_parts(list('\\inter')), ('', 0, '\\inter', 6, 'Math: latex', [])) + self.assertEqual(trans.transliterate_parts(list('\\inters')), ('', 0, '∩', 1, 'Math: latex', [])) + self.assertEqual(trans.transliterate_parts(list('\\inters ')), ('∩ ', 8, '', 0, 'Math: latex', [])) + self.assertEqual(trans.transliterate_parts(list('\\inters☺')), ('∩☺', 8, '', 0, 'Math: latex', [])) + self.assertEqual(trans.transliterate_parts(list('☺\\int')), ('☺', 1, '\\∫', 2, 'Math: latex', [])) def test_unicode(self) -> None: trans = self.get_transliterator_or_skip('t-unicode')