Skip to content

Commit

Permalink
[pt] Add tests for freaky modifier diacritics
Browse files Browse the repository at this point in the history
  • Loading branch information
p-goulart committed Oct 16, 2023
1 parent e89fc9f commit 2eaa2ab
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3886,9 +3886,9 @@
<disambig action="ignore_spelling"/>
</rule>

<rule id="CURRENCT_SPELLING_IGNORE">
<rule id="CURRENCY_SPELLING_IGNORE">
<pattern>
<token regexp="yes">\p{Lu}*&currency_symbols;\d+(\.\d+)?(,\d{2})?</token>
<token regexp="yes">\p{Lu}*&currency_symbols;(\d+(\.\d+)?(,\d{2})?)?</token>
</pattern>
<disambig action="ignore_spelling"/>
</rule>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,9 @@ public void testBrazilPortugueseSpelling() throws Exception {

assertNoErrors("Covid-19, COVID-19, covid-19.", lt, rule);

// 'ja' not corrected to 'já'!
assertSingleError("eu ja fiz isso.", lt, rule, new String[]{"já"});
assertSingleError("eu so", lt, rule, new String[]{"só"});
assertSingleError("é so", lt, rule, new String[]{"só"});

// corrected to bizarre 'autoconheci emen'
assertSingleErrorAndPos("- Encontre no autoconheciemen", lt, rule, new String[]{"autoconhecimento"}, 14, 29);
assertSingleErrorAndPos("Sr. Kato nos ensina inglês", lt, rule, new String[]{"Fato"}, 4, 8);
}

Expand All @@ -95,6 +91,7 @@ public void testBrazilPortugueseSpellingDoesNotCheckCurrencyValues() throws Exce
assertNoErrors("R$45,00", br_lt, br_rule);
assertNoErrors("US$1.000,00", br_lt, br_rule);
assertNoErrors("€99,99", br_lt, br_rule);
assertNoErrors("US$", br_lt, br_rule);
}

@Test
Expand All @@ -117,6 +114,22 @@ public void testBrazilPortugueseSpellingDoesNotCheckXForVezes() throws Exception
assertNoErrors("5x", br_lt, br_rule);
}

@Test
public void testBrazilPortugueseSpellingFailsWithModifierDiacritic() throws Exception {
assertNoErrors("Não", br_lt, br_rule); // proper 'ã' char
// this is acceptable because LT converts these compound chars to the proper ones
assertSingleError("Não", br_lt, br_rule, new String[]{"Não"}); // modifier tilde
}

@Test
public void testBrazilPortugueseSpellingMorfologikWeirdness() throws Exception {
// 'ja' not corrected to 'já'!
assertSingleError("eu ja fiz isso.", br_lt, br_rule, new String[]{"já"});
// corrected to bizarre 'autoconheci emen'
assertSingleErrorAndPos("- Encontre no autoconheciemen", br_lt, br_rule,
new String[]{"autoconhecimento"}, 14, 29);
}

@Test
public void testEuropeanPortugueseSpelling() throws Exception {
MorfologikPortugueseSpellerRule rule = new MorfologikPortugueseSpellerRule(TestTools.getMessages("pt"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,10 @@ public void testTokeniseNumberAbbreviation() {
public void testDoNotTokeniseEmoji() {
testTokenise("☺☺☺Só", new String[]{"☺☺☺Só"});
}

@Test
public void testDoNotTokeniseModifierDiacritics() {
// the tilde here is a unicode modifier char; normally, the unicode a-tilde (ã) is used
testTokenise("Não", new String[]{"Não"});
}
}

0 comments on commit 2eaa2ab

Please sign in to comment.