Skip to content

Commit

Permalink
[pt] Add more tokeniser and speller tests
Browse files Browse the repository at this point in the history
  • Loading branch information
p-goulart committed Oct 16, 2023
1 parent 2eaa2ab commit 9f46b6e
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 48 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ public class PortugueseWordTokenizer extends WordTokenizer {
private static final Pattern NEARBY_HYPHENS_PATTERN = Pattern.compile("([\\p{L}])-([\\p{L}])-([\\p{L}])", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
private static final String NEARBY_HYPHENS_REPL = "$1" + HYPHEN_SUBST + "$2" + HYPHEN_SUBST + "$3";

private final String PT_TOKENISING_CHARS = getTokenizingCharacters() + "⌈⌋″";

public PortugueseWordTokenizer() {
tagger = new PortugueseTagger();
}
Expand Down Expand Up @@ -133,7 +135,7 @@ public List<String> tokenize(String text) {
}

List<String> tokenList = new ArrayList<>();
StringTokenizer st = new StringTokenizer(text, getTokenizingCharacters(), true);
StringTokenizer st = new StringTokenizer(text, PT_TOKENISING_CHARS, true);
while (st.hasMoreElements()) {
String token = st.nextToken();
token = token.replace(DECIMAL_COMMA_SUBST, ',');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,21 @@
import static org.junit.Assert.assertEquals;

public class MorfologikPortugueseSpellerRuleTest {
private final MorfologikPortugueseSpellerRule br_rule = getBRSpellerRule();
private final JLanguageTool br_lt = getBRLanguageTool();
private final MorfologikPortugueseSpellerRule ruleBR = getSpellerRule("BR");
private final JLanguageTool ltBR = getLT("BR");
private final MorfologikPortugueseSpellerRule rulePT = getSpellerRule("PT");
private final JLanguageTool ltPT = getLT("PT");

public MorfologikPortugueseSpellerRuleTest() throws IOException {
}

private MorfologikPortugueseSpellerRule getBRSpellerRule() throws IOException {
private MorfologikPortugueseSpellerRule getSpellerRule(String countryCode) throws IOException {
return new MorfologikPortugueseSpellerRule(TestTools.getMessages("pt"),
Languages.getLanguageForShortCode("pt-BR"), null, null);
Languages.getLanguageForShortCode("pt-" + countryCode), null, null);
}

private JLanguageTool getBRLanguageTool() {
return new JLanguageTool(Languages.getLanguageForShortCode("pt-BR"));
private JLanguageTool getLT(String countryCode) {
return new JLanguageTool(Languages.getLanguageForShortCode("pt-" + countryCode));
}

private void assertErrorLength(String sentence, int length, JLanguageTool lt,
Expand Down Expand Up @@ -58,106 +60,105 @@ private void assertSingleError(String sentence, JLanguageTool lt,

@Test
public void testBrazilPortugueseSpelling() throws Exception {
JLanguageTool lt = br_lt;
MorfologikPortugueseSpellerRule rule = br_rule;
assertSingleError("ShintaroW.", ltBR, ruleBR, new String[]{});
assertSingleError("SHINTAROW.", ltBR, ruleBR, new String[]{});
assertSingleError("Shintaro Wada", ltBR, ruleBR, new String[]{"Shuntar"});

assertSingleError("ShintaroW.", lt, rule, new String[]{});
assertSingleError("SHINTAROW.", lt, rule, new String[]{});
assertSingleError("Shintaro Wada", lt, rule, new String[]{"Shuntar"});
assertNoErrors("A família.", ltBR, ruleBR);
assertSingleError("A familia.", ltBR, ruleBR, new String[]{"família", "Família", "famílias", "familiar"});

assertNoErrors("A família.", lt, rule);
assertSingleError("A familia.", lt, rule, new String[]{"família", "Família", "famílias", "familiar"});
assertNoErrors("Covid-19, COVID-19, covid-19.", ltBR, ruleBR);

assertNoErrors("Covid-19, COVID-19, covid-19.", lt, rule);
assertSingleError("eu so", ltBR, ruleBR, new String[]{"só"});
assertSingleError("é so", ltBR, ruleBR, new String[]{"só"});

assertSingleError("eu so", lt, rule, new String[]{"só"});
assertSingleError("é so", lt, rule, new String[]{"só"});

assertSingleErrorAndPos("Sr. Kato nos ensina inglês", lt, rule, new String[]{"Fato"}, 4, 8);
assertSingleErrorAndPos("Sr. Kato nos ensina inglês", ltBR, ruleBR, new String[]{"Fato"}, 4, 8);
}

@Test
public void testBrazilPortugueseSpellingDoesNotCheckHashtags() throws Exception {
assertNoErrors("#CantadaBoBem", br_lt, br_rule);
assertNoErrors("#CantadaBoBem", ltBR, ruleBR);
}

@Test
public void testBrazilPortugueseSpellingDoesNotCheckUserMentions() throws Exception {
assertNoErrors("@nomeDoUsuario", br_lt, br_rule);
assertNoErrors("@nomeDoUsuario", ltBR, ruleBR);
}

@Test
public void testBrazilPortugueseSpellingDoesNotCheckCurrencyValues() throws Exception {
assertNoErrors("R$45,00", br_lt, br_rule);
assertNoErrors("US$1.000,00", br_lt, br_rule);
assertNoErrors("€99,99", br_lt, br_rule);
assertNoErrors("US$", br_lt, br_rule);
assertNoErrors("R$45,00", ltBR, ruleBR);
assertNoErrors("US$1.000,00", ltBR, ruleBR);
assertNoErrors("€99,99", ltBR, ruleBR);
assertNoErrors("US$", ltBR, ruleBR);
}

@Test
public void testBrazilPortugueseSpellingDoesNotCheckNumberAbbreviations() throws Exception {
assertNoErrors("Nº666", br_lt, br_rule); // superscript 'o'
assertNoErrors("N°42189", br_lt, br_rule); // degree symbol, we'll do this in XML rules
assertNoErrors("Nº 420", br_lt, br_rule);
assertNoErrors("N.º69", br_lt, br_rule);
assertNoErrors("N.º 80085", br_lt, br_rule);
assertNoErrors("Nº666", ltBR, ruleBR); // superscript 'o'
assertNoErrors("N°42189", ltBR, ruleBR); // degree symbol, we'll do this in XML rules
assertNoErrors("Nº 420", ltBR, ruleBR);
assertNoErrors("N.º69", ltBR, ruleBR);
assertNoErrors("N.º 80085", ltBR, ruleBR);
}

@Test
public void testBrazilPortugueseSpellingSplitsEmoji() throws Exception {
assertSingleError("☺☺☺Só", br_lt, br_rule, new String[]{"☺☺☺ Só"});
assertSingleError("☺☺☺Só", ltBR, ruleBR, new String[]{"☺☺☺ Só"});
}

@Test
public void testBrazilPortugueseSpellingDoesNotCheckXForVezes() throws Exception {
assertNoErrors("10X", br_lt, br_rule);
assertNoErrors("5x", br_lt, br_rule);
assertNoErrors("10X", ltBR, ruleBR);
assertNoErrors("5x", ltBR, ruleBR);
}

@Test
public void testBrazilPortugueseSpellingFailsWithModifierDiacritic() throws Exception {
assertNoErrors("Não", br_lt, br_rule); // proper 'ã' char
assertNoErrors("Não", ltBR, ruleBR); // proper 'ã' char
// this is acceptable because LT converts these compound chars to the proper ones
assertSingleError("Não", br_lt, br_rule, new String[]{"Não"}); // modifier tilde
assertSingleError("Não", ltBR, ruleBR, new String[]{"Não"}); // modifier tilde
}

@Test
public void testBrazilPortugueseSpellingWorksWithRarePunctuation() throws Exception {
assertNoErrors("⌈Herói⌋", ltBR, ruleBR);
assertNoErrors("″Santo Antônio do Manga″", ltBR, ruleBR);
}

@Test
public void testBrazilPortugueseSpellingMorfologikWeirdness() throws Exception {
// 'ja' not corrected to 'já'!
assertSingleError("eu ja fiz isso.", br_lt, br_rule, new String[]{"já"});
assertSingleError("eu ja fiz isso.", ltBR, ruleBR, new String[]{"já"});
// corrected to bizarre 'autoconheci emen'
assertSingleErrorAndPos("- Encontre no autoconheciemen", br_lt, br_rule,
assertSingleErrorAndPos("- Encontre no autoconheciemen", ltBR, ruleBR,
new String[]{"autoconhecimento"}, 14, 29);
}

@Test
public void testEuropeanPortugueseSpelling() throws Exception {
MorfologikPortugueseSpellerRule rule = new MorfologikPortugueseSpellerRule(TestTools.getMessages("pt"),
Languages.getLanguageForShortCode("pt-PT"), null, null);
JLanguageTool lt = new JLanguageTool(Languages.getLanguageForShortCode("pt-PT"));

assertEquals(0, rule.match(lt.getAnalyzedSentence("A família.")).length);
RuleMatch[] matches = rule.match(lt.getAnalyzedSentence("A familia."));
assertEquals(0, rulePT.match(ltPT.getAnalyzedSentence("A família.")).length);
RuleMatch[] matches = rulePT.match(ltPT.getAnalyzedSentence("A familia."));
assertEquals(1, matches.length);
assertEquals("família", matches[0].getSuggestedReplacements().get(0));
assertEquals("famílias", matches[0].getSuggestedReplacements().get(1));
assertEquals("familiar", matches[0].getSuggestedReplacements().get(2));

assertEquals(0, rule.match(lt.getAnalyzedSentence("Covid-19, COVID-19, covid-19.")).length);
assertEquals(0, rulePT.match(ltPT.getAnalyzedSentence("Covid-19, COVID-19, covid-19.")).length);

matches = rule.match(lt.getAnalyzedSentence("eu ja fiz isso."));
matches = rulePT.match(ltPT.getAnalyzedSentence("eu ja fiz isso."));
assertEquals(1, matches.length);
assertEquals("já", matches[0].getSuggestedReplacements().get(0));

matches = rule.match(lt.getAnalyzedSentence("eu so"));
matches = rulePT.match(ltPT.getAnalyzedSentence("eu so"));
assertEquals(1, matches.length);
assertEquals("só", matches[0].getSuggestedReplacements().get(0));

matches = rule.match(lt.getAnalyzedSentence("é so"));
matches = rulePT.match(ltPT.getAnalyzedSentence("é so"));
assertEquals(1, matches.length);
assertEquals("só", matches[0].getSuggestedReplacements().get(0));

matches = rule.match(lt.getAnalyzedSentence("- Encontre no autoconheciemen"));
matches = rulePT.match(ltPT.getAnalyzedSentence("- Encontre no autoconheciemen"));
assertEquals(1, matches.length);
assertEquals("autoconhecimento", matches[0].getSuggestedReplacements().get(0));
assertEquals(14, matches[0].getFromPos());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,10 @@ public void testDoNotTokeniseModifierDiacritics() {
// the tilde here is a unicode modifier char; normally, the unicode a-tilde (ã) is used
testTokenise("Não", new String[]{"Não"});
}

@Test
public void testTokeniseRarePunctuation() {
testTokenise("⌈Herói⌋", new String[]{"⌈", "Herói", "⌋"});
testTokenise("″Santo Antônio do Manga″", new String[]{"″", "Santo", " ", "Antônio", " ", "do", " ", "Manga", "″"});
}
}

0 comments on commit 9f46b6e

Please sign in to comment.