From 9f46b6e59ab6445caaf2f499e989876708fac6bf Mon Sep 17 00:00:00 2001 From: p-goulart Date: Thu, 12 Oct 2023 14:04:44 +0200 Subject: [PATCH] [pt] Add more tokeniser and speller tests --- .../pt/PortugueseWordTokenizer.java | 4 +- .../MorfologikPortugueseSpellerRuleTest.java | 95 ++++++++++--------- .../pt/PortugueseWordTokenizerTest.java | 6 ++ 3 files changed, 57 insertions(+), 48 deletions(-) diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizer.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizer.java index e1777ade04f9..f9758023faa9 100644 --- a/languagetool-language-modules/pt/src/main/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizer.java +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizer.java @@ -89,6 +89,8 @@ public class PortugueseWordTokenizer extends WordTokenizer { private static final Pattern NEARBY_HYPHENS_PATTERN = Pattern.compile("([\\p{L}])-([\\p{L}])-([\\p{L}])", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); private static final String NEARBY_HYPHENS_REPL = "$1" + HYPHEN_SUBST + "$2" + HYPHEN_SUBST + "$3"; + private final String PT_TOKENISING_CHARS = getTokenizingCharacters() + "⌈⌋″"; + public PortugueseWordTokenizer() { tagger = new PortugueseTagger(); } @@ -133,7 +135,7 @@ public List tokenize(String text) { } List tokenList = new ArrayList<>(); - StringTokenizer st = new StringTokenizer(text, getTokenizingCharacters(), true); + StringTokenizer st = new StringTokenizer(text, PT_TOKENISING_CHARS, true); while (st.hasMoreElements()) { String token = st.nextToken(); token = token.replace(DECIMAL_COMMA_SUBST, ','); diff --git a/languagetool-language-modules/pt/src/test/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRuleTest.java b/languagetool-language-modules/pt/src/test/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRuleTest.java index 37f3e6b926a6..ca0bda438622 100644 --- a/languagetool-language-modules/pt/src/test/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRuleTest.java +++ b/languagetool-language-modules/pt/src/test/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRuleTest.java @@ -12,19 +12,21 @@ import static org.junit.Assert.assertEquals; public class MorfologikPortugueseSpellerRuleTest { - private final MorfologikPortugueseSpellerRule br_rule = getBRSpellerRule(); - private final JLanguageTool br_lt = getBRLanguageTool(); + private final MorfologikPortugueseSpellerRule ruleBR = getSpellerRule("BR"); + private final JLanguageTool ltBR = getLT("BR"); + private final MorfologikPortugueseSpellerRule rulePT = getSpellerRule("PT"); + private final JLanguageTool ltPT = getLT("PT"); public MorfologikPortugueseSpellerRuleTest() throws IOException { } - private MorfologikPortugueseSpellerRule getBRSpellerRule() throws IOException { + private MorfologikPortugueseSpellerRule getSpellerRule(String countryCode) throws IOException { return new MorfologikPortugueseSpellerRule(TestTools.getMessages("pt"), - Languages.getLanguageForShortCode("pt-BR"), null, null); + Languages.getLanguageForShortCode("pt-" + countryCode), null, null); } - private JLanguageTool getBRLanguageTool() { - return new JLanguageTool(Languages.getLanguageForShortCode("pt-BR")); + private JLanguageTool getLT(String countryCode) { + return new JLanguageTool(Languages.getLanguageForShortCode("pt-" + countryCode)); } private void assertErrorLength(String sentence, int length, JLanguageTool lt, @@ -58,106 +60,105 @@ private void assertSingleError(String sentence, JLanguageTool lt, @Test public void testBrazilPortugueseSpelling() throws Exception { - JLanguageTool lt = br_lt; - MorfologikPortugueseSpellerRule rule = br_rule; + assertSingleError("ShintaroW.", ltBR, ruleBR, new String[]{}); + assertSingleError("SHINTAROW.", ltBR, ruleBR, new String[]{}); + assertSingleError("Shintaro Wada", ltBR, ruleBR, new String[]{"Shuntar"}); - assertSingleError("ShintaroW.", lt, rule, new String[]{}); - assertSingleError("SHINTAROW.", lt, rule, new String[]{}); - assertSingleError("Shintaro Wada", lt, rule, new String[]{"Shuntar"}); + assertNoErrors("A família.", ltBR, ruleBR); + assertSingleError("A familia.", ltBR, ruleBR, new String[]{"família", "Família", "famílias", "familiar"}); - assertNoErrors("A família.", lt, rule); - assertSingleError("A familia.", lt, rule, new String[]{"família", "Família", "famílias", "familiar"}); + assertNoErrors("Covid-19, COVID-19, covid-19.", ltBR, ruleBR); - assertNoErrors("Covid-19, COVID-19, covid-19.", lt, rule); + assertSingleError("eu so", ltBR, ruleBR, new String[]{"só"}); + assertSingleError("é so", ltBR, ruleBR, new String[]{"só"}); - assertSingleError("eu so", lt, rule, new String[]{"só"}); - assertSingleError("é so", lt, rule, new String[]{"só"}); - - assertSingleErrorAndPos("Sr. Kato nos ensina inglês", lt, rule, new String[]{"Fato"}, 4, 8); + assertSingleErrorAndPos("Sr. Kato nos ensina inglês", ltBR, ruleBR, new String[]{"Fato"}, 4, 8); } @Test public void testBrazilPortugueseSpellingDoesNotCheckHashtags() throws Exception { - assertNoErrors("#CantadaBoBem", br_lt, br_rule); + assertNoErrors("#CantadaBoBem", ltBR, ruleBR); } @Test public void testBrazilPortugueseSpellingDoesNotCheckUserMentions() throws Exception { - assertNoErrors("@nomeDoUsuario", br_lt, br_rule); + assertNoErrors("@nomeDoUsuario", ltBR, ruleBR); } @Test public void testBrazilPortugueseSpellingDoesNotCheckCurrencyValues() throws Exception { - assertNoErrors("R$45,00", br_lt, br_rule); - assertNoErrors("US$1.000,00", br_lt, br_rule); - assertNoErrors("€99,99", br_lt, br_rule); - assertNoErrors("US$", br_lt, br_rule); + assertNoErrors("R$45,00", ltBR, ruleBR); + assertNoErrors("US$1.000,00", ltBR, ruleBR); + assertNoErrors("€99,99", ltBR, ruleBR); + assertNoErrors("US$", ltBR, ruleBR); } @Test public void testBrazilPortugueseSpellingDoesNotCheckNumberAbbreviations() throws Exception { - assertNoErrors("Nº666", br_lt, br_rule); // superscript 'o' - assertNoErrors("N°42189", br_lt, br_rule); // degree symbol, we'll do this in XML rules - assertNoErrors("Nº 420", br_lt, br_rule); - assertNoErrors("N.º69", br_lt, br_rule); - assertNoErrors("N.º 80085", br_lt, br_rule); + assertNoErrors("Nº666", ltBR, ruleBR); // superscript 'o' + assertNoErrors("N°42189", ltBR, ruleBR); // degree symbol, we'll do this in XML rules + assertNoErrors("Nº 420", ltBR, ruleBR); + assertNoErrors("N.º69", ltBR, ruleBR); + assertNoErrors("N.º 80085", ltBR, ruleBR); } @Test public void testBrazilPortugueseSpellingSplitsEmoji() throws Exception { - assertSingleError("☺☺☺Só", br_lt, br_rule, new String[]{"☺☺☺ Só"}); + assertSingleError("☺☺☺Só", ltBR, ruleBR, new String[]{"☺☺☺ Só"}); } @Test public void testBrazilPortugueseSpellingDoesNotCheckXForVezes() throws Exception { - assertNoErrors("10X", br_lt, br_rule); - assertNoErrors("5x", br_lt, br_rule); + assertNoErrors("10X", ltBR, ruleBR); + assertNoErrors("5x", ltBR, ruleBR); } @Test public void testBrazilPortugueseSpellingFailsWithModifierDiacritic() throws Exception { - assertNoErrors("Não", br_lt, br_rule); // proper 'ã' char + assertNoErrors("Não", ltBR, ruleBR); // proper 'ã' char // this is acceptable because LT converts these compound chars to the proper ones - assertSingleError("Não", br_lt, br_rule, new String[]{"Não"}); // modifier tilde + assertSingleError("Não", ltBR, ruleBR, new String[]{"Não"}); // modifier tilde + } + + @Test + public void testBrazilPortugueseSpellingWorksWithRarePunctuation() throws Exception { + assertNoErrors("⌈Herói⌋", ltBR, ruleBR); + assertNoErrors("″Santo Antônio do Manga″", ltBR, ruleBR); } @Test public void testBrazilPortugueseSpellingMorfologikWeirdness() throws Exception { // 'ja' not corrected to 'já'! - assertSingleError("eu ja fiz isso.", br_lt, br_rule, new String[]{"já"}); + assertSingleError("eu ja fiz isso.", ltBR, ruleBR, new String[]{"já"}); // corrected to bizarre 'autoconheci emen' - assertSingleErrorAndPos("- Encontre no autoconheciemen", br_lt, br_rule, + assertSingleErrorAndPos("- Encontre no autoconheciemen", ltBR, ruleBR, new String[]{"autoconhecimento"}, 14, 29); } @Test public void testEuropeanPortugueseSpelling() throws Exception { - MorfologikPortugueseSpellerRule rule = new MorfologikPortugueseSpellerRule(TestTools.getMessages("pt"), - Languages.getLanguageForShortCode("pt-PT"), null, null); - JLanguageTool lt = new JLanguageTool(Languages.getLanguageForShortCode("pt-PT")); - - assertEquals(0, rule.match(lt.getAnalyzedSentence("A família.")).length); - RuleMatch[] matches = rule.match(lt.getAnalyzedSentence("A familia.")); + assertEquals(0, rulePT.match(ltPT.getAnalyzedSentence("A família.")).length); + RuleMatch[] matches = rulePT.match(ltPT.getAnalyzedSentence("A familia.")); assertEquals(1, matches.length); assertEquals("família", matches[0].getSuggestedReplacements().get(0)); assertEquals("famílias", matches[0].getSuggestedReplacements().get(1)); assertEquals("familiar", matches[0].getSuggestedReplacements().get(2)); - assertEquals(0, rule.match(lt.getAnalyzedSentence("Covid-19, COVID-19, covid-19.")).length); + assertEquals(0, rulePT.match(ltPT.getAnalyzedSentence("Covid-19, COVID-19, covid-19.")).length); - matches = rule.match(lt.getAnalyzedSentence("eu ja fiz isso.")); + matches = rulePT.match(ltPT.getAnalyzedSentence("eu ja fiz isso.")); assertEquals(1, matches.length); assertEquals("já", matches[0].getSuggestedReplacements().get(0)); - matches = rule.match(lt.getAnalyzedSentence("eu so")); + matches = rulePT.match(ltPT.getAnalyzedSentence("eu so")); assertEquals(1, matches.length); assertEquals("só", matches[0].getSuggestedReplacements().get(0)); - matches = rule.match(lt.getAnalyzedSentence("é so")); + matches = rulePT.match(ltPT.getAnalyzedSentence("é so")); assertEquals(1, matches.length); assertEquals("só", matches[0].getSuggestedReplacements().get(0)); - matches = rule.match(lt.getAnalyzedSentence("- Encontre no autoconheciemen")); + matches = rulePT.match(ltPT.getAnalyzedSentence("- Encontre no autoconheciemen")); assertEquals(1, matches.length); assertEquals("autoconhecimento", matches[0].getSuggestedReplacements().get(0)); assertEquals(14, matches[0].getFromPos()); diff --git a/languagetool-language-modules/pt/src/test/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizerTest.java b/languagetool-language-modules/pt/src/test/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizerTest.java index 87e02ff36e13..179b436b344c 100644 --- a/languagetool-language-modules/pt/src/test/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizerTest.java +++ b/languagetool-language-modules/pt/src/test/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizerTest.java @@ -132,4 +132,10 @@ public void testDoNotTokeniseModifierDiacritics() { // the tilde here is a unicode modifier char; normally, the unicode a-tilde (ã) is used testTokenise("Não", new String[]{"Não"}); } + + @Test + public void testTokeniseRarePunctuation() { + testTokenise("⌈Herói⌋", new String[]{"⌈", "Herói", "⌋"}); + testTokenise("″Santo Antônio do Manga″", new String[]{"″", "Santo", " ", "Antônio", " ", "do", " ", "Manga", "″"}); + } }