Skip to content

Commit

Permalink
[pt] Migrate spelling dictionaries to Morfologik format (#9424)
Browse files Browse the repository at this point in the history
* new rule ID is `MORFOLOGIK_RULE_PT`, with sub-ids for each variant;
* adjust rule priorities
* add spelling tests
* use same chars as everyone else for PT tokeniser, and add tests;
* tokenise currency expressions and ignore spelling of currencies
* stop spellchecking Nº666 abbreviations
* make AO and MZ variants use PT-45 dict
* add abbreviation and do_not_suggest logic to morfologik
* add dialect alternations in Morfologik speller
* disable trema rule from grammar and update diaeresis Morfologik speller message
* update pt/br replacement txts
* add grammar rules for elided verb forms (and speller tests)
* remove Hunspell data

---------

Co-authored-by: p-goulart <[email protected]>
  • Loading branch information
jaumeortola and p-goulart authored Dec 6, 2023
1 parent 8685e19 commit 612dc21
Show file tree
Hide file tree
Showing 45 changed files with 44,592 additions and 505,626 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@
import java.util.Collections;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.languagetool.rules.ml.MLServerProto;
import org.languagetool.tools.StringTools;

/**
Expand All @@ -44,6 +46,20 @@ public class WordTokenizer implements Tokenizer {
private static final Pattern DOMAIN_CHARS = Pattern.compile("[a-zA-Z0-9][a-zA-Z0-9-]+");
private static final Pattern NO_PROTOCOL_URL = Pattern.compile("([a-zA-Z0-9][a-zA-Z0-9-]+\\.)?([a-zA-Z0-9][a-zA-Z0-9-]+)\\.([a-zA-Z0-9][a-zA-Z0-9-]+)/.*");
private static final Pattern E_MAIL = Pattern.compile("(?<!:)@?\\b[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\])|(([a-zA-Z\\-0-9]+\\.)+[a-zA-Z]{2,}))\\b");
// For now, to prevent very aggressive tokenisation, we're limiting this to symbols that are coterminous with or
// end in a *special* currency glyph, like "$" or "US$", respectively.
//
// Not contemplated:
// - currency symbols made up only of regular alphabetic glyphs, e.g. "Bs", "zł";
// - official, ASCII-only, three-letter currency symbols, e.g. "USD", "EUR";
// - glyphs from right-to-left writing scripts.
private static final Pattern CURRENCY_SYMBOLS = Pattern.compile("[A-Z]*[฿₿₵¢₡$₫֏€ƒ₲₴₭₾₺₼₦₱£៛₽₹₪৳₸₮₩¥¤]");
// Really loose, but will only be used in conjunction with CURRENCY_SYMBOLS above, and we actually want to catch
// potentially incorrect number formats, so that we tokenise them properly and are able to correct them more easily.
private static final Pattern CURRENCY_VALUE = Pattern.compile("\\d+(?:[.,]\\d+)*");
private static final Pattern CURRENCY_EXPRESSION = Pattern.compile(String.format("(?:(%s)(%s)|(%s)(%s))",
CURRENCY_SYMBOLS, CURRENCY_VALUE, CURRENCY_VALUE, CURRENCY_SYMBOLS));


/*
* Possibly problematic characters for tokenization:
Expand Down Expand Up @@ -244,4 +260,25 @@ private boolean urlEndsAt(int i, List<String> l, String urlQuote) {
return false;
}

public boolean isCurrencyExpression(String token) {
return CURRENCY_EXPRESSION.matcher(token).matches();
}

public List<String> splitCurrencyExpression(String token) {
List<String> newList = new ArrayList<>();
Matcher matcher = CURRENCY_EXPRESSION.matcher(token);
while (matcher.find()) {
if (matcher.group(1) != null && matcher.group(2) != null) {
newList.add(matcher.group(1));
newList.add(matcher.group(2));
} else if (matcher.group(3) != null && matcher.group(4) != null) {
newList.add(matcher.group(3));
newList.add(matcher.group(4));
}
}
if (newList.size() == 0) {
newList.add(token);
}
return newList;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,29 @@ public void testIncompleteUrlTokenize() {
assertEquals("foo| |http://|?| |bar", tokenize("foo http://? bar"));
}

@Test
public void testCheckCurrencyExpression() {
assertTrue(wordTokenizer.isCurrencyExpression("US$45"));
assertTrue(wordTokenizer.isCurrencyExpression("5,000€"));
assertTrue(wordTokenizer.isCurrencyExpression("£1.50"));
assertTrue(wordTokenizer.isCurrencyExpression("R$1.999.99"));
assertFalse(wordTokenizer.isCurrencyExpression("US$"));
assertFalse(wordTokenizer.isCurrencyExpression("X€"));
assertFalse(wordTokenizer.isCurrencyExpression(".50£"));
assertFalse(wordTokenizer.isCurrencyExpression("5R$5"));
}

@Test
public void testSplitCurrencyExpression() {
assertArrayEquals(wordTokenizer.splitCurrencyExpression("US$45").toArray(), new String[]{"US$", "45"});
assertArrayEquals(wordTokenizer.splitCurrencyExpression("5,000€").toArray(), new String[]{"5,000", "€"});
assertArrayEquals(wordTokenizer.splitCurrencyExpression("£1.50").toArray(), new String[]{"£", "1.50"});
assertArrayEquals(wordTokenizer.splitCurrencyExpression("R$1.999.99").toArray(), new String[]{"R$", "1.999.99"});
// not currency expr
assertArrayEquals(wordTokenizer.splitCurrencyExpression("US$X").toArray(), new String[]{"US$X"});
assertArrayEquals(wordTokenizer.splitCurrencyExpression("foobar").toArray(), new String[]{"foobar"});
}

private String tokenize(String text) {
List<String> tokens = wordTokenizer.tokenize(text);
return String.join("|", tokens);
Expand Down
12 changes: 3 additions & 9 deletions languagetool-language-modules/pt/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,19 @@

<developers>
<developer>
<name>Tiago F. Santos</name>
<roles>
<role>Maintainer</role>
</roles>
</developer>
<developer>
<name>Marco A.G. Pinto</name>
<name>Daniel Naber</name>
<roles>
<role>Maintainer</role>
</roles>
</developer>
<developer>
<name>Daniel Naber</name>
<name>Pedro Goulart</name>
<roles>
<role>Maintainer</role>
</roles>
</developer>
<developer>
<name>Marcin Miłkowski</name>
<name>Marco A.G. Pinto</name>
<roles>
<role>Maintainer</role>
</roles>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,4 @@ public String[] getCountries() {
return new String[]{"BR"};
}

@Nullable
@Override
protected SpellingCheckRule createDefaultSpellingRule(ResourceBundle messages) {
return new HunspellRule(messages, this, null, null);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,6 @@ protected int getPriorityForId(String id) {
return super.getPriorityForId(id);
}

@Nullable
@Override
protected SpellingCheckRule createDefaultSpellingRule(ResourceBundle messages) throws IOException {
return new HunspellRule(messages, this, null, null);
}

@Override
public String getOpeningDoubleQuote() {
return "«";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.rules.*;
import org.languagetool.rules.pt.*;
import org.languagetool.rules.spelling.hunspell.HunspellRule;
import org.languagetool.rules.spelling.SpellingCheckRule;
import org.languagetool.synthesis.Synthesizer;
import org.languagetool.synthesis.pt.PortugueseSynthesizer;
import org.languagetool.tagging.Tagger;
Expand Down Expand Up @@ -102,6 +102,12 @@ public SentenceTokenizer createDefaultSentenceTokenizer() {
return new SRXSentenceTokenizer(this);
}

@Nullable
@Override
protected SpellingCheckRule createDefaultSpellingRule(ResourceBundle messages) throws IOException {
return new MorfologikPortugueseSpellerRule(messages, this, null, null);
}

@Override
public List<Rule> getRelevantRules(ResourceBundle messages, UserConfig userConfig, Language motherTongue, List<Language> altLanguages) throws IOException {
return Arrays.asList(
Expand All @@ -111,7 +117,7 @@ public List<Rule> getRelevantRules(ResourceBundle messages, UserConfig userConfi
new GenericUnpairedBracketsRule(messages,
Arrays.asList("[", "(", "{", "\"", "“" /*, "«", "'", "‘" */),
Arrays.asList("]", ")", "}", "\"", "”" /*, "»", "'", "’" */)),
new HunspellRule(messages, this, userConfig, altLanguages),
new MorfologikPortugueseSpellerRule(messages, this, userConfig, altLanguages),
new LongSentenceRule(messages, userConfig, 50),
new LongParagraphRule(messages, this, userConfig),
new UppercaseSentenceStartRule(messages, this,
Expand Down Expand Up @@ -209,71 +215,78 @@ public boolean isAdvancedTypographyEnabled() {

@Override
protected int getPriorityForId(String id) {
if (id.startsWith("MORFOLOGIK_RULE")) {
return -50;
}

switch (id) {
case "FRAGMENT_TWO_ARTICLES": return 50;
case "DEGREE_MINUTES_SECONDS": return 30;
case "INTERJECTIONS_PUNTUATION": return 20;
case "CONFUSION_POR": return 10;
case "PARONYM_POLITICA_523": return 10;
case "PARONYM_PRONUNCIA_262": return 10;
case "PARONYM_CRITICA_397": return 10;
case "PARONYM_INICIO_169": return 10;
case "LP_PARONYMS": return 10;
case "PARONYM_MUSICO_499_bis": return 10;
case "NA_NÃO": return 10;
case "VERB_COMMA_CONJUNCTION": return 10; // greater than PORTUGUESE_WORD_REPEAT_RULE
case "HOMOPHONE_AS_CARD": return 5;
case "TODOS_FOLLOWED_BY_NOUN_PLURAL": return 3;
case "TODOS_FOLLOWED_BY_NOUN_SINGULAR": return 2;
case "AUSENCIA_VIRGULA": return 1;
case "EMAIL": return 1;
case "UNPAIRED_BRACKETS": return -5;
case "PROFANITY": return -6;
case "PT_BARBARISMS_REPLACE": return -10;
case "BARBARISMS_PT_PT_V2": return -10;
case "PT_PT_SIMPLE_REPLACE": return -11;
case "PT_REDUNDANCY_REPLACE": return -12;
case "PT_WORDINESS_REPLACE": return -13;
case "PT_CLICHE_REPLACE": return -17;
case "INTERNET_ABBREVIATIONS": return -24;
case "CHILDISH_LANGUAGE": return -25;
case "ARCHAISMS": return -26;
case "INFORMALITIES": return -27;
case "PUFFERY": return -30;
case "BIASED_OPINION_WORDS": return -31;
case "WEAK_WORDS": return -32;
case "PT_AGREEMENT_REPLACE": return -35;
case "CONTA_TO": return -44;
case "PT_DIACRITICS_REPLACE": return -45; // prefer over spell checker
case "DIACRITICS": return -45;
case "PT_COMPOUNDS_POST_REFORM": return -45;
case "AUX_VERBO": return -45;
case "HUNSPELL_RULE": return -50;
case "CRASE_CONFUSION": return -54;
case "NAO_MILITARES": return -54;
case "NA_QUELE": return -54;
case "NOTAS_FICAIS": return -54;
case "GENERAL_VERB_AGREEMENT_ERRORS": return -55;
case "GENERAL_NUMBER_AGREEMENT_ERRORS": return -56;
case "GENERAL_GENDER_NUMBER_AGREEMENT_ERRORS": return -56;
case "FINAL_STOPS": return -75;
case "EU_NÓS_REMOVAL": return -90;
case "COLOCAÇÃO_ADVÉRBIO": return -90;
case "FAZER_USO_DE-USAR-RECORRER": return -90;
case "T-V_DISTINCTION": return -100;
case "T-V_DISTINCTION_ALL": return -101;
case "REPEATED_WORDS": return -210;
case "REPEATED_WORDS_3X": return -211;
case "PT_WIKIPEDIA_COMMON_ERRORS":return -500;
case "FILLER_WORDS_PT": return -990;
case LongSentenceRule.RULE_ID: return -997;
case LongParagraphRule.RULE_ID: return -998;
case "READABILITY_RULE_SIMPLE_PT": return -1100;
case "READABILITY_RULE_DIFFICULT_PT": return -1101;
case "CACOPHONY": return -1500;
case "UNKNOWN_WORD": return -2000;
case "NO_VERB": return -2100;
case "FRAGMENT_TWO_ARTICLES": return 50;
case "DEGREE_MINUTES_SECONDS": return 30;
case "INTERJECTIONS_PUNTUATION": return 20;
case "CONFUSION_POR": return 10;
case "PARONYM_POLITICA_523": return 10;
case "PARONYM_PRONUNCIA_262": return 10;
case "PARONYM_CRITICA_397": return 10;
case "PARONYM_INICIO_169": return 10;
case "LP_PARONYMS": return 10;
case "PARONYM_MUSICO_499_bis": return 10;
case "NA_NÃO": return 10;
case "VERB_COMMA_CONJUNCTION": return 10; // greater than PORTUGUESE_WORD_REPEAT_RULE
case "HOMOPHONE_AS_CARD": return 5;
case "TODOS_FOLLOWED_BY_NOUN_PLURAL": return 3;
case "TODOS_FOLLOWED_BY_NOUN_SINGULAR": return 2;
case "AUSENCIA_VIRGULA": return 1;
case "EMAIL": return 1;
case "UNPAIRED_BRACKETS": return -5;
case "PROFANITY": return -6;
case "PT_BARBARISMS_REPLACE": return -10;
case "BARBARISMS_PT_PT_V2": return -10;
case "PT_PT_SIMPLE_REPLACE": return -11; // for pt-PT, not lower than speller, not sure why
case "PT_REDUNDANCY_REPLACE": return -12;
case "PT_WORDINESS_REPLACE": return -13;
case "PT_CLICHE_REPLACE": return -17;
case "INTERNET_ABBREVIATIONS": return -24;
case "CHILDISH_LANGUAGE": return -25;
case "ARCHAISMS": return -26;
case "INFORMALITIES": return -27;
case "PUFFERY": return -30;
case "BIASED_OPINION_WORDS": return -31;
case "WEAK_WORDS": return -32;
case "PT_AGREEMENT_REPLACE": return -35;
case "CONTA_TO": return -44;
case "PT_DIACRITICS_REPLACE": return -45; // prefer over spell checker
case "DIACRITICS": return -45;
case "PT_COMPOUNDS_POST_REFORM": return -45;
case "AUX_VERBO": return -45; // HIGHER THAN SPELLER
// MORFOLOGIK SPELLER FITS HERE AT -50 --------------------- // SPELLER (-50)
case "PRETERITO_PERFEITO": return -51; // LOWER THAN SPELLER
case "PT_BR_SIMPLE_REPLACE": return -51;
case "CRASE_CONFUSION": return -54;
case "NAO_MILITARES": return -54;
case "NA_QUELE": return -54;
case "NOTAS_FICAIS": return -54;
case "GENERAL_VERB_AGREEMENT_ERRORS": return -55;
case "GENERAL_NUMBER_AGREEMENT_ERRORS": return -56;
case "GENERAL_GENDER_NUMBER_AGREEMENT_ERRORS": return -56;
case "FINAL_STOPS": return -75;
case "EU_NÓS_REMOVAL": return -90;
case "COLOCAÇÃO_ADVÉRBIO": return -90;
case "FAZER_USO_DE-USAR-RECORRER": return -90;
case "T-V_DISTINCTION": return -100;
case "T-V_DISTINCTION_ALL": return -101;
case "REPEATED_WORDS": return -210;
case "REPEATED_WORDS_3X": return -211;
case "PT_WIKIPEDIA_COMMON_ERRORS": return -500;
case "FILLER_WORDS_PT": return -990;
case LongSentenceRule.RULE_ID: return -997;
case LongParagraphRule.RULE_ID: return -998;
case "READABILITY_RULE_SIMPLE_PT": return -1100;
case "READABILITY_RULE_DIFFICULT_PT": return -1101;
case "CACOPHONY": return -1500;
case "UNKNOWN_WORD": return -2000;
case "NO_VERB": return -2100;
}

if (id.startsWith("AI_PT_HYDRA_LEO")) { // prefer more specific rules (also speller)
if (id.startsWith("AI_PT_HYDRA_LEO_MISSING_COMMA")) {
return -51; // prefer comma style rules.
Expand Down
Loading

0 comments on commit 612dc21

Please sign in to comment.