[pt] Add abbreviation and do_not_suggest logic to morfologik

languagetool-org · Nov 10, 2023 · 2315fb2 · 2315fb2
1 parent 3e3399f
commit 2315fb2
Show file tree

Hide file tree

Showing 5 changed files with 2,148 additions and 16 deletions.
diff --git a/...e-modules/pt/src/main/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRule.java b/...e-modules/pt/src/main/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRule.java
@@ -1,23 +1,64 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2012 Marcin Miłkowski (http://www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
 package org.languagetool.rules.pt;
 
 import org.languagetool.JLanguageTool;
 import org.languagetool.Language;
 import org.languagetool.UserConfig;
+import org.languagetool.rules.SuggestedReplacement;
+import org.languagetool.rules.spelling.SpellingCheckRule;
 import org.languagetool.rules.spelling.morfologik.MorfologikSpellerRule;
 
 import java.io.IOException;
-import java.util.List;
-import java.util.Objects;
-import java.util.ResourceBundle;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import static org.languagetool.JLanguageTool.getDataBroker;
 
 public class MorfologikPortugueseSpellerRule extends MorfologikSpellerRule {
 
-  private String dictFilename;
-  private Language language;
+  private final String dictFilepath;
+  // Path, in pt/resources, where the list of words to be removed from the suggestion list is to be found.
+  private static final String doNotSuggestWordsFilepath = "/pt/do_not_suggest.txt";
+  // Set of words that we do not want to add to the suggestions, despite being correctly spelt. Mostly profanity.
+  private static final Set<String> doNotSuggestWords = getDoNotSuggestWords();
+  // Path, in pt/resources, where a list of abbreviations is found. These are simple abbreviations of the shape \w+\.
+  private static final String abbreviationFilepath = "/pt/abbreviations.txt";
+  private static final Set<String> abbreviations = getAbbreviations();
 
   @Override
   public String getFileName() {
-    return dictFilename;
+    return dictFilepath;
+  }
+
+  public static Set<String> getDoNotSuggestWords() {
+    return getWordSetFromResources(doNotSuggestWordsFilepath);
+  }
+
+  public static Set<String> getAbbreviations() {
+    return getWordSetFromResources(abbreviationFilepath);
+  }
+
+  public static Set<String> getWordSetFromResources(String filepath) {
+    return new HashSet<String>(getDataBroker().getFromResourceDirAsLines(filepath));
   }
 
   @Override
@@ -33,15 +74,55 @@ public MorfologikPortugueseSpellerRule(ResourceBundle messages, Language languag
     // the tagger tags pt-PT and pt-BR words all the same, as it should, but they're still incorrect if they belong
     // to the wrong dialect, commenting this out
     // this.setIgnoreTaggedWords();
-    if (language.getShortCodeWithCountryAndVariant().equals("pt")) {
-      language = language.getDefaultLanguageVariant();
+    Language spellerLanguage = language;
+    if (spellerLanguage.getShortCodeWithCountryAndVariant().equals("pt")) {
+      spellerLanguage = spellerLanguage.getDefaultLanguageVariant();
     }
-    this.language = language;
-    if (Objects.equals(language.getShortCodeWithCountryAndVariant(), "pt-BR")) {
-      this.dictFilename = "/pt/spelling/pt-BR" + JLanguageTool.DICTIONARY_FILENAME_EXTENSION;
-    } else {
-      // TODO: work out how to detect the orthographic agreement; user option?
-      this.dictFilename = "/pt/spelling/pt-PT-90" + JLanguageTool.DICTIONARY_FILENAME_EXTENSION;
+    this.dictFilepath = "/pt/spelling/" + getDictFilename(spellerLanguage) + JLanguageTool.DICTIONARY_FILENAME_EXTENSION;
+  }
+
+  @Override
+  protected List<SuggestedReplacement> filterNoSuggestWords(List<SuggestedReplacement> suggestedReplacements) {
+    return suggestedReplacements.stream().filter(
+      suggestedReplacement -> !doNotSuggestWords.contains(
+        suggestedReplacement.getReplacement().toLowerCase()
+      )).collect(Collectors.toList());
+  }
+
+  @Override
+  protected List<SuggestedReplacement> getAdditionalTopSuggestions(List<SuggestedReplacement> suggestions, String word)
+    throws IOException {
+    List<String> suggestionsList = suggestions.stream().map(SuggestedReplacement::getReplacement)
+      .collect(Collectors.toList());
+    return SuggestedReplacement.convert(getAdditionalTopSuggestionsString(suggestionsList, word));
+  }
+
+  private List<String> getAdditionalTopSuggestionsString(List<String> suggestions, String word) throws IOException {
+    if (isAbbreviation(word)) {
+      return Collections.singletonList(word + ".");
+    }
+    return Collections.emptyList();
+  }
+
+  // Check if the word we're checking is in our list of abbreviations.
+  protected boolean isAbbreviation(String word) {
+    // regular case (since we do have some abbreviations with weird casing) as well as downcased
+    return abbreviations.contains(word + ".") || abbreviations.contains(word.toLowerCase() + ".");
+  }
+
+  private static String getDictFilename(Language spellerLanguage) {
+    String dictFilename = "pt-BR";  // default dict is pt-BR with 1990 spelling
+    String fullLanguageCode = spellerLanguage.getShortCodeWithCountryAndVariant();
+    switch (fullLanguageCode) {
+      case "pt-BR":               dictFilename = "pt-BR";    break;
+      case "pt-PT":               dictFilename = "pt-PT-90"; break;
+      case "pt-AO": case "pt-MZ": dictFilename = "pt-PT-45"; break;
     }
+    return dictFilename;
+  }
+
+  @Override
+  public List<String> getAdditionalSpellingFileNames() {
+    return Arrays.asList(SpellingCheckRule.GLOBAL_SPELLING_FILE, "/pt/multiwords.txt");
   }
 }