diff --git a/lib/dic_chunk.py b/lib/dic_chunk.py index b8f7616..330a3bc 100644 --- a/lib/dic_chunk.py +++ b/lib/dic_chunk.py @@ -6,6 +6,7 @@ from lib.constants import LATIN_1_ENCODING from lib.logger import LOGGER from lib.shell_command import ShellCommand +from lib.variant import Variant class DicChunk: @@ -13,11 +14,13 @@ class DicChunk: Attributes: filepath (str): the path to the chunk + name (str): the name of the chunk (e.g. chunk0) compounds (bool): whether this is a file containing compounds or not; if True, this chunk will *not* be tokenised; """ - def __init__(self, filepath: str, compounds: bool = False): + def __init__(self, filepath: str, name: str, compounds: bool = False): self.filepath = filepath + self.name = name self.compounds = compounds def __str__(self) -> str: @@ -32,20 +35,27 @@ def rm(self) -> None: shutil.rmtree(self.filepath) @classmethod - def from_hunspell_dic(cls, dic_path: str, chunk_size: int, target_dir: str, sample_size: int) -> List: + def from_hunspell_dic(cls, variant: Variant, chunk_size: int, target_dir: str, sample_size: int, + compounds: bool = False) -> List: """Splits a dictionary file into smaller files (chunks) of a given number of lines. Args: - dic_path (str): the path to the Hunspell .dic file + variant (Variant): the variant for which we want to unmunch the .dic file chunk_size (int): the number of lines per chunk target_dir (str): the directory where the chunks will be saved sample_size (int): the number of lines to read from the dictionary file; if 0 or negative, read all lines + compounds (bool): whether this is a file containing compounds or not Returns: A list of DicChunk objects, each representing a chunk of the dictionary file """ + if compounds: + tmp_dir = path.join(target_dir, 'compounds') + dic_path = variant.compounds() + else: + tmp_dir = target_dir + dic_path = variant.dic() LOGGER.debug(f"Splitting dictionary file \"{dic_path}\" into chunks...") - compounds = (True if 'compounds' in dic_path else False) with open(dic_path, 'r', encoding=LATIN_1_ENCODING) as dic_file: lines = dic_file.readlines()[1:] # Skip the first line lines = [line for line in lines if not line.startswith("#")] # Filter out comment lines @@ -55,17 +65,14 @@ def from_hunspell_dic(cls, dic_path: str, chunk_size: int, target_dir: str, samp str_chunks: List[List[str]] = [lines[i:i + chunk_size] for i in range(0, total_lines, chunk_size)] chunks: List[cls] = [] for index, chunk in enumerate(str_chunks): - if compounds: - tmp_dir = path.join(target_dir, 'compounds') - else: - tmp_dir = target_dir - filename = path.basename(dic_path).replace('.dic', f'_chunk{index}.dic') + chunk_name = f"{variant.underscored}_chunk{index}" + filename = chunk_name + ".dic" chunk_path = path.join(tmp_dir, filename) with open(chunk_path, 'w', encoding=LATIN_1_ENCODING) as chunk_file: # Prepend the count of lines in this chunk and then write all lines chunk_file.write(f"{len(chunk)}\n") chunk_file.writelines(chunk) - chunks.append(cls(chunk_path, compounds)) + chunks.append(cls(chunk_path, chunk_name, compounds)) LOGGER.debug(f"Split into {len(chunks)} chunks.") return chunks @@ -79,7 +86,8 @@ def unmunch(self, aff_path: str, delete_tmp: bool = False) -> NamedTemporaryFile Returns: the temp file containing the unmunched dictionary """ - unmunched_tmp = NamedTemporaryFile(delete=delete_tmp, mode='wb') + unmunched_tmp = NamedTemporaryFile(delete=delete_tmp, mode='wb', + prefix=f"{self.name}_unmunched_") LOGGER.debug(f"Unmunching {self} into {unmunched_tmp.name} ...") cmd_unmunch = f"unmunch {self.filepath} {aff_path}" unmunch_result = ShellCommand(cmd_unmunch).run() diff --git a/lib/languagetool_utils.py b/lib/languagetool_utils.py index b64b38e..6420d70 100644 --- a/lib/languagetool_utils.py +++ b/lib/languagetool_utils.py @@ -1,3 +1,4 @@ +import re from tempfile import NamedTemporaryFile from typing import List @@ -33,7 +34,9 @@ def tokenise(self, unmunched_file: NamedTemporaryFile) -> NamedTemporaryFile: a NamedTemporaryFile with the result of tokenisation written to it; note this is a UTF-8-encoded file; it is not at this stage that we move from latin-1 encoding to UTF-8. """ - tokenised_tmp = NamedTemporaryFile(delete=self.delete_tmp, mode='w') + chunk_pattern = re.compile("[a-z]{2}_[A-Z]{2}(?:_[a-zA-Z0-9]+)?_chunk\\d+") + prefix = chunk_pattern.findall(unmunched_file.name.split('/')[-1])[0] + "_tokenised_" + tokenised_tmp = NamedTemporaryFile(delete=self.delete_tmp, mode='w', prefix=prefix) LOGGER.debug(f"Tokenising {unmunched_file.name} into {tokenised_tmp.name} ...") tokenise_cmd = ( f"java -cp {LT_JAR_PATH}:" diff --git a/lib/shell_command.py b/lib/shell_command.py index e3243e9..278a79c 100644 --- a/lib/shell_command.py +++ b/lib/shell_command.py @@ -16,10 +16,11 @@ def __init__(self, return_code: int, stderr: AnyStr = None): class ShellCommand: """A class for executing Java commands.""" - def __init__(self, command_str: str, env: dict = None): + def __init__(self, command_str: str, env: dict = None, cwd: str = '.'): self.command_str = command_str self.split_cmd = shlex.split(self.command_str) self.env: dict = {**os.environ} + self.cwd = cwd if env is not None: self.env.update(env) @@ -33,13 +34,13 @@ def check_status(return_code: int, stderr: AnyStr) -> None: def _popen(self, text: bool = False) -> subprocess.Popen: try: return subprocess.Popen(self.split_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, text=text, env=self.env) + stderr=subprocess.PIPE, text=text, env=self.env, cwd=self.cwd) except FileNotFoundError: raise ShellCommandException(255, "Command or file not found.") def _run(self) -> subprocess.run: try: - return subprocess.run(self.split_cmd, capture_output=True, env=self.env) + return subprocess.run(self.split_cmd, capture_output=True, env=self.env, cwd=self.cwd) except FileNotFoundError: raise ShellCommandException(255, "Command or file not found.") diff --git a/lib/utils.py b/lib/utils.py index c3bd7db..ec91e00 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1,11 +1,11 @@ import codecs import shutil from datetime import timedelta -from os import chdir, path +from os import path from tempfile import NamedTemporaryFile from typing import Optional -from lib.constants import REPO_DIR, LT_DIR, JAVA_RESULTS_DIR, LATIN_1_ENCODING +from lib.constants import LT_DIR, JAVA_RESULTS_DIR, LATIN_1_ENCODING from lib.shell_command import ShellCommand from lib.logger import LOGGER @@ -13,23 +13,26 @@ def compile_lt_dev(): """Build with maven in the languagetool-dev directory.""" LOGGER.info("Compiling LT dev...") - chdir(path.join(LT_DIR, "languagetool-dev")) - ShellCommand("mvn clean compile assembly:single").run() - chdir(REPO_DIR) # Go back to the repo directory + wd = path.join(LT_DIR, "languagetool-dev") + ShellCommand("mvn clean compile assembly:single", cwd=wd).run() + + +def compile_lt(): + """Build with maven in the languagetool-dev directory.""" + LOGGER.info("Compiling LT...") + ShellCommand("mvn clean install -DskipTests", cwd=LT_DIR).run() def install_dictionaries(custom_version: Optional[str]): """Install our dictionaries to the local ~/.m2.""" LOGGER.info("Installing dictionaries...") - chdir(JAVA_RESULTS_DIR) env: dict = {} if custom_version is not None: LOGGER.info(f"Installing custom version \"{custom_version}\"") env['PT_DICT_VERSION'] = custom_version else: LOGGER.info(f"Installing environment-defined version \"{env['PT_DICT_VERSION']}\"") - ShellCommand("mvn clean install", env=env).run() - chdir(REPO_DIR) # Go back to the repo directory + ShellCommand("mvn clean install", env=env, cwd=JAVA_RESULTS_DIR).run() def convert_to_utf8(tmp_file: NamedTemporaryFile, delete_tmp: bool = False) -> NamedTemporaryFile: diff --git a/lib/variant.py b/lib/variant.py index e941d09..dfb6c27 100644 --- a/lib/variant.py +++ b/lib/variant.py @@ -31,6 +31,9 @@ def __init__(self, locale_code: str): def __str__(self) -> str: return self.hyphenated + def __repr__(self) -> str: + return self.hyphenated + def aff(self) -> str: return path.join(HUNSPELL_DIR, f"{self.underscored}.aff") diff --git a/scripts/build_spelling_dicts.py b/scripts/build_spelling_dicts.py index 40c8696..8661b16 100644 --- a/scripts/build_spelling_dicts.py +++ b/scripts/build_spelling_dicts.py @@ -9,7 +9,7 @@ from lib.dic_chunk import DicChunk from lib.logger import LOGGER from lib.constants import SPELLING_DICT_DIR -from lib.utils import compile_lt_dev, install_dictionaries, convert_to_utf8, pretty_time_delta +from lib.utils import compile_lt_dev, install_dictionaries, convert_to_utf8, pretty_time_delta, compile_lt from lib.variant import Variant, VARIANT_MAPPING from lib.languagetool_utils import LanguageToolUtils as LtUtils @@ -83,7 +83,11 @@ def main(): f"CUSTOM_INSTALL_VERSION: {CUSTOM_INSTALL_VERSION}\n" f"DIC_VARIANTS: {DIC_VARIANTS}\n" ) + # We might consider *always* compiling, since the spelling dicts depends on the tagger dicts having been *installed* + # and compiled with LT. The reason we need to also re-build LT is that we need to make sure that OUR tagger dicts + # are used by the WordTokenizer. if FORCE_COMPILE: + compile_lt() compile_lt_dev() tasks = [] processed_files: dict[str: List[NamedTemporaryFile]] = {} @@ -92,8 +96,8 @@ def main(): # and then split them based on the dialectal and pre/post agreement alternation files for variant in DIC_VARIANTS: processed_files[variant] = [] - dic_chunks: List[DicChunk] = DicChunk.from_hunspell_dic(variant.dic(), CHUNK_SIZE, TMP_DIR, SAMPLE_SIZE) - dic_chunks.extend(DicChunk.from_hunspell_dic(variant.compounds(), CHUNK_SIZE, TMP_DIR, SAMPLE_SIZE)) + dic_chunks: List[DicChunk] = DicChunk.from_hunspell_dic(variant, CHUNK_SIZE, TMP_DIR, SAMPLE_SIZE) + dic_chunks.extend(DicChunk.from_hunspell_dic(variant, CHUNK_SIZE, TMP_DIR, SAMPLE_SIZE, compounds=True)) for chunk in dic_chunks: tasks.append((variant, chunk)) LOGGER.info("Starting unmunching and tokenisation process...") diff --git a/scripts/build_tagger_dicts.py b/scripts/build_tagger_dicts.py index c6dc9d2..e58d4ba 100644 --- a/scripts/build_tagger_dicts.py +++ b/scripts/build_tagger_dicts.py @@ -3,6 +3,7 @@ """ import argparse import os +from datetime import datetime from lib.languagetool_utils import LanguageToolUtils from lib.logger import LOGGER @@ -10,7 +11,7 @@ SORTED_POS_DICT_FILEPATH, POS_DICT_DIFF_FILEPATH, OLD_POS_DICT_FILEPATH, REPO_DIR, TAGGER_DICT_DIR, LT_RESULTS_DIR) from lib.shell_command import ShellCommand -from lib.utils import compile_lt_dev, install_dictionaries +from lib.utils import compile_lt_dev, install_dictionaries, pretty_time_delta from lib.variant import Variant @@ -59,6 +60,8 @@ def run_shell_script() -> None: def main(): + start_time = datetime.now() + LOGGER.debug(f"Started at {start_time.strftime('%r')}") if FORCE_COMPILE: compile_lt_dev() run_shell_script() @@ -67,6 +70,9 @@ def main(): lt.build_synth_binary() if FORCE_INSTALL: install_dictionaries(custom_version=CUSTOM_INSTALL_VERSION) + end_time = datetime.now() + LOGGER.debug(f"Finished at {end_time.strftime('%r')}. " + f"Total time elapsed: {pretty_time_delta(end_time - start_time)}.") if __name__ == "__main__": diff --git a/scripts/validate_aff.py b/scripts/validate_aff.py new file mode 100644 index 0000000..f5bf35f --- /dev/null +++ b/scripts/validate_aff.py @@ -0,0 +1,53 @@ +"""This was mostly made by chatgpt but of course i had to fix it because AI is stoopid.""" +import sys +import re + + +def validate_hunspell_aff(file_content): + lines = file_content.split('\n') + valid = True + errors = [] + + i = 0 + while i < len(lines): + line = lines[i].strip() + if line.startswith("SFX") or line.startswith("PFX"): + parts = line.split() + if len(parts) >= 4 and parts[2] == 'Y': + rule_count = int(parts[3]) + rule_type = parts[0] + rule_name = parts[1] + rule_lines = 0 + rule_start_line = i + i += 1 + same_block_pattern = re.compile(f"{rule_type}\\s+{rule_name}") + while i < len(lines) and same_block_pattern.search(lines[i]): + if not lines[i].strip().startswith("#"): + rule_lines += 1 + i += 1 + + if rule_lines != rule_count: + valid = False + errors.append(f"Rule {rule_type} {rule_name} at line {rule_start_line + 1}: " + f"Expected {rule_count} rules, found {rule_lines}") + continue + i += 1 + + return valid, errors + + +def validate_hunspell_aff_file(filepath): + try: + with open(filepath, 'r', encoding='latin-1') as file: + file_content = file.read() + except FileNotFoundError: + return False, ["File not found."] + except UnicodeDecodeError: + return False, ["File encoding issue. Ensure the file is in LATIN-1 encoding."] + except Exception as e: + return False, [str(e)] + return validate_hunspell_aff(file_content) + + +if __name__ == '__main__': + print(validate_hunspell_aff_file(sys.argv[1]))