-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Extract more methods from the scripts
- Loading branch information
Showing
5 changed files
with
225 additions
and
191 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import shutil | ||
from os import path | ||
from tempfile import NamedTemporaryFile | ||
from typing import List | ||
|
||
from lib.constants import LATIN_1_ENCODING | ||
from lib.logger import LOGGER | ||
from lib.shell_command import ShellCommand | ||
|
||
|
||
class DicChunk: | ||
"""This class represents a single chunk of a Hunspell dictionary file. | ||
Attributes: | ||
filepath (str): the path to the chunk | ||
compounds (bool): whether this is a file containing compounds or not; if True, this chunk will *not* be | ||
tokenised; | ||
""" | ||
def __init__(self, filepath: str, compounds: bool = False): | ||
self.filepath = filepath | ||
self.compounds = compounds | ||
|
||
def __str__(self) -> str: | ||
basename = path.basename(self.filepath) | ||
if self.compounds: | ||
return path.join('compounds', basename) | ||
return basename | ||
|
||
def rm(self) -> None: | ||
"""Remove the chunk file.""" | ||
LOGGER.debug(f"Removing {self} ...") | ||
shutil.rmtree(self.filepath) | ||
|
||
@classmethod | ||
def from_hunspell_dic(cls, dic_path: str, chunk_size: int, target_dir: str, sample_size: int) -> List: | ||
"""Splits a dictionary file into smaller files (chunks) of a given number of lines. | ||
Args: | ||
dic_path (str): the path to the Hunspell .dic file | ||
chunk_size (int): the number of lines per chunk | ||
target_dir (str): the directory where the chunks will be saved | ||
sample_size (int): the number of lines to read from the dictionary file; if 0 or negative, read all lines | ||
Returns: | ||
A list of DicChunk objects, each representing a chunk of the dictionary file | ||
""" | ||
compounds = (True if 'compounds' in dic_path else False) | ||
with open(dic_path, 'r', encoding=LATIN_1_ENCODING) as dic_file: | ||
lines = dic_file.readlines()[1:] # Skip the first line | ||
lines = [line for line in lines if not line.startswith("#")] # Filter out comment lines | ||
if sample_size > 0: | ||
lines = lines[0:sample_size] | ||
total_lines = len(lines) | ||
str_chunks: List[List[str]] = [lines[i:i + chunk_size] for i in range(0, total_lines, chunk_size)] | ||
chunks: List[cls] = [] | ||
for index, chunk in enumerate(str_chunks): | ||
if compounds: | ||
tmp_dir = path.join(target_dir, 'compounds') | ||
else: | ||
tmp_dir = target_dir | ||
filename = path.basename(dic_path).replace('.dic', f'_chunk{index}.dic') | ||
chunk_path = path.join(tmp_dir, filename) | ||
with open(chunk_path, 'w', encoding=LATIN_1_ENCODING) as chunk_file: | ||
# Prepend the count of lines in this chunk and then write all lines | ||
chunk_file.write(f"{len(chunk)}\n") | ||
chunk_file.writelines(chunk) | ||
chunks.append(cls(chunk_path, compounds)) | ||
return chunks | ||
|
||
def unmunch(self, aff_path: str, delete_tmp: bool = False) -> NamedTemporaryFile: | ||
"""Create all forms from Hunspell dictionaries. | ||
Args: | ||
aff_path: the path to the .aff file | ||
delete_tmp: whether to delete the temporary file after use | ||
Returns: | ||
the temp file containing the unmunched dictionary | ||
""" | ||
unmunched_tmp = NamedTemporaryFile(delete=delete_tmp, mode='wb') | ||
LOGGER.debug(f"Unmunching {self} into {unmunched_tmp.name} ...") | ||
cmd_unmunch = f"unmunch {self.filepath} {aff_path}" | ||
unmunch_result = ShellCommand(cmd_unmunch).run() | ||
unmunched_tmp.write(unmunch_result) | ||
unmunched_tmp.flush() | ||
if delete_tmp: | ||
self.rm() | ||
return unmunched_tmp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
from tempfile import NamedTemporaryFile | ||
from typing import List | ||
|
||
from lib.constants import LATIN_1_ENCODING, LT_VER, LT_JAR_PATH, LT_DIR, RESULT_POS_DICT_FILEPATH | ||
from lib.logger import LOGGER | ||
from lib.shell_command import ShellCommand | ||
from lib.variant import Variant | ||
|
||
|
||
class LanguageToolUtils: | ||
def __init__(self, variant: Variant, delete_tmp: bool = False): | ||
self.variant = variant | ||
self.delete_tmp = delete_tmp | ||
|
||
def tokenise(self, unmunched_file: NamedTemporaryFile) -> NamedTemporaryFile: | ||
"""Tokenise each line of an unmunched file, write it to another temp file and return it. | ||
The written data looks weird, since the output of the LT word tokeniser inserts newlines between tokens. | ||
Original line after unmunch: | ||
"far-se-á" | ||
Lines after tokenisation: | ||
"far" | ||
"" | ||
"se" | ||
"" | ||
"á" | ||
This may look iffy, but later in the process we will sort and dedupe these files, so don't panic. | ||
Args: | ||
unmunched_file: the NamedTemporaryFile object for the unmunched file we'll be tokenising | ||
Returns: | ||
a NamedTemporaryFile with the result of tokenisation written to it; note this is a UTF-8-encoded file; it is | ||
not at this stage that we move from latin-1 encoding to UTF-8. | ||
""" | ||
tokenised_tmp = NamedTemporaryFile(delete=self.delete_tmp, mode='w') | ||
LOGGER.debug(f"Tokenising {unmunched_file.name} into {tokenised_tmp.name} ...") | ||
tokenise_cmd = ( | ||
f"java -cp {LT_JAR_PATH}:" | ||
f"{LT_DIR}/languagetool-dev/target/languagetool-dev-{LT_VER}-jar-with-dependencies.jar " | ||
f"org.languagetool.dev.archive.WordTokenizer {self.variant.lang}" | ||
) | ||
with open(unmunched_file.name, 'r', encoding=LATIN_1_ENCODING) as u: | ||
unmunched_str = u.read() | ||
unmunched_file.close() | ||
tokenisation_result = ShellCommand(tokenise_cmd).run_with_input(unmunched_str) | ||
tokenised_tmp.write(tokenisation_result) | ||
tokenised_tmp.flush() | ||
return tokenised_tmp | ||
|
||
def build_spelling_binary(self, tokenised_temps: List[NamedTemporaryFile]) -> None: | ||
"""Merge many unmunched and tokenised files into *one* plaintext file and used that to build a Morfologik | ||
SPELLING dictionary. | ||
The files must be merged and converted into UTF-8 before we can do anything with them. Once we have a single | ||
'master' temp file per variant, we can pass that file as an input parameter to the Java tool that builds | ||
spelling dictionaries. | ||
If the shell command is successful, we will have a new output file saved to the appropriate result directory. | ||
This will be a binary file ready to be released and used by Morfologik. | ||
Returns: | ||
None | ||
""" | ||
LOGGER.info(f"Building binary for {self.variant}...") | ||
megatemp = NamedTemporaryFile(delete=self.delete_tmp, mode='w', | ||
encoding='utf-8') # Open the file with UTF-8 encoding | ||
lines = set() | ||
for tmp in tokenised_temps: | ||
with open(tmp.name, 'r', encoding='utf-8') as t: | ||
lines.update(t.read().split("\n")) | ||
megatemp.write("\n".join(sorted(lines))) | ||
LOGGER.debug(f"Found {len(lines)} unique unmunched and tokenised forms for {self.variant}.") | ||
cmd_build = ( | ||
f"java -cp {LT_JAR_PATH} " | ||
f"org.languagetool.tools.SpellDictionaryBuilder " | ||
f"-i {megatemp.name} " | ||
f"-info {self.variant.info('source')} " | ||
f"-freq {self.variant.freq()} " | ||
f"-o {self.variant.dict()}" | ||
) | ||
ShellCommand(cmd_build).run() | ||
LOGGER.info(f"Done compiling {self.variant} dictionary!") | ||
self.variant.copy_spell_info() | ||
megatemp.close() | ||
|
||
def build_pos_binary(self) -> None: | ||
cmd_build = ( | ||
f"java -cp {LT_JAR_PATH} " | ||
f"org.languagetool.tools.POSDictionaryBuilder " | ||
f"-i {RESULT_POS_DICT_FILEPATH} " | ||
f"-info {self.variant.pos_info_java_input_path()} " | ||
f"-o {self.variant.pos_dict_java_output_path()}" | ||
) | ||
ShellCommand(cmd_build).run() | ||
self.variant.copy_pos_info() | ||
|
||
def build_synth_binary(self) -> None: | ||
cmd_build = ( | ||
f"java -cp {LT_JAR_PATH} " | ||
f"org.languagetool.tools.SynthDictionaryBuilder " | ||
f"-i {RESULT_POS_DICT_FILEPATH} " | ||
f"-info {self.variant.synth_info_java_input_path()} " | ||
f"-o {self.variant.synth_dict_java_output_path()}" | ||
) | ||
ShellCommand(cmd_build).run() | ||
self.variant.copy_synth_info() | ||
self.variant.rename_synth_tag_files() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.