#142 Centralize data formatting into a single file

scribe-org · Nov 24, 2024 · bf489b7 · bf489b7
1 parent 2808ae0
commit bf489b7
Show file tree

Hide file tree

Showing 54 changed files with 234 additions and 2,165 deletions.
diff --git a/docs/source/scribe_data/wikidata/format_data.rst b/docs/source/scribe_data/wikidata/format_data.rst
@@ -0,0 +1,8 @@
+format_data.py
+==============
+
+`View code on Github <https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/wikidata/format_data.py>`_
+
+.. automodule:: scribe_data.wikidata.format_data
+    :members:
+    :private-members:
diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py
@@ -143,7 +143,9 @@ def validate_single_item(item, valid_options, item_type):
             item : str
                 The item to validate.
             valid_options : list
+
                 A list of valid options against which the item will be validated.
+
             item_type : str
                 A description of the item type (e.g., "language", "data-type") used in error messages.
 
@@ -183,6 +185,7 @@ def validate_single_item(item, valid_options, item_type):
 
     if language is not None and isinstance(language, list):
         for lang in language:
+            lang = lang.split(" ")[0]
             error = validate_single_item(lang, language_to_qid.keys(), "language")
 
             if error:

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -94,23 +94,23 @@ def get_data(
             "tsv": DEFAULT_TSV_EXPORT_DIR,
         }.get(output_type, DEFAULT_JSON_EXPORT_DIR)
 
-    languages = [language] if language else None
     data_types = [data_type] if data_type else None
 
     subprocess_result = False
 
     # MARK: Get All
     if all:
         if language:
-            print(f"Updating all data types for language for {language.capitalize()}")
+            language_or_sub_language = language.split(" ")[0]
+            print(f"Updating all data types for language for {language.title()}")
             query_data(
-                languages=[language],
+                languages=[language_or_sub_language],
                 data_type=None,
                 output_dir=output_dir,
                 overwrite=overwrite,
             )
             print(
-                f"Query completed for all data types with specified language for {language.capitalize()}."
+                f"Query completed for all data types with specified language for {language.title()}."
             )
 
         elif data_type:
@@ -145,12 +145,13 @@ def get_data(
     # MARK: Query Data
 
     elif language or data_type:
+        language_or_sub_language = language.split(" ")[0]
         data_type = data_type[0] if isinstance(data_type, list) else data_type
         print(
-            f"Updating data for language(s): {language.capitalize()}; data type(s): {data_type.capitalize()}"
+            f"Updating data for language(s): {language.title()}; data type(s): {data_type.capitalize()}"
         )
         query_data(
-            languages=languages,
+            languages=[language_or_sub_language],
             data_type=data_types,
             output_dir=output_dir,
             overwrite=overwrite,

diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py
@@ -58,7 +58,7 @@ def list_languages() -> None:
 
     for lang in languages:
         print(
-            f"{lang.capitalize():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}"
+            f"{lang.title():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}"
         )
 
     print()

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
@@ -23,6 +23,7 @@
 
 import ast
 import json
+import os
 import re
 from importlib import resources
 from pathlib import Path
@@ -198,7 +199,7 @@ def get_language_qid(language: str) -> str:
     """
     return _find(
         source_key="language",
-        source_value=language,
+        source_value=language.split(" ")[0],
         target_key="qid",
         error_msg=f"{language.capitalize()} is currently not a supported language for QID conversion.",
     )
@@ -221,7 +222,7 @@ def get_language_iso(language: str) -> str:
 
     return _find(
         source_key="language",
-        source_value=language,
+        source_value=language.split(" ")[0],
         target_key="iso",
         error_msg=f"{language.capitalize()} is currently not a supported language for ISO conversion.",
     )
@@ -258,15 +259,15 @@ def get_language_from_iso(iso: str) -> str:
 
 
 def load_queried_data(
-    file_path: str, language: str, data_type: str
+    dir_path: str, language: str, data_type: str
 ) -> tuple[Any, bool, str]:
     """
     Loads queried data from a JSON file for a specific language and data type.
 
     Parameters
     ----------
-        file_path : str
-            The path to the file containing the queried data.
+        dir_path : str
+            The path to the directory containing the queried data.
 
         language : str
             The language for which the data is being loaded.
@@ -279,14 +280,48 @@ def load_queried_data(
         tuple(Any, str)
             A tuple containing the loaded data and the path to the data file.
     """
-    data_path = Path(file_path) / language.lower() / f"{data_type}.json"
+    data_path = (
+        Path(dir_path) / language.lower().replace(" ", "_") / f"{data_type}.json"
+    )
 
     with open(data_path, encoding="utf-8") as f:
         return json.load(f), data_path
 
 
+def remove_queried_data(dir_path: str, language: str, data_type: str) -> None:
+    """
+    Removes queried data for a specific language and data type as a new formatted file has been generated.
+
+    Parameters
+    ----------
+        dir_path : str
+            The path to the directory containing the queried data.
+
+        language : str
+            The language for which the data is being loaded.
+
+        data_type : str
+            The type of data being loaded (e.g. 'nouns', 'verbs').
+
+    Returns
+    -------
+        None : The file is deleted.
+    """
+    data_path = (
+        Path(dir_path)
+        / language.lower().replace(" ", "_")
+        / f"{data_type}_queried.json"
+    )
+
+    try:
+        os.remove(data_path)
+
+    except OSError:
+        pass
+
+
 def export_formatted_data(
-    file_path: str,
+    dir_path: str,
     formatted_data: dict,
     language: str,
     data_type: str,
@@ -297,8 +332,8 @@ def export_formatted_data(
 
     Parameters
     ----------
-        file_path : str
-            The path to the file containing the queried data.
+        dir_path : str
+            The path to the directory containing the queried data.
 
         formatted_data : dict
             The data to be exported.
@@ -314,7 +349,9 @@ def export_formatted_data(
         None
     """
     export_path = (
-        Path(file_path) / language.lower() / f"{data_type.replace('-', '_')}.json"
+        Path(dir_path)
+        / language.lower().replace(" ", "_")
+        / f"{data_type.replace('-', '_')}.json"
     )
 
     with open(export_path, "w", encoding="utf-8") as file:
@@ -554,7 +591,7 @@ def order_annotations(annotation: str) -> str:
 def format_sublanguage_name(lang, language_metadata=_languages):
     """
     Formats the name of a sub-language by appending its main language
-    in the format 'MAIN_LANG/SUB_LANG'. If the language is not a sub-language,
+    in the format 'SUB_LANG MAIN_LANG'. If the language is not a sub-language,
     the original language name is returned as-is.
 
     Parameters
@@ -568,7 +605,7 @@ def format_sublanguage_name(lang, language_metadata=_languages):
     Returns
     -------
         str
-            The formatted language name if it's a sub-language (e.g., 'Norwegian/Nynorsk').
+            The formatted language name if it's a sub-language (e.g., 'Nynorsk Norwegian').
             Otherwise the original name.
 
     Raises
@@ -578,7 +615,7 @@ def format_sublanguage_name(lang, language_metadata=_languages):
     Example
     -------
         > format_sublanguage_name("nynorsk", language_metadata)
-        'Norwegian/Nynorsk'
+        'Nynorsk Norwegian'
 
         > format_sublanguage_name("english", language_metadata)
         'English'
@@ -589,12 +626,13 @@ def format_sublanguage_name(lang, language_metadata=_languages):
             return lang
 
         # Check if the main language has sub-languages.
+        lang = lang.split(" ")[0]
         if "sub_languages" in lang_data:
             # Check if the provided language is a sub-language.
             for sub_lang in lang_data["sub_languages"]:
                 if lang == sub_lang:
-                    # Return the formatted name MAIN_LANG/SUB_LANG.
-                    return f"{main_lang}/{sub_lang}"
+                    # Return the formatted name SUB_LANG MAIN_LANG.
+                    return f"{sub_lang} {main_lang}"
 
     # Raise ValueError if no match is found.
     raise ValueError(f"{lang.capitalize()} is not a valid language or sub-language.")
@@ -611,7 +649,9 @@ def list_all_languages(language_metadata=_languages):
         # Check if there are sub-languages.
         if "sub_languages" in lang_data:
             # Add the sub-languages to current_languages.
-            current_languages.extend(lang_data["sub_languages"].keys())
+            current_languages.extend(
+                [f"{sub} {lang_key}" for sub in lang_data["sub_languages"].keys()]
+            )
         else:
             # If no sub-languages, add the main language.
             current_languages.append(lang_key)

diff --git a/src/scribe_data/wikidata/format_data.py b/src/scribe_data/wikidata/format_data.py
@@ -0,0 +1,94 @@
+"""
+Formats the data queried from Wikidata using query_verbs.sparql.
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
+import argparse
+import collections
+
+from scribe_data.utils import (
+    export_formatted_data,
+    load_queried_data,
+    remove_queried_data,
+)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--dir-path")
+parser.add_argument("--language")
+parser.add_argument("--data_type")
+args = parser.parse_args()
+
+
+def format_data(
+    dir_path: str = args.dir_path,
+    language: str = args.language,
+    data_type: str = args.data_type,
+):
+    """
+    Formats data that has been queried from the Wikidata Query Service.
+
+    Parameters
+    ----------
+        dir_path : str
+            The output directory path for results.
+
+        language : str
+            The language for which the data is being loaded.
+
+        data_type : str
+            The type of data being loaded (e.g. 'nouns', 'verbs').
+
+    Returns
+    _______
+        A saved and formatted data file for the given language and data type.
+    """
+    data_list, data_path = load_queried_data(
+        dir_path=dir_path, language=language, data_type=data_type
+    )
+
+    data_formatted = {}
+
+    for data_vals in data_list:
+        lexeme_id = data_vals["lexemeID"]
+
+        if lexeme_id not in data_formatted:
+            data_formatted[lexeme_id] = {}
+
+        # Reverse to make sre that we're getting the same order as the query.
+        query_identifiers = list(reversed(data_vals.keys()))
+        query_identifiers.remove("lexemeID")
+
+        for k in query_identifiers:
+            data_formatted[lexeme_id][k] = data_vals[k]
+
+    data_formatted = collections.OrderedDict(sorted(data_formatted.items()))
+
+    export_formatted_data(
+        dir_path=dir_path,
+        formatted_data=data_formatted,
+        language=language,
+        data_type=data_type,
+    )
+
+    remove_queried_data(dir_path=dir_path, language=language, data_type=data_type)
+
+
+if __name__ == "__main__":
+    format_data()
diff --git a/src/scribe_data/wikidata/language_data_extraction/__init__.py b/src/scribe_data/wikidata/language_data_extraction/__init__.py
diff --git a/src/scribe_data/wikidata/language_data_extraction/english/__init__.py b/src/scribe_data/wikidata/language_data_extraction/english/__init__.py
diff --git a/src/scribe_data/wikidata/language_data_extraction/english/nouns/__init__.py b/src/scribe_data/wikidata/language_data_extraction/english/nouns/__init__.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -58,7 +58,7 @@ def list_languages() -> None: @@
         for lang in languages:
             print(
-                f"{lang.capitalize():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}"
+                f"{lang.title():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}"
             )
         print()
@@ Expand Down @@