Skip to content

Commit

Permalink
#142 Centralize data formatting into a single file
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis committed Nov 24, 2024
1 parent 2808ae0 commit bf489b7
Show file tree
Hide file tree
Showing 54 changed files with 234 additions and 2,165 deletions.
8 changes: 8 additions & 0 deletions docs/source/scribe_data/wikidata/format_data.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
format_data.py
==============

`View code on Github <https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/wikidata/format_data.py>`_

.. automodule:: scribe_data.wikidata.format_data
:members:
:private-members:
3 changes: 3 additions & 0 deletions src/scribe_data/cli/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,9 @@ def validate_single_item(item, valid_options, item_type):
item : str
The item to validate.
valid_options : list
A list of valid options against which the item will be validated.
item_type : str
A description of the item type (e.g., "language", "data-type") used in error messages.
Expand Down Expand Up @@ -183,6 +185,7 @@ def validate_single_item(item, valid_options, item_type):

if language is not None and isinstance(language, list):
for lang in language:
lang = lang.split(" ")[0]
error = validate_single_item(lang, language_to_qid.keys(), "language")

if error:
Expand Down
13 changes: 7 additions & 6 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,23 +94,23 @@ def get_data(
"tsv": DEFAULT_TSV_EXPORT_DIR,
}.get(output_type, DEFAULT_JSON_EXPORT_DIR)

languages = [language] if language else None
data_types = [data_type] if data_type else None

subprocess_result = False

# MARK: Get All
if all:
if language:
print(f"Updating all data types for language for {language.capitalize()}")
language_or_sub_language = language.split(" ")[0]
print(f"Updating all data types for language for {language.title()}")
query_data(
languages=[language],
languages=[language_or_sub_language],
data_type=None,
output_dir=output_dir,
overwrite=overwrite,
)
print(
f"Query completed for all data types with specified language for {language.capitalize()}."
f"Query completed for all data types with specified language for {language.title()}."
)

elif data_type:
Expand Down Expand Up @@ -145,12 +145,13 @@ def get_data(
# MARK: Query Data

elif language or data_type:
language_or_sub_language = language.split(" ")[0]
data_type = data_type[0] if isinstance(data_type, list) else data_type
print(
f"Updating data for language(s): {language.capitalize()}; data type(s): {data_type.capitalize()}"
f"Updating data for language(s): {language.title()}; data type(s): {data_type.capitalize()}"
)
query_data(
languages=languages,
languages=[language_or_sub_language],
data_type=data_types,
output_dir=output_dir,
overwrite=overwrite,
Expand Down
2 changes: 1 addition & 1 deletion src/scribe_data/cli/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def list_languages() -> None:

for lang in languages:
print(
f"{lang.capitalize():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}"
f"{lang.title():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}"
)

print()
Expand Down
72 changes: 56 additions & 16 deletions src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import ast
import json
import os
import re
from importlib import resources
from pathlib import Path
Expand Down Expand Up @@ -198,7 +199,7 @@ def get_language_qid(language: str) -> str:
"""
return _find(
source_key="language",
source_value=language,
source_value=language.split(" ")[0],
target_key="qid",
error_msg=f"{language.capitalize()} is currently not a supported language for QID conversion.",
)
Expand All @@ -221,7 +222,7 @@ def get_language_iso(language: str) -> str:

return _find(
source_key="language",
source_value=language,
source_value=language.split(" ")[0],
target_key="iso",
error_msg=f"{language.capitalize()} is currently not a supported language for ISO conversion.",
)
Expand Down Expand Up @@ -258,15 +259,15 @@ def get_language_from_iso(iso: str) -> str:


def load_queried_data(
file_path: str, language: str, data_type: str
dir_path: str, language: str, data_type: str
) -> tuple[Any, bool, str]:
"""
Loads queried data from a JSON file for a specific language and data type.
Parameters
----------
file_path : str
The path to the file containing the queried data.
dir_path : str
The path to the directory containing the queried data.
language : str
The language for which the data is being loaded.
Expand All @@ -279,14 +280,48 @@ def load_queried_data(
tuple(Any, str)
A tuple containing the loaded data and the path to the data file.
"""
data_path = Path(file_path) / language.lower() / f"{data_type}.json"
data_path = (
Path(dir_path) / language.lower().replace(" ", "_") / f"{data_type}.json"
)

with open(data_path, encoding="utf-8") as f:
return json.load(f), data_path


def remove_queried_data(dir_path: str, language: str, data_type: str) -> None:
"""
Removes queried data for a specific language and data type as a new formatted file has been generated.
Parameters
----------
dir_path : str
The path to the directory containing the queried data.
language : str
The language for which the data is being loaded.
data_type : str
The type of data being loaded (e.g. 'nouns', 'verbs').
Returns
-------
None : The file is deleted.
"""
data_path = (
Path(dir_path)
/ language.lower().replace(" ", "_")
/ f"{data_type}_queried.json"
)

try:
os.remove(data_path)

except OSError:
pass


def export_formatted_data(
file_path: str,
dir_path: str,
formatted_data: dict,
language: str,
data_type: str,
Expand All @@ -297,8 +332,8 @@ def export_formatted_data(
Parameters
----------
file_path : str
The path to the file containing the queried data.
dir_path : str
The path to the directory containing the queried data.
formatted_data : dict
The data to be exported.
Expand All @@ -314,7 +349,9 @@ def export_formatted_data(
None
"""
export_path = (
Path(file_path) / language.lower() / f"{data_type.replace('-', '_')}.json"
Path(dir_path)
/ language.lower().replace(" ", "_")
/ f"{data_type.replace('-', '_')}.json"
)

with open(export_path, "w", encoding="utf-8") as file:
Expand Down Expand Up @@ -554,7 +591,7 @@ def order_annotations(annotation: str) -> str:
def format_sublanguage_name(lang, language_metadata=_languages):
"""
Formats the name of a sub-language by appending its main language
in the format 'MAIN_LANG/SUB_LANG'. If the language is not a sub-language,
in the format 'SUB_LANG MAIN_LANG'. If the language is not a sub-language,
the original language name is returned as-is.
Parameters
Expand All @@ -568,7 +605,7 @@ def format_sublanguage_name(lang, language_metadata=_languages):
Returns
-------
str
The formatted language name if it's a sub-language (e.g., 'Norwegian/Nynorsk').
The formatted language name if it's a sub-language (e.g., 'Nynorsk Norwegian').
Otherwise the original name.
Raises
Expand All @@ -578,7 +615,7 @@ def format_sublanguage_name(lang, language_metadata=_languages):
Example
-------
> format_sublanguage_name("nynorsk", language_metadata)
'Norwegian/Nynorsk'
'Nynorsk Norwegian'
> format_sublanguage_name("english", language_metadata)
'English'
Expand All @@ -589,12 +626,13 @@ def format_sublanguage_name(lang, language_metadata=_languages):
return lang

# Check if the main language has sub-languages.
lang = lang.split(" ")[0]
if "sub_languages" in lang_data:
# Check if the provided language is a sub-language.
for sub_lang in lang_data["sub_languages"]:
if lang == sub_lang:
# Return the formatted name MAIN_LANG/SUB_LANG.
return f"{main_lang}/{sub_lang}"
# Return the formatted name SUB_LANG MAIN_LANG.
return f"{sub_lang} {main_lang}"

# Raise ValueError if no match is found.
raise ValueError(f"{lang.capitalize()} is not a valid language or sub-language.")
Expand All @@ -611,7 +649,9 @@ def list_all_languages(language_metadata=_languages):
# Check if there are sub-languages.
if "sub_languages" in lang_data:
# Add the sub-languages to current_languages.
current_languages.extend(lang_data["sub_languages"].keys())
current_languages.extend(
[f"{sub} {lang_key}" for sub in lang_data["sub_languages"].keys()]
)
else:
# If no sub-languages, add the main language.
current_languages.append(lang_key)
Expand Down
94 changes: 94 additions & 0 deletions src/scribe_data/wikidata/format_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""
Formats the data queried from Wikidata using query_verbs.sparql.
.. raw:: html
<!--
* Copyright (C) 2024 Scribe
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
-->
"""

import argparse
import collections

from scribe_data.utils import (
export_formatted_data,
load_queried_data,
remove_queried_data,
)

parser = argparse.ArgumentParser()
parser.add_argument("--dir-path")
parser.add_argument("--language")
parser.add_argument("--data_type")
args = parser.parse_args()


def format_data(
dir_path: str = args.dir_path,
language: str = args.language,
data_type: str = args.data_type,
):
"""
Formats data that has been queried from the Wikidata Query Service.
Parameters
----------
dir_path : str
The output directory path for results.
language : str
The language for which the data is being loaded.
data_type : str
The type of data being loaded (e.g. 'nouns', 'verbs').
Returns
_______
A saved and formatted data file for the given language and data type.
"""
data_list, data_path = load_queried_data(
dir_path=dir_path, language=language, data_type=data_type
)

data_formatted = {}

for data_vals in data_list:
lexeme_id = data_vals["lexemeID"]

if lexeme_id not in data_formatted:
data_formatted[lexeme_id] = {}

# Reverse to make sre that we're getting the same order as the query.
query_identifiers = list(reversed(data_vals.keys()))
query_identifiers.remove("lexemeID")

for k in query_identifiers:
data_formatted[lexeme_id][k] = data_vals[k]

data_formatted = collections.OrderedDict(sorted(data_formatted.items()))

export_formatted_data(
dir_path=dir_path,
formatted_data=data_formatted,
language=language,
data_type=data_type,
)

remove_queried_data(dir_path=dir_path, language=language, data_type=data_type)


if __name__ == "__main__":
format_data()
Empty file.
Empty file.
Empty file.
Loading

0 comments on commit bf489b7

Please sign in to comment.