Skip to content

Commit

Permalink
Merge pull request #51 from monarch-initiative/add-merge-update-feature
Browse files Browse the repository at this point in the history
Add a feature to update translations during merge
souzadevinicius authored Oct 21, 2024
2 parents d1ed9c6 + 219b633 commit 5048936
Showing 5 changed files with 540 additions and 467 deletions.
959 changes: 500 additions & 459 deletions poetry.lock

Large diffs are not rendered by default.

38 changes: 31 additions & 7 deletions src/babelon/cli.py
Original file line number Diff line number Diff line change
@@ -15,10 +15,10 @@
from babelon.translate import prepare_translation_for_ontology, translate_profile
from babelon.translation_profile import statistics_translation_profile
from babelon.utils import (
assemble_xliff_file,
drop_unknown_columns_babelon,
sort_babelon,
generate_translation_units,
assemble_xliff_file,
sort_babelon,
)

info_log = logging.getLogger()
@@ -281,8 +281,15 @@ def statistics_translation_profile_command(
@multiple_inputs_argument
@drop_unknown_column_option
@sort_table_option
@click.option(
"--update-translations",
type=bool,
default=False,
help="""If true, duplicate translations for the same term and property are merged.
"Translated values provided by a later babelon file are considered newer and will take precedence.""",
)
@output_option
def merge(inputs, sort_tables, drop_unknown_columns, output):
def merge(inputs, sort_tables, drop_unknown_columns, update_translations, output):
"""Merge multiple babelon TSV files into one.
Example:
@@ -294,7 +301,22 @@ def merge(inputs, sort_tables, drop_unknown_columns, output):
# Loop through the rest of the input files and concatenate each DataFrame
for input_file in inputs[1:]:
df_temp = pd.read_csv(input_file, sep="\t")
df = pd.concat([df, df_temp], axis=0, ignore_index=True)
if update_translations:
merge_keys = ["source_language", "translation_language", "subject_id", "predicate_id"]

# Create a temporary key for merging
df["temp_key"] = df[merge_keys].apply(lambda x: "_".join(x.astype(str)), axis=1)
df_temp["temp_key"] = df_temp[merge_keys].apply(
lambda x: "_".join(x.astype(str)), axis=1
)

# Remove rows from df that exist in df_temp (based on the merge keys)
df = df[~df["temp_key"].isin(df_temp["temp_key"])]

df = pd.concat([df, df_temp], axis=0, ignore_index=True)
df = df.drop("temp_key", axis=1)
else:
df = pd.concat([df, df_temp], axis=0, ignore_index=True)

if sort_tables:
df = sort_babelon(df)
@@ -321,13 +343,15 @@ def prepare_ontology_for_crowdin(oak_adapter, top_level_term, output):
"""Merge multiple babelon TSV files into one.
Example:
babelon prepare-ontology-for-crowdin --oak-adapter simpleobo:hp-base.obo --top-level-term HP:0000001 -o hp-translation.xliff
babelon prepare-ontology-for-crowdin
--oak-adapter simpleobo:hp-base.obo
--top-level-term HP:0000001 -o hp-translation.xliff
""" # noqa: DAR101
adapter = get_adapter(oak_adapter)
translation_units = list()

for top_level_term in top_level_term:
for term in adapter.descendants(top_level_term, predicates=[IS_A]):
for t in top_level_term:
for term in adapter.descendants(t, predicates=[IS_A]):
label = adapter.label(term)
definition = adapter.definition(term)
alias_map = adapter.entity_alias_map(term)
2 changes: 1 addition & 1 deletion src/babelon/utils.py
Original file line number Diff line number Diff line change
@@ -7,8 +7,8 @@
from pathlib import Path
from string import punctuation
from typing import TextIO, Union
from xml.etree.ElementTree import Element, SubElement, tostring, fromstring
from xml.dom import minidom
from xml.etree.ElementTree import Element, SubElement, fromstring, tostring

import pandas as pd
import validators
4 changes: 4 additions & 0 deletions tests/data/translations/hp-cs-small.babelon.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
source_language translation_language subject_id predicate_id source_value translation_value translation_status
en cs HP:0032768 rdfs:label Focal aware autonomic seizure with pupillary dilation/constriction Fokální autonomní záchvat s dilatací/konstrikcí zornic bez poruchy vědomí CANDIDATE
en cs HP:0032768 IAO:0000115 A focal autonomic seizure with pupillary dilation / constriction characterized by retained awareness throughout the seizure Fokální autonomní záchvat s dilatací/konstrikcí zornic charakterizovaný zachovaným vědomím po celou dobu záchvatu. CANDIDATE
en cs HP:0000001 rdfs:label All Všo OFFICIAL
4 changes: 4 additions & 0 deletions tests/data/translations/hp-cs-update.babelon.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
source_language translation_language subject_id predicate_id source_value translation_value translation_status
en cs HP:0032768 rdfs:label Focal aware autonomic seizure with pupillary dilation/constriction Fokální autonomní záchvat s dilatací/konstrikcí zornic bez poruchy vědomí CANDIDATE
en cs HP:0032768 IAO:0000115 A focal autonomic seizure with pupillary dilation / constriction characterized by retained awareness throughout the seizure Fokální autonomní záchvat s dilatací/konstrikcí zornic charakterizovaný zachovaným vědomím po celou dobu záchvatu. CANDIDATE
en cs HP:0000001 rdfs:label All Vše OFFICIAL

0 comments on commit 5048936

Please sign in to comment.