From 7668989a4032bef7efa2f259e90dce9637992e7b Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Fri, 1 Mar 2024 21:34:42 +0200 Subject: [PATCH] Make equivalence test less strict for translations --- src/babelon/translate.py | 27 ++++++++++++++++++++++++++- tests/test_translate.py | 20 +++++++++++++++----- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/src/babelon/translate.py b/src/babelon/translate.py index ed92685..45f4b60 100644 --- a/src/babelon/translate.py +++ b/src/babelon/translate.py @@ -2,6 +2,8 @@ import logging import os +import re +import string from typing import Dict, List import llm @@ -222,8 +224,11 @@ def prepare_translation_for_ontology( f"{predicate_id} value for {subject_id} is ambiguous," f"picking first one ({term_metadata[predicate_id]})." ) - if ontology_value != source_value: + if not _is_equivalent_string(ontology_value, source_value): + # If the translated string and the ontology literal are not equivalent, change status: translation_value = row["translation_value"] + # Set the ontology value as the source value, so that the translation profiles are consistent + # With what is in the ontology df_augmented.at[index, "source_value"] = ontology_value new_translation_status = ( "CANDIDATE" if translation_value != "NOT_TRANSLATED" else "NOT_TRANSLATED" @@ -235,6 +240,10 @@ def prepare_translation_for_ontology( f"but {ontology_value} in the ontology." ) output_source_changed_data.append(row) + else: + # Because `_is_equivalent_string` is a bit forgiving, we still want to replace the source value, + # so that the translation profiles are consistent + df_augmented.at[index, "source_value"] = ontology_value else: logging.warning( f"{predicate_id} value for {subject_id} does not exist in ontology. " @@ -285,6 +294,22 @@ def prepare_translation_for_ontology( return df_augmented, df_output_source_changed, df_output_not_translated +def _is_equivalent_string(string1, string2): + """Compare two strings after they are whitespace, punctuation and case normalised.""" + + def _normalize(s): + # Remove punctuation + s = s.translate(str.maketrans("", "", string.punctuation)) + # Normalize whitespace and convert to lowercase + return re.sub(r"\s+", " ", s).strip().lower() + + normalized_string1 = _normalize(string1) + normalized_string2 = _normalize(string2) + + # Compare the normalized strings + return normalized_string1 == normalized_string2 + + def _get_metadata_for_term(ontology, term): term_metadata = ontology.entity_metadata_map(term) term_label = ontology.label(term) diff --git a/tests/test_translate.py b/tests/test_translate.py index 568fd2b..1e51048 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -4,10 +4,14 @@ import unittest from dotenv import load_dotenv -from oaklib.implementations.pronto.pronto_implementation import ProntoImplementation -from oaklib.resource import OntologyResource +from oaklib import get_adapter -from babelon.translate import OpenAITranslator, prepare_translation_for_ontology, translate_profile +from babelon.translate import ( + OpenAITranslator, + _is_equivalent_string, + prepare_translation_for_ontology, + translate_profile, +) from tests.constants import _create_simple_example_for_testing from tests.test_data import data_dir as test_data_dir from tests.test_data import env_file @@ -40,8 +44,7 @@ def test_translate_profile(self): def test_prepare_translation_for_ontology(self): """Test the update method for babelon profiles.""" test_file = f"{test_data_dir}/hp-testsubset.obo" - resource = OntologyResource(slug=test_file, local=True) - ontology = ProntoImplementation(resource) + ontology = get_adapter(f"pronto:{test_file}") terms = ["HP:0001707"] fields = ["rdfs:label"] df_babelon = _create_simple_example_for_testing() @@ -59,3 +62,10 @@ def test_prepare_translation_for_ontology(self): ["HP:0001945", "HP:0001297", "HP:0001707"], df_output_not_translated["subject_id"].tolist(), ) + + def test_equivalent_string(self): + """Test if _is_equivalent_string() catches important cases.""" + string1 = "Hello, my." + string2 = "hello my" + + self.assertTrue(_is_equivalent_string(string1, string2))