From 7668989a4032bef7efa2f259e90dce9637992e7b Mon Sep 17 00:00:00 2001
From: Nico Matentzoglu <nicolas.matentzoglu@gmail.com>
Date: Fri, 1 Mar 2024 21:34:42 +0200
Subject: [PATCH] Make equivalence test less strict for translations

---
 src/babelon/translate.py | 27 ++++++++++++++++++++++++++-
 tests/test_translate.py  | 20 +++++++++++++++-----
 2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/src/babelon/translate.py b/src/babelon/translate.py
index ed92685..45f4b60 100644
--- a/src/babelon/translate.py
+++ b/src/babelon/translate.py
@@ -2,6 +2,8 @@
 
 import logging
 import os
+import re
+import string
 from typing import Dict, List
 
 import llm
@@ -222,8 +224,11 @@ def prepare_translation_for_ontology(
                     f"{predicate_id} value for {subject_id} is ambiguous,"
                     f"picking first one ({term_metadata[predicate_id]})."
                 )
-            if ontology_value != source_value:
+            if not _is_equivalent_string(ontology_value, source_value):
+                # If the translated string and the ontology literal are not equivalent, change status:
                 translation_value = row["translation_value"]
+                # Set the ontology value as the source value, so that the translation profiles are consistent
+                # With what is in the ontology
                 df_augmented.at[index, "source_value"] = ontology_value
                 new_translation_status = (
                     "CANDIDATE" if translation_value != "NOT_TRANSLATED" else "NOT_TRANSLATED"
@@ -235,6 +240,10 @@ def prepare_translation_for_ontology(
                     f"but {ontology_value} in the ontology."
                 )
                 output_source_changed_data.append(row)
+            else:
+                # Because `_is_equivalent_string` is a bit forgiving, we still want to replace the source value,
+                # so that the translation profiles are consistent
+                df_augmented.at[index, "source_value"] = ontology_value
         else:
             logging.warning(
                 f"{predicate_id} value for {subject_id} does not exist in ontology. "
@@ -285,6 +294,22 @@ def prepare_translation_for_ontology(
     return df_augmented, df_output_source_changed, df_output_not_translated
 
 
+def _is_equivalent_string(string1, string2):
+    """Compare two strings after they are whitespace, punctuation and case normalised."""
+
+    def _normalize(s):
+        # Remove punctuation
+        s = s.translate(str.maketrans("", "", string.punctuation))
+        # Normalize whitespace and convert to lowercase
+        return re.sub(r"\s+", " ", s).strip().lower()
+
+    normalized_string1 = _normalize(string1)
+    normalized_string2 = _normalize(string2)
+
+    # Compare the normalized strings
+    return normalized_string1 == normalized_string2
+
+
 def _get_metadata_for_term(ontology, term):
     term_metadata = ontology.entity_metadata_map(term)
     term_label = ontology.label(term)
diff --git a/tests/test_translate.py b/tests/test_translate.py
index 568fd2b..1e51048 100644
--- a/tests/test_translate.py
+++ b/tests/test_translate.py
@@ -4,10 +4,14 @@
 import unittest
 
 from dotenv import load_dotenv
-from oaklib.implementations.pronto.pronto_implementation import ProntoImplementation
-from oaklib.resource import OntologyResource
+from oaklib import get_adapter
 
-from babelon.translate import OpenAITranslator, prepare_translation_for_ontology, translate_profile
+from babelon.translate import (
+    OpenAITranslator,
+    _is_equivalent_string,
+    prepare_translation_for_ontology,
+    translate_profile,
+)
 from tests.constants import _create_simple_example_for_testing
 from tests.test_data import data_dir as test_data_dir
 from tests.test_data import env_file
@@ -40,8 +44,7 @@ def test_translate_profile(self):
     def test_prepare_translation_for_ontology(self):
         """Test the update method for babelon profiles."""
         test_file = f"{test_data_dir}/hp-testsubset.obo"
-        resource = OntologyResource(slug=test_file, local=True)
-        ontology = ProntoImplementation(resource)
+        ontology = get_adapter(f"pronto:{test_file}")
         terms = ["HP:0001707"]
         fields = ["rdfs:label"]
         df_babelon = _create_simple_example_for_testing()
@@ -59,3 +62,10 @@ def test_prepare_translation_for_ontology(self):
             ["HP:0001945", "HP:0001297", "HP:0001707"],
             df_output_not_translated["subject_id"].tolist(),
         )
+
+    def test_equivalent_string(self):
+        """Test if _is_equivalent_string() catches important cases."""
+        string1 = "Hello, my."
+        string2 = "hello  my"
+
+        self.assertTrue(_is_equivalent_string(string1, string2))