Skip to content

Commit

Permalink
test: Add unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez committed Nov 13, 2024
1 parent 8030887 commit 33f0dca
Showing 1 changed file with 74 additions and 33 deletions.
107 changes: 74 additions & 33 deletions tests/test_value_matching.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,97 @@
import unittest
import pandas as pd
from bdikit.value_matching.polyfuzz import (
TFIDFValueMatcher,
EditDistanceValueMatcher,
FastTextValueMatcher,
EmbeddingValueMatcher,
)

from bdikit.value_matching.gene import Gene

class ValueMatchingTest(unittest.TestCase):

def test_tfidf_value_matching(self):
def test_textual_transformation_matching():
threshold = 0.5
for value_matcher in [
TFIDFValueMatcher(threshold=threshold),
EditDistanceValueMatcher(threshold=threshold),
]:

# given
source_values = ["Red Apple", "Banana", "Oorange", "Strawberry"]
source_values = ["Red Apple", "Banana", "Oorange", "dragon-fruits"]
target_values = ["apple", "banana", "orange", "kiwi"]

tfidf_matcher = TFIDFValueMatcher()

# when
matches = tfidf_matcher.match(source_values, target_values)
matches = value_matcher.match(source_values, target_values)

# then
self.assertEqual(len(matches), 3)
assert len(matches) == 3

mapped_matches = {match[0]: (match[1], match[2]) for match in matches}
self.assertNotIn("Strawberry", mapped_matches)
self.assertEqual(mapped_matches["Red Apple"][0], "apple"),
self.assertEqual(mapped_matches["Banana"][0], "banana"),
self.assertEqual(mapped_matches["Oorange"][0], "orange")
assert "dragon-fruits" not in mapped_matches
assert mapped_matches["Red Apple"][0] == "apple"
assert mapped_matches["Banana"][0] == "banana"
assert mapped_matches["Oorange"][0] == "orange"

scores = [match[2] for match in matches]
self.assertTrue(all(score > 0.8 for score in scores))
assert all(score > threshold for score in scores)

def test_edit_distance_value_matching(self):
# given
source_values = ["Red Apple", "Banana", "Oorange", "Strawberry"]
target_values = ["apple", "bananana", "orange", "kiwi"]

edit_distance_matcher = EditDistanceValueMatcher(threshold=0.5)
def test_semantic_matching():
threshold = 0.4
value_matcher = FastTextValueMatcher(threshold=threshold)

# when
matches = edit_distance_matcher.match(
source_values,
target_values,
)
# given
source_values = ["Computer", "Display", "Pencil"]
target_values = ["PC", "Monitor", "Football field"]

# then
self.assertEqual(len(matches), 3)
# when
matches = value_matcher.match(source_values, target_values)

mapped_matches = {match[0]: (match[1], match[2]) for match in matches}
self.assertNotIn("Strawberry", mapped_matches)
self.assertEqual(mapped_matches["Red Apple"][0], "apple"),
self.assertEqual(mapped_matches["Banana"][0], "bananana"),
self.assertEqual(mapped_matches["Oorange"][0], "orange")
# then
assert len(matches) == 2

scores = [match[2] for match in matches]
self.assertTrue(all(score > 0.5 for score in scores))
mapped_matches = {match[0]: (match[1], match[2]) for match in matches}
assert "Pencil" not in mapped_matches
assert mapped_matches["Computer"][0] == "PC"
assert mapped_matches["Display"][0] == "Monitor"

scores = [match[2] for match in matches]
assert all(score > threshold for score in scores)

threshold = 0.6
value_matcher = EmbeddingValueMatcher(threshold=threshold)

# given
source_values = ["Computer", "Display", "Pencil"]
target_values = ["PC", "Monitor", "Football field"]

# when
matches = value_matcher.match(source_values, target_values)

# then
assert len(matches) == 2

mapped_matches = {match[0]: (match[1], match[2]) for match in matches}
assert "Pencil" not in mapped_matches
assert mapped_matches["Computer"][0] == "PC"
assert mapped_matches["Display"][0] == "Monitor"

scores = [match[2] for match in matches]
assert all(score > threshold for score in scores)


def test_gene_matching():
# given
gene_matcher = Gene()
source_values = ["ENSMUSG00000064341", "ENSMUSG00000064345", "ENSMUSG00000064351"]
target_values = ["ENSG00000198763", "ENSG00000198888", "ENSG000000000000X"]

# when
matches = gene_matcher.match(source_values, target_values)

# then
assert len(matches) == 2

mapped_matches = {match[0]: (match[1], match[2]) for match in matches}
assert "ENSG000000000000X" not in mapped_matches
assert mapped_matches["ENSMUSG00000064341"][0] == "ENSG00000198888"
assert mapped_matches["ENSMUSG00000064345"][0] == "ENSG00000198763"

0 comments on commit 33f0dca

Please sign in to comment.