diff --git a/test/test.py b/test/test.py index cc80238..084606a 100644 --- a/test/test.py +++ b/test/test.py @@ -183,3 +183,61 @@ def test_dl(self): self.assertEqual(dam_lev('bca', 'ab'), 2) self.assertEqual(dam_lev('ab', 'bdca'), 3) self.assertEqual(dam_lev('bdca', 'ab'), 3) + + +class TestClevWithUnicode(unittest.TestCase): + + def setUp(self): + self.iw = np.ones(10001, dtype=np.float64) + self.dw = np.ones(10001, dtype=np.float64) + self.sw = np.ones((10001, 10001), dtype=np.float64) + self.tw = np.ones((10001, 10001), dtype=np.float64) + self.iw[ord("á")] = 2.0 + self.dw[ord("á")] = 2.0 + self.iw[ord("ő")] = 9.0 + self.dw[ord("ő")] = 9.0 + self.iw[ord("Ұ")] = 10.0 + self.dw[ord("Ұ")] = 10.0 + + + def _lev(self, x, y): + return lev(x, y, self.iw, self.dw, self.sw) + + def _osa(self, x, y): + return osa(x, y, self.iw, self.dw, self.sw, self.tw) + + def _dl(self, x, y): + return dam_lev(x, y, self.iw, self.dw, self.sw, self.tw) + + def test_lev(self): + try: + self.assertEqual(self._lev('átívelődök', 'átívelődök'), 0.0) + self.assertEqual(self._lev('', 'átívelődök'), 19.0) + self.assertEqual(self._lev('átívelődök', ''), 19.0) + self.assertEqual(self._lev('', ''), 0.0) + self.assertEqual(self._lev('átívelődök', 'átívelőd'), 2.0) + self.assertEqual(self._lev('', 'ҰǴʚΏ¤☣✐'), 16.0) + except UnicodeEncodeError: + self.fail("Could not handle special characters") + + def test_osa(self): + try: + self.assertEqual(self._osa('átívelődök', 'átívelődök'), 0.0) + self.assertEqual(self._osa('', 'átívelődök'), 19.0) + self.assertEqual(self._osa('átívelődök', ''), 19.0) + self.assertEqual(self._osa('', ''), 0.0) + self.assertEqual(self._osa('átívelődök', 'átívelőd'), 2.0) + self.assertEqual(self._osa('', 'ҰǴʚΏ¤☣✐'), 16.0) + except UnicodeEncodeError: + self.fail("Could not handle special characters") + + def test_dl(self): + try: + self.assertEqual(self._dl('átívelődök', 'átívelődök'), 0.0) + self.assertEqual(self._dl('', 'átívelődök'), 19.0) + self.assertEqual(self._dl('átívelődök', ''), 19.0) + self.assertEqual(self._dl('', ''), 0.0) + self.assertEqual(self._dl('átívelődök', 'átívelőd'), 2.0) + self.assertEqual(self._dl('', 'ҰǴʚΏ¤☣✐'), 16.0) + except UnicodeEncodeError: + self.fail("Could not handle special characters") diff --git a/weighted_levenshtein/clev.pxd b/weighted_levenshtein/clev.pxd index 9ebf499..9c19f29 100644 --- a/weighted_levenshtein/clev.pxd +++ b/weighted_levenshtein/clev.pxd @@ -3,7 +3,39 @@ from libc.float cimport DBL_MAX as DTYPE_MAX ctypedef double DTYPE_t cdef enum: - ALPHABET_SIZE = 128 + ALPHABET_SIZE = 512 + + +cdef DTYPE_t c_damerau_levenshtein_unicode( + unsigned int* word_m, + Py_ssize_t len1, + unsigned int* word_n, + Py_ssize_t len2, + DTYPE_t[::1] insert_costs, + DTYPE_t[::1] delete_costs, + DTYPE_t[:,::1] substitute_costs, + DTYPE_t[:,::1] transpose_costs) nogil + + +cdef DTYPE_t c_optimal_string_alignment_unicode( + unsigned int* word_m, + Py_ssize_t len1, + unsigned int* word_n, + Py_ssize_t len2, + DTYPE_t[::1] insert_costs, + DTYPE_t[::1] delete_costs, + DTYPE_t[:,::1] substitute_costs, + DTYPE_t[:,::1] transpose_costs) nogil + + +cdef DTYPE_t c_levenshtein_unicode( + unsigned int* word_m, + Py_ssize_t len1, + unsigned int* word_n, + Py_ssize_t len2, + DTYPE_t[::1] insert_costs, + DTYPE_t[::1] delete_costs, + DTYPE_t[:,::1] substitute_costs) nogil cdef DTYPE_t c_damerau_levenshtein( @@ -36,4 +68,3 @@ cdef DTYPE_t c_levenshtein( DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs) nogil - diff --git a/weighted_levenshtein/clev.pyx b/weighted_levenshtein/clev.pyx index 9df89cd..fba8129 100644 --- a/weighted_levenshtein/clev.pyx +++ b/weighted_levenshtein/clev.pyx @@ -1,10 +1,10 @@ #!python -# cython: language_level=3, boundscheck=False, wraparound=False, embedsignature=True, linetrace=True, c_string_type=str, c_string_encoding=ascii -# distutils: define_macros=CYTHON_TRACE_NOGIL=1 +# cython: language_level=3, boundscheck=False, wraparound=False, embedsignature=True, linetrace=True, c_string_type=unicode, c_string_encoding=utf8 from libc.stdlib cimport malloc, free from cython.view cimport array as cvarray from .clev cimport DTYPE_t, DTYPE_MAX, ALPHABET_SIZE +import numpy as np cyarr = cvarray(shape=(ALPHABET_SIZE,), itemsize=sizeof(double), format="d") @@ -132,11 +132,20 @@ cdef inline DTYPE_t row_insert_range_cost( # End Array2D +cdef unsigned int* convert_string_to_int_array(unsigned char* str, Py_ssize_t size): + cdef unsigned int* intarr = malloc(size * sizeof(unsigned int)) + for i, c in enumerate(str): + intarr[i] = ord(c) + return intarr -cdef inline unsigned char str_1_get(unsigned char* s, Py_ssize_t i) nogil: +cdef void copy_str_to_int_arr(unsigned char* str, Py_ssize_t len, unsigned int* int_arr) nogil: + for i in range(len): + int_arr[i] = str[i] + +cdef inline unsigned int int_array_1_get(unsigned int* s, Py_ssize_t i) nogil: """ - Takes an index of a 1-indexed string - and returns that character + Takes an index of a 1-indexed int array + and returns that number """ return s[i - 1] @@ -179,24 +188,33 @@ def damerau_levenshtein( if transpose_costs is None: transpose_costs = unit_matrix - s1 = str(str1).encode() - s2 = str(str2).encode() + s1 = str1.encode('utf-8').decode('utf-8') + len1 = len(s1) + intarr1 = convert_string_to_int_array(str1, len1) + + s2 = str2.encode('utf-8').decode('utf-8') + len2 = len(s2) + intarr2 = convert_string_to_int_array(str2, len2) - return c_damerau_levenshtein( - s1, len(s1), - s2, len(s2), - insert_costs, - delete_costs, - substitute_costs, - transpose_costs - ) + cdef DTYPE_t result = c_damerau_levenshtein_unicode( + intarr1, len1, + intarr2, len2, + insert_costs, + delete_costs, + substitute_costs, + transpose_costs + ) + + free(intarr1) + free(intarr2) + return result dam_lev = damerau_levenshtein -cdef DTYPE_t c_damerau_levenshtein( - unsigned char* str1, Py_ssize_t len1, - unsigned char* str2, Py_ssize_t len2, +cdef DTYPE_t c_damerau_levenshtein_unicode( + unsigned int* str1, Py_ssize_t len1, + unsigned int* str2, Py_ssize_t len2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, @@ -208,12 +226,16 @@ cdef DTYPE_t c_damerau_levenshtein( Py_ssize_t[ALPHABET_SIZE] da Py_ssize_t i, j - unsigned char char_i, char_j + unsigned int char_i, char_j DTYPE_t cost, ret_val Py_ssize_t db, k, l - Array2D d + DTYPE_t substitute_cost + DTYPE_t insert_cost + DTYPE_t delete_cost + DTYPE_t transpose_cost + Array2D_init(&d, len1 + 2, len2 + 2) # initialize 'da' to all 0 @@ -230,21 +252,20 @@ cdef DTYPE_t c_damerau_levenshtein( # fill row 0 and column 0 with insertion and deletion costs Array2D_n1_at(d, 0, 0)[0] = 0 for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) + char_i = int_array_1_get(str1, i) cost = delete_costs[char_i] Array2D_n1_at(d, i, 0)[0] = Array2D_n1_get(d, i - 1, 0) + cost for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) + char_j = int_array_1_get(str2, j) cost = insert_costs[char_j] Array2D_n1_at(d, 0, j)[0] = Array2D_n1_get(d, 0, j - 1) + cost # fill DP array for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) - + char_i = int_array_1_get(str1, i) db = 0 for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) + char_j = int_array_1_get(str2, j) k = da[char_j] l = db @@ -254,14 +275,25 @@ cdef DTYPE_t c_damerau_levenshtein( else: cost = substitute_costs[char_i, char_j] + substitute_cost = Array2D_n1_get(d, i - 1, j - 1) + cost + insert_cost = Array2D_n1_get(d, i, j - 1) + insert_costs[char_j] + delete_cost = Array2D_n1_get(d, i - 1, j) + delete_costs[char_i] + if k <= 0: + # char_j hasn't been seen yet, so nothing to swap + transpose_cost = DTYPE_MAX + else: + # char_j has been seen, swap with char_i + transpose_cost = \ + Array2D_n1_get(d, k - 1, l - 1) + \ + col_delete_range_cost(d, k + 1, i - 1) + \ + transpose_costs[char_j, char_i] + \ + row_insert_range_cost(d, l + 1, j - 1) + Array2D_n1_at(d, i, j)[0] = min( - Array2D_n1_get(d, i - 1, j - 1) + cost, # equal/substitute - Array2D_n1_get(d, i, j - 1) + insert_costs[char_j], # insert - Array2D_n1_get(d, i - 1, j) + delete_costs[char_i], # delete - Array2D_n1_get(d, k - 1, l - 1) + # transpose - col_delete_range_cost(d, k + 1, i - 1) + # delete chars in between - transpose_costs[str_1_get(str1, k), str_1_get(str1, i)] + # transpose chars - row_insert_range_cost(d, l + 1, j - 1) # insert chars in between + substitute_cost, + insert_cost, + delete_cost, + transpose_cost ) da[char_i] = i @@ -307,24 +339,33 @@ def optimal_string_alignment( if transpose_costs is None: transpose_costs = unit_matrix - s1 = str(str1).encode() - s2 = str(str2).encode() + s1 = str1.encode('utf-8').decode('utf-8') + len1 = len(s1) + intarr1 = convert_string_to_int_array(str1, len1) - return c_optimal_string_alignment( - s1, len(s1), - s2, len(s2), - insert_costs, - delete_costs, - substitute_costs, - transpose_costs - ) + s2 = str2.encode('utf-8').decode('utf-8') + len2 = len(s2) + intarr2 = convert_string_to_int_array(str2, len2) + + cdef DTYPE_t result = c_optimal_string_alignment_unicode( + intarr1, len1, + intarr2, len2, + insert_costs, + delete_costs, + substitute_costs, + transpose_costs + ) + + free(intarr1) + free(intarr2) + return result osa = optimal_string_alignment -cdef DTYPE_t c_optimal_string_alignment( - unsigned char* str1, Py_ssize_t len1, - unsigned char* str2, Py_ssize_t len2, +cdef DTYPE_t c_optimal_string_alignment_unicode( + unsigned int* str1, Py_ssize_t len1, + unsigned int* str2, Py_ssize_t len2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, @@ -334,7 +375,7 @@ cdef DTYPE_t c_optimal_string_alignment( """ cdef: Py_ssize_t i, j - unsigned char char_i, char_j, prev_char_i, prev_char_j + unsigned int char_i, char_j, prev_char_i, prev_char_j DTYPE_t ret_val Array2D d @@ -343,17 +384,17 @@ cdef DTYPE_t c_optimal_string_alignment( # fill row 0 and column 0 with insertion and deletion costs Array2D_0_at(d, 0, 0)[0] = 0 for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) + char_i = int_array_1_get(str1, i) Array2D_0_at(d, i, 0)[0] = Array2D_0_get(d, i - 1, 0) + delete_costs[char_i] for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) + char_j = int_array_1_get(str2, j) Array2D_0_at(d, 0, j)[0] = Array2D_0_get(d, 0, j - 1) + insert_costs[char_j] # fill DP array for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) + char_i = int_array_1_get(str1, i) for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) + char_j = int_array_1_get(str2, j) if char_i == char_j: # match Array2D_0_at(d, i, j)[0] = Array2D_0_get(d, i - 1, j - 1) else: @@ -364,8 +405,8 @@ cdef DTYPE_t c_optimal_string_alignment( ) if i > 1 and j > 1: - prev_char_i = str_1_get(str1, i - 1) - prev_char_j = str_1_get(str2, j - 1) + prev_char_i = int_array_1_get(str1, i - 1) + prev_char_j = int_array_1_get(str2, j - 1) if char_i == prev_char_j and prev_char_i == char_j: # transpose Array2D_0_at(d, i, j)[0] = min( Array2D_0_get(d, i, j), @@ -408,23 +449,33 @@ def levenshtein( if substitute_costs is None: substitute_costs = unit_matrix - s1 = str(str1).encode() - s2 = str(str2).encode() + s1 = str1.encode('utf-8').decode('utf-8') + len1 = len(s1) + intarr1 = convert_string_to_int_array(str1, len1) + + s2 = str2.encode('utf-8').decode('utf-8') + len2 = len(s2) + intarr2 = convert_string_to_int_array(str2, len2) + + cdef DTYPE_t result = c_levenshtein_unicode( + intarr1, len1, + intarr2, len2, + insert_costs, + delete_costs, + substitute_costs + ) + + free(intarr1) + free(intarr2) + return result - return c_levenshtein( - s1, len(s1), - s2, len(s2), - insert_costs, - delete_costs, - substitute_costs - ) lev = levenshtein -cdef DTYPE_t c_levenshtein( - unsigned char* str1, Py_ssize_t len1, - unsigned char* str2, Py_ssize_t len2, +cdef DTYPE_t c_levenshtein_unicode( + unsigned int* str1, Py_ssize_t len1, + unsigned int* str2, Py_ssize_t len2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs) nogil: @@ -433,7 +484,7 @@ cdef DTYPE_t c_levenshtein( """ cdef: Py_ssize_t i, j - unsigned char char_i, char_j + unsigned int char_i, char_j DTYPE_t ret_val Array2D d @@ -441,16 +492,16 @@ cdef DTYPE_t c_levenshtein( Array2D_0_at(d, 0, 0)[0] = 0 for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) + char_i = int_array_1_get(str1, i) Array2D_0_at(d, i, 0)[0] = Array2D_0_get(d, i - 1, 0) + delete_costs[char_i] for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) + char_j = int_array_1_get(str2, j) Array2D_0_at(d, 0, j)[0] = Array2D_0_get(d, 0, j - 1) + insert_costs[char_j] for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) + char_i = int_array_1_get(str1, i) for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) + char_j = int_array_1_get(str2, j) if char_i == char_j: # match Array2D_0_at(d, i, j)[0] = Array2D_0_get(d, i - 1, j - 1) else: @@ -463,3 +514,65 @@ cdef DTYPE_t c_levenshtein( ret_val = Array2D_0_get(d, len1, len2) Array2D_del(d) return ret_val + +# Legacy code + +cdef DTYPE_t c_damerau_levenshtein( + unsigned char* str1, Py_ssize_t len1, + unsigned char* str2, Py_ssize_t len2, + DTYPE_t[::1] insert_costs, + DTYPE_t[::1] delete_costs, + DTYPE_t[:,::1] substitute_costs, + DTYPE_t[:,::1] transpose_costs) nogil: + + cdef unsigned int* int_arr1 = malloc(len1 * sizeof(unsigned int)) + copy_str_to_int_arr(str1, len1, int_arr1) + cdef unsigned int* int_arr2 = malloc(len2 * sizeof(unsigned int)) + copy_str_to_int_arr(str2, len2, int_arr2) + + cdef DTYPE_t result = c_damerau_levenshtein_unicode(int_arr1, len1, int_arr2, len2, + insert_costs, delete_costs, substitute_costs, transpose_costs) + + free(int_arr1) + free(int_arr2) + return result + + +cdef DTYPE_t c_optimal_string_alignment( + unsigned char* str1, Py_ssize_t len1, + unsigned char* str2, Py_ssize_t len2, + DTYPE_t[::1] insert_costs, + DTYPE_t[::1] delete_costs, + DTYPE_t[:,::1] substitute_costs, + DTYPE_t[:,::1] transpose_costs) nogil: + + cdef unsigned int* int_arr1 = malloc(len1 * sizeof(unsigned int)) + copy_str_to_int_arr(str1, len1, int_arr1) + cdef unsigned int* int_arr2 = malloc(len2 * sizeof(unsigned int)) + copy_str_to_int_arr(str2, len2, int_arr2) + + cdef DTYPE_t result = c_optimal_string_alignment_unicode(int_arr1, len1, int_arr2, len2, + insert_costs, delete_costs, substitute_costs, transpose_costs) + + free(int_arr1) + free(int_arr2) + return result + +cdef DTYPE_t c_levenshtein( + unsigned char* str1, Py_ssize_t len1, + unsigned char* str2, Py_ssize_t len2, + DTYPE_t[::1] insert_costs, + DTYPE_t[::1] delete_costs, + DTYPE_t[:,::1] substitute_costs) nogil: + + cdef unsigned int* int_arr1 = malloc(len1 * sizeof(unsigned int)) + copy_str_to_int_arr(str1, len1, int_arr1) + cdef unsigned int* int_arr2 = malloc(len2 * sizeof(unsigned int)) + copy_str_to_int_arr(str2, len2, int_arr2) + + cdef DTYPE_t result = c_levenshtein_unicode(int_arr1, len1, int_arr2, len2, + insert_costs, delete_costs, substitute_costs) + + free(int_arr1) + free(int_arr2) + return result