rapidfuzz · Ph0enixKM · Aug 9, 2022 · Aug 9, 2022 · Aug 9, 2022 · maxbachmann
diff --git a/Cargo.toml b/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.10.0"
 authors = ["Danny Guo <[email protected]>"]
 description = """
 Implementations of string similarity metrics. Includes Hamming, Levenshtein,
-OSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, and Sørensen-Dice.
+OSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, Sørensen-Dice and LCS based algorithm.
 """
 license = "MIT"
 readme = "README.md"

diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@
   - [Damerau-Levenshtein] - distance & normalized
   - [Jaro and Jaro-Winkler] - this implementation of Jaro-Winkler does not limit the common prefix length
   - [Sørensen-Dice]
+  - [LCS based algorithm] - this algorithm uses LCS length finding variant with O(n * m) time complexity and O(min(n, m)) memory complexity
 
 The normalized versions return values between `0.0` and `1.0`, where `1.0` means
 an exact match.
@@ -39,7 +40,7 @@ extern crate strsim;
 
 use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance,
              damerau_levenshtein, normalized_damerau_levenshtein, jaro,
-             jaro_winkler, sorensen_dice};
+             jaro_winkler, sorensen_dice, lcs_normalized};
 
 fn main() {
     match hamming("hamming", "hammers") {
@@ -66,6 +67,8 @@ fn main() {
 
     assert_eq!(sorensen_dice("web applications", "applications of the web"),
         0.7878787878787878);
+
+    assert!(lcs_normalized("foobar", "ofobar") > 0.8);
 }
 ```
 
@@ -99,4 +102,5 @@ Benchmarks require a Nightly toolchain. Run `$ cargo +nightly bench`.
 [Hamming]:http://en.wikipedia.org/wiki/Hamming_distance
 [Optimal string alignment]:https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
 [Sørensen-Dice]:http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
+[LCS Algorithm]:https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
 [Docker]:https://docs.docker.com/engine/installation/
diff --git a/benches/benches.rs b/benches/benches.rs
@@ -97,4 +97,13 @@ mod benches {
             strsim::sorensen_dice(&a, &b);
         })
     }
+
+    #[bench]
+    fn bench_lcs_normalized(bencher: &mut Bencher) {
+        let a = "Philosopher Friedrich Nietzsche";
+        let b = "Philosopher Jean-Paul Sartre";
+        bencher.iter(|| {
+            strsim::lcs_normalized(&a, &b);
+        })
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -464,6 +464,50 @@ pub fn sorensen_dice(a: &str, b: &str) -> f64 {
     (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64
 }
 
+/// Uses LCS algorithm to find longest common subsequence
+/// and then divides it by the length of the longes string
+/// ```
+/// use strsim::lcs_normalized;
+///
+/// assert_eq!(1.0, lcs_normalized("", ""));
+/// assert_eq!(0.0, lcs_normalized("", "umbrella"));
+/// assert_eq!(0.8, lcs_normalized("night", "fight"));
+/// assert_eq!(1.0, lcs_normalized("ferris", "ferris"));
+/// ```
+pub fn lcs_normalized(left: impl AsRef<str>, right: impl AsRef<str>) -> f64 {
+    let (len1, len2) = (left.as_ref().len(), right.as_ref().len());
+    let lcs_len = lcs_length(left.as_ref(), right.as_ref());
+    let size = max(len1, len2);
+    // Empty strings should match
+    if size == 0 { 1.0 } else { lcs_len as f64 / size as f64 }
+}
+
+#[inline]
+fn get_shorter_longer_strings(left: impl AsRef<str>, right: impl AsRef<str>) -> (String, String) {
+    if left.as_ref().len() < right.as_ref().len() {
+        (left.as_ref().to_string(), right.as_ref().to_string())
+    } else {
+        (right.as_ref().to_string(), left.as_ref().to_string())
+    }
+}
+
+#[inline]
+fn lcs_length(left: impl AsRef<str>, right: impl AsRef<str>) -> usize {
+    let (left, right) = get_shorter_longer_strings(left, right);
+    let mut table = vec![vec![0 as usize; left.len() + 1]; 2];
+    for rletter in right.chars() {
+        for (col, lletter) in left.chars().enumerate() {
+            if rletter == lletter {
+                table[1][col + 1] = 1 + table[0][col];
+            } else {
+                table[1][col + 1] = max(table[0][col + 1], table[1][col]);
+            }
-            if rletter == lletter {
-                table[1][col + 1] = 1 + table[0][col];
-            } else {
-                table[1][col + 1] = max(table[0][col + 1], table[1][col]);
-            }
+           table[1][col + 1] =  if rletter == lletter {
+                1 + table[0][col]
+            } else {
+               max(table[0][col + 1], table[1][col])
+            };
-            if rletter == lletter {
-                table[1][col + 1] = 1 + table[0][col];
-            } else {
-                table[1][col + 1] = max(table[0][col + 1], table[1][col]);
-            }
+           table[1][col + 1] =  if rletter == lletter {
+                1 + table[0][col]
+            } else {
+               max(table[0][col + 1], table[1][col])
+            };
+        }
+        table[0] = table.pop().unwrap();
+        table.push(vec![0 as usize; left.len() + 1]);
+    }
+    *table[0].last().unwrap()
+}
 
 #[cfg(test)]
 mod tests {
@@ -989,4 +1033,71 @@ mod tests {
             sorensen_dice("this has one extra word", "this has one word")
         );
     }
+
+    #[test]
+    fn lcs_normalized_diff_unequal_length() {
+        assert!(lcs_normalized("damerau", "aderuaxyz") < 0.5);
+    }
+
+    #[test]
+    fn lcs_normalized_diff_unequal_length_reversed() {
+        assert!(lcs_normalized("aderuaxyz", "damerau") < 0.5);
+    }
+
+    #[test]
+    fn lcs_normalized_diff_comedians() {
+        assert!(lcs_normalized("Stewart", "Colbert") < 0.5);
+    }
+
+    #[test]
+    fn lcs_normalized_many_transpositions() {
+        assert!(lcs_normalized("abcdefghijkl", "bacedfgihjlk") < 0.7);
+    }
+
+    #[test]
+    fn lcs_normalized_diff_longer() {
+        let a = "The quick brown fox jumped over the angry dog.";
+        let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
+        assert!(lcs_normalized(a, b) < 0.4);
+    }
+
+    #[test]
+    fn lcs_normalized_beginning_transposition() {
+        assert!(lcs_normalized("foobar", "ofobar") > 0.8);
+    }
+
+    #[test]
+    fn lcs_normalized_end_transposition() {
+        assert!(lcs_normalized("specter", "spectre") > 0.8);
+    }
+
+    #[test]
+    fn lcs_normalized_unrestricted_edit() {
+        assert!(lcs_normalized("a cat", "an abct") > 0.5);
+    }
+
+    #[test]
+    fn lcs_normalized_diff_short() {
+        assert!(lcs_normalized("levenshtein", "löwenbräu") < 0.01);
+    }
+
+    #[test]
+    fn lcs_normalized_for_empty_strings() {
+        assert!(lcs_normalized("", "") > 0.99);
+    }
+
+    #[test]
+    fn lcs_normalized_first_empty() {
+        assert!(lcs_normalized("", "flower") < 0.01);
+    }
+
+    #[test]
+    fn lcs_normalized_second_empty() {
+        assert!(lcs_normalized("tree", "") < 0.01);
+    }
+
+    #[test]
+    fn lcs_normalized_identical_strings() {
+        assert!(lcs_normalized("sunglasses", "sunglasses") > 0.99);
+    }
 }