From 33d369cf58ff3e0854c2961d1f1528063c8cdc08 Mon Sep 17 00:00:00 2001 From: Ben Dean-Kawamura Date: Wed, 18 Dec 2024 11:39:48 -0500 Subject: [PATCH] Bug 1931373 - Add FTS matching data Added extra data to `Suggestion::Fakespot` to capture how the FTS match was made. The plan is to use this as a facet for our metrics to help us consider how to tune the matching logic (i.e. maybe we should not use stemming, maybe we should reqiure that terms are close together). Added Suggest CLI flag to print out the FTS match info. --- components/suggest/src/db.rs | 74 +++++++++++++--- components/suggest/src/query.rs | 117 +++++++++++++++++++------ components/suggest/src/store.rs | 85 +++++++++++++++--- components/suggest/src/suggestion.rs | 29 ++++++ components/suggest/src/testing/data.rs | 8 +- examples/suggest-cli/src/main.rs | 16 +++- 6 files changed, 273 insertions(+), 56 deletions(-) diff --git a/components/suggest/src/db.rs b/components/suggest/src/db.rs index dcde6f7c85..a483822e58 100644 --- a/components/suggest/src/db.rs +++ b/components/suggest/src/db.rs @@ -28,7 +28,9 @@ use crate::{ DownloadedPocketSuggestion, DownloadedWikipediaSuggestion, Record, SuggestRecordId, }, schema::{clear_database, SuggestConnectionInitializer}, - suggestion::{cook_raw_suggestion_url, AmpSuggestionType, Suggestion}, + suggestion::{ + cook_raw_suggestion_url, AmpSuggestionType, FtsMatchInfo, FtsTermDistance, Suggestion, + }, util::full_keyword, weather::WeatherCache, Result, SuggestionQuery, @@ -724,9 +726,11 @@ impl<'a> SuggestDao<'a> { /// Fetches fakespot suggestions pub fn fetch_fakespot_suggestions(&self, query: &SuggestionQuery) -> Result> { + let fts_query = query.fts_query(); self.conn.query_rows_and_then_cached( r#" SELECT + s.id, s.title, s.url, s.score, @@ -754,30 +758,76 @@ impl<'a> SuggestDao<'a> { ORDER BY s.score DESC "#, - (&query.fts_query(),), + (&fts_query.sqlite_match,), |row| { + let suggestion_id: usize = row.get(0)?; + let title: String = row.get(1)?; let score = fakespot::FakespotScore::new( &query.keyword, - row.get(9)?, row.get(10)?, - row.get(2)?, + row.get(11)?, + row.get(3)?, ) .as_suggest_score(); + let match_info = FtsMatchInfo { + prefix: fts_query.prefix_match + && !self.fakespot_suggestion_matches_query( + suggestion_id, + fts_query.sqlite_match_without_prefix_match(), + None, + )?, + stemming: fts_query.match_required_stemming(&title), + term_distance: self + .fakespot_term_distance(suggestion_id, &fts_query.sqlite_match)?, + }; + Ok(Suggestion::Fakespot { - title: row.get(0)?, - url: row.get(1)?, + title, + url: row.get(2)?, score, - fakespot_grade: row.get(3)?, - product_id: row.get(4)?, - rating: row.get(5)?, - total_reviews: row.get(6)?, - icon: row.get(7)?, - icon_mimetype: row.get(8)?, + fakespot_grade: row.get(4)?, + product_id: row.get(5)?, + rating: row.get(6)?, + total_reviews: row.get(7)?, + icon: row.get(8)?, + icon_mimetype: row.get(9)?, + match_info, }) }, ) } + fn fakespot_term_distance( + &self, + suggestion_id: usize, + match_phrase: &str, + ) -> Result { + if self.fakespot_suggestion_matches_query(suggestion_id, match_phrase, Some(3))? { + Ok(FtsTermDistance::Near) + } else if self.fakespot_suggestion_matches_query(suggestion_id, match_phrase, Some(5))? { + Ok(FtsTermDistance::Medium) + } else { + Ok(FtsTermDistance::Far) + } + } + + fn fakespot_suggestion_matches_query( + &self, + suggestion_id: usize, + match_phrase: &str, + max_distance: Option, + ) -> Result { + let match_phrase = match max_distance { + Some(max_distance) => &format!("NEAR({match_phrase}, {max_distance})"), + None => match_phrase, + }; + + Ok(self.conn.exists( + "SELECT 1 FROM fakespot_fts WHERE rowid = ? AND fakespot_fts MATCH ?", + (suggestion_id, match_phrase), + )?) + } + /// Fetches exposure suggestions pub fn fetch_exposure_suggestions(&self, query: &SuggestionQuery) -> Result> { // A single exposure suggestion can be spread across multiple remote diff --git a/components/suggest/src/query.rs b/components/suggest/src/query.rs index 2fc72a6e5a..fc55e53df8 100644 --- a/components/suggest/src/query.rs +++ b/components/suggest/src/query.rs @@ -143,31 +143,37 @@ impl SuggestionQuery { } } - // Other Functionality - - /// Parse the `keyword` field into a set of keywords. - /// - /// This is used when passing the keywords into an FTS search. It: - /// - Strips out any `():^*"` chars. These are typically used for advanced searches, which - /// we don't support and it would be weird to only support for FTS searches, which - /// currently means Fakespot searches. - /// - Splits on whitespace to get a list of individual keywords - /// - pub(crate) fn parse_keywords(&self) -> Vec<&str> { - self.keyword - .split([' ', '(', ')', ':', '^', '*', '"']) - .filter(|s| !s.is_empty()) - .collect() + /// Create an FTS query term for our keyword(s) + pub(crate) fn fts_query(&self) -> FtsQuery<'_> { + FtsQuery::new(&self.keyword) } +} - /// Create an FTS query term for our keyword(s) - pub(crate) fn fts_query(&self) -> String { - let keywords = self.parse_keywords(); +pub struct FtsQuery<'a> { + pub sqlite_match: String, + pub prefix_match: bool, + keywords: Vec<&'a str>, +} + +impl<'a> FtsQuery<'a> { + fn new(keyword: &'a str) -> Self { + // Parse the `keyword` field into a set of keywords. + // + // This is used when passing the keywords into an FTS search. It: + // - Strips out any `():^*"` chars. These are typically used for advanced searches, which + // we don't support and it would be weird to only support for FTS searches, which + // currently means Fakespot searches. + // - splits on whitespace to get a list of individual keywords + let keywords = Self::split_terms(keyword); if keywords.is_empty() { - return String::from(r#""""#); + return Self { + keywords, + sqlite_match: String::from(r#""""#), + prefix_match: false, + }; } // Quote each term from `query` and join them together - let mut fts_query = keywords + let mut sqlite_match = keywords .iter() .map(|keyword| format!(r#""{keyword}""#)) .collect::>() @@ -175,11 +181,50 @@ impl SuggestionQuery { // If the input is > 3 characters, and there's no whitespace at the end. // We want to append a `*` char to the end to do a prefix match on it. let total_chars = keywords.iter().fold(0, |count, s| count + s.len()); - let query_ends_in_whitespace = self.keyword.ends_with(' '); - if (total_chars > 3) && !query_ends_in_whitespace { - fts_query.push('*'); + let query_ends_in_whitespace = keyword.ends_with(' '); + let prefix_match = (total_chars > 3) && !query_ends_in_whitespace; + if prefix_match { + sqlite_match.push('*'); + } + Self { + keywords, + prefix_match, + sqlite_match, } - fts_query + } + + pub fn sqlite_match_without_prefix_match(&self) -> &str { + self.sqlite_match + .strip_suffix('*') + .unwrap_or(&self.sqlite_match) + } + + /// Try to figure out if a FTS match required stemming + /// + /// To test this, we have to try to mimic the SQLite FTS logic. This code doesn't do it + /// perfectly, but it should return the correct result most of the time. + pub fn match_required_stemming(&self, title: &str) -> bool { + let title = title.to_lowercase(); + let split_title = Self::split_terms(&title); + + !self.keywords.iter().enumerate().all(|(i, keyword)| { + split_title.iter().any(|title_word| { + let last_keyword = i == self.keywords.len() - 1; + + if last_keyword && self.prefix_match { + title_word.starts_with(keyword) + } else { + title_word == keyword + } + }) + }) + } + + fn split_terms(phrase: &str) -> Vec<&str> { + phrase + .split([' ', '(', ')', ':', '^', '*', '"', ',']) + .filter(|s| !s.is_empty()) + .collect() } } @@ -189,7 +234,7 @@ mod test { fn check_parse_keywords(input: &str, expected: Vec<&str>) { let query = SuggestionQuery::all_providers(input); - assert_eq!(query.parse_keywords(), expected); + assert_eq!(query.fts_query().keywords, expected); } #[test] @@ -207,7 +252,7 @@ mod test { fn check_fts_query(input: &str, expected: &str) { let query = SuggestionQuery::all_providers(input); - assert_eq!(query.fts_query(), expected); + assert_eq!(query.fts_query().sqlite_match, expected); } #[test] @@ -234,4 +279,24 @@ mod test { check_fts_query(" ", r#""""#); check_fts_query("()", r#""""#); } + + #[test] + fn test_fts_query_match_required_stemming() { + // These don't require stemming, since each keyword matches a term in the title + assert!(!FtsQuery::new("running shoes").match_required_stemming("running shoes")); + assert!( + !FtsQuery::new("running shoes").match_required_stemming("new balance running shoes") + ); + // Case changes shouldn't matter + assert!(!FtsQuery::new("running shoes").match_required_stemming("Running Shoes")); + // This doesn't require stemming, since `:` is not part of the word + assert!(!FtsQuery::new("running shoes").match_required_stemming("Running: Shoes")); + // This requires the keywords to be stemmed in order to match + assert!(FtsQuery::new("run shoes").match_required_stemming("running shoes")); + // This didn't require stemming, since the last keyword was a prefix match + assert!(!FtsQuery::new("running sh").match_required_stemming("running shoes")); + // This does require stemming (we know it wasn't a prefix match since there's not enough + // characters). + assert!(FtsQuery::new("run").match_required_stemming("running shoes")); + } } diff --git a/components/suggest/src/store.rs b/components/suggest/src/store.rs index c61531908b..b73b29fc82 100644 --- a/components/suggest/src/store.rs +++ b/components/suggest/src/store.rs @@ -912,7 +912,11 @@ pub(crate) mod tests { use std::sync::atomic::{AtomicUsize, Ordering}; - use crate::{testing::*, SuggestionProvider}; + use crate::{ + suggestion::{FtsMatchInfo, FtsTermDistance}, + testing::*, + SuggestionProvider, + }; /// In-memory Suggest store for testing pub(crate) struct TestStore { @@ -2258,30 +2262,58 @@ pub(crate) mod tests { store.ingest(SuggestIngestionConstraints::all_providers()); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("globe")), - vec![snowglobe_suggestion().with_fakespot_product_type_bonus(0.5)], + vec![snowglobe_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_product_type_bonus(0.5)], ); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simpsons")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); // The snowglobe suggestion should come before the simpsons one, since `snow` is a partial // match on the product_type field. assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("snow")), vec![ - snowglobe_suggestion().with_fakespot_product_type_bonus(0.5), - simpsons_suggestion(), + snowglobe_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_product_type_bonus(0.5), + simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },), ], ); // Test FTS by using a query where the keywords are separated in the source text assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simpsons snow")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); // Special characters should be stripped out assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simpsons + snow")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: false, + // This is correctly incorrectly counted as stemming, since nothing matches the `+` + // character. TODO: fix this be improving the tokenizer in `FtsQuery`. + stemming: true, + term_distance: FtsTermDistance::Near, + },)], ); Ok(()) @@ -2309,8 +2341,18 @@ pub(crate) mod tests { assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("snow")), vec![ - simpsons_suggestion().with_fakespot_keyword_bonus(), - snowglobe_suggestion().with_fakespot_product_type_bonus(0.5), + simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_keyword_bonus(), + snowglobe_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_product_type_bonus(0.5), ], ); Ok(()) @@ -2332,15 +2374,27 @@ pub(crate) mod tests { store.ingest(SuggestIngestionConstraints::all_providers()); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simp")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: true, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simps")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: true, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("simpson")), - vec![simpsons_suggestion()], + vec![simpsons_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },)], ); Ok(()) @@ -2416,7 +2470,12 @@ pub(crate) mod tests { store.ingest(SuggestIngestionConstraints::all_providers()); assert_eq!( store.fetch_suggestions(SuggestionQuery::fakespot("globe")), - vec![snowglobe_suggestion().with_fakespot_product_type_bonus(0.5)], + vec![snowglobe_suggestion(FtsMatchInfo { + prefix: false, + stemming: false, + term_distance: FtsTermDistance::Near, + },) + .with_fakespot_product_type_bonus(0.5)], ); assert_eq!( store.fetch_suggestions(SuggestionQuery::amp("lo")), diff --git a/components/suggest/src/suggestion.rs b/components/suggest/src/suggestion.rs index eabb96de39..79f39e9659 100644 --- a/components/suggest/src/suggestion.rs +++ b/components/suggest/src/suggestion.rs @@ -96,6 +96,7 @@ pub enum Suggestion { icon: Option>, icon_mimetype: Option, score: f64, + match_info: FtsMatchInfo, }, Exposure { suggestion_type: String, @@ -103,6 +104,27 @@ pub enum Suggestion { }, } +/// Additional data about how an FTS match was made(https://bugzilla.mozilla.org/show_bug.cgi?id=1931373) +#[derive(Debug, Clone, PartialEq, uniffi::Record)] +pub struct FtsMatchInfo { + /// Was this a prefix match (`water b` matched against `water bottle`) + pub prefix: bool, + /// Did the match require stemming? (`run shows` matched against `running shoes`) + pub stemming: bool, + /// How far apart were the terms product title + pub term_distance: FtsTermDistance, +} + +#[derive(Debug, Clone, PartialEq, uniffi::Enum)] +pub enum FtsTermDistance { + // All terms in a 3-term chunk + Near, + // All terms in a 5-term chunk + Medium, + // No 5-term chunk that contains all the terms + Far, +} + impl PartialOrd for Suggestion { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -185,6 +207,13 @@ impl Suggestion { Self::Wikipedia { .. } => DEFAULT_SUGGESTION_SCORE, } } + + pub fn fts_match_info(&self) -> Option<&FtsMatchInfo> { + match self { + Self::Fakespot { match_info, .. } => Some(match_info), + _ => None, + } + } } #[cfg(test)] diff --git a/components/suggest/src/testing/data.rs b/components/suggest/src/testing/data.rs index a91cf7146e..e30954c0e4 100644 --- a/components/suggest/src/testing/data.rs +++ b/components/suggest/src/testing/data.rs @@ -4,7 +4,7 @@ //! Test data that we use in many tests -use crate::{testing::MockIcon, Suggestion}; +use crate::{suggestion::FtsMatchInfo, testing::MockIcon, Suggestion}; use serde_json::json; use serde_json::Value as JsonValue; @@ -466,7 +466,7 @@ pub fn snowglobe_fakespot() -> JsonValue { }) } -pub fn snowglobe_suggestion() -> Suggestion { +pub fn snowglobe_suggestion(match_info: FtsMatchInfo) -> Suggestion { Suggestion::Fakespot { fakespot_grade: "B".into(), product_id: "amazon-ABC".into(), @@ -477,6 +477,7 @@ pub fn snowglobe_suggestion() -> Suggestion { score: 0.3 + 0.00008, icon: Some("fakespot-icon-amazon-data".as_bytes().to_vec()), icon_mimetype: Some("image/png".into()), + match_info, } } @@ -496,7 +497,7 @@ pub fn simpsons_fakespot() -> JsonValue { }) } -pub fn simpsons_suggestion() -> Suggestion { +pub fn simpsons_suggestion(match_info: FtsMatchInfo) -> Suggestion { Suggestion::Fakespot { fakespot_grade: "A".into(), product_id: "vendorwithouticon-XYZ".into(), @@ -507,6 +508,7 @@ pub fn simpsons_suggestion() -> Suggestion { score: 0.3 + 0.00009, icon: None, icon_mimetype: None, + match_info, } } diff --git a/examples/suggest-cli/src/main.rs b/examples/suggest-cli/src/main.rs index b9cc7f2fc0..b635999aa0 100644 --- a/examples/suggest-cli/src/main.rs +++ b/examples/suggest-cli/src/main.rs @@ -50,6 +50,8 @@ enum Commands { }, /// Query against ingested data Query { + #[arg(long, action)] + fts_match_info: bool, #[clap(long, short)] provider: Option, /// Input to search @@ -103,7 +105,11 @@ fn main() -> Result<()> { reingest, providers, } => ingest(&store, reingest, providers, cli.verbose), - Commands::Query { provider, input } => query(&store, provider, input, cli.verbose), + Commands::Query { + provider, + input, + fts_match_info, + } => query(&store, provider, input, fts_match_info, cli.verbose), }; Ok(()) } @@ -178,6 +184,7 @@ fn query( store: &SuggestStore, provider: Option, input: String, + fts_match_info: bool, verbose: bool, ) { let query = SuggestionQuery { @@ -203,7 +210,12 @@ fn query( } else { "no icon" }; - println!("{title} ({url}) ({icon})"); + println!("* {title} ({url}) ({icon})"); + if fts_match_info { + if let Some(match_info) = suggestion.fts_match_info() { + println!(" {match_info:?}") + } + } } } if verbose {