Skip to content

Commit

Permalink
Bug 1931373 - Add FTS matching data
Browse files Browse the repository at this point in the history
Added extra data to `Suggestion::Fakespot` to capture how the FTS match
was made.  The plan is to use this as a facet for our metrics to help us
consider how to tune the matching logic (i.e. maybe we should not use
stemming, maybe we should reqiure that terms are close together).

Added Suggest CLI flag to print out the FTS match info.
  • Loading branch information
bendk committed Dec 18, 2024
1 parent f111809 commit 474496c
Show file tree
Hide file tree
Showing 6 changed files with 256 additions and 56 deletions.
61 changes: 49 additions & 12 deletions components/suggest/src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ use crate::{
DownloadedPocketSuggestion, DownloadedWikipediaSuggestion, Record, SuggestRecordId,
},
schema::{clear_database, SuggestConnectionInitializer},
suggestion::{cook_raw_suggestion_url, AmpSuggestionType, Suggestion},
suggestion::{
cook_raw_suggestion_url, AmpSuggestionType, FtsMatchInfo, FtsTermDistance, Suggestion,
},
util::full_keyword,
weather::WeatherCache,
Result, SuggestionQuery,
Expand Down Expand Up @@ -724,9 +726,11 @@ impl<'a> SuggestDao<'a> {

/// Fetches fakespot suggestions
pub fn fetch_fakespot_suggestions(&self, query: &SuggestionQuery) -> Result<Vec<Suggestion>> {
let fts_query = query.fts_query();
self.conn.query_rows_and_then_cached(
r#"
SELECT
s.id,
s.title,
s.url,
s.score,
Expand Down Expand Up @@ -754,30 +758,63 @@ impl<'a> SuggestDao<'a> {
ORDER BY
s.score DESC
"#,
(&query.fts_query(),),
(&fts_query.sqlite_match,),
|row| {
let suggestion_id: usize = row.get(0)?;
let title: String = row.get(1)?;
let score = fakespot::FakespotScore::new(
&query.keyword,
row.get(9)?,
row.get(10)?,
row.get(2)?,
row.get(11)?,
row.get(3)?,
)
.as_suggest_score();
let match_info = FtsMatchInfo {
prefix: fts_query.prefix_match && !self.fakespot_suggestion_matches_query(
suggestion_id, fts_query.sqlite_match_without_prefix_match(), None)?,
stemming: fts_query.match_required_stemming(&title),
term_distance: self.fakespot_term_distance(suggestion_id, &fts_query.sqlite_match)?,
};

Ok(Suggestion::Fakespot {
title: row.get(0)?,
url: row.get(1)?,
title,
url: row.get(2)?,
score,
fakespot_grade: row.get(3)?,
product_id: row.get(4)?,
rating: row.get(5)?,
total_reviews: row.get(6)?,
icon: row.get(7)?,
icon_mimetype: row.get(8)?,
fakespot_grade: row.get(4)?,
product_id: row.get(5)?,
rating: row.get(6)?,
total_reviews: row.get(7)?,
icon: row.get(8)?,
icon_mimetype: row.get(9)?,
match_info,
})
},
)
}

fn fakespot_term_distance(&self, suggestion_id: usize, match_phrase: &str) -> Result<FtsTermDistance> {
if self.fakespot_suggestion_matches_query(suggestion_id, match_phrase, Some(3))? {
Ok(FtsTermDistance::Near)
} else if self.fakespot_suggestion_matches_query(suggestion_id, match_phrase, Some(5))? {
Ok(FtsTermDistance::Medium)
} else {
Ok(FtsTermDistance::Far)
}

}

fn fakespot_suggestion_matches_query(&self, suggestion_id: usize, match_phrase: &str, max_distance: Option<usize>) -> Result<bool> {
let match_phrase = match max_distance {
Some(max_distance) => &format!("NEAR({match_phrase}, {max_distance})"),
None => match_phrase,
};

Ok(self.conn.exists(
"SELECT 1 FROM fakespot_fts WHERE rowid = ? AND fakespot_fts MATCH ?",
(suggestion_id, match_phrase)
)?)
}

/// Fetches exposure suggestions
pub fn fetch_exposure_suggestions(&self, query: &SuggestionQuery) -> Result<Vec<Suggestion>> {
// A single exposure suggestion can be spread across multiple remote
Expand Down
113 changes: 87 additions & 26 deletions components/suggest/src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,43 +143,84 @@ impl SuggestionQuery {
}
}

// Other Functionality

/// Parse the `keyword` field into a set of keywords.
///
/// This is used when passing the keywords into an FTS search. It:
/// - Strips out any `():^*"` chars. These are typically used for advanced searches, which
/// we don't support and it would be weird to only support for FTS searches, which
/// currently means Fakespot searches.
/// - Splits on whitespace to get a list of individual keywords
///
pub(crate) fn parse_keywords(&self) -> Vec<&str> {
self.keyword
.split([' ', '(', ')', ':', '^', '*', '"'])
.filter(|s| !s.is_empty())
.collect()
/// Create an FTS query term for our keyword(s)
pub(crate) fn fts_query(&self) -> FtsQuery<'_> {
FtsQuery::new(&self.keyword)
}
}

/// Create an FTS query term for our keyword(s)
pub(crate) fn fts_query(&self) -> String {
let keywords = self.parse_keywords();
pub struct FtsQuery<'a> {
pub sqlite_match: String,
pub prefix_match: bool,
keywords: Vec<&'a str>,
}

impl<'a> FtsQuery<'a> {
fn new(keyword: &'a str) -> Self {
// Parse the `keyword` field into a set of keywords.
//
// This is used when passing the keywords into an FTS search. It:
// - Strips out any `():^*"` chars. These are typically used for advanced searches, which
// we don't support and it would be weird to only support for FTS searches, which
// currently means Fakespot searches.
// - splits on whitespace to get a list of individual keywords
let keywords = Self::split_terms(keyword);
if keywords.is_empty() {
return String::from(r#""""#);
return Self {
keywords,
sqlite_match: String::from(r#""""#),
prefix_match: false,
};
}
// Quote each term from `query` and join them together
let mut fts_query = keywords
let mut sqlite_match = keywords
.iter()
.map(|keyword| format!(r#""{keyword}""#))
.collect::<Vec<_>>()
.join(" ");
// If the input is > 3 characters, and there's no whitespace at the end.
// We want to append a `*` char to the end to do a prefix match on it.
let total_chars = keywords.iter().fold(0, |count, s| count + s.len());
let query_ends_in_whitespace = self.keyword.ends_with(' ');
if (total_chars > 3) && !query_ends_in_whitespace {
fts_query.push('*');
let query_ends_in_whitespace = keyword.ends_with(' ');
let prefix_match = (total_chars > 3) && !query_ends_in_whitespace;
if prefix_match {
sqlite_match.push('*');
}
Self {
keywords,
prefix_match,
sqlite_match,
}
fts_query
}

pub fn sqlite_match_without_prefix_match(&self) -> &str {
self.sqlite_match
.strip_suffix('*')
.unwrap_or(&self.sqlite_match)
}

pub fn match_required_stemming(&self, title: &str) -> bool {
let title = title.to_lowercase();
let split_title = Self::split_terms(&title);

!self.keywords.iter().enumerate().all(|(i, keyword)| {
split_title.iter().any(|title_word| {
let last_keyword = i == self.keywords.len() - 1;
let value = if last_keyword && self.prefix_match {
title_word.starts_with(keyword)
} else {
title_word == keyword
};
value
})
})
}

fn split_terms(phrase: &str) -> Vec<&str> {
phrase
.split([' ', '(', ')', ':', '^', '*', '"'])
.filter(|s| !s.is_empty())
.collect()
}
}

Expand All @@ -189,7 +230,7 @@ mod test {

fn check_parse_keywords(input: &str, expected: Vec<&str>) {
let query = SuggestionQuery::all_providers(input);
assert_eq!(query.parse_keywords(), expected);
assert_eq!(query.fts_query().keywords, expected);
}

#[test]
Expand All @@ -207,7 +248,7 @@ mod test {

fn check_fts_query(input: &str, expected: &str) {
let query = SuggestionQuery::all_providers(input);
assert_eq!(query.fts_query(), expected);
assert_eq!(query.fts_query().sqlite_match, expected);
}

#[test]
Expand All @@ -234,4 +275,24 @@ mod test {
check_fts_query(" ", r#""""#);
check_fts_query("()", r#""""#);
}

#[test]
fn test_fts_query_match_required_stemming() {
// These don't require stemming, since each keyword matches a term in the title
assert!(!FtsQuery::new("running shoes").match_required_stemming("running shoes"));
assert!(
!FtsQuery::new("running shoes").match_required_stemming("new balance running shoes")
);
// Case changes shouldn't matter
assert!(!FtsQuery::new("running shoes").match_required_stemming("Running Shoes"));
// This doesn't require stemming, since `:` is not part of the word
assert!(!FtsQuery::new("running shoes").match_required_stemming("Running: Shoes"));
// This requires the keywords to be stemmed in order to match
assert!(FtsQuery::new("run shoes").match_required_stemming("running shoes"));
// This didn't require stemming, since the last keyword was a prefix match
assert!(!FtsQuery::new("running sh").match_required_stemming("running shoes"));
// This does require stemming (we know it wasn't a prefix match since there's not enough
// characters).
assert!(FtsQuery::new("run").match_required_stemming("running shoes"));
}
}
85 changes: 72 additions & 13 deletions components/suggest/src/store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -912,7 +912,11 @@ pub(crate) mod tests {

use std::sync::atomic::{AtomicUsize, Ordering};

use crate::{testing::*, SuggestionProvider};
use crate::{
suggestion::{FtsMatchInfo, FtsTermDistance},
testing::*,
SuggestionProvider,
};

/// In-memory Suggest store for testing
pub(crate) struct TestStore {
Expand Down Expand Up @@ -2258,30 +2262,58 @@ pub(crate) mod tests {
store.ingest(SuggestIngestionConstraints::all_providers());
assert_eq!(
store.fetch_suggestions(SuggestionQuery::fakespot("globe")),
vec![snowglobe_suggestion().with_fakespot_product_type_bonus(0.5)],
vec![snowglobe_suggestion(FtsMatchInfo {
prefix: false,
stemming: false,
term_distance: FtsTermDistance::Near,
},)
.with_fakespot_product_type_bonus(0.5)],
);
assert_eq!(
store.fetch_suggestions(SuggestionQuery::fakespot("simpsons")),
vec![simpsons_suggestion()],
vec![simpsons_suggestion(FtsMatchInfo {
prefix: false,
stemming: false,
term_distance: FtsTermDistance::Near,
},)],
);
// The snowglobe suggestion should come before the simpsons one, since `snow` is a partial
// match on the product_type field.
assert_eq!(
store.fetch_suggestions(SuggestionQuery::fakespot("snow")),
vec![
snowglobe_suggestion().with_fakespot_product_type_bonus(0.5),
simpsons_suggestion(),
snowglobe_suggestion(FtsMatchInfo {
prefix: false,
stemming: false,
term_distance: FtsTermDistance::Near,
},)
.with_fakespot_product_type_bonus(0.5),
simpsons_suggestion(FtsMatchInfo {
prefix: false,
stemming: false,
term_distance: FtsTermDistance::Near,
},),
],
);
// Test FTS by using a query where the keywords are separated in the source text
assert_eq!(
store.fetch_suggestions(SuggestionQuery::fakespot("simpsons snow")),
vec![simpsons_suggestion()],
vec![simpsons_suggestion(FtsMatchInfo {
prefix: false,
stemming: false,
term_distance: FtsTermDistance::Near,
},)],
);
// Special characters should be stripped out
assert_eq!(
store.fetch_suggestions(SuggestionQuery::fakespot("simpsons + snow")),
vec![simpsons_suggestion()],
vec![simpsons_suggestion(FtsMatchInfo {
prefix: false,
// This is correctly incorrectly counted as stemming, since nothing matches the `+`
// character. TODO: fix this be improving the tokenizer in `FtsQuery`.
stemming: true,
term_distance: FtsTermDistance::Near,
},)],
);

Ok(())
Expand Down Expand Up @@ -2309,8 +2341,18 @@ pub(crate) mod tests {
assert_eq!(
store.fetch_suggestions(SuggestionQuery::fakespot("snow")),
vec![
simpsons_suggestion().with_fakespot_keyword_bonus(),
snowglobe_suggestion().with_fakespot_product_type_bonus(0.5),
simpsons_suggestion(FtsMatchInfo {
prefix: false,
stemming: false,
term_distance: FtsTermDistance::Near,
},)
.with_fakespot_keyword_bonus(),
snowglobe_suggestion(FtsMatchInfo {
prefix: false,
stemming: false,
term_distance: FtsTermDistance::Near,
},)
.with_fakespot_product_type_bonus(0.5),
],
);
Ok(())
Expand All @@ -2332,15 +2374,27 @@ pub(crate) mod tests {
store.ingest(SuggestIngestionConstraints::all_providers());
assert_eq!(
store.fetch_suggestions(SuggestionQuery::fakespot("simp")),
vec![simpsons_suggestion()],
vec![simpsons_suggestion(FtsMatchInfo {
prefix: true,
stemming: false,
term_distance: FtsTermDistance::Near,
},)],
);
assert_eq!(
store.fetch_suggestions(SuggestionQuery::fakespot("simps")),
vec![simpsons_suggestion()],
vec![simpsons_suggestion(FtsMatchInfo {
prefix: true,
stemming: false,
term_distance: FtsTermDistance::Near,
},)],
);
assert_eq!(
store.fetch_suggestions(SuggestionQuery::fakespot("simpson")),
vec![simpsons_suggestion()],
vec![simpsons_suggestion(FtsMatchInfo {
prefix: false,
stemming: false,
term_distance: FtsTermDistance::Near,
},)],
);

Ok(())
Expand Down Expand Up @@ -2416,7 +2470,12 @@ pub(crate) mod tests {
store.ingest(SuggestIngestionConstraints::all_providers());
assert_eq!(
store.fetch_suggestions(SuggestionQuery::fakespot("globe")),
vec![snowglobe_suggestion().with_fakespot_product_type_bonus(0.5)],
vec![snowglobe_suggestion(FtsMatchInfo {
prefix: false,
stemming: false,
term_distance: FtsTermDistance::Near,
},)
.with_fakespot_product_type_bonus(0.5)],
);
assert_eq!(
store.fetch_suggestions(SuggestionQuery::amp("lo")),
Expand Down
Loading

0 comments on commit 474496c

Please sign in to comment.