Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 1931373 - Add FTS matching data #6531

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 63 additions & 9 deletions components/suggest/src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ use crate::{
DownloadedPocketSuggestion, DownloadedWikipediaSuggestion, Record, SuggestRecordId,
},
schema::{clear_database, SuggestConnectionInitializer},
suggestion::{cook_raw_suggestion_url, AmpSuggestionType, Suggestion},
suggestion::{
cook_raw_suggestion_url, AmpSuggestionType, FtsMatchInfo, FtsTermDistance, Suggestion,
},
util::full_keyword,
weather::WeatherCache,
Result, SuggestionQuery,
Expand Down Expand Up @@ -724,7 +726,40 @@ impl<'a> SuggestDao<'a> {

/// Fetches fakespot suggestions
pub fn fetch_fakespot_suggestions(&self, query: &SuggestionQuery) -> Result<Vec<Suggestion>> {
self.conn.query_rows_and_then_cached(
let fts_query = query.fts_query();
let mut params: Vec<&dyn ToSql> = vec![];

let was_prefix_match_sql = if fts_query.is_prefix_query {
// If the query was a prefix match query then test if the query without the prefix
// match would have also matched. If not, then we should report it as a prefix match
// in the match info
params.push(&fts_query.match_arg_without_prefix_match);
"NOT EXISTS (SELECT 1 FROM fakespot_fts WHERE fakespot_fts MATCH ?)"
} else {
// If not, then it definitely wasn't a prefix match
"FALSE"
};

// Allocate storage outside the if statement for the NEAR match expressions.
let near1;
let near2;
let (near_1_match_sql, near_3_match_sql) = if fts_query.had_multiple_terms() {
// If then the original query had multiple terms then test there still would be a match
// with various NEAR clauses.
near1 = format!("NEAR({}, 1)", fts_query.match_arg);
near2 = format!("NEAR({}, 3)", fts_query.match_arg);
params.push(&near1);
params.push(&near2);
(
"EXISTS (SELECT 1 FROM fakespot_fts WHERE fakespot_fts MATCH ?)",
"EXISTS (SELECT 1 FROM fakespot_fts WHERE fakespot_fts MATCH ?)",
)
} else {
// There was only 1 term, then it it would be a match regardless of NEAR
("TRUE", "TRUE")
};

let sql = format!(
r#"
SELECT
s.title,
Expand All @@ -737,7 +772,10 @@ impl<'a> SuggestDao<'a> {
i.data,
i.mimetype,
f.keywords,
f.product_type
f.product_type,
{was_prefix_match_sql},
{near_1_match_sql},
{near_3_match_sql}
FROM
suggestions s
JOIN
Expand All @@ -753,18 +791,34 @@ impl<'a> SuggestDao<'a> {
fakespot_fts MATCH ?
ORDER BY
s.score DESC
"#,
(&query.fts_query(),),
|row| {
"#
);
params.push(&fts_query.match_arg);

self.conn
.query_rows_and_then_cached(&sql, params.as_slice(), |row| {
let title: String = row.get(0)?;
let score = fakespot::FakespotScore::new(
&query.keyword,
row.get(9)?,
row.get(10)?,
row.get(2)?,
)
.as_suggest_score();
let match_info = FtsMatchInfo {
prefix: row.get(11)?,
stemming: fts_query.match_required_stemming(&title),
term_distance: if row.get(12)? {
FtsTermDistance::Near
} else if row.get(13)? {
FtsTermDistance::Medium
} else {
FtsTermDistance::Far
},
};

Ok(Suggestion::Fakespot {
title: row.get(0)?,
title,
url: row.get(1)?,
score,
fakespot_grade: row.get(3)?,
Expand All @@ -773,9 +827,9 @@ impl<'a> SuggestDao<'a> {
total_reviews: row.get(6)?,
icon: row.get(7)?,
icon_mimetype: row.get(8)?,
match_info,
})
},
)
})
}

/// Fetches exposure suggestions
Expand Down
118 changes: 92 additions & 26 deletions components/suggest/src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,43 +143,89 @@ impl SuggestionQuery {
}
}

// Other Functionality

/// Parse the `keyword` field into a set of keywords.
///
/// This is used when passing the keywords into an FTS search. It:
/// - Strips out any `():^*"` chars. These are typically used for advanced searches, which
/// we don't support and it would be weird to only support for FTS searches, which
/// currently means Fakespot searches.
/// - Splits on whitespace to get a list of individual keywords
///
pub(crate) fn parse_keywords(&self) -> Vec<&str> {
self.keyword
.split([' ', '(', ')', ':', '^', '*', '"'])
.filter(|s| !s.is_empty())
.collect()
/// Create an FTS query term for our keyword(s)
pub(crate) fn fts_query(&self) -> FtsQuery<'_> {
FtsQuery::new(&self.keyword)
}
}

/// Create an FTS query term for our keyword(s)
pub(crate) fn fts_query(&self) -> String {
let keywords = self.parse_keywords();
pub struct FtsQuery<'a> {
pub match_arg: String,
pub match_arg_without_prefix_match: String,
pub is_prefix_query: bool,
keyword_terms: Vec<&'a str>,
}

impl<'a> FtsQuery<'a> {
fn new(keyword: &'a str) -> Self {
// Parse the `keyword` field into a set of keywords.
//
// This is used when passing the keywords into an FTS search. It:
// - Strips out any `():^*"` chars. These are typically used for advanced searches, which
// we don't support and it would be weird to only support for FTS searches.
// - splits on whitespace to get a list of individual keywords
let keywords = Self::split_terms(keyword);
if keywords.is_empty() {
return String::from(r#""""#);
return Self {
keyword_terms: keywords,
match_arg: String::from(r#""""#),
match_arg_without_prefix_match: String::from(r#""""#),
is_prefix_query: false,
};
}
// Quote each term from `query` and join them together
let mut fts_query = keywords
let mut sqlite_match = keywords
.iter()
.map(|keyword| format!(r#""{keyword}""#))
.collect::<Vec<_>>()
.join(" ");
// If the input is > 3 characters, and there's no whitespace at the end.
// We want to append a `*` char to the end to do a prefix match on it.
let total_chars = keywords.iter().fold(0, |count, s| count + s.len());
let query_ends_in_whitespace = self.keyword.ends_with(' ');
if (total_chars > 3) && !query_ends_in_whitespace {
fts_query.push('*');
let query_ends_in_whitespace = keyword.ends_with(' ');
let prefix_match = (total_chars > 3) && !query_ends_in_whitespace;
let sqlite_match_without_prefix_match = sqlite_match.clone();
if prefix_match {
sqlite_match.push('*');
}
fts_query
Self {
keyword_terms: keywords,
is_prefix_query: prefix_match,
match_arg: sqlite_match,
match_arg_without_prefix_match: sqlite_match_without_prefix_match,
}
}

/// Try to figure out if a FTS match required stemming
///
/// To test this, we have to try to mimic the SQLite FTS logic. This code doesn't do it
/// perfectly, but it should return the correct result most of the time.
pub fn match_required_stemming(&self, title: &str) -> bool {
let title = title.to_lowercase();
let split_title = Self::split_terms(&title);

!self.keyword_terms.iter().enumerate().all(|(i, keyword)| {
split_title.iter().any(|title_word| {
let last_keyword = i == self.keyword_terms.len() - 1;

if last_keyword && self.is_prefix_query {
title_word.starts_with(keyword)
} else {
title_word == keyword
}
})
})
}

fn split_terms(phrase: &str) -> Vec<&str> {
phrase
.split([' ', '(', ')', ':', '^', '*', '"', ','])
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the , char in the list of things to split on. This way the comma in Trail Running, isn't included in the search terms. The FTS tokenizer ignores that, but it was messing up the stemming check logic.

.filter(|s| !s.is_empty())
.collect()
}

pub fn had_multiple_terms(&self) -> bool {
self.keyword_terms.len() > 1
}
}

Expand All @@ -189,7 +235,7 @@ mod test {

fn check_parse_keywords(input: &str, expected: Vec<&str>) {
let query = SuggestionQuery::all_providers(input);
assert_eq!(query.parse_keywords(), expected);
assert_eq!(query.fts_query().keyword_terms, expected);
}

#[test]
Expand All @@ -207,7 +253,7 @@ mod test {

fn check_fts_query(input: &str, expected: &str) {
let query = SuggestionQuery::all_providers(input);
assert_eq!(query.fts_query(), expected);
assert_eq!(query.fts_query().match_arg, expected);
}

#[test]
Expand All @@ -234,4 +280,24 @@ mod test {
check_fts_query(" ", r#""""#);
check_fts_query("()", r#""""#);
}

#[test]
fn test_fts_query_match_required_stemming() {
// These don't require stemming, since each keyword matches a term in the title
assert!(!FtsQuery::new("running shoes").match_required_stemming("running shoes"));
assert!(
!FtsQuery::new("running shoes").match_required_stemming("new balance running shoes")
);
// Case changes shouldn't matter
assert!(!FtsQuery::new("running shoes").match_required_stemming("Running Shoes"));
// This doesn't require stemming, since `:` is not part of the word
assert!(!FtsQuery::new("running shoes").match_required_stemming("Running: Shoes"));
// This requires the keywords to be stemmed in order to match
assert!(FtsQuery::new("run shoes").match_required_stemming("running shoes"));
// This didn't require stemming, since the last keyword was a prefix match
assert!(!FtsQuery::new("running sh").match_required_stemming("running shoes"));
// This does require stemming (we know it wasn't a prefix match since there's not enough
// characters).
assert!(FtsQuery::new("run").match_required_stemming("running shoes"));
}
}
Loading