Skip to content

Commit

Permalink
AMP FTS experiment logic
Browse files Browse the repository at this point in the history
This sets it up so we always ingest the FTS data and use the
`SuggestionProviderConstraints` passed to the query to determine how to
perform the query.  This seems like the simplest approach and it doesn't
increase ingestion time that much.  The benchmarks on my machine went
from 339.29 ms to 465.60 ms.
  • Loading branch information
bendk committed Jan 16, 2025
1 parent d7049d2 commit f63adf8
Show file tree
Hide file tree
Showing 8 changed files with 493 additions and 31 deletions.
216 changes: 193 additions & 23 deletions components/suggest/src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use crate::{
fakespot,
geoname::GeonameCache,
pocket::{split_keyword, KeywordConfidence},
provider::SuggestionProvider,
provider::{AmpMatchingStrategy, SuggestionProvider},
query::FtsQuery,
rs::{
DownloadedAmoSuggestion, DownloadedAmpSuggestion, DownloadedAmpWikipediaSuggestion,
Expand Down Expand Up @@ -321,35 +321,67 @@ impl<'a> SuggestDao<'a> {
&self,
query: &SuggestionQuery,
suggestion_type: AmpSuggestionType,
) -> Result<Vec<Suggestion>> {
let strategy = query
.provider_constraints
.as_ref()
.and_then(|c| c.amp_alternative_matching.as_ref());
match strategy {
None => self.fetch_amp_suggestions_using_keywords(query, suggestion_type, true),
Some(AmpMatchingStrategy::NoKeywordExpansion) => {
self.fetch_amp_suggestions_using_keywords(query, suggestion_type, false)
}
Some(AmpMatchingStrategy::FtsAgainstFullKeywords) => {
self.fetch_amp_suggestions_using_fts(query, suggestion_type, "full_keywords")
}
Some(AmpMatchingStrategy::FtsAgainstTitle) => {
self.fetch_amp_suggestions_using_fts(query, suggestion_type, "title")
}
}
}

pub fn fetch_amp_suggestions_using_keywords(
&self,
query: &SuggestionQuery,
suggestion_type: AmpSuggestionType,
allow_keyword_expansion: bool,
) -> Result<Vec<Suggestion>> {
let keyword_lowercased = &query.keyword.to_lowercase();
let provider = match suggestion_type {
AmpSuggestionType::Mobile => SuggestionProvider::AmpMobile,
AmpSuggestionType::Desktop => SuggestionProvider::Amp,
};
let where_extra = if allow_keyword_expansion {
""
} else {
"AND INSTR(CONCAT(fk.full_keyword, ' '), k.keyword) != 0"
};
let suggestions = self.conn.query_rows_and_then_cached(
r#"
SELECT
s.id,
k.rank,
s.title,
s.url,
s.provider,
s.score,
fk.full_keyword
FROM
suggestions s
JOIN
keywords k
ON k.suggestion_id = s.id
LEFT JOIN
full_keywords fk
ON k.full_keyword_id = fk.id
WHERE
s.provider = :provider
AND k.keyword = :keyword
AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url)
"#,
&format!(
r#"
SELECT
s.id,
k.rank,
s.title,
s.url,
s.provider,
s.score,
fk.full_keyword
FROM
suggestions s
JOIN
keywords k
ON k.suggestion_id = s.id
LEFT JOIN
full_keywords fk
ON k.full_keyword_id = fk.id
WHERE
s.provider = :provider
AND k.keyword = :keyword
{where_extra}
AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url)
"#
),
named_params! {
":keyword": keyword_lowercased,
":provider": provider
Expand Down Expand Up @@ -427,6 +459,96 @@ impl<'a> SuggestDao<'a> {
Ok(suggestions)
}

pub fn fetch_amp_suggestions_using_fts(
&self,
query: &SuggestionQuery,
suggestion_type: AmpSuggestionType,
fts_column: &str,
) -> Result<Vec<Suggestion>> {
let fts_query = query.fts_query();
let match_arg = &fts_query.match_arg;
let provider = match suggestion_type {
AmpSuggestionType::Mobile => SuggestionProvider::AmpMobile,
AmpSuggestionType::Desktop => SuggestionProvider::Amp,
};
let suggestions = self.conn.query_rows_and_then_cached(
&format!(
r#"
SELECT
s.id,
s.title,
s.url,
s.provider,
s.score
FROM
suggestions s
JOIN
amp_fts fts
ON fts.rowid = s.id
WHERE
s.provider = :provider
AND amp_fts match '{fts_column}: {match_arg}'
AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url)
ORDER BY rank
LIMIT 1
"#
),
named_params! {
":provider": provider
},
|row| -> Result<Suggestion> {
let suggestion_id: i64 = row.get("id")?;
let title = row.get("title")?;
let raw_url: String = row.get("url")?;
let score: f64 = row.get("score")?;

self.conn.query_row_and_then(
r#"
SELECT
amp.advertiser,
amp.block_id,
amp.iab_category,
amp.impression_url,
amp.click_url,
i.data AS icon,
i.mimetype AS icon_mimetype
FROM
amp_custom_details amp
LEFT JOIN
icons i ON amp.icon_id = i.id
WHERE
amp.suggestion_id = :suggestion_id
"#,
named_params! {
":suggestion_id": suggestion_id
},
|row| {
let cooked_url = cook_raw_suggestion_url(&raw_url);
let raw_click_url = row.get::<_, String>("click_url")?;
let cooked_click_url = cook_raw_suggestion_url(&raw_click_url);

Ok(Suggestion::Amp {
block_id: row.get("block_id")?,
advertiser: row.get("advertiser")?,
iab_category: row.get("iab_category")?,
title,
url: cooked_url,
raw_url,
full_keyword: query.keyword.clone(),
icon: row.get("icon")?,
icon_mimetype: row.get("icon_mimetype")?,
impression_url: row.get("impression_url")?,
click_url: cooked_click_url,
raw_click_url,
score,
})
},
)
},
)?;
Ok(suggestions)
}

/// Fetches Suggestions of type Wikipedia provider that match the given query
pub fn fetch_wikipedia_suggestions(&self, query: &SuggestionQuery) -> Result<Vec<Suggestion>> {
let keyword_lowercased = &query.keyword.to_lowercase();
Expand Down Expand Up @@ -907,6 +1029,21 @@ impl<'a> SuggestDao<'a> {
)?)
}

pub fn is_amp_fts_data_ingested(&self, record_id: &SuggestRecordId) -> Result<bool> {
Ok(self.conn.exists(
r#"
SELECT 1
FROM suggestions s
JOIN amp_fts fts
ON fts.rowid = s.id
WHERE s.record_id = :record_id
"#,
named_params! {
":record_id": record_id.as_str(),
},
)?)
}

/// Inserts all suggestions from a downloaded AMO attachment into
/// the database.
pub fn insert_amo_suggestions(
Expand Down Expand Up @@ -947,13 +1084,15 @@ impl<'a> SuggestDao<'a> {
&mut self,
record_id: &SuggestRecordId,
suggestions: &[DownloadedAmpWikipediaSuggestion],
enable_fts: bool,
) -> Result<()> {
// Prepare statements outside of the loop. This results in a large performance
// improvement on a fresh ingest, since there are so many rows.
let mut suggestion_insert = SuggestionInsertStatement::new(self.conn)?;
let mut amp_insert = AmpInsertStatement::new(self.conn)?;
let mut wiki_insert = WikipediaInsertStatement::new(self.conn)?;
let mut keyword_insert = KeywordInsertStatement::new(self.conn)?;
let mut fts_insert = AmpFtsInsertStatement::new(self.conn)?;
for suggestion in suggestions {
self.scope.err_if_interrupted()?;
let common_details = suggestion.common_details();
Expand All @@ -974,6 +1113,13 @@ impl<'a> SuggestDao<'a> {
wiki_insert.execute(suggestion_id, wikipedia)?;
}
}
if enable_fts {
fts_insert.execute(
suggestion_id,
&common_details.full_keywords_joined(),
&common_details.title,
)?;
}
let mut full_keyword_inserter = FullKeywordInserter::new(self.conn, suggestion_id);
for keyword in common_details.keywords() {
let full_keyword_id = match (suggestion, keyword.full_keyword) {
Expand Down Expand Up @@ -1764,6 +1910,30 @@ impl<'conn> KeywordMetricsInsertStatement<'conn> {
}
}

pub(crate) struct AmpFtsInsertStatement<'conn>(rusqlite::Statement<'conn>);

impl<'conn> AmpFtsInsertStatement<'conn> {
pub(crate) fn new(conn: &'conn Connection) -> Result<Self> {
Ok(Self(conn.prepare(
"INSERT INTO amp_fts(rowid, full_keywords, title)
VALUES(?, ?, ?)
",
)?))
}

pub(crate) fn execute(
&mut self,
suggestion_id: i64,
full_keywords: &str,
title: &str,
) -> Result<()> {
self.0
.execute((suggestion_id, full_keywords, title))
.with_context("amp fts insert")?;
Ok(())
}
}

fn provider_config_meta_key(provider: SuggestionProvider) -> String {
format!("{}{}", PROVIDER_CONFIG_META_KEY_PREFIX, provider as u8)
}
2 changes: 1 addition & 1 deletion components/suggest/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ pub use config::{SuggestGlobalConfig, SuggestProviderConfig};
pub use error::{Error, SuggestApiError};
pub use geoname::{Geoname, GeonameMatch, GeonameType};
pub use metrics::{LabeledTimingSample, SuggestIngestionMetrics};
pub use provider::{SuggestionProvider, SuggestionProviderConstraints};
pub use provider::{AmpMatchingStrategy, SuggestionProvider, SuggestionProviderConstraints};
pub use query::{QueryWithMetricsResult, SuggestionQuery};
pub use store::{InterruptKind, SuggestIngestionConstraints, SuggestStore, SuggestStoreBuilder};
pub use suggestion::{raw_suggestion_url_matches, Suggestion};
Expand Down
22 changes: 22 additions & 0 deletions components/suggest/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,26 @@ pub struct SuggestionProviderConstraints {
/// settings record(s).
#[uniffi(default = None)]
pub exposure_suggestion_types: Option<Vec<String>>,
/// Which strategy should we use for the AMP queries?
/// Use None for the default strategy.
#[uniffi(default = None)]
pub amp_alternative_matching: Option<AmpMatchingStrategy>,
}

#[derive(Clone, Debug, uniffi::Enum)]
pub enum AmpMatchingStrategy {
/// Disable keywords added via keyword expansion.
/// This eliminates keywords that for terms related to the "real" keywords, for example
/// misspellings like "underarmor" instead of "under armor"'.
NoKeywordExpansion,
/// Use FTS matching against the full keywords, joined together.
FtsAgainstFullKeywords,
/// Use FTS matching against the title field
FtsAgainstTitle,
}

impl AmpMatchingStrategy {
pub fn uses_fts(&self) -> bool {
matches!(self, Self::FtsAgainstFullKeywords | Self::FtsAgainstTitle)
}
}
1 change: 1 addition & 0 deletions components/suggest/src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ impl SuggestionQuery {
exposure_suggestion_types: Some(
suggestion_types.iter().map(|s| s.to_string()).collect(),
),
..SuggestionProviderConstraints::default()
}),
..Self::default()
}
Expand Down
19 changes: 18 additions & 1 deletion components/suggest/src/rs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
//! the new suggestion in their results, and return `Suggestion::T` variants
//! as needed.
use std::fmt;
use std::{collections::HashSet, fmt};

use remote_settings::{Attachment, RemoteSettingsRecord};
use serde::{Deserialize, Deserializer};
Expand Down Expand Up @@ -436,6 +436,23 @@ impl DownloadedSuggestionCommonDetails {
},
)
}

/// Get the full keywords as a single string
pub fn full_keywords_joined(&self) -> String {
let parts: HashSet<_> = self
.full_keywords
.iter()
.flat_map(|(s, _)| s.split_whitespace())
.collect();
let mut result = String::new();
for (i, part) in parts.into_iter().enumerate() {
if i != 0 {
result.push(' ');
}
result.push_str(part);
}
result
}
}

#[derive(Debug, PartialEq, Eq)]
Expand Down
Loading

0 comments on commit f63adf8

Please sign in to comment.