From fca9a92e08e1c18ca885a12cb9f999a2d399cd9a Mon Sep 17 00:00:00 2001 From: Martin Magnus Date: Fri, 3 Jan 2025 11:23:52 +0100 Subject: [PATCH] Add log-only mode for metrics based bad token detection (#3203) # Description The metrics based bad token detection has not been tested with real data yet so actively filtering out tokens based on it is pretty risky. If the logic is too naive or simply has a bug we might filter out a bunch of tokens which are actually good which could reduce the throughput of the system drastically. # Changes To increase the confidence in the system this PR introduces a log only mode for the metrics based bad token detection. In that mode no tokens will be filtered out based on the metrics but we already get logs indicating this tokens would get filtered out by it. When we start running this mode in prod we should hopefully see 2 things: 1 good tokens don't get filtered out 2 tokens which currently cause alerts like `Driver Run Error Rate Too High` should be flagged by the logic `2` would then indicate that the new logic would reduce the unactionable alerts if the feature gets enabled fully. I only added logs when we update the metrics and a token "turns bad". This should keep the noise in the logs low while still providing all the necessary information. ## How to test Deploying this to prod aids in testing the overall feature --- .../domain/competition/bad_tokens/metrics.rs | 28 +++++++++++++++---- crates/driver/src/infra/api/mod.rs | 20 ++++++------- crates/driver/src/infra/config/file/load.rs | 1 + crates/driver/src/infra/config/file/mod.rs | 12 ++++++++ crates/driver/src/infra/solver/mod.rs | 1 + 5 files changed, 46 insertions(+), 16 deletions(-) diff --git a/crates/driver/src/domain/competition/bad_tokens/metrics.rs b/crates/driver/src/domain/competition/bad_tokens/metrics.rs index 570b32fee7..50917c6d53 100644 --- a/crates/driver/src/domain/competition/bad_tokens/metrics.rs +++ b/crates/driver/src/domain/competition/bad_tokens/metrics.rs @@ -10,11 +10,12 @@ struct TokenStatistics { pub struct DetectorBuilder(Arc>); impl DetectorBuilder { - pub fn build(self, failure_ratio: f64, required_measurements: u32) -> Detector { + pub fn build(self, failure_ratio: f64, required_measurements: u32, log_only: bool) -> Detector { Detector { failure_ratio, required_measurements, counter: self.0, + log_only, } } } @@ -29,15 +30,20 @@ pub struct Detector { failure_ratio: f64, required_measurements: u32, counter: Arc>, + log_only: bool, } impl Detector { pub fn get_quality(&self, token: ð::TokenAddress) -> Option { let measurements = self.counter.get(token)?; - let is_unsupported = measurements.attempts >= self.required_measurements - && (measurements.fails as f64 / measurements.attempts as f64) >= self.failure_ratio; + let is_unsupported = self.is_unsupported(&measurements); + (!self.log_only && is_unsupported).then_some(Quality::Unsupported) + } - is_unsupported.then_some(Quality::Unsupported) + fn is_unsupported(&self, measurements: &TokenStatistics) -> bool { + let token_failure_ratio = measurements.fails as f64 / measurements.attempts as f64; + measurements.attempts >= self.required_measurements + && token_failure_ratio >= self.failure_ratio } /// Updates the tokens that participated in settlements by @@ -48,11 +54,13 @@ impl Detector { token_pairs: &[(eth::TokenAddress, eth::TokenAddress)], failure: bool, ) { + let mut unsupported_tokens = vec![]; token_pairs .iter() .flat_map(|(token_a, token_b)| [token_a, token_b]) .for_each(|token| { - self.counter + let measurement = self + .counter .entry(*token) .and_modify(|counter| { counter.attempts += 1; @@ -62,6 +70,16 @@ impl Detector { attempts: 1, fails: u32::from(failure), }); + if self.is_unsupported(&measurement) { + unsupported_tokens.push(token); + } }); + + if !unsupported_tokens.is_empty() { + tracing::debug!( + tokens = ?unsupported_tokens, + "mark tokens as unsupported" + ); + } } } diff --git a/crates/driver/src/infra/api/mod.rs b/crates/driver/src/infra/api/mod.rs index e4b760822c..3047c796a6 100644 --- a/crates/driver/src/infra/api/mod.rs +++ b/crates/driver/src/infra/api/mod.rs @@ -73,21 +73,19 @@ impl Api { let router = routes::reveal(router); let router = routes::settle(router); + let bad_token_config = solver.bad_token_detection(); let mut bad_tokens = - bad_tokens::Detector::new(solver.bad_token_detection().tokens_supported.clone()); - if solver.bad_token_detection().enable_simulation_strategy { + bad_tokens::Detector::new(bad_token_config.tokens_supported.clone()); + if bad_token_config.enable_simulation_strategy { bad_tokens.with_simulation_detector(self.bad_token_detector.clone()); } - if solver.bad_token_detection().enable_metrics_strategy { - bad_tokens.with_metrics_detector( - metrics_bad_token_detector_builder.clone().build( - solver.bad_token_detection().metrics_strategy_failure_ratio, - solver - .bad_token_detection() - .metrics_strategy_required_measurements, - ), - ); + if bad_token_config.enable_metrics_strategy { + bad_tokens.with_metrics_detector(metrics_bad_token_detector_builder.clone().build( + bad_token_config.metrics_strategy_failure_ratio, + bad_token_config.metrics_strategy_required_measurements, + bad_token_config.metrics_strategy_log_only, + )); } let router = router.with_state(State(Arc::new(Inner { diff --git a/crates/driver/src/infra/config/file/load.rs b/crates/driver/src/infra/config/file/load.rs index 1497b4bdc9..9c19aad70a 100644 --- a/crates/driver/src/infra/config/file/load.rs +++ b/crates/driver/src/infra/config/file/load.rs @@ -123,6 +123,7 @@ pub async fn load(chain: Chain, path: &Path) -> infra::Config { metrics_strategy_required_measurements: config .bad_token_detection .metrics_strategy_required_measurements, + metrics_strategy_log_only: config.bad_token_detection.metrics_strategy_log_only, }, settle_queue_size: config.settle_queue_size, } diff --git a/crates/driver/src/infra/config/file/mod.rs b/crates/driver/src/infra/config/file/mod.rs index 4f38b43e0f..5677e60be7 100644 --- a/crates/driver/src/infra/config/file/mod.rs +++ b/crates/driver/src/infra/config/file/mod.rs @@ -708,6 +708,14 @@ pub struct BadTokenDetectionConfig { rename = "metrics-bad-token-detection-required-measurements" )] pub metrics_strategy_required_measurements: u32, + + /// Controls whether the metrics based detection strategy should only log + /// unsupported tokens or actually filter them out. + #[serde( + default = "default_metrics_bad_token_detector_log_only", + rename = "metrics-bad-token-detection-log-only" + )] + pub metrics_strategy_log_only: bool, } impl Default for BadTokenDetectionConfig { @@ -730,3 +738,7 @@ fn default_metrics_bad_token_detector_required_measurements() -> u32 { fn default_settle_queue_size() -> usize { 2 } + +fn default_metrics_bad_token_detector_log_only() -> bool { + true +} diff --git a/crates/driver/src/infra/solver/mod.rs b/crates/driver/src/infra/solver/mod.rs index 37aa96ef26..cc0802a5a2 100644 --- a/crates/driver/src/infra/solver/mod.rs +++ b/crates/driver/src/infra/solver/mod.rs @@ -316,4 +316,5 @@ pub struct BadTokenDetection { pub enable_metrics_strategy: bool, pub metrics_strategy_failure_ratio: f64, pub metrics_strategy_required_measurements: u32, + pub metrics_strategy_log_only: bool, }