From 9d54bec5ee97b647886720019105dbed6b0e1212 Mon Sep 17 00:00:00 2001 From: Brent Pedersen Date: Fri, 18 Aug 2023 10:53:39 +0200 Subject: [PATCH] moving average and base-quality --- src/bin/commands/trimmer.rs | 8 ++++ src/bin/main.rs | 4 +- src/lib/base_quality.rs | 73 +++++++++++++++++++++++++++++++++++++ src/lib/mod.rs | 2 + src/lib/moving_average.rs | 53 +++++++++++++++++++++++++++ 5 files changed, 138 insertions(+), 2 deletions(-) create mode 100644 src/lib/base_quality.rs create mode 100644 src/lib/moving_average.rs diff --git a/src/bin/commands/trimmer.rs b/src/bin/commands/trimmer.rs index 9a65e94..893c74f 100644 --- a/src/bin/commands/trimmer.rs +++ b/src/bin/commands/trimmer.rs @@ -19,6 +19,14 @@ pub(crate) struct TrimmerOpts { #[clap(long, short = 't', default_value = "5")] threads: usize, + /// Minimum base-quality to keep a base when trimming tails. + #[clap(long, short = 'q', default_value = "20")] + trim_tail_quality: u8, + + /// Window size for moving average when trimming tails. + #[clap(long, short = 'w', default_value = "20")] + trim_tail_window: u8, + /// Level of compression to use to compress outputs. #[clap(long, short = 'c', default_value = "5")] compression_level: usize, diff --git a/src/bin/main.rs b/src/bin/main.rs index 1e78df1..c42202b 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -5,7 +5,7 @@ pub mod commands; use anyhow::Result; use clap::Parser; use commands::command::Command; -use commands::{demux::Demux, trimmer::Trimmer}; +use commands::{demux::Demux, trimmer::TrimmerOpts}; use enum_dispatch::enum_dispatch; use env_logger::Env; @@ -23,7 +23,7 @@ struct Args { #[command(version)] enum Subcommand { Demux(Demux), - Trimmer(Trimmer), + Trimmer(TrimmerOpts), } fn main() -> Result<()> { diff --git a/src/lib/base_quality.rs b/src/lib/base_quality.rs new file mode 100644 index 0000000..79a6db4 --- /dev/null +++ b/src/lib/base_quality.rs @@ -0,0 +1,73 @@ +use super::moving_average::MovingAverage; + +use std::ops::Range; + +pub(crate) fn find_oscillating_quals(bqs: &[u8]) -> Range { + return 0..0; +} + +pub(crate) enum Tail { + Left, + Right, + Both, +} + +/// Uses a moving average to return a range of high quality bases. +/// If all bases are high-quality, the range is the full read. +pub(crate) fn find_high_quality_bases( + bqs: &[u8], + min_quality: u8, + window: u8, + tail: Tail, +) -> Range { + let mut left = 0; + let mut right = bqs.len(); + if matches!(tail, Tail::Left | Tail::Both) { + let mut ma = MovingAverage::::new(window as usize); + for &bq in bqs { + let mean = ma.push(bq); + if mean >= min_quality as f64 { + break; + } + left += 1; + } + } + if matches!(tail, Tail::Right | Tail::Both) { + let mut ma = MovingAverage::::new(window as usize); + for &bq in bqs.iter().rev() { + let mean = ma.push(bq); + if mean >= min_quality as f64 { + break; + } + right -= 1; + } + } + left..right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_find_hq_all() { + let bqs = b"IIIIIIII"; + let range = find_high_quality_bases(bqs, 'I' as u8, 3, Tail::Both); + assert_eq!(range, 0..bqs.len()); + } + + #[test] + fn test_find_hq_ends() { + let bqs = b"EIIIIIIE"; + let range = find_high_quality_bases(bqs, 'I' as u8, 1, Tail::Both); + assert_eq!(range, 1..bqs.len() - 1); + + let bqs = b"EIIIIIIE"; + let range = find_high_quality_bases(bqs, 'I' as u8, 1, Tail::Left); + assert_eq!(range, 1..bqs.len()); + + let bqs = b"EIIIIIIE"; + let range = find_high_quality_bases(bqs, 'I' as u8, 1, Tail::Right); + assert_eq!(range, 0..bqs.len() - 1); + } +} diff --git a/src/lib/mod.rs b/src/lib/mod.rs index 022a1e7..e8f2627 100644 --- a/src/lib/mod.rs +++ b/src/lib/mod.rs @@ -1,4 +1,6 @@ pub mod barcode_matching; +pub mod base_quality; +pub mod moving_average; pub mod pair_overlap; pub mod samples; diff --git a/src/lib/moving_average.rs b/src/lib/moving_average.rs new file mode 100644 index 0000000..654f4d1 --- /dev/null +++ b/src/lib/moving_average.rs @@ -0,0 +1,53 @@ +/// A simple moving average calculator. +/// Only requires that T is convertable to f64. +/// Uses space of window * size_of(T) bytes. +pub(crate) struct MovingAverage { + window: usize, + values: Vec, + sum: f64, + idx: usize, + count: usize, +} + +impl> MovingAverage { + /// create a new moving average calculator with a window of `window` values. + pub fn new(window: usize) -> Self { + Self { window, values: vec![T::default(); window], sum: 0.0, idx: 0, count: 0 } + } + + /// push a new value into the moving average calculator and get the new mean. + pub fn push(&mut self, value: T) -> f64 { + let old_value = self.values[self.idx]; + self.values[self.idx] = value; + self.sum = self.sum + value.into() - old_value.into(); + self.idx = (self.idx + 1) % self.window; + self.count += 1; + self.mean() + } + + /// get the current mean. + #[inline] + pub fn mean(&self) -> f64 { + self.sum / (self.count.min(self.window) as f64) + } +} + +// write some tests for the calculator +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_moving_average() { + let window_size = 3; + let mut ma = MovingAverage::new(window_size); + // NOTE the first value is always the mean + // we use min of values added and window size to calculate the mean + assert_eq!(ma.push(1), 1 as f64 / 1 as f64); + assert_eq!(ma.push(2), (1 + 2) as f64 / 2 as f64); + assert_eq!(ma.push(3), (1 + 2 + 3) as f64 / window_size as f64); + assert_eq!(ma.push(4), (2 + 3 + 4) as f64 / window_size as f64); + assert_eq!(ma.push(5), (3 + 4 + 5) as f64 / window_size as f64); + assert_eq!(ma.push(6), (4 + 5 + 6) as f64 / window_size as f64); + } +}