Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support protein sequences #36

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 35 additions & 11 deletions src/lib/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use std::ops::Range;
use crate::color::{color_background, color_head};
use crate::color::{COLOR_BACKGROUND, COLOR_BASES, COLOR_QUALS};
use crate::reverse_complement;
use crate::AMINO_ACIDS;
use crate::DNA_BASES;
use anyhow::{bail, Context, Result};
use bstr::ByteSlice;
Expand Down Expand Up @@ -120,9 +121,18 @@ fn bases_colored(
}

/// Validates that a given FIXED pattern contains only valid DNA bases (ACGTN)
pub fn validate_fixed_pattern(pattern: &str) -> Result<()> {
pub fn validate_fixed_pattern(pattern: &str, protein: bool) -> Result<()> {
for (index, base) in pattern.chars().enumerate() {
if !DNA_BASES.contains(&(base as u8)) {
if protein {
if !AMINO_ACIDS.contains(&(base as u8)) {
bail!(
"Fixed pattern must contain only amino acids: {} .. [{}] .. {}",
&pattern[0..index],
&pattern[index..=index],
&pattern[index + 1..],
)
}
} else if !DNA_BASES.contains(&(base as u8)) {
bail!(
"Fixed pattern must contain only DNA bases: {} .. [{}] .. {}",
&pattern[0..index],
Expand Down Expand Up @@ -609,17 +619,31 @@ pub mod tests {
// Tests validate_fixed_pattern()
// ############################################################################################

#[test]
fn test_validate_fixed_pattern_is_ok() {
let pattern = "AGTGTGATG";
let result = validate_fixed_pattern(&pattern);
#[rstest]
#[case("AGTGTGATG", false)]
#[case("QRNQRNQRN", true)]
fn test_validate_fixed_pattern_is_ok(#[case] pattern: &str, #[case] protein: bool) {
let result = validate_fixed_pattern(&pattern, protein);
assert!(result.is_ok())
}
#[test]
fn test_validate_fixed_pattern_error() {
let pattern = "AXGTGTGATG";
let msg = String::from("Fixed pattern must contain only DNA bases: A .. [X] .. GTGTGATG");
let result = validate_fixed_pattern(&pattern);

#[rstest]
#[case(
"AXGTGTGATG",
false,
"Fixed pattern must contain only DNA bases: A .. [X] .. GTGTGATG"
)]
#[case(
"QRNQRNZQRN",
true,
"Fixed pattern must contain only amino acids: QRNQRN .. [Z] .. QRN"
)]
fn test_validate_fixed_pattern_error(
#[case] pattern: &str,
#[case] protein: bool,
#[case] msg: &str,
) {
let result = validate_fixed_pattern(&pattern, protein);
let inner = result.unwrap_err().to_string();
assert_eq!(inner, msg);
}
Expand Down
1 change: 1 addition & 0 deletions src/lib/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use lazy_static::lazy_static;
use std::{borrow::Borrow, path::Path};

pub const DNA_BASES: [u8; 5] = *b"ACGTN";
pub const AMINO_ACIDS: [u8; 20] = *b"ARNDCEQGHILKMFPSTWYV";
pub const IUPAC_BASES: [u8; 15] = *b"AGCTYRWSKMDVHBN";
pub const IUPAC_BASES_COMPLEMENT: [u8; 15] = *b"TCGARYWSMKHBDVN";

Expand Down
89 changes: 84 additions & 5 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,10 @@ struct Opts {
#[structopt(long)]
progress: bool,

/// The input files contain protein sequence (amino acids), not DNA sequence.
#[structopt(long)]
protein: bool,

/// The first argument is the pattern to match, with the remaining arguments containing the
/// files to match. If `-e` is given, then all the arguments are files to match.
/// Use standard input if either no files are given or `-` is given.
Expand Down Expand Up @@ -366,13 +370,17 @@ fn fqgrep_from_opts(opts: &Opts) -> Result<usize> {
);
}

if opts.protein && opts.reverse_complement {
bail!("--reverse-complement many not be used with --protein")
}

// Validate the fixed string pattern, if fixed-strings are specified
if opts.fixed_strings {
if let Some(pattern) = &pattern {
validate_fixed_pattern(pattern)?;
validate_fixed_pattern(pattern, opts.protein)?;
} else if !opts.regexp.is_empty() {
for pattern in &opts.regexp {
validate_fixed_pattern(pattern)?;
validate_fixed_pattern(pattern, opts.protein)?;
}
} else {
bail!("A pattern must be given as a positional argument or with -e/--regexp")
Expand Down Expand Up @@ -672,6 +680,7 @@ pub mod tests {
paired: false,
reverse_complement: false,
progress: true,
protein: false,
args: fq_path.to_vec(),
output: output,
};
Expand Down Expand Up @@ -797,6 +806,43 @@ pub mod tests {
assert_eq!(sequence_match, expected_bool);
}

// ############################################################################################
//Tests match with protein (not DNA!)
// ############################################################################################
#[rstest]
// fixed strings
#[case(vec!["A"], vec!["AAAA", "ATAT", "TATA", "AAAT", "TTTA"])] // unpaired: fixed string with multiple matches
#[case(vec!["Q"], vec!["QFPQFP"])] // unpaired: fixed string with one match
#[case(vec!["M"], vec![])] // unpaired: fixed string with no matches
// regex
#[case(vec!["^Q"], vec!["QFPQFP"])] // unpaired: regex with one match
#[case(vec!["^Q", "^F"], vec!["QFPQFP"])] // unpaired: regex set with two matches
#[case(vec!["^Q", "^A"], vec!["AAAA", "ATAT", "AAAT", "QFPQFP"])] // unpaired: regex set with two matches
#[case(vec!["^M", "^K"], vec![])] // unpaired: regex set with no matches
fn test_protein_ok(#[case] pattern: Vec<&str>, #[case] expected_seq: Vec<&str>) {
let dir = TempDir::new().unwrap();
let seqs = vec![vec![
"AAAA", "TTTT", "ATAT", "TATA", "AAAT", "TTTA", "QFPQFP",
]];
let out_path = dir.path().join(String::from("output.fq"));
let result_path = &out_path.clone();
let pattern = pattern.iter().map(|&s| s.to_owned()).collect::<Vec<_>>();
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
true,
Some(out_path),
String::from(".fq"),
);

opts.protein = true;
let _result = fqgrep_from_opts(&opts);
let return_sequences = slurp_output(result_path.to_path_buf());

assert_eq!(return_sequences, expected_seq);
}

// ############################################################################################
// Tests two fastqs for 'TGGATTCAGACTT' which is only found once in the reverse complement
// Tests inverse_match
Expand Down Expand Up @@ -832,20 +878,51 @@ pub mod tests {
assert_eq!(result.unwrap(), expected);
}

//
// ############################################################################################
// Tests that an error is returned when protein and reverse_complement are both present
// ############################################################################################
#[rstest]
#[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: --reverse-complement many not be used with --protein"
)]
#[case()]
fn text_fails_with_protein_and_reverse_complement() {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["GGGG", "TTTT"], vec!["AAAA", "CCCC"]];
let pattern = vec![String::from("^G")];

let mut opts = build_opts(&dir, &seqs, &pattern, true, None, String::from(".fq"));

opts.protein = true;
opts.reverse_complement = true;
let _result = fqgrep_from_opts(&opts);
_result.unwrap();
}

// ############################################################################################
// Tests that an error is returned when fixed_strings is true and regex is present
// Tests that an error is returned when fixed_strings is true for DNA and regex is present
// ############################################################################################
#[rstest]
#[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only DNA bases: .. [^] .. G"
)]
#[case(true, vec![String::from("^G")])] // panic with single regex
#[case(true, false, vec![String::from("^G")])] // panic with single regex
#[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only DNA bases: .. [^] .. A"
)]
#[case(true, vec![String::from("^A"),String::from("AA")])] // panic with combination of regex and fixed string
#[case(true, false, vec![String::from("^A"),String::from("AA")])] // panic with combination of regex and fixed string
#[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only amino acids: .. [^] .. Q"
)]
#[case(true, true, vec![String::from("^Q")])] // panic with single regex
#[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only amino acids: .. [^] .. Q"
)]
#[case(true, true, vec![String::from("^Q"),String::from("QQ")])] // panic with combination of regex and fixed string
fn test_regexp_from_fixed_string_fails_with_regex(
#[case] fixed_strings: bool,
#[case] protein: bool,
#[case] pattern: Vec<String>,
) {
let dir = TempDir::new().unwrap();
Expand All @@ -854,9 +931,11 @@ pub mod tests {
let mut opts = build_opts(&dir, &seqs, &pattern, true, None, String::from(".fq"));

opts.fixed_strings = fixed_strings;
opts.protein = protein;
let _result = fqgrep_from_opts(&opts);
_result.unwrap();
}

// ############################################################################################
// Tests error is returned from main when three records are defined as paired
// ############################################################################################
Expand Down
Loading