From 16db1dd9eb252d1bdb0eae3af421c8047941ad51 Mon Sep 17 00:00:00 2001 From: mholt Date: Fri, 26 Jan 2024 10:31:19 -0800 Subject: [PATCH] changes for IUPAC fix and WFAGraph boundaries --- CHANGELOG.md | 5 ++++ Cargo.lock | 2 +- Cargo.toml | 2 +- LICENSE-THIRDPARTY.json | 2 +- src/data_types/variants.rs | 2 +- src/phaser.rs | 59 +++++++++++++++++++++++++++++++++----- src/wfa_graph.rs | 30 +++++++++++++++++++ 7 files changed, 91 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da06f3c..078bdfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +# v1.2.1 +## Fixed +- Fixed [a rare issue](https://github.com/PacificBiosciences/pbbioconda/issues/640) where reference alleles with stripped IUPAC codes were throwing errors due to reference mismatch +- Fixed an issue where variants preceding a GraphWFA region were not ignored, potentially leading to aberrant graph structure + # v1.2.0 ## Changes - Added an option (`--csi-index`) to output .csi index files instead of .tbi/.bai diff --git a/Cargo.lock b/Cargo.lock index 886acee..4b51140 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -523,7 +523,7 @@ dependencies = [ [[package]] name = "hiphase" -version = "1.2.0" +version = "1.2.1" dependencies = [ "bio", "bit-vec", diff --git a/Cargo.toml b/Cargo.toml index 64a5fcb..383a808 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "hiphase" -version = "1.2.0" +version = "1.2.1" authors = ["J. Matthew Holt "] description = "A tool for jointly phasing small, structural, and tandem repeat variants for PacBio sequencing data" edition = "2021" diff --git a/LICENSE-THIRDPARTY.json b/LICENSE-THIRDPARTY.json index 84965f6..a7f7d8a 100644 --- a/LICENSE-THIRDPARTY.json +++ b/LICENSE-THIRDPARTY.json @@ -496,7 +496,7 @@ }, { "name": "hiphase", - "version": "1.2.0", + "version": "1.2.1", "authors": "J. Matthew Holt ", "repository": null, "license": null, diff --git a/src/data_types/variants.rs b/src/data_types/variants.rs index 0f3be59..d540453 100644 --- a/src/data_types/variants.rs +++ b/src/data_types/variants.rs @@ -42,7 +42,7 @@ pub enum Zygosity { /// A variant definition structure. /// It currently assumes that chromosome is fixed and that the variant is a SNP. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Eq, PartialEq)] pub struct Variant { /// The vcf index from the input datasets vcf_index: usize, diff --git a/src/phaser.rs b/src/phaser.rs index 2b4e0c2..e94aa90 100644 --- a/src/phaser.rs +++ b/src/phaser.rs @@ -239,13 +239,26 @@ fn load_variant_calls( // that users do crazy things, so we need to convert it to a full Error let ref_sequence = reference_genome.get_slice(region.get_chrom(), position as usize, ref_postfix_start); if all_alleles[0] != ref_sequence { - bail!( - "Reference mismatch error: variant at {}:{} has REF allele = \"{}\", but reference genome has \"{}\".", - region.get_chrom(), position+1, - // we don't want to panic in the middle of this, so use a safe unwrapper with default - std::str::from_utf8(all_alleles[0]).unwrap_or("utf8 decode error"), - std::str::from_utf8(ref_sequence).unwrap_or("utf8 decode error") - ); + // check there are IUPAC in the reference + let iupac_filter_ref_sequence: Vec = ref_sequence.iter() + .map(|&c| { + match c { + b'A' | b'C' | b'G' | b'T' => c, + _ => b'N' + } + }).collect(); + if all_alleles[0] != iupac_filter_ref_sequence { + bail!( + "Reference mismatch error: variant at {}:{} has REF allele = \"{}\", but reference genome has \"{}\".", + region.get_chrom(), position+1, + // we don't want to panic in the middle of this, so use a safe unwrapper with default + std::str::from_utf8(all_alleles[0]).unwrap_or("utf8 decode error"), + std::str::from_utf8(ref_sequence).unwrap_or("utf8 decode error") + ); + } else { + // this is a case where a non-ACGT was replaced with N, so allow it + // debug!("IUPAC match"); + } } // check if this variant is too close to the previous variant @@ -800,4 +813,36 @@ mod tests { assert_eq!(haplotag_result.get("r4").unwrap(), &(3, 1)); // spans two blocks and is inexact, but closer to hap 1 starting with block 3 assert_eq!(haplotag_result.get("r2").unwrap(), &(3, 1)); // equal by alleles, but qual on allele 1 is higher } + + // tests that having a IUPAC code modified to "N" is okay + #[test] + fn test_iupac_conversion() { + // create a single variant block for simplicity + let mut phase_block = PhaseBlock::new(0, "chr1".to_string(), 0, 20, "sample_name".to_string(), 1); + phase_block.add_locus_variant("chr1", 4, 0); + + // load it up + let vcf_path = PathBuf::from("./test_data/iupac_test/small_variants.vcf.gz"); + let ref_fn = PathBuf::from("./test_data/iupac_test/test_reference.fa"); + let reference_genome = ReferenceGenome::from_fasta(&ref_fn).unwrap(); + let reference_buffer = 2; + let is_hom_allowed = false; + let (het_variants, hom_variants) = load_variant_calls( + &phase_block, + &[vcf_path], + &reference_genome, + reference_buffer, + is_hom_allowed + ).unwrap(); + + // we should just have the one variant in the block with the translated IUPAC code + let mut expected_variant = Variant::new_indel(0, 4, 4, b"ANGT".to_vec(), b"AT".to_vec(), 0, 1); + expected_variant.add_reference_prefix(b"GT"); + expected_variant.add_reference_postfix(b"AC"); + + assert_eq!(het_variants, vec![ + expected_variant + ]); + assert!(hom_variants.is_empty()); + } } \ No newline at end of file diff --git a/src/wfa_graph.rs b/src/wfa_graph.rs index 513f0bf..20b065c 100644 --- a/src/wfa_graph.rs +++ b/src/wfa_graph.rs @@ -132,6 +132,11 @@ impl WFAGraph { // look at where this variant is let variant_pos: usize = variant.position() as usize; let ref_len: usize = variant.get_ref_len(); + if variant_pos < ref_start { + // this variant starts before our reference block, so ignore it + trace!("Ignoring variant starting at {} before ref_start {}", variant_pos, ref_start); + continue; + } if variant_pos + ref_len > ref_end { // this variant end after our reference block, so ignore it trace!("Ignoring variant ending at {} after ref_end {}", variant_pos+ref_len, ref_end); @@ -1062,6 +1067,31 @@ mod tests { // After here are mostly edge case bug tests //////////////////////////////////////////////////////////////////////////////// + // tests when a variant starts before the provided reference, we should ignore it basically + #[test] + fn test_variant_before_start() { + // the first 10 bases are ignored here "NNNNNNNNNA" + let reference = "NNNNNNNNNAACGTA".as_bytes(); + let ref_start: usize = 10; + // the first one should be ignored, the second one included + let variants = vec![ + // vcf_index, position, allele0, allele1, index_allele0, index_allele1 + Variant::new_snv(0, (ref_start-1) as i64, "A".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1), + Variant::new_snv(0, ref_start as i64, "A".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1) + ]; + + let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = + WFAGraph::from_reference_variants(&reference, &variants, ref_start, reference.len()).unwrap(); + + // check the alignments first + assert_eq!(graph.get_num_nodes(), 4); + + // now check our lookup tables + assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); // first node is empty + assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(1, 1)]); // second is the alternate for the SECOND variant (first variant is ignored) + assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(1, 0)]); // third is the reference for the SECOND variant + assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]); // last is just more reference + } // tests when a variant goes past the provided reference, we should ignore it basically #[test]