Skip to content

Commit

Permalink
Merge pull request #23 from PacificBiosciences/iupac_fix
Browse files Browse the repository at this point in the history
changes for IUPAC fix and WFAGraph boundaries
  • Loading branch information
holtjma authored Jan 26, 2024
2 parents 28ee5d5 + 16db1dd commit dff3c47
Show file tree
Hide file tree
Showing 7 changed files with 91 additions and 11 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# v1.2.1
## Fixed
- Fixed [a rare issue](https://github.com/PacificBiosciences/pbbioconda/issues/640) where reference alleles with stripped IUPAC codes were throwing errors due to reference mismatch
- Fixed an issue where variants preceding a GraphWFA region were not ignored, potentially leading to aberrant graph structure

# v1.2.0
## Changes
- Added an option (`--csi-index`) to output .csi index files instead of .tbi/.bai
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "hiphase"
version = "1.2.0"
version = "1.2.1"
authors = ["J. Matthew Holt <[email protected]>"]
description = "A tool for jointly phasing small, structural, and tandem repeat variants for PacBio sequencing data"
edition = "2021"
Expand Down
2 changes: 1 addition & 1 deletion LICENSE-THIRDPARTY.json
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@
},
{
"name": "hiphase",
"version": "1.2.0",
"version": "1.2.1",
"authors": "J. Matthew Holt <[email protected]>",
"repository": null,
"license": null,
Expand Down
2 changes: 1 addition & 1 deletion src/data_types/variants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ pub enum Zygosity {

/// A variant definition structure.
/// It currently assumes that chromosome is fixed and that the variant is a SNP.
#[derive(Clone, Debug)]
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Variant {
/// The vcf index from the input datasets
vcf_index: usize,
Expand Down
59 changes: 52 additions & 7 deletions src/phaser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,13 +239,26 @@ fn load_variant_calls(
// that users do crazy things, so we need to convert it to a full Error
let ref_sequence = reference_genome.get_slice(region.get_chrom(), position as usize, ref_postfix_start);
if all_alleles[0] != ref_sequence {
bail!(
"Reference mismatch error: variant at {}:{} has REF allele = \"{}\", but reference genome has \"{}\".",
region.get_chrom(), position+1,
// we don't want to panic in the middle of this, so use a safe unwrapper with default
std::str::from_utf8(all_alleles[0]).unwrap_or("utf8 decode error"),
std::str::from_utf8(ref_sequence).unwrap_or("utf8 decode error")
);
// check there are IUPAC in the reference
let iupac_filter_ref_sequence: Vec<u8> = ref_sequence.iter()
.map(|&c| {
match c {
b'A' | b'C' | b'G' | b'T' => c,
_ => b'N'
}
}).collect();
if all_alleles[0] != iupac_filter_ref_sequence {
bail!(
"Reference mismatch error: variant at {}:{} has REF allele = \"{}\", but reference genome has \"{}\".",
region.get_chrom(), position+1,
// we don't want to panic in the middle of this, so use a safe unwrapper with default
std::str::from_utf8(all_alleles[0]).unwrap_or("utf8 decode error"),
std::str::from_utf8(ref_sequence).unwrap_or("utf8 decode error")
);
} else {
// this is a case where a non-ACGT was replaced with N, so allow it
// debug!("IUPAC match");
}
}

// check if this variant is too close to the previous variant
Expand Down Expand Up @@ -800,4 +813,36 @@ mod tests {
assert_eq!(haplotag_result.get("r4").unwrap(), &(3, 1)); // spans two blocks and is inexact, but closer to hap 1 starting with block 3
assert_eq!(haplotag_result.get("r2").unwrap(), &(3, 1)); // equal by alleles, but qual on allele 1 is higher
}

// tests that having a IUPAC code modified to "N" is okay
#[test]
fn test_iupac_conversion() {
// create a single variant block for simplicity
let mut phase_block = PhaseBlock::new(0, "chr1".to_string(), 0, 20, "sample_name".to_string(), 1);
phase_block.add_locus_variant("chr1", 4, 0);

// load it up
let vcf_path = PathBuf::from("./test_data/iupac_test/small_variants.vcf.gz");
let ref_fn = PathBuf::from("./test_data/iupac_test/test_reference.fa");
let reference_genome = ReferenceGenome::from_fasta(&ref_fn).unwrap();
let reference_buffer = 2;
let is_hom_allowed = false;
let (het_variants, hom_variants) = load_variant_calls(
&phase_block,
&[vcf_path],
&reference_genome,
reference_buffer,
is_hom_allowed
).unwrap();

// we should just have the one variant in the block with the translated IUPAC code
let mut expected_variant = Variant::new_indel(0, 4, 4, b"ANGT".to_vec(), b"AT".to_vec(), 0, 1);
expected_variant.add_reference_prefix(b"GT");
expected_variant.add_reference_postfix(b"AC");

assert_eq!(het_variants, vec![
expected_variant
]);
assert!(hom_variants.is_empty());
}
}
30 changes: 30 additions & 0 deletions src/wfa_graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ impl WFAGraph {
// look at where this variant is
let variant_pos: usize = variant.position() as usize;
let ref_len: usize = variant.get_ref_len();
if variant_pos < ref_start {
// this variant starts before our reference block, so ignore it
trace!("Ignoring variant starting at {} before ref_start {}", variant_pos, ref_start);
continue;
}
if variant_pos + ref_len > ref_end {
// this variant end after our reference block, so ignore it
trace!("Ignoring variant ending at {} after ref_end {}", variant_pos+ref_len, ref_end);
Expand Down Expand Up @@ -1062,6 +1067,31 @@ mod tests {
// After here are mostly edge case bug tests
////////////////////////////////////////////////////////////////////////////////

// tests when a variant starts before the provided reference, we should ignore it basically
#[test]
fn test_variant_before_start() {
// the first 10 bases are ignored here "NNNNNNNNNA"
let reference = "NNNNNNNNNAACGTA".as_bytes();
let ref_start: usize = 10;
// the first one should be ignored, the second one included
let variants = vec![
// vcf_index, position, allele0, allele1, index_allele0, index_allele1
Variant::new_snv(0, (ref_start-1) as i64, "A".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1),
Variant::new_snv(0, ref_start as i64, "A".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1)
];

let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) =
WFAGraph::from_reference_variants(&reference, &variants, ref_start, reference.len()).unwrap();

// check the alignments first
assert_eq!(graph.get_num_nodes(), 4);

// now check our lookup tables
assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); // first node is empty
assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(1, 1)]); // second is the alternate for the SECOND variant (first variant is ignored)
assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(1, 0)]); // third is the reference for the SECOND variant
assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]); // last is just more reference
}

// tests when a variant goes past the provided reference, we should ignore it basically
#[test]
Expand Down

0 comments on commit dff3c47

Please sign in to comment.