From 16db1dd9eb252d1bdb0eae3af421c8047941ad51 Mon Sep 17 00:00:00 2001
From: mholt <mholt@pacificbiosciences.com>
Date: Fri, 26 Jan 2024 10:31:19 -0800
Subject: [PATCH] changes for IUPAC fix and WFAGraph boundaries

---
 CHANGELOG.md               |  5 ++++
 Cargo.lock                 |  2 +-
 Cargo.toml                 |  2 +-
 LICENSE-THIRDPARTY.json    |  2 +-
 src/data_types/variants.rs |  2 +-
 src/phaser.rs              | 59 +++++++++++++++++++++++++++++++++-----
 src/wfa_graph.rs           | 30 +++++++++++++++++++
 7 files changed, 91 insertions(+), 11 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index da06f3c..078bdfe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
+# v1.2.1
+## Fixed
+- Fixed [a rare issue](https://github.com/PacificBiosciences/pbbioconda/issues/640) where reference alleles with stripped IUPAC codes were throwing errors due to reference mismatch
+- Fixed an issue where variants preceding a GraphWFA region were not ignored, potentially leading to aberrant graph structure
+
 # v1.2.0
 ## Changes
 - Added an option (`--csi-index`) to output .csi index files instead of .tbi/.bai
diff --git a/Cargo.lock b/Cargo.lock
index 886acee..4b51140 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -523,7 +523,7 @@ dependencies = [
 
 [[package]]
 name = "hiphase"
-version = "1.2.0"
+version = "1.2.1"
 dependencies = [
  "bio",
  "bit-vec",
diff --git a/Cargo.toml b/Cargo.toml
index 64a5fcb..383a808 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "hiphase"
-version = "1.2.0"
+version = "1.2.1"
 authors = ["J. Matthew Holt <mholt@pacificbiosciences.com>"]
 description = "A tool for jointly phasing small, structural, and tandem repeat variants for PacBio sequencing data"
 edition = "2021"
diff --git a/LICENSE-THIRDPARTY.json b/LICENSE-THIRDPARTY.json
index 84965f6..a7f7d8a 100644
--- a/LICENSE-THIRDPARTY.json
+++ b/LICENSE-THIRDPARTY.json
@@ -496,7 +496,7 @@
   },
   {
     "name": "hiphase",
-    "version": "1.2.0",
+    "version": "1.2.1",
     "authors": "J. Matthew Holt <mholt@pacificbiosciences.com>",
     "repository": null,
     "license": null,
diff --git a/src/data_types/variants.rs b/src/data_types/variants.rs
index 0f3be59..d540453 100644
--- a/src/data_types/variants.rs
+++ b/src/data_types/variants.rs
@@ -42,7 +42,7 @@ pub enum Zygosity {
 
 /// A variant definition structure.
 /// It currently assumes that chromosome is fixed and that the variant is a SNP.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Eq, PartialEq)]
 pub struct Variant {
     /// The vcf index from the input datasets
     vcf_index: usize,
diff --git a/src/phaser.rs b/src/phaser.rs
index 2b4e0c2..e94aa90 100644
--- a/src/phaser.rs
+++ b/src/phaser.rs
@@ -239,13 +239,26 @@ fn load_variant_calls(
                     // that users do crazy things, so we need to convert it to a full Error
                     let ref_sequence = reference_genome.get_slice(region.get_chrom(), position as usize, ref_postfix_start);
                     if all_alleles[0] != ref_sequence {
-                        bail!(
-                            "Reference mismatch error: variant at {}:{} has REF allele = \"{}\", but reference genome has \"{}\".",
-                            region.get_chrom(), position+1, 
-                            // we don't want to panic in the middle of this, so use a safe unwrapper with default
-                            std::str::from_utf8(all_alleles[0]).unwrap_or("utf8 decode error"), 
-                            std::str::from_utf8(ref_sequence).unwrap_or("utf8 decode error")
-                        );
+                        // check there are IUPAC in the reference 
+                        let iupac_filter_ref_sequence: Vec<u8> = ref_sequence.iter()
+                            .map(|&c| {
+                                match c {
+                                    b'A' | b'C' | b'G' | b'T' => c,
+                                    _ => b'N'
+                                }
+                            }).collect();
+                        if all_alleles[0] != iupac_filter_ref_sequence {
+                            bail!(
+                                "Reference mismatch error: variant at {}:{} has REF allele = \"{}\", but reference genome has \"{}\".",
+                                region.get_chrom(), position+1, 
+                                // we don't want to panic in the middle of this, so use a safe unwrapper with default
+                                std::str::from_utf8(all_alleles[0]).unwrap_or("utf8 decode error"), 
+                                std::str::from_utf8(ref_sequence).unwrap_or("utf8 decode error")
+                            );
+                        } else {
+                            // this is a case where a non-ACGT was replaced with N, so allow it
+                            // debug!("IUPAC match");
+                        }
                     }
 
                     // check if this variant is too close to the previous variant
@@ -800,4 +813,36 @@ mod tests {
         assert_eq!(haplotag_result.get("r4").unwrap(), &(3, 1)); // spans two blocks and is inexact, but closer to hap 1 starting with block 3
         assert_eq!(haplotag_result.get("r2").unwrap(), &(3, 1)); // equal by alleles, but qual on allele 1 is higher
     }
+
+    // tests that having a IUPAC code modified to "N" is okay
+    #[test]
+    fn test_iupac_conversion() {
+        // create a single variant block for simplicity
+        let mut phase_block = PhaseBlock::new(0, "chr1".to_string(), 0, 20, "sample_name".to_string(), 1);
+        phase_block.add_locus_variant("chr1", 4, 0);
+
+        // load it up
+        let vcf_path = PathBuf::from("./test_data/iupac_test/small_variants.vcf.gz");
+        let ref_fn = PathBuf::from("./test_data/iupac_test/test_reference.fa");
+        let reference_genome = ReferenceGenome::from_fasta(&ref_fn).unwrap();
+        let reference_buffer = 2;
+        let is_hom_allowed = false;
+        let (het_variants, hom_variants) = load_variant_calls(
+            &phase_block,
+            &[vcf_path],
+            &reference_genome,
+            reference_buffer,
+            is_hom_allowed
+        ).unwrap();
+
+        // we should just have the one variant in the block with the translated IUPAC code
+        let mut expected_variant = Variant::new_indel(0, 4, 4, b"ANGT".to_vec(), b"AT".to_vec(), 0, 1);
+        expected_variant.add_reference_prefix(b"GT");
+        expected_variant.add_reference_postfix(b"AC");
+        
+        assert_eq!(het_variants, vec![
+            expected_variant
+        ]);
+        assert!(hom_variants.is_empty());
+    }
 }
\ No newline at end of file
diff --git a/src/wfa_graph.rs b/src/wfa_graph.rs
index 513f0bf..20b065c 100644
--- a/src/wfa_graph.rs
+++ b/src/wfa_graph.rs
@@ -132,6 +132,11 @@ impl WFAGraph {
             // look at where this variant is
             let variant_pos: usize = variant.position() as usize;
             let ref_len: usize = variant.get_ref_len();
+            if variant_pos < ref_start {
+                // this variant starts before our reference block, so ignore it
+                trace!("Ignoring variant starting at {} before ref_start {}", variant_pos, ref_start);
+                continue;
+            }
             if variant_pos + ref_len > ref_end {
                 // this variant end after our reference block, so ignore it
                 trace!("Ignoring variant ending at {} after ref_end {}", variant_pos+ref_len, ref_end);
@@ -1062,6 +1067,31 @@ mod tests {
     // After here are mostly edge case bug tests
     ////////////////////////////////////////////////////////////////////////////////
     
+    // tests when a variant starts before the provided reference, we should ignore it basically
+    #[test]
+    fn test_variant_before_start() {
+        // the first 10 bases are ignored here "NNNNNNNNNA"
+        let reference = "NNNNNNNNNAACGTA".as_bytes();
+        let ref_start: usize = 10;
+        // the first one should be ignored, the second one included
+        let variants = vec![
+            // vcf_index, position, allele0, allele1, index_allele0, index_allele1
+            Variant::new_snv(0, (ref_start-1) as i64, "A".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1),
+            Variant::new_snv(0, ref_start as i64, "A".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1)
+        ];
+
+        let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = 
+            WFAGraph::from_reference_variants(&reference, &variants, ref_start, reference.len()).unwrap();
+        
+        // check the alignments first
+        assert_eq!(graph.get_num_nodes(), 4);
+
+        // now check our lookup tables
+        assert_eq!(*node_to_alleles.get(&0).unwrap_or(&vec![]), vec![]); // first node is empty
+        assert_eq!(*node_to_alleles.get(&1).unwrap_or(&vec![]), vec![(1, 1)]); // second is the alternate for the SECOND variant (first variant is ignored)
+        assert_eq!(*node_to_alleles.get(&2).unwrap_or(&vec![]), vec![(1, 0)]); // third is the reference for the SECOND variant
+        assert_eq!(*node_to_alleles.get(&3).unwrap_or(&vec![]), vec![]); // last is just more reference
+    }
 
     // tests when a variant goes past the provided reference, we should ignore it basically
     #[test]