pdimens · pdimens · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/.github/filters.yml b/.github/filters.yml
@@ -57,6 +57,7 @@ bwa: &bwa
   - 'harpy/bin/count_bx.py'
   - 'harpy/bin/assign_mi.py'
   - 'harpy/bin/make_windows.py'
+  - 'harpy/bin/depth_windows.py'
   - 'test/fastq/**'
 ema: &ema
   - *common
@@ -68,6 +69,7 @@ ema: &ema
   - 'harpy/bin/bx_stats.py'
   - 'harpy/bin/count_bx.py'
   - 'harpy/bin/make_windows.py'
+  - 'harpy/bin/depth_windows.py'
   - 'test/fastq/**'
 strobealign: &strobealign
   - *common
@@ -80,6 +82,7 @@ strobealign: &strobealign
   - 'harpy/bin/bx_stats.py'
   - 'harpy/bin/count_bx.py'
   - 'harpy/bin/make_windows.py'
+  - 'harpy/bin/depth_windows.py'
   - 'test/fastq/**'
 mpileup: &mpileup
   - *common

diff --git a/harpy/_cli_types_generic.py b/harpy/_cli_types_generic.py
@@ -64,7 +64,7 @@ def convert(self, value, param, ctx):
             "vcf": ["vcf", "bcf", "vcf.gz"],
             "gff": [".gff",".gff3"]
         }
-        if self.filetype not in filedict.keys():
+        if self.filetype not in filedict:
             self.fail(f"Extension validation for {self.filetype} is not yet implemented. This error should only appear during development; if you are a user and seeing this, please post an issue on GitHub: https://github.com/pdimens/harpy/issues/new?assignees=&labels=bug&projects=&template=bug_report.yml")
         if not os.path.exists(value):
             self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx)
@@ -113,5 +113,4 @@ def convert(self, value, param, ctx):
         elif not os.access(f"{value}/config.yaml", os.R_OK):
             self.fail(f"{value}/config.yaml does not have read access. Please check the file permissions and try again.", param, ctx)
         return value
-
-### program-specific extra-params types
+
diff --git a/harpy/bin/assign_mi.py b/harpy/bin/assign_mi.py
@@ -62,10 +62,9 @@ def write_invalidbx(bam, alnrecord):
     at the end and writes it to the output
     bam file. Keeps existing MI tag if present.
     '''
-    # will keeping an existing MI tag if present
-    # may create incorrect molecule associations by chance
+    # will not keep an existing MI tag if present
     # also remove DI because it's not necessary
-    tags = [j for j in alnrecord.get_tags() if j[0] not in ['DI', 'BX']]
+    tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
     _bx = alnrecord.get_tag("BX")
     # if hyphen is present, it's been deconvolved and shouldn't have been
     # and rm the hyphen part
@@ -84,11 +83,7 @@ def write_missingbx(bam, alnrecord):
     at the end and writes it to the output
     bam file. Removes existing MI tag, if exists.
     '''
-    # get all the tags except MI b/c it's being replaced (if exists)
-    # this won't write a new MI, but keeping an existing one
-    # may create incorrect molecule associations by chance
-    # also remove DI because it's not necessary
-    # removes BX... just in case. It's not supposed to be there to begin with
+     # removes MI and DI tags, writes new BX tag
     tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
     tags.append(("BX", "A00C00B00D00"))
     alnrecord.set_tags(tags)

diff --git a/harpy/bin/deconvolve_alignments.py b/harpy/bin/deconvolve_alignments.py
@@ -0,0 +1,204 @@
+#! /usr/bin/env python
+"""deconvolve BX-tagged barcodes and assign molecular identifier (MI) tags to alignments based on distance and barcode"""
+import re
+import os
+import sys
+import argparse
+import pysam
+
+parser = argparse.ArgumentParser(
+    prog = 'deconvolve_alignments.py',
+    description =
+    """
+    Deconvolve BX-tagged barcodes and assign an MI:i: (Molecular Identifier)
+    tag to each barcoded record based on a molecular distance cutoff. Unmapped records
+    are discarded in the output. Records without a BX:Z: tag or
+    with an invalid barcode (00 as one of its segments) are presevered
+    but are not assigned an MI:i tag. Input file MUST BE COORDINATE SORTED.
+    """,
+    usage = "deconvolve_alignments.py -c cutoff -o output.bam input.bam",
+    exit_on_error = False
+    )
+
+parser.add_argument('-c','--cutoff', type=int, default = 100000, help = "Distance in base pairs at which alignments with the same barcode should be considered different molecules (default: 100000)")
+parser.add_argument('-o', '--output', help = "Output bam file")
+parser.add_argument('input', help = "Input coordinate-sorted bam/sam file")
+
+if len(sys.argv) == 1:
+    parser.print_help(sys.stderr)
+    sys.exit(1)
+
+args = parser.parse_args()
+
+def write_validbx(bam, alnrecord, bx_tag, mol_id):
+    '''
+    bam: the output bam
+    alnrecord: the pysam alignment record
+    mol_id: the "mol_id" entry of a barcode dictionary
+    Formats an alignment record to include the MI tag
+    and the BX at the end and writes it to the output
+    bam file. Replaces existing MI tag, if exists.
+    '''
+    # get all the tags except MI b/c it's being replaced (if exists)
+    # will manually parse BX, so omit that too
+    # also remove DI because it's not necessary
+    tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
+    tags.append(("MI", mol_id))
+    tags.append(("BX", bx_tag))
+    alnrecord.set_tags(tags)
+    # write record to output file
+    bam.write(alnrecord)
+
+def write_invalidbx(bam, alnrecord):
+    '''
+    bam: the output bam
+    alnrecord: the pysam alignment record
+    Formats an alignment record to include the BX 
+    at the end and writes it to the output
+    bam file. Keeps existing MI tag if present.
+    '''
+    # will not keep an existing MI tag if present
+    # also remove DI because it's not necessary
+    tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
+    _bx = alnrecord.get_tag("BX")
+    # if hyphen is present, it's been deconvolved and shouldn't have been
+    # and rm the hyphen part
+    bx_clean = _bx.split("-")[0]
+    tags.append(("BX", bx_clean))
+    # update the record's tags
+    alnrecord.set_tags(tags)
+    # write record to output file
+    bam.write(alnrecord)
+
+def write_missingbx(bam, alnrecord):
+    '''
+    bam: the output bam
+    alnrecord: the pysam alignment record
+    Formats an alignment record to include invalid BX 
+    at the end and writes it to the output
+    bam file. Removes existing MI tag, if exists.
+    '''
+    # removes MI and DI tags, writes new BX tag
+    tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
+    tags.append(("BX", "A00C00B00D00"))
+    alnrecord.set_tags(tags)
+    # write record to output file
+    bam.write(alnrecord)
+
+bam_input = args.input
+# initialize the dict
+d = {}
+# LAST_CONTIG keeps track of the last contig so we can
+# clear the dict when it's a new contig/chromosome
+LAST_CONTIG = False
+# MI is the name of the current molecule, starting a 1 (0+1)
+MI = 0
+
+if not os.path.exists(bam_input):
+    print(f"Error: {bam_input} not found", file = sys.stderr)
+    sys.exit(1)
+
+if bam_input.lower().endswith(".bam"):
+    if not os.path.exists(bam_input + ".bai"):
+        pysam.index(bam_input)
+
+# iniitalize input/output files
+alnfile = pysam.AlignmentFile(bam_input)
+outfile = pysam.AlignmentFile(args.output, "wb", template = alnfile)
+
+for record in alnfile.fetch():
+    chrm = record.reference_name
+    bp   = record.query_alignment_length
+    # check if the current chromosome is different from the previous one
+    # if so, empty the dict (a consideration for RAM usage)
+    if LAST_CONTIG is not False and chrm != LAST_CONTIG:
+        d = {}
+    if record.is_unmapped:
+        # skip, don't output
+        LAST_CONTIG = chrm
+        continue
+
+    try:
+        bx = record.get_tag("BX")
+        # do a regex search to find X00 pattern in the BX
+        if re.search("[ABCD]0{2,4}", bx):
+            # if found, invalid
+            write_invalidbx(outfile, record)
+            LAST_CONTIG = chrm
+            continue
+    except KeyError:
+        # There is no bx tag
+        write_missingbx(outfile, record)
+        LAST_CONTIG = chrm
+        continue
+
+    aln = record.get_blocks()
+    if not aln:
+        # unaligned, skip and don't output
+        LAST_CONTIG = chrm
+        continue
+
+    # logic to accommodate split records
+    # start position of first alignment
+    pos_start = aln[0][0]
+    # end position of last alignment
+    pos_end   = aln[-1][1]
+
+    # create bx entry if it's not present
+    if bx not in d:
+        # increment MI b/c it's a new molecule
+        MI += 1
+        d[bx] = {
+            "lastpos" : pos_end,
+            "current_suffix": 0,
+            "mol_id": MI
+        }
+        # write and move on
+        write_validbx(outfile, record, bx, MI)
+        LAST_CONTIG = chrm
+        continue
+
+    # store the original barcode as `orig` b/c we might need to suffix it
+    orig = bx
+    # if there is a suffix, append it to the barcode name
+    if d[orig]["current_suffix"] > 0:
+        bx = orig + "-" + str(d[orig]["current_suffix"])
+
+    # distance from last alignment = current aln start - previous aln end
+    dist = pos_start - d[bx]["lastpos"]
+    # if the distance between alignments is > cutoff, it's a different molecule
+    # so we'll +1 the suffix of the original barcode and relabel this one as
+    # BX + suffix. Since it's a new entry, we initialize it and move on
+    if dist > args.cutoff:
+        # increment MI b/c it's a new molecule
+        MI += 1
+        # increment original barcode's suffix
+        d[orig]["current_suffix"] += 1
+        bx = orig + "-" + str(d[orig]["current_suffix"])
+        # add new entry for new suffixed barcode with unique MI
+        d[bx] = {
+            "lastpos" : pos_end,
+            "current_suffix": 0,
+            "mol_id": MI
+        }
+        # write and move on
+        write_validbx(outfile, record, bx, MI)
+        LAST_CONTIG = chrm
+        continue
+
+    if record.is_reverse or (record.is_forward and not record.is_paired):
+        # set the last position to be the end of current alignment
+        d[bx]["lastpos"] = pos_end
+
+    # if it hasn't moved on by now, it's a record for an
+    # existing barcode/molecule. Write the record.
+    write_validbx(outfile, record, bx, d[bx]["mol_id"])
+
+    # update the chromosome tracker
+    LAST_CONTIG = chrm
+
+alnfile.close()
+outfile.close()
+
+# index the output file
+pysam.index(args.output)
diff --git a/harpy/bin/depth_windows.py b/harpy/bin/depth_windows.py
@@ -23,7 +23,9 @@
     # the contig has changed, make the END POSITION the last POSITION, print output
     if LAST_CONTIG and contig != LAST_CONTIG:
         WINSIZE = (POSITION + 1) - START
-        print(f"{LAST_CONTIG}\t{POSITION}\t{_SUM / WINSIZE}", file = sys.stdout)
+        if WINSIZE > 0:
+            depth = _SUM / WINSIZE
+            print(f"{LAST_CONTIG}\t{POSITION}\t{depth}", file = sys.stdout)
         # reset the window START/END and sum
         _SUM = 0
         START = 1
@@ -33,7 +35,8 @@
     _SUM += int(line[2])
 
     if POSITION == END:
-        print(f"{contig}\t{END}\t{_SUM / args.windowsize}", file = sys.stdout)
+        depth = _SUM / args.windowsize
+        print(f"{contig}\t{END}\t{depth}", file = sys.stdout)
         # reset the window START/END and sum
         _SUM = 0
         START = END + 1