Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix - divide-by-zero error #161

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/filters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ bwa: &bwa
- 'harpy/bin/count_bx.py'
- 'harpy/bin/assign_mi.py'
- 'harpy/bin/make_windows.py'
- 'harpy/bin/depth_windows.py'
- 'test/fastq/**'
ema: &ema
- *common
Expand All @@ -68,6 +69,7 @@ ema: &ema
- 'harpy/bin/bx_stats.py'
- 'harpy/bin/count_bx.py'
- 'harpy/bin/make_windows.py'
- 'harpy/bin/depth_windows.py'
- 'test/fastq/**'
strobealign: &strobealign
- *common
Expand All @@ -80,6 +82,7 @@ strobealign: &strobealign
- 'harpy/bin/bx_stats.py'
- 'harpy/bin/count_bx.py'
- 'harpy/bin/make_windows.py'
- 'harpy/bin/depth_windows.py'
- 'test/fastq/**'
mpileup: &mpileup
- *common
Expand Down
5 changes: 2 additions & 3 deletions harpy/_cli_types_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def convert(self, value, param, ctx):
"vcf": ["vcf", "bcf", "vcf.gz"],
"gff": [".gff",".gff3"]
}
if self.filetype not in filedict.keys():
if self.filetype not in filedict:
self.fail(f"Extension validation for {self.filetype} is not yet implemented. This error should only appear during development; if you are a user and seeing this, please post an issue on GitHub: https://github.com/pdimens/harpy/issues/new?assignees=&labels=bug&projects=&template=bug_report.yml")
if not os.path.exists(value):
self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx)
Expand Down Expand Up @@ -113,5 +113,4 @@ def convert(self, value, param, ctx):
elif not os.access(f"{value}/config.yaml", os.R_OK):
self.fail(f"{value}/config.yaml does not have read access. Please check the file permissions and try again.", param, ctx)
return value

### program-specific extra-params types

11 changes: 3 additions & 8 deletions harpy/bin/assign_mi.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,9 @@ def write_invalidbx(bam, alnrecord):
at the end and writes it to the output
bam file. Keeps existing MI tag if present.
'''
# will keeping an existing MI tag if present
# may create incorrect molecule associations by chance
# will not keep an existing MI tag if present
# also remove DI because it's not necessary
tags = [j for j in alnrecord.get_tags() if j[0] not in ['DI', 'BX']]
tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
_bx = alnrecord.get_tag("BX")
# if hyphen is present, it's been deconvolved and shouldn't have been
# and rm the hyphen part
Expand All @@ -84,11 +83,7 @@ def write_missingbx(bam, alnrecord):
at the end and writes it to the output
bam file. Removes existing MI tag, if exists.
'''
# get all the tags except MI b/c it's being replaced (if exists)
# this won't write a new MI, but keeping an existing one
# may create incorrect molecule associations by chance
# also remove DI because it's not necessary
# removes BX... just in case. It's not supposed to be there to begin with
# removes MI and DI tags, writes new BX tag
tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
tags.append(("BX", "A00C00B00D00"))
alnrecord.set_tags(tags)
Expand Down
204 changes: 204 additions & 0 deletions harpy/bin/deconvolve_alignments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
#! /usr/bin/env python
"""deconvolve BX-tagged barcodes and assign molecular identifier (MI) tags to alignments based on distance and barcode"""
import re
import os
import sys
import argparse
import pysam

parser = argparse.ArgumentParser(
prog = 'deconvolve_alignments.py',
description =
"""
Deconvolve BX-tagged barcodes and assign an MI:i: (Molecular Identifier)
tag to each barcoded record based on a molecular distance cutoff. Unmapped records
are discarded in the output. Records without a BX:Z: tag or
with an invalid barcode (00 as one of its segments) are presevered
but are not assigned an MI:i tag. Input file MUST BE COORDINATE SORTED.
""",
usage = "deconvolve_alignments.py -c cutoff -o output.bam input.bam",
exit_on_error = False
)

parser.add_argument('-c','--cutoff', type=int, default = 100000, help = "Distance in base pairs at which alignments with the same barcode should be considered different molecules (default: 100000)")
parser.add_argument('-o', '--output', help = "Output bam file")
parser.add_argument('input', help = "Input coordinate-sorted bam/sam file")

if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)

args = parser.parse_args()

def write_validbx(bam, alnrecord, bx_tag, mol_id):
'''
bam: the output bam
alnrecord: the pysam alignment record
mol_id: the "mol_id" entry of a barcode dictionary
Formats an alignment record to include the MI tag
and the BX at the end and writes it to the output
bam file. Replaces existing MI tag, if exists.
'''
# get all the tags except MI b/c it's being replaced (if exists)
# will manually parse BX, so omit that too
# also remove DI because it's not necessary
tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
tags.append(("MI", mol_id))
tags.append(("BX", bx_tag))
alnrecord.set_tags(tags)
# write record to output file
bam.write(alnrecord)

def write_invalidbx(bam, alnrecord):
'''
bam: the output bam
alnrecord: the pysam alignment record
Formats an alignment record to include the BX
at the end and writes it to the output
bam file. Keeps existing MI tag if present.
'''
# will not keep an existing MI tag if present
# also remove DI because it's not necessary
tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
_bx = alnrecord.get_tag("BX")
# if hyphen is present, it's been deconvolved and shouldn't have been
# and rm the hyphen part
bx_clean = _bx.split("-")[0]
tags.append(("BX", bx_clean))
# update the record's tags
alnrecord.set_tags(tags)
# write record to output file
bam.write(alnrecord)

def write_missingbx(bam, alnrecord):
'''
bam: the output bam
alnrecord: the pysam alignment record
Formats an alignment record to include invalid BX
at the end and writes it to the output
bam file. Removes existing MI tag, if exists.
'''
# removes MI and DI tags, writes new BX tag
tags = [j for j in alnrecord.get_tags() if j[0] not in ['MI', 'DI', 'BX']]
tags.append(("BX", "A00C00B00D00"))
alnrecord.set_tags(tags)
# write record to output file
bam.write(alnrecord)

bam_input = args.input
# initialize the dict
d = {}
# LAST_CONTIG keeps track of the last contig so we can
# clear the dict when it's a new contig/chromosome
LAST_CONTIG = False
# MI is the name of the current molecule, starting a 1 (0+1)
MI = 0

if not os.path.exists(bam_input):
print(f"Error: {bam_input} not found", file = sys.stderr)
sys.exit(1)

if bam_input.lower().endswith(".bam"):
if not os.path.exists(bam_input + ".bai"):
pysam.index(bam_input)

# iniitalize input/output files
alnfile = pysam.AlignmentFile(bam_input)
outfile = pysam.AlignmentFile(args.output, "wb", template = alnfile)

for record in alnfile.fetch():
chrm = record.reference_name
bp = record.query_alignment_length
# check if the current chromosome is different from the previous one
# if so, empty the dict (a consideration for RAM usage)
if LAST_CONTIG is not False and chrm != LAST_CONTIG:
d = {}
if record.is_unmapped:
# skip, don't output
LAST_CONTIG = chrm
continue

try:
bx = record.get_tag("BX")
# do a regex search to find X00 pattern in the BX
if re.search("[ABCD]0{2,4}", bx):
# if found, invalid
write_invalidbx(outfile, record)
LAST_CONTIG = chrm
continue
except KeyError:
# There is no bx tag
write_missingbx(outfile, record)
LAST_CONTIG = chrm
continue

aln = record.get_blocks()
if not aln:
# unaligned, skip and don't output
LAST_CONTIG = chrm
continue

# logic to accommodate split records
# start position of first alignment
pos_start = aln[0][0]
# end position of last alignment
pos_end = aln[-1][1]

# create bx entry if it's not present
if bx not in d:
# increment MI b/c it's a new molecule
MI += 1
d[bx] = {
"lastpos" : pos_end,
"current_suffix": 0,
"mol_id": MI
}
# write and move on
write_validbx(outfile, record, bx, MI)
LAST_CONTIG = chrm
continue

# store the original barcode as `orig` b/c we might need to suffix it
orig = bx
# if there is a suffix, append it to the barcode name
if d[orig]["current_suffix"] > 0:
bx = orig + "-" + str(d[orig]["current_suffix"])

# distance from last alignment = current aln start - previous aln end
dist = pos_start - d[bx]["lastpos"]
# if the distance between alignments is > cutoff, it's a different molecule
# so we'll +1 the suffix of the original barcode and relabel this one as
# BX + suffix. Since it's a new entry, we initialize it and move on
if dist > args.cutoff:
# increment MI b/c it's a new molecule
MI += 1
# increment original barcode's suffix
d[orig]["current_suffix"] += 1
bx = orig + "-" + str(d[orig]["current_suffix"])
# add new entry for new suffixed barcode with unique MI
d[bx] = {
"lastpos" : pos_end,
"current_suffix": 0,
"mol_id": MI
}
# write and move on
write_validbx(outfile, record, bx, MI)
LAST_CONTIG = chrm
continue

if record.is_reverse or (record.is_forward and not record.is_paired):
# set the last position to be the end of current alignment
pdimens marked this conversation as resolved.
Show resolved Hide resolved
d[bx]["lastpos"] = pos_end

# if it hasn't moved on by now, it's a record for an
# existing barcode/molecule. Write the record.
write_validbx(outfile, record, bx, d[bx]["mol_id"])

# update the chromosome tracker
LAST_CONTIG = chrm

alnfile.close()
outfile.close()

# index the output file
pysam.index(args.output)
7 changes: 5 additions & 2 deletions harpy/bin/depth_windows.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
# the contig has changed, make the END POSITION the last POSITION, print output
if LAST_CONTIG and contig != LAST_CONTIG:
WINSIZE = (POSITION + 1) - START
print(f"{LAST_CONTIG}\t{POSITION}\t{_SUM / WINSIZE}", file = sys.stdout)
if WINSIZE > 0:
depth = _SUM / WINSIZE
print(f"{LAST_CONTIG}\t{POSITION}\t{depth}", file = sys.stdout)
# reset the window START/END and sum
_SUM = 0
START = 1
Expand All @@ -33,7 +35,8 @@
_SUM += int(line[2])

if POSITION == END:
print(f"{contig}\t{END}\t{_SUM / args.windowsize}", file = sys.stdout)
depth = _SUM / args.windowsize
print(f"{contig}\t{END}\t{depth}", file = sys.stdout)
pdimens marked this conversation as resolved.
Show resolved Hide resolved
# reset the window START/END and sum
_SUM = 0
START = END + 1
Expand Down
Loading