Skip to content

Commit

Permalink
Improve guesswork by using softclipped reads
Browse files Browse the repository at this point in the history
  • Loading branch information
nriddiford committed Feb 19, 2018
1 parent a1c05c0 commit 78ba626
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 52 deletions.
62 changes: 32 additions & 30 deletions svSupport/find_reads.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pysam
import os, re
import os, re, sys
from merge_bams import merge_bams

"""
Expand Down Expand Up @@ -128,25 +128,27 @@ def bp1_reads(self):
elif read.reference_start < self.bp1 and read_end_pos > self.bp1:
self.print_and_write_bp1(read, mate, bp1_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)

else:
print("Reads must be classified")
sys.exit()

support_count = len(set(supporting_reads))
oppose_count = len(set(opposing_reads))

# pysam.index(support_out)
# pysam.index(oppose_out)

return(supporting_reads, support_count, support_out, opposing_reads, oppose_count, oppose_out)


def print_and_write_bp1(self, read, mate, read_bam, read_list, evidence, for_against, read_end_pos, mate_end_pos):
if self.debug:
print("* bp1 %s %s read : %s %s [rs:e: %s-%s, ms:e: %s-%s]") % (evidence, for_against, read.query_name, read.seq, read.reference_start, read_end_pos, mate.reference_start, mate_end_pos)
read_bam.write(read)
if evidence == 'mate_pair' or evidence == 'disc_read':
read_bam.write(mate)
read_list.append(read.query_name)
if read.query_name not in read_list:
read_bam.write(read)
if evidence == 'mate_pair' or evidence == 'disc_read':
read_bam.write(mate)
read_list.append(read.query_name)


def bp2_reads(self):
def bp2_reads(self, bp1_supporting_reads, bp1_opposing_reads):
samfile = pysam.Samfile(self.bam_in, "rb")
bp2_start, bp2_end = self.set_window('bp2')
supporting_reads = []
Expand Down Expand Up @@ -174,65 +176,65 @@ def bp2_reads(self):
mate_start = mate.reference_start +1
if read_start >= self.bp2 and mate_end_pos <= self.bp1:
# if not read.is_proper_pair and read.is_reverse and mate_end_pos < self.bp1:
self.print_and_write_bp2(read, mate, bp2_supporting_reads, supporting_reads, 'disc_read', 'supporting', read_end_pos, mate_end_pos)
self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_supporting_reads, supporting_reads, 'disc_read', 'supporting', read_end_pos, mate_end_pos)
elif read_start == self.bp2 and re.findall(r'(\d+)[S|H]', read.cigarstring):
self.print_and_write_bp2(read, mate, bp2_supporting_reads, supporting_reads, 'clipped_read', 'supporting', read_end_pos, mate_end_pos)
self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_supporting_reads, supporting_reads, 'clipped_read', 'supporting', read_end_pos, mate_end_pos)

# opposing
elif read.qname not in supporting_reads:
if (read_end_pos < self.bp2 and mate_start > self.bp2) or (read_start > self.bp2 and mate_end_pos < self.bp2 and mate_end_pos > self.bp1):
# if (read_end_pos < self.bp1 and mate.reference_start > self.bp1 and mate.reference_start < self.bp2) or (read.reference_start > self.bp1 and mate_end_pos < self.bp1):
self.print_and_write_bp2(read, mate, bp2_opposing_reads, opposing_reads, 'mate_pair', 'opposing', read_end_pos, mate_end_pos)
self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_opposing_reads, opposing_reads, 'mate_pair', 'opposing', read_end_pos, mate_end_pos)
# elif read.reference_start < self.bp1 and read_end_pos > self.bp1:
elif read_start < self.bp2 and read_end_pos > self.bp2:
self.print_and_write_bp2(read, mate, bp2_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)
self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)

# type I inversion - ('F_bp1', 'F_bp2')
elif self.bp1_class == 'F_bp1' and self.bp2_class == 'F_bp2':
# supporting
if not read.is_proper_pair and read_end_pos +1 <= self.bp2 and mate_end_pos <= self.bp1:
self.print_and_write_bp2(read, mate, bp2_supporting_reads, supporting_reads, 'disc_read', 'supporting', read_end_pos, mate_end_pos)
self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_supporting_reads, supporting_reads, 'disc_read', 'supporting', read_end_pos, mate_end_pos)

elif read_end_pos == self.bp2 and re.findall(r'(\d+)[S|H]', read.cigarstring):
self.print_and_write_bp2(read, mate, bp2_supporting_reads, supporting_reads, 'clipped_read', 'supporting', read_end_pos, mate_end_pos)
self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_supporting_reads, supporting_reads, 'clipped_read', 'supporting', read_end_pos, mate_end_pos)

# opposing
elif read.qname not in supporting_reads:
if (read_end_pos < self.bp1 and mate.reference_start > self.bp1 and mate.reference_start < self.bp2) or (read.reference_start > self.bp1 and mate_end_pos < self.bp1):
self.print_and_write_bp2(read, mate, bp2_opposing_reads, opposing_reads, 'mate_pair', 'opposing', read_end_pos, mate_end_pos)
self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_opposing_reads, opposing_reads, 'mate_pair', 'opposing', read_end_pos, mate_end_pos)
elif read.reference_start < self.bp1 and read_end_pos > self.bp1:
self.print_and_write_bp2(read, mate, bp2_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)
self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)

# type II inversion - ('bp1_R', 'bp2_R')
elif self.bp1_class == 'bp1_R' and self.bp2_class == 'bp2_R':
# supporting
if not read.is_proper_pair and read.reference_start +1 >= self.bp2 and mate.reference_start >= self.bp1 and mate_end_pos < self.bp2:
self.print_and_write_bp2(read, mate, bp2_supporting_reads, supporting_reads, 'disc_read', 'supporting', read_end_pos, mate_end_pos)
self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_supporting_reads, supporting_reads, 'disc_read', 'supporting', read_end_pos, mate_end_pos)

elif read.reference_start == self.bp2 and re.findall(r'(\d+)[S|H]', read.cigarstring):
self.print_and_write_bp2(read, mate, bp2_supporting_reads, supporting_reads, 'clipped_read', 'supporting', read_end_pos, mate_end_pos)
self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_supporting_reads, supporting_reads, 'clipped_read', 'supporting', read_end_pos, mate_end_pos)

# opposing
elif read.qname not in supporting_reads:
if ( read_end_pos < self.bp2 and mate.reference_start > self.bp2 ) or (read.reference_start > self.bp2 and mate_end_pos < self.bp2):
self.print_and_write_bp2(read, mate, bp2_opposing_reads, opposing_reads, 'mate_pair', 'opposing', read_end_pos, mate_end_pos)
self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_opposing_reads, opposing_reads, 'mate_pair', 'opposing', read_end_pos, mate_end_pos)
elif read.reference_start < self.bp2 and read_end_pos > self.bp2:
self.print_and_write_bp2(read, mate, bp2_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)
self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)

else:
print("Reads must be classified")
sys.exit()
support_count = len(set(supporting_reads))
oppose_count = len(set(opposing_reads))

# pysam.index(support_out)
# pysam.index(oppose_out)

return(supporting_reads, support_count, support_out, opposing_reads, oppose_count, oppose_out)


def print_and_write_bp2(self, read, mate, read_bam, read_list, evidence, for_against, read_end_pos, mate_end_pos):
def print_and_write_bp2(self, read, mate, bp1_supporting_reads, bp1_opposing_reads, read_bam, read_list, evidence, for_against, read_end_pos, mate_end_pos):
if self.debug:
print("* bp2 %s %s read : %s %s [rs:e: %s-%s, ms:e: %s-%s]") % (evidence, for_against, read.query_name, read.seq, read.reference_start +1, read_end_pos, mate.reference_start +1, mate_end_pos)
read_bam.write(read)
if evidence == 'mate_pair' or evidence == 'disc_read':
read_bam.write(mate)

read_list.append(read.query_name)
if read.query_name not in bp1_supporting_reads and read.query_name not in bp1_opposing_reads:
read_bam.write(read)
if evidence == 'mate_pair' or evidence == 'disc_read':
read_bam.write(mate)
read_list.append(read.query_name)
6 changes: 3 additions & 3 deletions svSupport/merge_bams.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,19 @@ def merge_bams(out_file, bams):
for bam_file in bams:
try:
name = os.path.splitext(bam_file)[0]
print(name)
sorted_bam = name + ".s" + ".bam"
pysam.sort("-o", sorted_bam, bam_file)
os.remove(bam_file)
pysam.index(sorted_bam)
s_bams.append(sorted_bam)
except:
print("Can't index %s" % bam_file)
print("Can't sort %s" % bam_file)

in_files = ', '.join(s_bams)
print(in_files)
print("Merging bam files %s into '%s'") % (in_files, out_file)
merge_parameters = ['-f', out_file] + s_bams
pysam.merge(*merge_parameters)

#Remove individual bp files
for bp_file in s_bams:
try:
Expand Down
67 changes: 51 additions & 16 deletions svSupport/svSupport.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,34 +83,52 @@ def print_options(bam_in, ratio_file, chrom, bp1, bp2, slop, find_bps, debug, te


def F_bp(read, bp):
if not read.is_proper_pair and (not read.is_reverse and read.reference_start < bp):
if not read.is_proper_pair and (not read.is_reverse and read.reference_start + read.reference_length <= bp):
return True

def bp_R(read, bp):
if not read.is_proper_pair and (read.is_reverse and read.reference_start + read.reference_length > bp):
if not read.is_proper_pair and (read.is_reverse and read.reference_start >= bp):
return True

def bp_F(read, bp):
if not read.is_proper_pair and (not read.is_reverse and read.reference_start + read.reference_length > bp):
if not read.is_proper_pair and (not read.is_reverse and read.reference_start >= bp):
return True

def guess_type(bamFile, chrom, bp, bp_number, out_dir, debug):
samfile = pysam.Samfile(bamFile, "rb")
start = bp - 300
stop = bp + 300
start = bp - 500
stop = bp + 500

sv_reads = defaultdict(int)

print(bp, bp_number)
count = 0
out_file = os.path.join(out_dir, bp_number + "_classifying_reads" + ".bam")

with pysam.AlignmentFile(out_file, "wb", template=samfile) as bpReads:

for read in samfile.fetch(chrom, start, stop):
read_end_pos = read.reference_start + read.reference_length
mate_end_pos = read.next_reference_start + read.reference_length

if bp_number == 'bp1':
if read.reference_start +1 == bp and re.findall(r'(\d+)[S|H]', read.cigarstring):
if re.findall(r'.*?M(\d+)[S|H]', read.cigarstring):
# print("Read clipped to right: %s") % (read.cigarstring)
if bp_number == 'bp1':
sv_reads['F_bp1'] += 1
bpReads.write(read)
else:
sv_reads['F_bp2'] += 1
bpReads.write(read)
elif re.findall(r'(\d+)[S|H].*?M', read.cigarstring):
# print("Read clipped to left: %s") % (read.cigarstring)
if bp_number == 'bp2':

sv_reads['bp2_R'] += 1
bpReads.write(read)
else:
sv_reads['bp1_R'] += 1
bpReads.write(read)

elif bp_number == 'bp1':
if F_bp(read, bp):
sv_reads['F_bp1'] += 1
bpReads.write(read)
Expand All @@ -125,9 +143,12 @@ def guess_type(bamFile, chrom, bp, bp_number, out_dir, debug):
elif F_bp(read, bp):
sv_reads['F_bp2'] += 1
bpReads.write(read)
elif bp_F(read, bp):
sv_reads['bp2_F'] += 1
bpReads.write(read)

pysam.index(out_file)

pysam.index(out_file)
sv_reads['NA'] = 0
maxValKey = max(sv_reads, key=sv_reads.get)

Expand Down Expand Up @@ -178,12 +199,12 @@ def hone_bps(bam_in, chrom, bp, bp_class):

if bp_class == 'F_bp1' or bp_class == 'F_bp2':
if read_end_pos == i:
count+=1
count += 1
bp_reads[i] = count

elif bp_class == 'bp2_R' or bp_class == 'bp1_R':
if read.reference_start +1 == i:
count+=1
count += 1
bp_reads[i] = count


Expand Down Expand Up @@ -281,9 +302,9 @@ def get_args():
help="File to write parsed values to " )

parser.add_option("-g", \
"--guess_type", \
dest="guess_type",
action="store",
"--guess", \
dest="guess",
action="store_true",
help="Guess type of SV for read searching" )

parser.set_defaults(slop=500, out_dir='../out', purity=1, variants_out='variants_out.txt')
Expand All @@ -307,6 +328,7 @@ def worker(options):
test = options.test
ratio_file = options.ratio_file
variants_out = options.variants_out
guess = options.guess

chrom, bp1, bp2 = re.split(':|-', region)
bp1 = int(bp1)
Expand All @@ -323,7 +345,8 @@ def worker(options):
else:
print("python svSupport.py -i %s -l %s:%s-%s -s %s -p %s -f %s -o %s -v %s") % (bam_in, chrom, bp1, bp2, slop, purity, find_bps, out_dir, variants_out)

if guess_type:
if guess:
print(guess)
bp1_reads, bp1_best_guess = guess_type(bam_in, chrom, bp1, 'bp1', out_dir, debug)
bp1_best_guess = max(bp1_reads, key=bp1_reads.get)
bp2_reads, bp2_best_guess = guess_type(bam_in, chrom, bp2, 'bp2', out_dir, debug)
Expand Down Expand Up @@ -362,9 +385,21 @@ def worker(options):

make_dirs(bam_in, out_dir)

# # Would be much quicker if we could just isolate regions surrounding bps and merge into bam_in
# bp1_bam = pysam.Samfile(bam_in, "rb")
# bp1_bam = bp1_bam.fetch(chrom, bp1-1000, bp1+1000)
#
# bp2_bam = pysam.Samfile(bam_in, "rb")
# bp2_bam = bp2_bam.fetch(chrom, bp2-1000, bp2+1000)
#
# merge_bams('bp_regions.bam', [bp1_bam, bp2_bam])
#
# bam_in = 'bp_regions.bam'


reads = FindReads(bam_in, chrom, bp1, bp2, slop, out_dir, debug, bp1_best_guess, bp2_best_guess)
bp1_supporting_reads, bp1_support_count, bp1_support_bam, bp1_opposing_reads, bp1_oppose_count, bp1_oppose_bam = reads.bp1_reads()
bp2_supporting_reads, bp2_support_count, bp2_support_bam, bp2_opposing_reads, bp2_oppose_count, bp2_oppose_bam = reads.bp2_reads()
bp2_supporting_reads, bp2_support_count, bp2_support_bam, bp2_opposing_reads, bp2_oppose_count, bp2_oppose_bam = reads.bp2_reads(bp1_supporting_reads, bp1_opposing_reads)

support_out = os.path.join(out_dir, "sv_support" + ".bam")
oppose_out = os.path.join(out_dir, "sv_oppose" + ".bam")
Expand Down
7 changes: 4 additions & 3 deletions svSupport/test_deletions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@

reads = FindReads(bam_in, chrom, bp1, bp2, slop, out_dir, debug, bp1_best_guess, bp2_best_guess)

# @pytest.mark.parametrize("bam_in,chrom,bp1,bp2", [
# ('../data/A373R11.tagged.filt.SC.RG.bam', '2L', 2250461, 2251184),
# ('../data/A373R11.tagged.filt.SC.RG.bam', '2L', 2131064, 2131463),
# ])
class Breakpoint_reads(unittest.TestCase):
"""Test reads returned in support of breakpoints"""
bp1_supporting_reads, bp1_support_count, bp1_support_bam, bp1_opposing_reads, bp1_oppose_count, bp1_oppose_bam = reads.bp1_reads()
Expand All @@ -25,19 +29,16 @@ def test_bp1_read_count(self):
"""Are the correct number of bp1 supporting reads reported?"""
self.assertTrue(self.bp1_support_count == 4)


def test_bp1_read_names(self):
"""Are the correct bp1 supporting reads reported?"""
bp1_true_support = ['HWI-D00405:129:C6KNAANXX:4:1309:3990:94686', 'HWI-D00405:129:C6KNAANXX:4:1209:9222:18319',
'HWI-D00405:129:C6KNAANXX:4:2304:19694:29523', 'HWI-D00405:129:C6KNAANXX:4:1314:2618:18304']
self.assertTrue(sorted(self.bp1_supporting_reads) == sorted(bp1_true_support))


def test_bp2_read_count(self):
"""Are the correct number of bp2 supporting reads reported?"""
self.assertTrue(self.bp2_support_count == 5)


def test_bp2_read_names(self):
"""Are the correct bp2 supporting reads reported?"""
bp2_true_support = ['HWI-D00405:129:C6KNAANXX:4:1314:2618:18304', 'HWI-D00405:129:C6KNAANXX:4:2209:8835:88441',
Expand Down

0 comments on commit 78ba626

Please sign in to comment.