Improve guesswork by using softclipped reads

nriddiford · Feb 19, 2018 · 78ba626 · 78ba626
1 parent a1c05c0
commit 78ba626
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 52 deletions.
diff --git a/svSupport/find_reads.py b/svSupport/find_reads.py
@@ -1,5 +1,5 @@
 import pysam
-import os, re
+import os, re, sys
 from merge_bams import merge_bams
 
 """
@@ -128,25 +128,27 @@ def bp1_reads(self):
                         elif read.reference_start < self.bp1 and read_end_pos > self.bp1:
                             self.print_and_write_bp1(read, mate, bp1_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)
 
+                else:
+                    print("Reads must be classified")
+                    sys.exit()
+
         support_count = len(set(supporting_reads))
         oppose_count = len(set(opposing_reads))
 
-        # pysam.index(support_out)
-        # pysam.index(oppose_out)
-
         return(supporting_reads, support_count, support_out, opposing_reads, oppose_count, oppose_out)
 
 
     def print_and_write_bp1(self, read, mate, read_bam, read_list, evidence, for_against, read_end_pos, mate_end_pos):
         if self.debug:
             print("* bp1 %s %s read    : %s %s [rs:e: %s-%s, ms:e: %s-%s]") % (evidence, for_against, read.query_name, read.seq, read.reference_start, read_end_pos, mate.reference_start, mate_end_pos)
-        read_bam.write(read)
-        if evidence == 'mate_pair' or evidence == 'disc_read':
-            read_bam.write(mate)
-        read_list.append(read.query_name)
+        if read.query_name not in read_list:
+            read_bam.write(read)
+            if evidence == 'mate_pair' or evidence == 'disc_read':
+                read_bam.write(mate)
+            read_list.append(read.query_name)
 
 
-    def bp2_reads(self):
+    def bp2_reads(self, bp1_supporting_reads, bp1_opposing_reads):
         samfile = pysam.Samfile(self.bam_in, "rb")
         bp2_start, bp2_end = self.set_window('bp2')
         supporting_reads = []
@@ -174,65 +176,65 @@ def bp2_reads(self):
                     mate_start = mate.reference_start +1
                     if read_start >= self.bp2 and mate_end_pos <= self.bp1:
                     # if not read.is_proper_pair and read.is_reverse and mate_end_pos < self.bp1:
-                        self.print_and_write_bp2(read, mate, bp2_supporting_reads, supporting_reads, 'disc_read', 'supporting', read_end_pos, mate_end_pos)
+                        self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_supporting_reads, supporting_reads, 'disc_read', 'supporting', read_end_pos, mate_end_pos)
                     elif read_start == self.bp2 and re.findall(r'(\d+)[S|H]', read.cigarstring):
-                        self.print_and_write_bp2(read, mate, bp2_supporting_reads, supporting_reads, 'clipped_read', 'supporting', read_end_pos, mate_end_pos)
+                        self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_supporting_reads, supporting_reads, 'clipped_read', 'supporting', read_end_pos, mate_end_pos)
 
                     # opposing
                     elif read.qname not in supporting_reads:
                         if (read_end_pos < self.bp2 and mate_start > self.bp2) or (read_start > self.bp2 and mate_end_pos < self.bp2 and mate_end_pos > self.bp1):
                         # if (read_end_pos < self.bp1 and mate.reference_start > self.bp1 and mate.reference_start < self.bp2) or (read.reference_start > self.bp1 and mate_end_pos < self.bp1):
-                            self.print_and_write_bp2(read, mate, bp2_opposing_reads, opposing_reads, 'mate_pair', 'opposing', read_end_pos, mate_end_pos)
+                            self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_opposing_reads, opposing_reads, 'mate_pair', 'opposing', read_end_pos, mate_end_pos)
                         # elif read.reference_start < self.bp1 and read_end_pos > self.bp1:
                         elif read_start < self.bp2 and read_end_pos > self.bp2:
-                            self.print_and_write_bp2(read, mate, bp2_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)
+                            self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)
 
                 # type I inversion - ('F_bp1', 'F_bp2')
                 elif self.bp1_class == 'F_bp1' and self.bp2_class == 'F_bp2':
                     # supporting
                     if not read.is_proper_pair and read_end_pos +1 <= self.bp2 and mate_end_pos <= self.bp1:
-                        self.print_and_write_bp2(read, mate, bp2_supporting_reads, supporting_reads, 'disc_read', 'supporting', read_end_pos, mate_end_pos)
+                        self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_supporting_reads, supporting_reads, 'disc_read', 'supporting', read_end_pos, mate_end_pos)
 
                     elif read_end_pos == self.bp2 and re.findall(r'(\d+)[S|H]', read.cigarstring):
-                        self.print_and_write_bp2(read, mate, bp2_supporting_reads, supporting_reads, 'clipped_read', 'supporting', read_end_pos, mate_end_pos)
+                        self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_supporting_reads, supporting_reads, 'clipped_read', 'supporting', read_end_pos, mate_end_pos)
 
                     # opposing
                     elif read.qname not in supporting_reads:
                         if (read_end_pos < self.bp1 and mate.reference_start > self.bp1 and mate.reference_start < self.bp2) or (read.reference_start > self.bp1 and mate_end_pos < self.bp1):
-                            self.print_and_write_bp2(read, mate, bp2_opposing_reads, opposing_reads, 'mate_pair', 'opposing', read_end_pos, mate_end_pos)
+                            self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_opposing_reads, opposing_reads, 'mate_pair', 'opposing', read_end_pos, mate_end_pos)
                         elif read.reference_start < self.bp1 and read_end_pos > self.bp1:
-                            self.print_and_write_bp2(read, mate, bp2_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)
+                            self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)
 
                 # type II inversion - ('bp1_R', 'bp2_R')
                 elif self.bp1_class == 'bp1_R' and self.bp2_class == 'bp2_R':
                     # supporting
                     if not read.is_proper_pair and read.reference_start +1 >= self.bp2 and mate.reference_start >= self.bp1 and mate_end_pos < self.bp2:
-                        self.print_and_write_bp2(read, mate, bp2_supporting_reads, supporting_reads, 'disc_read', 'supporting', read_end_pos, mate_end_pos)
+                        self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_supporting_reads, supporting_reads, 'disc_read', 'supporting', read_end_pos, mate_end_pos)
 
                     elif read.reference_start == self.bp2 and re.findall(r'(\d+)[S|H]', read.cigarstring):
-                        self.print_and_write_bp2(read, mate, bp2_supporting_reads, supporting_reads, 'clipped_read', 'supporting', read_end_pos, mate_end_pos)
+                        self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_supporting_reads, supporting_reads, 'clipped_read', 'supporting', read_end_pos, mate_end_pos)
 
                     # opposing
                     elif read.qname not in supporting_reads:
                         if ( read_end_pos < self.bp2 and mate.reference_start > self.bp2 ) or (read.reference_start > self.bp2 and mate_end_pos < self.bp2):
-                            self.print_and_write_bp2(read, mate, bp2_opposing_reads, opposing_reads, 'mate_pair', 'opposing', read_end_pos, mate_end_pos)
+                            self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_opposing_reads, opposing_reads, 'mate_pair', 'opposing', read_end_pos, mate_end_pos)
                         elif read.reference_start < self.bp2 and read_end_pos > self.bp2:
-                            self.print_and_write_bp2(read, mate, bp2_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)
+                            self.print_and_write_bp2(read, mate, bp1_supporting_reads, bp1_opposing_reads, bp2_opposing_reads, opposing_reads, 'spanning', 'opposing', read_end_pos, mate_end_pos)
 
+                else:
+                    print("Reads must be classified")
+                    sys.exit()
         support_count = len(set(supporting_reads))
         oppose_count = len(set(opposing_reads))
 
-        # pysam.index(support_out)
-        # pysam.index(oppose_out)
-
         return(supporting_reads, support_count, support_out, opposing_reads, oppose_count, oppose_out)
 
 
-    def print_and_write_bp2(self, read, mate, read_bam, read_list, evidence, for_against, read_end_pos, mate_end_pos):
+    def print_and_write_bp2(self, read, mate, bp1_supporting_reads, bp1_opposing_reads, read_bam, read_list, evidence, for_against, read_end_pos, mate_end_pos):
         if self.debug:
             print("* bp2 %s %s read    : %s %s [rs:e: %s-%s, ms:e: %s-%s]") % (evidence, for_against, read.query_name, read.seq, read.reference_start +1, read_end_pos, mate.reference_start +1, mate_end_pos)
-        read_bam.write(read)
-        if evidence == 'mate_pair' or evidence == 'disc_read':
-            read_bam.write(mate)
-
-        read_list.append(read.query_name)
+        if read.query_name not in bp1_supporting_reads and read.query_name not in bp1_opposing_reads:
+            read_bam.write(read)
+            if evidence == 'mate_pair' or evidence == 'disc_read':
+                read_bam.write(mate)
+            read_list.append(read.query_name)
diff --git a/svSupport/merge_bams.py b/svSupport/merge_bams.py
@@ -6,19 +6,19 @@ def merge_bams(out_file, bams):
     for bam_file in bams:
         try:
             name = os.path.splitext(bam_file)[0]
-            print(name)
             sorted_bam = name + ".s" + ".bam"
             pysam.sort("-o", sorted_bam, bam_file)
+            os.remove(bam_file)
             pysam.index(sorted_bam)
             s_bams.append(sorted_bam)
         except:
-            print("Can't index %s" % bam_file)
+            print("Can't sort %s" % bam_file)
 
     in_files = ', '.join(s_bams)
-    print(in_files)
     print("Merging bam files %s into '%s'") % (in_files, out_file)
     merge_parameters = ['-f', out_file] + s_bams
     pysam.merge(*merge_parameters)
+
     #Remove individual bp files
     for bp_file in s_bams:
         try:

diff --git a/svSupport/svSupport.py b/svSupport/svSupport.py
@@ -83,34 +83,52 @@ def print_options(bam_in, ratio_file, chrom, bp1, bp2, slop, find_bps, debug, te
 
 
 def F_bp(read, bp):
-    if not read.is_proper_pair and (not read.is_reverse and read.reference_start < bp):
+    if not read.is_proper_pair and (not read.is_reverse and read.reference_start + read.reference_length <= bp):
         return True
 
 def bp_R(read, bp):
-    if not read.is_proper_pair and (read.is_reverse and read.reference_start + read.reference_length > bp):
+    if not read.is_proper_pair and (read.is_reverse and read.reference_start >= bp):
         return True
 
 def bp_F(read, bp):
-    if not read.is_proper_pair and (not read.is_reverse and read.reference_start + read.reference_length > bp):
+    if not read.is_proper_pair and (not read.is_reverse and read.reference_start >= bp):
         return True
 
 def guess_type(bamFile, chrom, bp, bp_number, out_dir, debug):
     samfile = pysam.Samfile(bamFile, "rb")
-    start = bp - 300
-    stop = bp + 300
+    start = bp - 500
+    stop = bp + 500
 
     sv_reads = defaultdict(int)
 
+    print(bp, bp_number)
     count = 0
     out_file = os.path.join(out_dir, bp_number + "_classifying_reads" + ".bam")
 
     with pysam.AlignmentFile(out_file, "wb", template=samfile) as bpReads:
 
         for read in samfile.fetch(chrom, start, stop):
-            read_end_pos = read.reference_start + read.reference_length
-            mate_end_pos = read.next_reference_start + read.reference_length
 
-            if bp_number == 'bp1':
+            if read.reference_start +1 == bp and re.findall(r'(\d+)[S|H]', read.cigarstring):
+                if re.findall(r'.*?M(\d+)[S|H]', read.cigarstring):
+                    # print("Read clipped to right: %s") % (read.cigarstring)
+                    if bp_number == 'bp1':
+                        sv_reads['F_bp1'] += 1
+                        bpReads.write(read)
+                    else:
+                        sv_reads['F_bp2'] += 1
+                        bpReads.write(read)
+                elif re.findall(r'(\d+)[S|H].*?M', read.cigarstring):
+                    # print("Read clipped to left: %s") % (read.cigarstring)
+                    if bp_number == 'bp2':
+
+                        sv_reads['bp2_R'] += 1
+                        bpReads.write(read)
+                    else:
+                        sv_reads['bp1_R'] += 1
+                        bpReads.write(read)
+
+            elif bp_number == 'bp1':
                 if F_bp(read, bp):
                     sv_reads['F_bp1'] += 1
                     bpReads.write(read)
@@ -125,9 +143,12 @@ def guess_type(bamFile, chrom, bp, bp_number, out_dir, debug):
                 elif F_bp(read, bp):
                     sv_reads['F_bp2'] += 1
                     bpReads.write(read)
+                elif bp_F(read, bp):
+                    sv_reads['bp2_F'] += 1
+                    bpReads.write(read)
 
-    pysam.index(out_file)
 
+    pysam.index(out_file)
     sv_reads['NA'] = 0
     maxValKey = max(sv_reads, key=sv_reads.get)
 
@@ -178,12 +199,12 @@ def hone_bps(bam_in, chrom, bp, bp_class):
 
             if bp_class == 'F_bp1' or bp_class == 'F_bp2':
                 if read_end_pos == i:
-                    count+=1
+                    count += 1
                     bp_reads[i] = count
 
             elif bp_class == 'bp2_R' or bp_class == 'bp1_R':
                 if read.reference_start +1 == i:
-                    count+=1
+                    count += 1
                     bp_reads[i] = count
 
 
@@ -281,9 +302,9 @@ def get_args():
                     help="File to write parsed values to " )
 
     parser.add_option("-g", \
-                    "--guess_type", \
-                    dest="guess_type",
-                    action="store",
+                    "--guess", \
+                    dest="guess",
+                    action="store_true",
                     help="Guess type of SV for read searching" )
 
     parser.set_defaults(slop=500, out_dir='../out', purity=1, variants_out='variants_out.txt')
@@ -307,6 +328,7 @@ def worker(options):
     test     = options.test
     ratio_file = options.ratio_file
     variants_out = options.variants_out
+    guess = options.guess
 
     chrom, bp1, bp2 = re.split(':|-', region)
     bp1 = int(bp1)
@@ -323,7 +345,8 @@ def worker(options):
         else:
             print("python svSupport.py -i %s -l %s:%s-%s -s %s -p %s -f %s -o %s -v %s") % (bam_in, chrom, bp1, bp2, slop, purity, find_bps, out_dir, variants_out)
 
-    if guess_type:
+    if guess:
+        print(guess)
         bp1_reads, bp1_best_guess = guess_type(bam_in, chrom, bp1, 'bp1', out_dir, debug)
         bp1_best_guess = max(bp1_reads, key=bp1_reads.get)
         bp2_reads, bp2_best_guess = guess_type(bam_in, chrom, bp2, 'bp2', out_dir, debug)
@@ -362,9 +385,21 @@ def worker(options):
 
         make_dirs(bam_in, out_dir)
 
+        # # Would be much quicker if we could just isolate regions surrounding bps and merge into bam_in
+        # bp1_bam = pysam.Samfile(bam_in, "rb")
+        # bp1_bam = bp1_bam.fetch(chrom, bp1-1000, bp1+1000)
+        #
+        # bp2_bam = pysam.Samfile(bam_in, "rb")
+        # bp2_bam = bp2_bam.fetch(chrom, bp2-1000, bp2+1000)
+        #
+        # merge_bams('bp_regions.bam', [bp1_bam, bp2_bam])
+        #
+        # bam_in = 'bp_regions.bam'
+
+
         reads = FindReads(bam_in, chrom, bp1, bp2, slop, out_dir, debug, bp1_best_guess, bp2_best_guess)
         bp1_supporting_reads, bp1_support_count, bp1_support_bam, bp1_opposing_reads, bp1_oppose_count, bp1_oppose_bam = reads.bp1_reads()
-        bp2_supporting_reads, bp2_support_count, bp2_support_bam, bp2_opposing_reads, bp2_oppose_count, bp2_oppose_bam = reads.bp2_reads()
+        bp2_supporting_reads, bp2_support_count, bp2_support_bam, bp2_opposing_reads, bp2_oppose_count, bp2_oppose_bam = reads.bp2_reads(bp1_supporting_reads, bp1_opposing_reads)
 
         support_out = os.path.join(out_dir, "sv_support" + ".bam")
         oppose_out = os.path.join(out_dir, "sv_oppose" + ".bam")

diff --git a/svSupport/test_deletions.py b/svSupport/test_deletions.py
@@ -16,6 +16,10 @@
 
 reads = FindReads(bam_in, chrom, bp1, bp2, slop, out_dir, debug, bp1_best_guess, bp2_best_guess)
 
+# @pytest.mark.parametrize("bam_in,chrom,bp1,bp2", [
+#     ('../data/A373R11.tagged.filt.SC.RG.bam', '2L', 2250461, 2251184),
+#     ('../data/A373R11.tagged.filt.SC.RG.bam', '2L', 2131064, 2131463),
+# ])
 class Breakpoint_reads(unittest.TestCase):
     """Test reads returned in support of breakpoints"""
     bp1_supporting_reads, bp1_support_count, bp1_support_bam, bp1_opposing_reads, bp1_oppose_count, bp1_oppose_bam = reads.bp1_reads()
@@ -25,19 +29,16 @@ def test_bp1_read_count(self):
         """Are the correct number of bp1 supporting reads reported?"""
         self.assertTrue(self.bp1_support_count == 4)
 
-
     def test_bp1_read_names(self):
         """Are the correct bp1 supporting reads reported?"""
         bp1_true_support = ['HWI-D00405:129:C6KNAANXX:4:1309:3990:94686', 'HWI-D00405:129:C6KNAANXX:4:1209:9222:18319',
                             'HWI-D00405:129:C6KNAANXX:4:2304:19694:29523', 'HWI-D00405:129:C6KNAANXX:4:1314:2618:18304']
         self.assertTrue(sorted(self.bp1_supporting_reads) == sorted(bp1_true_support))
 
-
     def test_bp2_read_count(self):
         """Are the correct number of bp2 supporting reads reported?"""
         self.assertTrue(self.bp2_support_count == 5)
 
-
     def test_bp2_read_names(self):
         """Are the correct bp2 supporting reads reported?"""
         bp2_true_support = ['HWI-D00405:129:C6KNAANXX:4:1314:2618:18304', 'HWI-D00405:129:C6KNAANXX:4:2209:8835:88441',