From 936c3cd7cabfe8d9525eae21ebf145367be8bea9 Mon Sep 17 00:00:00 2001 From: J35P312 Date: Mon, 19 Dec 2022 17:48:33 +0100 Subject: [PATCH] modified: setup.py modified: tiddit/__main__.py modified: tiddit/tiddit_cluster.pyx modified: tiddit/tiddit_contig_analysis.pyx modified: tiddit/tiddit_signal.pyx modified: tiddit/tiddit_variant.pyx --- setup.py | 2 +- tiddit/__main__.py | 2 +- tiddit/tiddit_cluster.pyx | 27 ++++++++---- tiddit/tiddit_contig_analysis.pyx | 3 +- tiddit/tiddit_signal.pyx | 45 +++++++++++++++++--- tiddit/tiddit_variant.pyx | 69 ++++++++++++++++++++++++++----- 6 files changed, 121 insertions(+), 27 deletions(-) diff --git a/setup.py b/setup.py index e6c1219..7653dbe 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ setup( name = 'tiddit', - version = '3.3.2', + version = '3.4.0', url = "https://github.com/SciLifeLab/TIDDIT", author = "Jesper Eisfeldt", diff --git a/tiddit/__main__.py b/tiddit/__main__.py index 5bfdf82..e1d398e 100644 --- a/tiddit/__main__.py +++ b/tiddit/__main__.py @@ -17,7 +17,7 @@ import tiddit.tiddit_contig_analysis as tiddit_contig_analysis def main(): - version="3.3.2" + version="3.4.0" parser = argparse.ArgumentParser("""tiddit-{}""".format(version),add_help=False) parser.add_argument("--sv" , help="call structural variation", required=False, action="store_true") parser.add_argument("--cov" , help="generate a coverage bed file", required=False, action="store_true") diff --git a/tiddit/tiddit_cluster.pyx b/tiddit/tiddit_cluster.pyx index 91a50d8..fe2e9ab 100644 --- a/tiddit/tiddit_cluster.pyx +++ b/tiddit/tiddit_cluster.pyx @@ -70,7 +70,7 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi if int(posB) > contig_length[chrB]: posA=contig_length[chrB] - discordants[chrA][chrB].append([content[0],sample,"D",posA,content[5],posB,content[8],i]) + discordants[chrA][chrB].append([content[0],sample,"D",posA,content[5],posB,content[8],i,int(content[3]),int(content[4]),int(content[6]),int(content[7])]) positions[chrA][chrB].append([int(posA),int(posB),i]) i+=1 @@ -99,7 +99,7 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi if int(posB) > contig_length[chrB]: posB=contig_length[chrB] - discordants[chrA][chrB].append([content[0],sample,"S",posA,content[4],posB,content[6],i]) + discordants[chrA][chrB].append([content[0],sample,"S",posA,content[4],posB,content[6],i,int(content[7]),int(content[8]),int(content[9]),int(content[10])]) positions[chrA][chrB].append([int(posA),int(posB),i]) i+=1 @@ -132,7 +132,7 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi if int(posB) > contig_length[chrB]: posB=contig_length[chrB] - discordants[chrA][chrB].append([content[0],sample,"A",posA,content[4],posB,content[6],i]) + discordants[chrA][chrB].append([content[0],sample,"A",posA,content[4],posB,content[6],i,int(content[7]),int(content[8]),int(content[9]),int(content[10])]) positions[chrA][chrB].append([int(posA),int(posB),i]) contigs.add(i) i+=1 @@ -197,6 +197,9 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi candidates[chrA][chrB][candidate]["positions_A"]["orientation_contigs"]=[] candidates[chrA][chrB][candidate]["positions_A"]["orientation_splits"]=[] candidates[chrA][chrB][candidate]["positions_A"]["orientation_discordants"]=[] + candidates[chrA][chrB][candidate]["positions_A"]["start"]=[] + candidates[chrA][chrB][candidate]["positions_A"]["end"]=[] + candidates[chrA][chrB][candidate]["start_A"]=0 candidates[chrA][chrB][candidate]["end_A"]=0 @@ -208,6 +211,8 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi candidates[chrA][chrB][candidate]["positions_B"]["orientation_contigs"]=[] candidates[chrA][chrB][candidate]["positions_B"]["orientation_splits"]=[] candidates[chrA][chrB][candidate]["positions_B"]["orientation_discordants"]=[] + candidates[chrA][chrB][candidate]["positions_B"]["start"]=[] + candidates[chrA][chrB][candidate]["positions_B"]["end"]=[] candidates[chrA][chrB][candidate]["start_B"]=0 candidates[chrA][chrB][candidate]["end_B"]=0 @@ -218,7 +223,13 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi candidates[chrA][chrB][candidate]["sample_contigs"][discordants[chrA][chrB][i][1]]=set([]) candidates[chrA][chrB][candidate]["samples"].add(discordants[chrA][chrB][i][1]) - + + candidates[chrA][chrB][candidate]["positions_A"]["start"].append(discordants[chrA][chrB][i][8]) + candidates[chrA][chrB][candidate]["positions_A"]["end"].append(discordants[chrA][chrB][i][9]) + + candidates[chrA][chrB][candidate]["positions_B"]["start"].append(discordants[chrA][chrB][i][10]) + candidates[chrA][chrB][candidate]["positions_B"]["end"].append(discordants[chrA][chrB][i][11]) + if discordants[chrA][chrB][i][2] == "D": candidates[chrA][chrB][candidate]["discordants"].add(discordants[chrA][chrB][i][0]) candidates[chrA][chrB][candidate]["positions_A"]["discordants"].append(int(discordants[chrA][chrB][i][3])) @@ -321,11 +332,11 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi candidates[chrA][chrB][candidate]["posA"] candidates[chrA][chrB][candidate]["posB"] - candidates[chrA][chrB][candidate]["startB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["contigs"]+candidates[chrA][chrB][candidate]["positions_B"]["splits"]+candidates[chrA][chrB][candidate]["positions_B"]["discordants"]) - candidates[chrA][chrB][candidate]["endB"]=max(candidates[chrA][chrB][candidate]["positions_B"]["contigs"]+candidates[chrA][chrB][candidate]["positions_B"]["splits"]+candidates[chrA][chrB][candidate]["positions_B"]["discordants"]) + candidates[chrA][chrB][candidate]["startB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["start"]) + candidates[chrA][chrB][candidate]["endB"]=max(candidates[chrA][chrB][candidate]["positions_B"]["end"]) - candidates[chrA][chrB][candidate]["startA"]=min(candidates[chrA][chrB][candidate]["positions_A"]["contigs"]+candidates[chrA][chrB][candidate]["positions_A"]["splits"]+candidates[chrA][chrB][candidate]["positions_A"]["discordants"]) - candidates[chrA][chrB][candidate]["endA"]=max(candidates[chrA][chrB][candidate]["positions_A"]["contigs"]+candidates[chrA][chrB][candidate]["positions_A"]["splits"]+candidates[chrA][chrB][candidate]["positions_A"]["discordants"]) + candidates[chrA][chrB][candidate]["startA"]=min(candidates[chrA][chrB][candidate]["positions_A"]["start"]) + candidates[chrA][chrB][candidate]["endA"]=max(candidates[chrA][chrB][candidate]["positions_A"]["end"]) return(candidates) diff --git a/tiddit/tiddit_contig_analysis.pyx b/tiddit/tiddit_contig_analysis.pyx index f9304e2..35c2904 100644 --- a/tiddit/tiddit_contig_analysis.pyx +++ b/tiddit/tiddit_contig_analysis.pyx @@ -63,7 +63,8 @@ def read_contigs(aligned_contigs,prefix,sample_id,min_size): current_bp=read.reference_start for i in range(0,len(read.cigartuples)-1): if read.cigartuples[i][0] == 2 and read.cigartuples[i][1] > min_size: - split_contigs[read.reference_name][read.reference_name]["{}_d_{}".format(read.query_name,i)]=[current_bp,read.is_reverse,current_bp+read.cigartuples[i][1],read.is_reverse] + + split_contigs[read.reference_name][read.reference_name]["{}_d_{}".format(read.query_name,i)]=[current_bp,read.is_reverse,current_bp+read.cigartuples[i][1],read.is_reverse,read.reference_start,current_bp,current_bp+read.cigartuples[i][1],read.reference_end] current_bp+=read.cigartuples[i][1] f=open("{}_tiddit/contigs_{}.tab".format(prefix,sample_id),"w") diff --git a/tiddit/tiddit_signal.pyx b/tiddit/tiddit_signal.pyx index 472eeca..eb30a78 100644 --- a/tiddit/tiddit_signal.pyx +++ b/tiddit/tiddit_signal.pyx @@ -66,18 +66,40 @@ def SA_analysis(read,min_q,tag,reference_name): cdef long read_query_alignment_end=read.query_alignment_end if (read.query_alignment_start ) < (read.query_length - read_query_alignment_end): - split_pos=read.reference_end+1 + if read.is_reverse: + + + split_pos=read.reference_start+1 + else: + split_pos=read.reference_end+1 else: - split_pos=read.reference_start+1 + if read.is_reverse: + split_pos=read.reference_end+1 + else: + split_pos=read.reference_start+1 supplementry_alignment=find_SA_query_range(SA_data) SA_chr=SA_data[0] + startA=read.reference_start+1 + endA=read.reference_end+1 + + startB=supplementry_alignment.reference_start + endB=supplementry_alignment.reference_end + if (supplementry_alignment.query_alignment_start ) < (supplementry_alignment.query_length - read_query_alignment_end): - SA_split_pos=supplementry_alignment.reference_end + if SA_data[2] == "-": + + SA_split_pos=supplementry_alignment.reference_start + else: + SA_split_pos=supplementry_alignment.reference_end else: - SA_split_pos=supplementry_alignment.reference_start + if SA_data[2] == "-": + SA_split_pos=supplementry_alignment.reference_end + + else: + SA_split_pos=supplementry_alignment.reference_start if SA_chr < reference_name: @@ -87,6 +109,12 @@ def SA_analysis(read,min_q,tag,reference_name): split_pos=SA_split_pos SA_split_pos=tmp + startB=read.reference_start+1 + endB=read.reference_end+1 + startA=supplementry_alignment.reference_start + endA=supplementry_alignment.reference_end + + else: chrA=reference_name chrB=SA_chr @@ -97,11 +125,16 @@ def SA_analysis(read,min_q,tag,reference_name): split_pos=SA_split_pos SA_split_pos=tmp + startB=read.reference_start+1 + endB=read.reference_end+1 + startA=supplementry_alignment.reference_start + endA=supplementry_alignment.reference_end + split=[] if "-" == SA_data[2]: - split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos, True] + split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos, True,startA,endA,startB,endB] else: - split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos,False] + split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos,False,startA,endA,startB,endB] #splits[chrA][chrB][read.query_name]+=[split_pos,read.is_reverse,SA_split_pos,False] return(split) diff --git a/tiddit/tiddit_variant.pyx b/tiddit/tiddit_variant.pyx index a6f576c..79939dd 100644 --- a/tiddit/tiddit_variant.pyx +++ b/tiddit/tiddit_variant.pyx @@ -6,6 +6,37 @@ from joblib import Parallel, delayed #from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment from pysam import AlignmentFile, AlignedSegment + +def scoring(scoring_dict,percentiles): + score=[0] + if scoring_dict["n_contigs"]: + score.append(50) + + if scoring_dict["n_discordants"]: + score.append(0) + for p in percentiles["FA"]: + if scoring_dict["n_discordants"]/(scoring_dict["refFA"]+scoring_dict["n_discordants"]) >= p: + score[-1]+=5 + + score.append(0) + for p in percentiles["FB"]: + if scoring_dict["n_discordants"]/(scoring_dict["refFB"]+scoring_dict["n_discordants"]) >= p: + score[-1]+=5 + + + if scoring_dict["n_splits"]: + score.append(0) + for p in percentiles["RA"]: + if scoring_dict["n_splits"]/(scoring_dict["refRA"]+scoring_dict["n_splits"]) >= p: + score[-1]+=5 + + score.append(0) + for p in percentiles["RB"]: + if scoring_dict["n_splits"]/(scoring_dict["refRB"]+scoring_dict["n_splits"]) >= p: + score[-1]+=5 + + return(max(score)) + def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, contig_number): cdef int low_q=0 @@ -67,10 +98,10 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c r_start=read_reference_start r_end=read_reference_end - if read_reference_start < bp-10 and r_end > bp: + if read_reference_start < bp-20 and r_end > bp+20: crossing_r+=1 - mate_bp_read= (read.next_reference_start < bp and r_end > bp) + mate_bp_read= (read.next_reference_start < bp-50 and r_end > bp+50) discordant= ( abs(read.isize) > max_ins or read_next_reference_name != read_reference_name ) if mate_bp_read and not discordant: @@ -281,14 +312,16 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar #configure filters for CNV based on Read depth for sample in samples: + + covA=sample_data[sample]["covA"] + covM=sample_data[sample]["covM"] + covB=sample_data[sample]["covB"] + if "DEL" in svtype: #homozygout del based on coverage if cn == 0: filt="PASS" - covA=sample_data[sample]["covA"] - covM=sample_data[sample]["covM"] - covB=sample_data[sample]["covB"] #normal coverage on the flanking regions, abnormal inbetween if covA > covM*(cn+0.9) and covB > covM*(cn+0.9): @@ -297,8 +330,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar #too few reads, but clear DR signal elif "DUP" in svtype and filt == "BelowExpectedLinks": filt="PASS" - - + scoring_dict={"n_contigs":n_contigs, "n_discordants":n_discordants,"n_splits":n_splits,"covA":covA,"covM":covM,"covB":covB,"refRA":sample_data[sample]["refRA"],"refRB":sample_data[sample]["refRB"],"refFA":sample_data[sample]["refFA"],"refFB":sample_data[sample]["refFB"]} if svtype != "BND": info=["SVTYPE={}".format(svtype),"SVLEN={}".format(posB-posA),"END={}".format(posB)] @@ -363,7 +395,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar GT="0/1" variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) ) - variants.append([chrA,posA,variant]) + variants.append([chrA,posA,variant,scoring_dict]) else: info=["SVTYPE=BND".format(svtype)] inverted=False @@ -439,7 +471,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) ) - variants.append([chrA,posA,variant]) + variants.append([chrA,posA,variant,scoring_dict]) variant=[chrB,str(posB),"SV_{}_2".format(var_n),"N",alt_str_b,".",filt,info,format_col] @@ -472,7 +504,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) ) - variants.append([chrB,posB,variant]) + variants.append([chrB,posB,variant, scoring_dict ]) samfile.close() return(variants) @@ -498,8 +530,25 @@ def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,sampl variants_list=Parallel(n_jobs=args.threads)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs) for chrA in sv_clusters) + ratios={"fragments_A":[],"fragments_B":[],"reads_A":[],"reads_B":[]} for v in variants_list: for variant in v: + if variant[3]["n_discordants"]: + ratios["fragments_A"].append(variant[3]["n_discordants"]/(variant[3]["refFA"]+variant[3]["n_discordants"]) ) + ratios["fragments_B"].append(variant[3]["n_discordants"]/(variant[3]["refFB"]+variant[3]["n_discordants"]) ) + + if variant[3]["n_splits"]: + ratios["reads_A"].append(variant[3]["n_splits"]/(variant[3]["refRA"]+variant[3]["n_splits"]) ) + ratios["reads_B"].append(variant[3]["n_splits"]/(variant[3]["refRB"]+variant[3]["n_splits"]) ) + + + p=[1,5,10,20,30,40,50,60,70,75,80,85,90,95,97.5,99] + percentiles={"FA":numpy.percentile(ratios["fragments_A"],p),"FB":numpy.percentile(ratios["fragments_B"],p),"RA":numpy.percentile(ratios["reads_A"],p),"RB":numpy.percentile(ratios["reads_B"],p)} + + for v in variants_list: + for variant in v: + score=scoring(variant[3],percentiles) + variant[2][5]=str(score) variants[ variant[0] ].append( [ variant[1],variant[2] ] ) return(variants)