-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathvcf2gvf.py
executable file
·68 lines (58 loc) · 2.35 KB
/
vcf2gvf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
"""
Usage vcf2gvf.py <vcf_file input> <gff_file output>
"""
import sys,re
##simplify the variant descriptor for GVF
def get_vartype(type_str):
type_str=type_str.upper()
for V in ['COMPLEX', 'MNP', 'DEL', 'INDEL','INS','SNP']:
if V in type_str:
return V
break
in_vcf_file = open(sys.argv[1], 'r')
out_gff_file = open(sys.argv[2], 'w')
out_gff_file.write("#gff-version 3\n") ####gvf-version 1.06
##only deal with tidy source lines, samtools is special needs
source_pat = re.compile('##source=(\S+)', re.IGNORECASE)
while True:
line=in_vcf_file.readline()
if line[0]!='#':
break
elif line[2:10]=='samtools':
source_type='SAMTOOLS'
break
else:
m=source_pat.match(line)
if m:
source_type=m.group(1).upper()
break
##now read the data
##samtools only distinguishes SNPs and indels
##This is much complicated in highly heterozygous systems
##see http://www.sequenceontology.org/miso/current_release/term/SO:0001059
##GVF column 3 requires a type from this ontology
###vcf type may be snp, mnp, ins, del, or complex.
##for current purposes map multiple types to most complex state
##ie SNP,SNP -> SNP; SNP,MNP-> MNP; SNP,complex
try:
for line in in_vcf_file:
if line[0] != '#':
var=line.split() ##see if var[7] has indel at start. If so add TYPE=INDEL for samtools case
info_field=var[7].split(';')
if info_field[0]=='INDEL':
info_field[0]="TYPE=INDEL"
info_dict=dict(X.split('=') for X in info_field)
##if no TYPE key, then add TYPE=SNP for samtools case
if not info_dict.has_key('TYPE'):
info_dict['TYPE']='SNP';
var_type=get_vartype(info_dict['TYPE'])
ID=":".join([var[0],source_type,var_type,var[1]])
start=int(var[1])
length=len(var[3]) ## using reference length in this case
attributes=";".join(['ID='+ID,'Reference_seq='+var[3],'Variant_seq='+var[4]])
output_line=[var[0], source_type, var_type, str(start), str(start+length-1) ,'.','.','.',attributes,"\n"]
out_gff_file.write("\t".join([str(X) for X in output_line]))
finally:
in_vcf_file.close()
out_gff_file.close()