Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to parser.py and model.py to allow for nonstandard VCF file headers #326

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 40 additions & 18 deletions vcf/model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from abc import ABCMeta, abstractmethod
import ast
import collections
import sys
import re
Expand Down Expand Up @@ -172,18 +173,28 @@ class _Record(object):
Neither the upstream nor downstream flanking bases are
included in the region.
"""
def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT,
sample_indexes, samples=None):
self.CHROM = CHROM
#: the one-based coordinate of the first nucleotide in ``REF``
self.POS = POS
self.ID = ID
self.REF = REF
self.ALT = ALT
self.QUAL = QUAL
self.FILTER = FILTER
self.INFO = INFO
self.FORMAT = FORMAT
def __init__(self, ROWDICT,
#CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT,
sample_indexes, samples=None):

for varname, value in ROWDICT.items():
# "." pretty always means no value (NOne)
if value == ".":
value = None

else:
try:
# Convert to either int or float
# WIll error if value is not a string or a number
value = ast.literal_eval(value)
except Exception as e:
# Any error means value is a string, so just leave it
# Silence this error, or print it for debugging
pass
#print("ERROR occurred converting value '{}' with AST!({})".format(value, e))

setattr(self, varname, value) #equivalent to: self.varname= 'something'

#: zero-based, half-open start coordinate of ``REF``
self.start = self.POS - 1
#: zero-based, half-open end coordinate of ``REF``
Expand Down Expand Up @@ -270,8 +281,20 @@ def __iter__(self):
return iter(self.samples)

def __str__(self):
return "Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s)" % self.__dict__

# return "Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s)" % self.__dict__
return ''.join(["Record(",
"CHROM=%(CHROM)s, ",
"POS=%(POS)s, ",
"ID=%(ID)s, ",
"REF=%(REF)s, ",
"ALT=%(ALT)s), "
"QUAL=%(QUAL)s), "
"FILTER=%(FILTER)s), "
"INFO=%(INFO)s), "
"FORMAT=%(FORMAT)s)"
]) % self.__dict__

# CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT
def add_format(self, fmt):
self.FORMAT = self.FORMAT + ':' + fmt

Expand Down Expand Up @@ -362,7 +385,7 @@ def heterozygosity(self):
If there are i alleles with frequency p_i, H=1-sum_i(p_i^2)
"""
allele_freqs = [1-sum(self.aaf)] + self.aaf
return 1 - sum(map(lambda x: x**2, allele_freqs))
return 1 - sum([x**2 for x in allele_freqs])

def get_hom_refs(self):
""" The list of hom ref genotypes"""
Expand Down Expand Up @@ -558,9 +581,8 @@ def is_filtered(self):
return True


class _AltRecord(object):
class _AltRecord(object, metaclass=ABCMeta):
'''An alternative allele record: either replacement string, SV placeholder, or breakend'''
__metaclass__ = ABCMeta

def __init__(self, type, **kwargs):
super(_AltRecord, self).__init__(**kwargs)
Expand Down Expand Up @@ -596,7 +618,7 @@ def __len__(self):
return len(self.sequence)

def __eq__(self, other):
if isinstance(other, basestring):
if isinstance(other, str):
return self.sequence == other
elif not isinstance(other, self.__class__):
return False
Expand Down
186 changes: 133 additions & 53 deletions vcf/parser.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import ast
import codecs
import collections
import csv
Expand All @@ -22,8 +23,8 @@
except ImportError:
cparse = None

from model import _Call, _Record, make_calldata_tuple
from model import _Substitution, _Breakend, _SingleBreakend, _SV
from .model import _Call, _Record, make_calldata_tuple
from .model import _Substitution, _Breakend, _SingleBreakend, _SV


# Metadata parsers/constants
Expand Down Expand Up @@ -315,6 +316,7 @@ def _parse_metainfo(self):
parser = _vcf_metadata_parser()

line = next(self.reader)

while line.startswith('##'):
self._header_lines.append(line)

Expand Down Expand Up @@ -350,9 +352,14 @@ def _parse_metainfo(self):
line = next(self.reader)

fields = self._row_pattern.split(line[1:])
self._column_headers = fields[:9]
self.samples = fields[9:]
self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)])
# self._column_headers = fields[:9]
self._column_headers = fields
self._non_standard_column_headers = (
set(self._column_headers) -
set(["CHROM", "POS", "ID", "REF", "ALT","QUAL", "FILTER",
"INFO", "FORMAT"]))
# self.samples = fields[9:] #removed in favor of running in __next__
# self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)])

def _map(self, func, iterable, bad=['.', '']):
'''``map``, but make bad values None.'''
Expand Down Expand Up @@ -447,6 +454,10 @@ def _parse_sample_format(self, samp_fmt):
return samp_fmt

def _parse_samples(self, samples, samp_fmt, site):
print("-----------------------------")#333
print("samples =", samples) #3333
print("samp_fmt =", samp_fmt)#3333
print("site =", site) #333
'''Parse a sample entry according to the format specified in the FORMAT
column.

Expand All @@ -459,6 +470,7 @@ def _parse_samples(self, samples, samp_fmt, site):
self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt)
samp_fmt = self._format_cache[samp_fmt]

print("cparse=", cparse)#333
if cparse:
return cparse.parse_samples(
self.samples, samples, samp_fmt, samp_fmt._types, samp_fmt._nums, site)
Expand All @@ -467,13 +479,16 @@ def _parse_samples(self, samples, samp_fmt, site):
_map = self._map

nfields = len(samp_fmt._fields)

for name, sample in itertools.izip(self.samples, samples):


print("self.samples", self.samples) #3333
print("samples =", samples)#3333
for name, sample in zip(self.samples, samples):
print("Zipped: ", name, sample) #333
# parse the data for this sample
sampdat = [None] * nfields

for i, vals in enumerate(sample.split(':')):
print("sample split = ", i, ":", vals)#333

# short circuit the most common
if samp_fmt._fields[i] == 'GT':
Expand Down Expand Up @@ -548,47 +563,112 @@ def _parse_alt(self, str):
else:
return _Substitution(str)

def next(self):
def __next__(self):
'''Return the next record in the file.'''
line = next(self.reader)
row = self._row_pattern.split(line.rstrip())
chrom = row[0]
row = self._row_pattern.split(line.rstrip())
rowdict = collections.OrderedDict((zip(self._column_headers, row)))

if self._prepend_chr:
chrom = 'chr' + chrom
pos = int(row[1])

if row[2] != '.':
ID = row[2]
else:
ID = None

ref = row[3]
alt = self._map(self._parse_alt, row[4].split(','))

try:
qual = int(row[5])
except ValueError:
try:
qual = float(row[5])
except ValueError:
qual = None

filt = self._parse_filter(row[6])
info = self._parse_info(row[7])
rowdict["CHROM"] = 'chr' + rowdict["CHROM"]
#===============================================================================
# chrom = rowdict["CHROM"]
# if self._prepend_chr:
# chrom = 'chr' + chrom
# # chrom = row[0]
#===============================================================================

#===============================================================================
# pos = int(rowdict["POS"])
# # pos = int(row[1])
#===============================================================================

rowdict["ID"] = rowdict["ID"] if rowdict["ID"] != '.' else None
#=======================================================================
# if row[2] != '.':
# ID = row[2]
# else:
# ID = None
#=======================================================================

#=======================================================================
# ref = rowdict["REF"]
# alt = self._map(self._parse_alt, rowdict["ALT"].split(','))
#=======================================================================
rowdict["ALT"] = self._map(self._parse_alt, rowdict["ALT"].split(','))

try:
rowdict["QUAL"] = ast.literal_eval(rowdict["QUAL"])
except: # Any error means qual = None
rowdict["QUAL"] = None
#=======================================================================
# try:
# qual = int(row[5])
# except ValueError:
# try:
# qual = float(row[5])
# except ValueError:
# qual = None
#=======================================================================

rowdict["FILTER"] = self._parse_filter(rowdict["FILTER"])
rowdict["INFO"] = self._parse_info(rowdict["INFO"])
#=======================================================================
# filt = self._parse_filter(rowdict["FILTER"])
# info = self._parse_info(rowdict["INFO"])
#=======================================================================

try:
fmt = row[8]
except IndexError:
fmt = None
else:
if fmt == '.':
fmt = None

record = _Record(chrom, pos, ID, ref, alt, qual, filt,
info, fmt, self._sample_indexes)

if fmt is not None:
samples = self._parse_samples(row[9:], fmt, record)
if rowdict["FORMAT"] == '.':
rowdict["FORMAT"] = None
except KeyError:
rowdict["FORMAT"] = None
#=======================================================================
# try:
# fmt = row[8]
# except IndexError:
# fmt = None
# else:
# if fmt == '.':
# fmt = None
#=======================================================================

#=======================================================================
# record = _Record(chrom, pos, ID, ref, alt, qual, filt,
# info, fmt, self._sample_indexes)
#=======================================================================

# Genotype fields can be followed by other data columns,
# so we have to detect that
if rowdict["FORMAT"] is not None:
# We run this code here, because we have to wait for the first line
# of actual data.
# It will use the number of colons found in the FORMAT field, to
# create a regex pattern. It will match that regex pattern against
# the data in the remaining non-standard colums and, if they appear
# to actually be sample data, adds them to self.samples
# self.samples remains set to None in init.
if self.samples is None:
self.samples = []
self._num_format_colons = rowdict["FORMAT"].count(":")
self._samples_pattern = "^"
for i in range(0,self._num_format_colons+1, 1):
self._samples_pattern += r"[0-9.,/-]{1,}:"
self._samples_pattern = self._samples_pattern.rstrip(":") + "$"
for header in self._column_headers:
if re.match(self._samples_pattern, str(rowdict[header])):
self.samples.append(header)
self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)])
record = _Record(rowdict, self._sample_indexes)

# Generate an "items" list (of sample data) based on the columns
# identified to be containing sample data. This is passed to
# _parse_samples
items = []
for col in self.samples:
items.append(rowdict[col])

samples = self._parse_samples(items, rowdict["FORMAT"], record)
record.samples = samples

return record
Expand Down Expand Up @@ -641,7 +721,7 @@ class Writer(object):
"""VCF Writer. On Windows Python 2, open stream with 'wb'."""

# Reverse keys and values in header field count dictionary
counts = dict((v,k) for k,v in field_counts.iteritems())
counts = dict((v,k) for k,v in field_counts.items())

def __init__(self, stream, template, lineterminator="\n"):
self.writer = csv.writer(stream, delimiter="\t",
Expand All @@ -654,30 +734,30 @@ def __init__(self, stream, template, lineterminator="\n"):
# get a maximum key).
self.info_order = collections.defaultdict(
lambda: len(template.infos),
dict(zip(template.infos.iterkeys(), itertools.count())))
dict(list(zip(iter(template.infos.keys()), itertools.count()))))

two = '##{key}=<ID={0},Description="{1}">\n'
four = '##{key}=<ID={0},Number={num},Type={2},Description="{3}">\n'
_num = self._fix_field_count
for (key, vals) in template.metadata.iteritems():
for (key, vals) in template.metadata.items():
if key in SINGULAR_METADATA:
vals = [vals]
for val in vals:
if isinstance(val, dict):
values = ','.join('{0}={1}'.format(key, value)
for key, value in val.items())
for key, value in list(val.items()))
stream.write('##{0}=<{1}>\n'.format(key, values))
else:
stream.write('##{0}={1}\n'.format(key, val))
for line in template.infos.itervalues():
for line in template.infos.values():
stream.write(four.format(key="INFO", *line, num=_num(line.num)))
for line in template.formats.itervalues():
for line in template.formats.values():
stream.write(four.format(key="FORMAT", *line, num=_num(line.num)))
for line in template.filters.itervalues():
for line in template.filters.values():
stream.write(two.format(key="FILTER", *line))
for line in template.alts.itervalues():
for line in template.alts.values():
stream.write(two.format(key="ALT", *line))
for line in template.contigs.itervalues():
for line in template.contigs.values():
if line.length:
stream.write('##contig=<ID={0},length={1}>\n'.format(*line))
else:
Expand Down
Loading