jamescasbon · BioComSoftware · Aug 13, 2020 · Aug 13, 2020 · Aug 14, 2020 · Aug 14, 2020
diff --git a/vcf/model.py b/vcf/model.py
@@ -1,4 +1,5 @@
 from abc import ABCMeta, abstractmethod
+import ast
 import collections
 import sys
 import re
@@ -172,18 +173,28 @@ class _Record(object):
             Neither the upstream nor downstream flanking bases are
             included in the region.
     """
-    def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT,
-            sample_indexes, samples=None):
-        self.CHROM = CHROM
-        #: the one-based coordinate of the first nucleotide in ``REF``
-        self.POS = POS
-        self.ID = ID
-        self.REF = REF
-        self.ALT = ALT
-        self.QUAL = QUAL
-        self.FILTER = FILTER
-        self.INFO = INFO
-        self.FORMAT = FORMAT
+    def __init__(self, ROWDICT,
+                 #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT,
+                    sample_indexes, samples=None):
+
+        for varname, value in ROWDICT.items():
+            # "." pretty always means no value (NOne)
+            if value == ".": 
+                value = None
+
+            else: 
+                try: 
+                    # Convert to either int or float
+                    # WIll error if value is not a string or a number
+                    value = ast.literal_eval(value)
+                except Exception as e: 
+                    # Any error means value is a string, so just leave it
+                    # Silence this error, or print it for debugging
+                    pass
+                    #print("ERROR occurred converting value '{}' with AST!({})".format(value, e))
+
+            setattr(self, varname, value) #equivalent to: self.varname= 'something'
+
         #: zero-based, half-open start coordinate of ``REF``
         self.start = self.POS - 1
         #: zero-based, half-open end coordinate of ``REF``
@@ -270,8 +281,20 @@ def __iter__(self):
         return iter(self.samples)
 
     def __str__(self):
-        return "Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s)" % self.__dict__
-
+#        return "Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s)" % self.__dict__
+        return ''.join(["Record(",
+                        "CHROM=%(CHROM)s, ",
+                        "POS=%(POS)s, ",
+                        "ID=%(ID)s, ",
+                        "REF=%(REF)s, ",
+                        "ALT=%(ALT)s), "
+                        "QUAL=%(QUAL)s), "
+                        "FILTER=%(FILTER)s), "
+                        "INFO=%(INFO)s), "
+                        "FORMAT=%(FORMAT)s)"
+                        ]) % self.__dict__
+
+#    CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT
     def add_format(self, fmt):
         self.FORMAT = self.FORMAT + ':' + fmt
 
@@ -362,7 +385,7 @@ def heterozygosity(self):
         If there are i alleles with frequency p_i, H=1-sum_i(p_i^2)
         """
         allele_freqs = [1-sum(self.aaf)] + self.aaf
-        return 1 - sum(map(lambda x: x**2, allele_freqs))
+        return 1 - sum([x**2 for x in allele_freqs])
 
     def get_hom_refs(self):
         """ The list of hom ref genotypes"""
@@ -558,9 +581,8 @@ def is_filtered(self):
             return True
 
 
-class _AltRecord(object):
+class _AltRecord(object, metaclass=ABCMeta):
     '''An alternative allele record: either replacement string, SV placeholder, or breakend'''
-    __metaclass__ = ABCMeta
 
     def __init__(self, type, **kwargs):
         super(_AltRecord, self).__init__(**kwargs)
@@ -596,7 +618,7 @@ def __len__(self):
         return len(self.sequence)
 
     def __eq__(self, other):
-        if isinstance(other, basestring):
+        if isinstance(other, str):
             return self.sequence == other
         elif not isinstance(other, self.__class__):
             return False

diff --git a/vcf/parser.py b/vcf/parser.py
@@ -1,3 +1,4 @@
+import ast
 import codecs
 import collections
 import csv
@@ -22,8 +23,8 @@
 except ImportError:
     cparse = None
 
-from model import _Call, _Record, make_calldata_tuple
-from model import _Substitution, _Breakend, _SingleBreakend, _SV
+from .model import _Call, _Record, make_calldata_tuple
+from .model import _Substitution, _Breakend, _SingleBreakend, _SV
 
 
 # Metadata parsers/constants
@@ -315,6 +316,7 @@ def _parse_metainfo(self):
         parser = _vcf_metadata_parser()
 
         line = next(self.reader)
+
         while line.startswith('##'):
             self._header_lines.append(line)
 
@@ -350,9 +352,14 @@ def _parse_metainfo(self):
             line = next(self.reader)
 
         fields = self._row_pattern.split(line[1:])
-        self._column_headers = fields[:9]
-        self.samples = fields[9:]
-        self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)])
+#        self._column_headers = fields[:9]
+        self._column_headers = fields
+        self._non_standard_column_headers = (
+            set(self._column_headers) - 
+            set(["CHROM", "POS", "ID", "REF", "ALT","QUAL", "FILTER", 
+                 "INFO", "FORMAT"]))
+#         self.samples = fields[9:] #removed in favor of running in __next__
+#         self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)])
 
     def _map(self, func, iterable, bad=['.', '']):
         '''``map``, but make bad values None.'''
@@ -447,6 +454,10 @@ def _parse_sample_format(self, samp_fmt):
         return samp_fmt
 
     def _parse_samples(self, samples, samp_fmt, site):
+        print("-----------------------------")#333
+        print("samples =", samples) #3333
+        print("samp_fmt =", samp_fmt)#3333
+        print("site =", site) #333
         '''Parse a sample entry according to the format specified in the FORMAT
         column.
 
@@ -459,6 +470,7 @@ def _parse_samples(self, samples, samp_fmt, site):
             self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt)
         samp_fmt = self._format_cache[samp_fmt]
 
+        print("cparse=", cparse)#333
         if cparse:
             return cparse.parse_samples(
                 self.samples, samples, samp_fmt, samp_fmt._types, samp_fmt._nums, site)
@@ -467,13 +479,16 @@ def _parse_samples(self, samples, samp_fmt, site):
         _map = self._map
 
         nfields = len(samp_fmt._fields)
-
-        for name, sample in itertools.izip(self.samples, samples):
-
+
+        print("self.samples", self.samples) #3333
+        print("samples =", samples)#3333
+        for name, sample in zip(self.samples, samples):
+            print("Zipped: ", name, sample) #333
             # parse the data for this sample
             sampdat = [None] * nfields
 
             for i, vals in enumerate(sample.split(':')):
+                print("sample split = ", i, ":", vals)#333
 
                 # short circuit the most common
                 if samp_fmt._fields[i] == 'GT':
@@ -548,47 +563,112 @@ def _parse_alt(self, str):
         else:
             return _Substitution(str)
 
-    def next(self):
+    def __next__(self):
         '''Return the next record in the file.'''
         line = next(self.reader)
-        row = self._row_pattern.split(line.rstrip())
-        chrom = row[0]
+        row = self._row_pattern.split(line.rstrip())        
+        rowdict = collections.OrderedDict((zip(self._column_headers, row)))
+
         if self._prepend_chr:
-            chrom = 'chr' + chrom
-        pos = int(row[1])
-
-        if row[2] != '.':
-            ID = row[2]
-        else:
-            ID = None
-
-        ref = row[3]
-        alt = self._map(self._parse_alt, row[4].split(','))
-
-        try:
-            qual = int(row[5])
-        except ValueError:
-            try:
-                qual = float(row[5])
-            except ValueError:
-                qual = None
-
-        filt = self._parse_filter(row[6])
-        info = self._parse_info(row[7])
+            rowdict["CHROM"] = 'chr' + rowdict["CHROM"]
+#===============================================================================
+#         chrom = rowdict["CHROM"]
+#         if self._prepend_chr:
+#             chrom = 'chr' + chrom
+# #        chrom = row[0]
+#===============================================================================
+
+#===============================================================================
+#         pos = int(rowdict["POS"])
+# #        pos = int(row[1])
+#===============================================================================
+
+        rowdict["ID"] = rowdict["ID"] if rowdict["ID"] != '.' else None
+        #=======================================================================
+        # if row[2] != '.':
+        #     ID = row[2]
+        # else:
+        #     ID = None
+        #=======================================================================
+
+        #=======================================================================
+        # ref = rowdict["REF"]
+        # alt = self._map(self._parse_alt, rowdict["ALT"].split(','))
+        #=======================================================================
+        rowdict["ALT"] = self._map(self._parse_alt, rowdict["ALT"].split(','))
+
+        try: 
+            rowdict["QUAL"] = ast.literal_eval(rowdict["QUAL"])
+        except: # Any error means qual = None 
+              rowdict["QUAL"] = None
+        #=======================================================================
+        # try:
+        #     qual = int(row[5])
+        # except ValueError:
+        #     try:
+        #         qual = float(row[5])
+        #     except ValueError:
+        #         qual = None
+        #=======================================================================
+
+        rowdict["FILTER"] = self._parse_filter(rowdict["FILTER"])
+        rowdict["INFO"]   = self._parse_info(rowdict["INFO"])
+        #=======================================================================
+        # filt = self._parse_filter(rowdict["FILTER"])
+        # info = self._parse_info(rowdict["INFO"])
+        #=======================================================================
 
         try:
-            fmt = row[8]
-        except IndexError:
-            fmt = None
-        else:
-            if fmt == '.':
-                fmt = None
-
-        record = _Record(chrom, pos, ID, ref, alt, qual, filt,
-                info, fmt, self._sample_indexes)
-
-        if fmt is not None:
-            samples = self._parse_samples(row[9:], fmt, record)
+            if rowdict["FORMAT"] == '.': 
+                rowdict["FORMAT"] = None
+        except KeyError:
+            rowdict["FORMAT"] = None
+        #=======================================================================
+        # try:
+        #     fmt = row[8]
+        # except IndexError:
+        #     fmt = None
+        # else:
+        #     if fmt == '.':
+        #         fmt = None
+        #=======================================================================
+
+        #=======================================================================
+        # record = _Record(chrom, pos, ID, ref, alt, qual, filt,
+        #                  info, fmt, self._sample_indexes)
+        #=======================================================================
+
+        # Genotype fields can be followed by other data columns, 
+        # so we have to detect that
+        if rowdict["FORMAT"] is not None:
+            # We run this code here, because we have to wait for the first line 
+            # of actual data. 
+            # It will use the number of colons found in the FORMAT field, to 
+            # create a regex pattern. It will match that regex pattern against 
+            # the data in the remaining non-standard colums and, if they appear 
+            # to actually be sample data, adds them to self.samples
+            # self.samples remains set to None in init. 
+            if self.samples is None: 
+                self.samples = []
+                self._num_format_colons = rowdict["FORMAT"].count(":")
+                self._samples_pattern = "^"
+                for i in range(0,self._num_format_colons+1, 1):
+                    self._samples_pattern += r"[0-9.,/-]{1,}:"
+                self._samples_pattern = self._samples_pattern.rstrip(":") + "$"
+                for header in self._column_headers:
+                        if re.match(self._samples_pattern, str(rowdict[header])):
+                            self.samples.append(header) 
+                self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)])
+            record = _Record(rowdict, self._sample_indexes)
+
+            # Generate an "items" list (of sample data) based on the columns
+            # identified to be containing sample data. This is passed to 
+            # _parse_samples
+            items = []
+            for col in self.samples:
+                items.append(rowdict[col])
+
+            samples = self._parse_samples(items, rowdict["FORMAT"], record)
             record.samples = samples
 
         return record
@@ -641,7 +721,7 @@ class Writer(object):
     """VCF Writer. On Windows Python 2, open stream with 'wb'."""
 
     # Reverse keys and values in header field count dictionary
-    counts = dict((v,k) for k,v in field_counts.iteritems())
+    counts = dict((v,k) for k,v in field_counts.items())
 
     def __init__(self, stream, template, lineterminator="\n"):
         self.writer = csv.writer(stream, delimiter="\t",
@@ -654,30 +734,30 @@ def __init__(self, stream, template, lineterminator="\n"):
         # get a maximum key).
         self.info_order = collections.defaultdict(
             lambda: len(template.infos),
-            dict(zip(template.infos.iterkeys(), itertools.count())))
+            dict(list(zip(iter(template.infos.keys()), itertools.count()))))
 
         two = '##{key}=<ID={0},Description="{1}">\n'
         four = '##{key}=<ID={0},Number={num},Type={2},Description="{3}">\n'
         _num = self._fix_field_count
-        for (key, vals) in template.metadata.iteritems():
+        for (key, vals) in template.metadata.items():
             if key in SINGULAR_METADATA:
                 vals = [vals]
             for val in vals:
                 if isinstance(val, dict):
                     values = ','.join('{0}={1}'.format(key, value)
-                                      for key, value in val.items())
+                                      for key, value in list(val.items()))
                     stream.write('##{0}=<{1}>\n'.format(key, values))
                 else:
                     stream.write('##{0}={1}\n'.format(key, val))
-        for line in template.infos.itervalues():
+        for line in template.infos.values():
             stream.write(four.format(key="INFO", *line, num=_num(line.num)))
-        for line in template.formats.itervalues():
+        for line in template.formats.values():
             stream.write(four.format(key="FORMAT", *line, num=_num(line.num)))
-        for line in template.filters.itervalues():
+        for line in template.filters.values():
             stream.write(two.format(key="FILTER", *line))
-        for line in template.alts.itervalues():
+        for line in template.alts.values():
             stream.write(two.format(key="ALT", *line))
-        for line in template.contigs.itervalues():
+        for line in template.contigs.values():
             if line.length:
                 stream.write('##contig=<ID={0},length={1}>\n'.format(*line))
             else: