Merge pull request #205 from qiyunzhu/coord

Improved handling of range coordinates
qiyunzhu · Sep 28, 2024 · ddea40a · ddea40a
2 parents 58ddd0c + 635ae3c
commit ddea40a
Show file tree

Hide file tree

Showing 12 changed files with 536 additions and 429 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Change Log
 
+## Version 0.1.6-dev
+
+### Changed
+- Changed default output subject coverage (`--outcov`) coordinates into BED-like (0-based, exclusive end). The output can be directly parsed by programs like bedtools. Also added support for GFF-like and other custom formats, as controled by paramter `--outcov-fmt` ([#204](https://github.com/qiyunzhu/woltka/pull/204) and [#205](https://github.com/qiyunzhu/woltka/pull/205)).
+
+
 ## Version 0.1.6 (2/22/2024)
 
 ### Changed

diff --git a/doc/cli.md b/doc/cli.md
@@ -81,7 +81,7 @@ Option | Description
 `--scale` | Scale counts by this factor. Accepts "k", "M" suffixes.
 `--digits` | Round counts to this number of digits after the decimal point.
 
-### Output files
+### Output table
 
 Option | Description
 --- | ---
@@ -90,9 +90,21 @@ Option | Description
 `--name-as-id` | Replace feature IDs with names. Otherwise append names to table as a metadata column.
 `--add-rank` | Append feature ranks to table as a metadata column.
 `--add-lineage` | Append lineage strings to table as a metadata column.
+
+### Output mapping
+
+Option | Description
+--- | ---
 `--outmap`, `-u` | Write read-to-feature maps to this directory.
 `--zipmap` | Compress read-to-feature maps using this algorithm. Options: `none`, `gz` (default), `bz2`, `xz`.
 
+### Output coverage
+
+Option | Description
+--- | ---
+`--outcov` | Write subject coverage maps to this directory.
+`--cov-fmt` | Format of subject coverage coordinates. Options: `bed` (BED-like, 0-based, exclusive end, equivalent to `0e`) (default), `gff` (GFF-like, 1-based, inclusive end, equivalent to `1i`), `0e`, `1e`, `0i`, `1i`.
+
 ### Performance
 
 Option | Description

diff --git a/woltka/align.py b/woltka/align.py
@@ -66,6 +66,11 @@ def plain_mapper(fh, fmt=None, excl=None, n=1000):
     list of set of str
         Subject(s) queue.
 
+    See Also
+    --------
+    .range.range_mapper
+    .ordinal.ordinal_mapper
+
     Notes
     -----
     The design of this function aims to couple with the extremely large size of
@@ -111,74 +116,6 @@ def plain_mapper(fh, fmt=None, excl=None, n=1000):
             yield qryque, subque
 
 
-def range_mapper(fh, fmt=None, excl=None, n=1000):
-    """Read an alignment file and yield maps of query to subject(s) and their
-    ranges.
-
-    Parameters
-    ----------
-    fh : file handle
-        Alignment file to parse.
-    fmt : str, optional
-        Alignment file format.
-    excl : set, optional
-        Subjects to exclude.
-    n : int, optional
-        Number of unique queries per chunk.
-
-    Yields
-    ------
-    list of str
-        Query queue.
-    list of dict of list of int
-        Subject-to-ranges queue.
-
-    Notes
-    -----
-    Same as `plain_mapper`, except that it also returns subject ranges.
-
-    Ranges are stored as a one-dimensional, interleaved list of start1, end1,
-    start2, end2, start3, end3...
-
-    See Also
-    --------
-    plain_mapper
-    .coverage.merge_ranges
-    """
-    it = iter_align(fh, fmt, excl, True)
-    while True:
-        i, done = 0, False
-        qryque, subque = [None] * n, [None] * n
-        for query, records in it:
-
-            # generate a mapping of subjects to interleaved starts and ends
-            ranges = {}
-            for subject, _, _, start, end in records:
-
-                # start and end must be positive integers
-                if start and end:
-
-                    # combine ranges on the same subject
-                    ranges.setdefault(subject, []).extend((start, end))
-
-            # append query and ranges
-            if ranges:
-                qryque[i] = query
-                subque[i] = ranges
-
-                i += 1
-                if i == n:
-                    done = True
-                    break
-
-        if not done:
-            if i:
-                yield qryque[:i], subque[:i]
-            break  # pragma: no cover
-        else:
-            yield qryque, subque
-
-
 def iter_align(fh, fmt=None, excl=None, extr=None):
     """Generate an iterator of alignment file content.
 
@@ -340,6 +277,8 @@ def parse_sam_file(fh):
         QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL,
         TAGS
 
+    POS is 1-based.
+
     .. _Wikipedia:
         https://en.wikipedia.org/wiki/SAM_(file_format)
     .. _SAM format specification:
@@ -441,7 +380,7 @@ def parse_sam_file_ex(fh):
         mate = int(flag) >> 6 & 3
 
         # leftmost mapping position
-        pos = int(pos)
+        pos = int(pos) - 1
 
         # parse CIGAR string
         length, offset = cigar_to_lens(cigar)
@@ -457,7 +396,7 @@ def parse_sam_file_ex(fh):
             this, pool = qname, ([], [], [])
 
         # append
-        pool[mate].append((rname, None, length, pos, pos + offset - 1))
+        pool[mate].append((rname, None, length, pos, pos + offset))
 
     # final yield
     if pool[0]:
@@ -587,19 +526,19 @@ def parse_sam_file_ex_ft(fh, excl):
             keep = rname not in excl
             if keep:
                 pool = ([], [], [])
-                pos = int(pos)
+                pos = int(pos) - 1
                 length, offset = cigar_to_lens(cigar)
                 pool[int(flag) >> 6 & 3].append((
-                    rname, None, length, pos, pos + offset - 1))
+                    rname, None, length, pos, pos + offset))
 
         elif keep:
             if rname in excl:
                 keep = False
             else:
-                pos = int(pos)
+                pos = int(pos) - 1
                 length, offset = cigar_to_lens(cigar)
                 pool[int(flag) >> 6 & 3].append((
-                    rname, None, length, pos, pos + offset - 1))
+                    rname, None, length, pos, pos + offset))
 
     if pool[0]:
         yield this, pool[0]
@@ -718,7 +657,7 @@ def parse_sam_file_pd(fh, n=65536):
     #         # this is slow, because of function all
     #         chunk['length'], offset = zip(*chunk['cigar'].apply(
     #             cigar_to_lens))
-    #         chunk['right'] = chunk['pos'] + offset - 1
+    #         chunk['right'] = chunk['pos'] + offset
     #         # this is slow, because of function all
     #         # chunk['qname'] = chunk[['qname', 'flag']].apply(
     #         #   qname_by_flag, axis=1)
@@ -886,6 +825,8 @@ def parse_b6o_file(fh):
         qseqid sseqid pident length mismatch gapopen qstart qend sstart send
         evalue bitscore
 
+    Coordinates are 1-based, inclusive.
+
     .. _BLAST manual:
         https://www.ncbi.nlm.nih.gov/books/NBK279684/
     """
@@ -946,7 +887,7 @@ def parse_b6o_file_ex(fh):
         except IndexError:
             continue
         sstart, send = sorted((int(x[8]), int(x[9])))
-        this, pool = qseqid, [(sseqid, score, length, sstart, send)]
+        this, pool = qseqid, [(sseqid, score, length, sstart - 1, send)]
         break
 
     # body
@@ -958,10 +899,10 @@ def parse_b6o_file_ex(fh):
             continue
         sstart, send = sorted((int(x[8]), int(x[9])))
         if qseqid == this:
-            pool.append((sseqid, score, length, sstart, send))
+            pool.append((sseqid, score, length, sstart - 1, send))
         else:
             yield this, pool
-            this, pool = qseqid, [(sseqid, score, length, sstart, send)]
+            this, pool = qseqid, [(sseqid, score, length, sstart - 1, send)]
 
     # foot
     if this is not None:
@@ -1065,7 +1006,7 @@ def parse_b6o_file_ex_ft(fh, excl):
             keep = False
         else:
             sstart, send = sorted((int(x[8]), int(x[9])))
-            pool.append((sseqid, score, length, sstart, send))
+            pool.append((sseqid, score, length, sstart - 1, send))
         break
 
     # body
@@ -1082,12 +1023,12 @@ def parse_b6o_file_ex_ft(fh, excl):
             this = qseqid
             keep = sseqid not in excl
             if keep:
-                pool = [(sseqid, score, length, sstart, send)]
+                pool = [(sseqid, score, length, sstart - 1, send)]
         elif keep:
             if sseqid in excl:
                 keep = False
             else:
-                pool.append((sseqid, score, length, sstart, send))
+                pool.append((sseqid, score, length, sstart - 1, send))
 
     # foot
     if this is not None and keep:
@@ -1177,7 +1118,7 @@ def parse_paf_file_ex(fh):
     for line in fh:
         x = line.split('\t')
         try:
-            record = (x[5], int(x[11]), int(x[10]), int(x[7]) + 1, int(x[8]))
+            record = (x[5], int(x[11]), int(x[10]), int(x[7]), int(x[8]))
         except (IndexError, ValueError):
             continue
         this, pool = x[0], [record]
@@ -1187,7 +1128,7 @@ def parse_paf_file_ex(fh):
     for line in fh:
         x = line.split('\t')
         try:
-            record = (x[5], int(x[11]), int(x[10]), int(x[7]) + 1, int(x[8]))
+            record = (x[5], int(x[11]), int(x[10]), int(x[7]), int(x[8]))
         except (IndexError, ValueError):
             continue
         if x[0] == this:
@@ -1290,7 +1231,7 @@ def parse_paf_file_ex_ft(fh, excl):
     for line in fh:
         x = line.split('\t')
         try:
-            record = (x[5], int(x[11]), int(x[10]), int(x[7]) + 1, int(x[8]))
+            record = (x[5], int(x[11]), int(x[10]), int(x[7]), int(x[8]))
         except (IndexError, ValueError):
             continue
         this = x[0]
@@ -1304,7 +1245,7 @@ def parse_paf_file_ex_ft(fh, excl):
     for line in fh:
         x = line.split('\t')
         try:
-            record = (x[5], int(x[11]), int(x[10]), int(x[7]) + 1, int(x[8]))
+            record = (x[5], int(x[11]), int(x[10]), int(x[7]), int(x[8]))
         except (IndexError, ValueError):
             continue
 

diff --git a/woltka/cli.py b/woltka/cli.py
@@ -179,6 +179,11 @@ def cli():
 @click.option(
     '--outcov', 'outcov_dir', type=click.Path(dir_okay=True),
     help='Write subject coverage maps to this directory.')
+@click.option(
+    '--cov-fmt', 'outcov_fmt', default='bed', type=click.Choice(
+        ['bed', 'gff', '0e', '1e', '0i', '1i'], case_sensitive=False),
+    help=('Format of subject coverage coordinates. Default is BED-like '
+          '(0-based, exclusive end).'))
 # performance
 @click.option(
     '--chunk', type=click.INT, default=None,