Skip to content

Commit

Permalink
Merge pull request #205 from qiyunzhu/coord
Browse files Browse the repository at this point in the history
Improved handling of range coordinates
  • Loading branch information
qiyunzhu authored Sep 28, 2024
2 parents 58ddd0c + 635ae3c commit ddea40a
Show file tree
Hide file tree
Showing 12 changed files with 536 additions and 429 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Change Log

## Version 0.1.6-dev

### Changed
- Changed default output subject coverage (`--outcov`) coordinates into BED-like (0-based, exclusive end). The output can be directly parsed by programs like bedtools. Also added support for GFF-like and other custom formats, as controled by paramter `--outcov-fmt` ([#204](https://github.com/qiyunzhu/woltka/pull/204) and [#205](https://github.com/qiyunzhu/woltka/pull/205)).


## Version 0.1.6 (2/22/2024)

### Changed
Expand Down
14 changes: 13 additions & 1 deletion doc/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ Option | Description
`--scale` | Scale counts by this factor. Accepts "k", "M" suffixes.
`--digits` | Round counts to this number of digits after the decimal point.

### Output files
### Output table

Option | Description
--- | ---
Expand All @@ -90,9 +90,21 @@ Option | Description
`--name-as-id` | Replace feature IDs with names. Otherwise append names to table as a metadata column.
`--add-rank` | Append feature ranks to table as a metadata column.
`--add-lineage` | Append lineage strings to table as a metadata column.

### Output mapping

Option | Description
--- | ---
`--outmap`, `-u` | Write read-to-feature maps to this directory.
`--zipmap` | Compress read-to-feature maps using this algorithm. Options: `none`, `gz` (default), `bz2`, `xz`.

### Output coverage

Option | Description
--- | ---
`--outcov` | Write subject coverage maps to this directory.
`--cov-fmt` | Format of subject coverage coordinates. Options: `bed` (BED-like, 0-based, exclusive end, equivalent to `0e`) (default), `gff` (GFF-like, 1-based, inclusive end, equivalent to `1i`), `0e`, `1e`, `0i`, `1i`.

### Performance

Option | Description
Expand Down
111 changes: 26 additions & 85 deletions woltka/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ def plain_mapper(fh, fmt=None, excl=None, n=1000):
list of set of str
Subject(s) queue.
See Also
--------
.range.range_mapper
.ordinal.ordinal_mapper
Notes
-----
The design of this function aims to couple with the extremely large size of
Expand Down Expand Up @@ -111,74 +116,6 @@ def plain_mapper(fh, fmt=None, excl=None, n=1000):
yield qryque, subque


def range_mapper(fh, fmt=None, excl=None, n=1000):
"""Read an alignment file and yield maps of query to subject(s) and their
ranges.
Parameters
----------
fh : file handle
Alignment file to parse.
fmt : str, optional
Alignment file format.
excl : set, optional
Subjects to exclude.
n : int, optional
Number of unique queries per chunk.
Yields
------
list of str
Query queue.
list of dict of list of int
Subject-to-ranges queue.
Notes
-----
Same as `plain_mapper`, except that it also returns subject ranges.
Ranges are stored as a one-dimensional, interleaved list of start1, end1,
start2, end2, start3, end3...
See Also
--------
plain_mapper
.coverage.merge_ranges
"""
it = iter_align(fh, fmt, excl, True)
while True:
i, done = 0, False
qryque, subque = [None] * n, [None] * n
for query, records in it:

# generate a mapping of subjects to interleaved starts and ends
ranges = {}
for subject, _, _, start, end in records:

# start and end must be positive integers
if start and end:

# combine ranges on the same subject
ranges.setdefault(subject, []).extend((start, end))

# append query and ranges
if ranges:
qryque[i] = query
subque[i] = ranges

i += 1
if i == n:
done = True
break

if not done:
if i:
yield qryque[:i], subque[:i]
break # pragma: no cover
else:
yield qryque, subque


def iter_align(fh, fmt=None, excl=None, extr=None):
"""Generate an iterator of alignment file content.
Expand Down Expand Up @@ -340,6 +277,8 @@ def parse_sam_file(fh):
QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL,
TAGS
POS is 1-based.
.. _Wikipedia:
https://en.wikipedia.org/wiki/SAM_(file_format)
.. _SAM format specification:
Expand Down Expand Up @@ -441,7 +380,7 @@ def parse_sam_file_ex(fh):
mate = int(flag) >> 6 & 3

# leftmost mapping position
pos = int(pos)
pos = int(pos) - 1

# parse CIGAR string
length, offset = cigar_to_lens(cigar)
Expand All @@ -457,7 +396,7 @@ def parse_sam_file_ex(fh):
this, pool = qname, ([], [], [])

# append
pool[mate].append((rname, None, length, pos, pos + offset - 1))
pool[mate].append((rname, None, length, pos, pos + offset))

# final yield
if pool[0]:
Expand Down Expand Up @@ -587,19 +526,19 @@ def parse_sam_file_ex_ft(fh, excl):
keep = rname not in excl
if keep:
pool = ([], [], [])
pos = int(pos)
pos = int(pos) - 1
length, offset = cigar_to_lens(cigar)
pool[int(flag) >> 6 & 3].append((
rname, None, length, pos, pos + offset - 1))
rname, None, length, pos, pos + offset))

elif keep:
if rname in excl:
keep = False
else:
pos = int(pos)
pos = int(pos) - 1
length, offset = cigar_to_lens(cigar)
pool[int(flag) >> 6 & 3].append((
rname, None, length, pos, pos + offset - 1))
rname, None, length, pos, pos + offset))

if pool[0]:
yield this, pool[0]
Expand Down Expand Up @@ -718,7 +657,7 @@ def parse_sam_file_pd(fh, n=65536):
# # this is slow, because of function all
# chunk['length'], offset = zip(*chunk['cigar'].apply(
# cigar_to_lens))
# chunk['right'] = chunk['pos'] + offset - 1
# chunk['right'] = chunk['pos'] + offset
# # this is slow, because of function all
# # chunk['qname'] = chunk[['qname', 'flag']].apply(
# # qname_by_flag, axis=1)
Expand Down Expand Up @@ -886,6 +825,8 @@ def parse_b6o_file(fh):
qseqid sseqid pident length mismatch gapopen qstart qend sstart send
evalue bitscore
Coordinates are 1-based, inclusive.
.. _BLAST manual:
https://www.ncbi.nlm.nih.gov/books/NBK279684/
"""
Expand Down Expand Up @@ -946,7 +887,7 @@ def parse_b6o_file_ex(fh):
except IndexError:
continue
sstart, send = sorted((int(x[8]), int(x[9])))
this, pool = qseqid, [(sseqid, score, length, sstart, send)]
this, pool = qseqid, [(sseqid, score, length, sstart - 1, send)]
break

# body
Expand All @@ -958,10 +899,10 @@ def parse_b6o_file_ex(fh):
continue
sstart, send = sorted((int(x[8]), int(x[9])))
if qseqid == this:
pool.append((sseqid, score, length, sstart, send))
pool.append((sseqid, score, length, sstart - 1, send))
else:
yield this, pool
this, pool = qseqid, [(sseqid, score, length, sstart, send)]
this, pool = qseqid, [(sseqid, score, length, sstart - 1, send)]

# foot
if this is not None:
Expand Down Expand Up @@ -1065,7 +1006,7 @@ def parse_b6o_file_ex_ft(fh, excl):
keep = False
else:
sstart, send = sorted((int(x[8]), int(x[9])))
pool.append((sseqid, score, length, sstart, send))
pool.append((sseqid, score, length, sstart - 1, send))
break

# body
Expand All @@ -1082,12 +1023,12 @@ def parse_b6o_file_ex_ft(fh, excl):
this = qseqid
keep = sseqid not in excl
if keep:
pool = [(sseqid, score, length, sstart, send)]
pool = [(sseqid, score, length, sstart - 1, send)]
elif keep:
if sseqid in excl:
keep = False
else:
pool.append((sseqid, score, length, sstart, send))
pool.append((sseqid, score, length, sstart - 1, send))

# foot
if this is not None and keep:
Expand Down Expand Up @@ -1177,7 +1118,7 @@ def parse_paf_file_ex(fh):
for line in fh:
x = line.split('\t')
try:
record = (x[5], int(x[11]), int(x[10]), int(x[7]) + 1, int(x[8]))
record = (x[5], int(x[11]), int(x[10]), int(x[7]), int(x[8]))
except (IndexError, ValueError):
continue
this, pool = x[0], [record]
Expand All @@ -1187,7 +1128,7 @@ def parse_paf_file_ex(fh):
for line in fh:
x = line.split('\t')
try:
record = (x[5], int(x[11]), int(x[10]), int(x[7]) + 1, int(x[8]))
record = (x[5], int(x[11]), int(x[10]), int(x[7]), int(x[8]))
except (IndexError, ValueError):
continue
if x[0] == this:
Expand Down Expand Up @@ -1290,7 +1231,7 @@ def parse_paf_file_ex_ft(fh, excl):
for line in fh:
x = line.split('\t')
try:
record = (x[5], int(x[11]), int(x[10]), int(x[7]) + 1, int(x[8]))
record = (x[5], int(x[11]), int(x[10]), int(x[7]), int(x[8]))
except (IndexError, ValueError):
continue
this = x[0]
Expand All @@ -1304,7 +1245,7 @@ def parse_paf_file_ex_ft(fh, excl):
for line in fh:
x = line.split('\t')
try:
record = (x[5], int(x[11]), int(x[10]), int(x[7]) + 1, int(x[8]))
record = (x[5], int(x[11]), int(x[10]), int(x[7]), int(x[8]))
except (IndexError, ValueError):
continue

Expand Down
5 changes: 5 additions & 0 deletions woltka/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@ def cli():
@click.option(
'--outcov', 'outcov_dir', type=click.Path(dir_okay=True),
help='Write subject coverage maps to this directory.')
@click.option(
'--cov-fmt', 'outcov_fmt', default='bed', type=click.Choice(
['bed', 'gff', '0e', '1e', '0i', '1i'], case_sensitive=False),
help=('Format of subject coverage coordinates. Default is BED-like '
'(0-based, exclusive end).'))
# performance
@click.option(
'--chunk', type=click.INT, default=None,
Expand Down
Loading

0 comments on commit ddea40a

Please sign in to comment.