Skip to content

Commit

Permalink
Cw1814 fastqparse
Browse files Browse the repository at this point in the history
  • Loading branch information
nrhorner committed Jun 7, 2023
1 parent 7d36682 commit cfcb322
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 61 deletions.
1 change: 1 addition & 0 deletions conda/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ requirements:
- six
- pandas
- numpy
- pysam

test:
commands:
Expand Down
22 changes: 6 additions & 16 deletions pychopper/scripts/pychopper.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,10 +358,6 @@ def main():
sys.stderr.write("Using kit: {}\n".format(args.k))
sys.stderr.write("Configurations to consider: \"{}\"\n".format(CONFIG))

in_fh = sys.stdin
if args.input_fastx != '-':
in_fh = _opener(args.input_fastx, "r")

out_fh = sys.stdout
if args.output_fastx != '-':
out_fh = open(args.output_fastx, "w")
Expand Down Expand Up @@ -437,24 +433,18 @@ def backend(x, pool, q=None, mb=None):
sys.stderr.write(
"Total fastq records in input file: {}\n".format(nr_records))
read_sample = list(
seu.readfq(_opener(args.input_fastx, "r"), sample=target_prop,
min_qual=args.Q))
seu.readfq(args.input_fastx, sample=target_prop, min_qual=args.Q))
sys.stderr.write(
"Tuning the cutoff parameter (q) on {} sampled reads ({:.1f}%) passing quality filters (Q >= {}).\n".format(
len(read_sample), target_prop * 100.0, args.Q))
sys.stderr.write("Optimizing over {} cutoff values.\n".format(args.L))
for qv in tqdm.tqdm(cutoffs):
clsLen = 0
cls = 0
with concurrent.futures.ProcessPoolExecutor(
max_workers=args.t) as executor:
with concurrent.futures.ProcessPoolExecutor(max_workers=args.t) as executor:
for batch in utils.batch(read_sample, int((len(read_sample)))):
for read, (segments, hits, usable_len) in backend(batch,
executor,
qv,
max(1000,
int((
len(read_sample)) / args.t))):
for read, (segments, hits, usable_len) \
in backend(batch, executor, qv, max(1000, int((len(read_sample)) / args.t))):
flt = list([x.Len for x in segments if x.Len > 0])
if len(flt) == 1:
clsLen += sum(flt)
Expand Down Expand Up @@ -494,7 +484,7 @@ def backend(x, pool, q=None, mb=None):
with concurrent.futures.ProcessPoolExecutor(
max_workers=args.t) as executor:
for batch in utils.batch(
seu.readfq(in_fh, min_qual=args.Q, rfq_sup=rfq_sup), args.B):
seu.readfq(args.input_fastx, min_qual=args.Q, rfq_sup=rfq_sup), args.B):
for read, (segments, hits, usable_len) in backend(batch, executor,
q=args.q,
mb=min_batch_size):
Expand Down Expand Up @@ -546,7 +536,7 @@ def backend(x, pool, q=None, mb=None):
if args.S is not None:
stdf.to_csv(args.S, sep="\t", index=False)

for fh in (in_fh, out_fh, u_fh, l_fh, w_fh, a_fh, d_fh):
for fh in (out_fh, u_fh, l_fh, w_fh, a_fh, d_fh):
if fh is None:
continue
fh.flush()
Expand Down
61 changes: 16 additions & 45 deletions pychopper/seq_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from six.moves import reduce
from numpy.random import random
from pysam import FastxFile
from math import log
import sys
from pychopper.common_structures import Seq
Expand Down Expand Up @@ -49,7 +50,7 @@ def reverse_complement(seq):
return reduce(lambda x, y: x + y, map(base_complement, seq[::-1]))


def readfq(fp, sample=None, min_qual=None, rfq_sup={}): # this is a generator function
def readfq(fastq, sample=None, min_qual=0, rfq_sup={}): # this is a generator function
"""
Below function taken from https://github.com/lh3/readfq/blob/master/readfq.py
Much faster parsing of large files compared to Biopyhton.
Expand All @@ -58,53 +59,23 @@ def readfq(fp, sample=None, min_qual=None, rfq_sup={}): # this is a generator f
tsup = "total" in rfq_sup
if sup:
fh = open(rfq_sup["out_fq"], "w")
last = None # this is a buffer keeping the last unprocessed line
while True: # mimic closure; is it a bad idea?
if not last: # the first record or a record following a fastq
for l in fp: # search for the start of the next record
if l[0] in '>@': # fasta/q header line
last = l[:-1] # save this line
break
if not last:
break
name, seqs, last = last[1:], [], None
for l in fp: # read the sequence
if l[0] in '@+>':
last = l[:-1]
break
seqs.append(l[:-1])
if not last or last[0] != '+': # this is a fasta record

with FastxFile(fastq) as fqin:
for fx in fqin:
if sample is None or (random() < sample):
if tsup:
rfq_sup["total"] += 1
yield Seq(name.split(" ", 1)[0], name, ''.join(seqs), None, None) # yield a fasta record
if not last:
break
else: # this is a fastq record
seq, leng, seqs = ''.join(seqs), 0, []
for l in fp: # read the quality
seqs.append(l[:-1])
leng += len(l) - 1
if leng >= len(seq): # have read enough quality
last = None
if sample is None or (random() < sample):
quals = "".join(seqs)
oseq = Seq(Id=name.split(" ", 1)[0], Name=name, Seq=seq, Qual=quals, Umi=None)
if tsup:
rfq_sup["total"] += 1
if not (min_qual is not None and min_qual > 0 and mean_qual(quals) < min_qual):
if tsup:
rfq_sup["pass"] += 1
yield oseq
else:
if sup:
writefq(oseq, fh)
break
if last: # reach EOF before reading enough quality
if sample is None or (random() < sample):
yield Seq(name.split(" ", 1)[0], name, seq, None) # yield a fasta record instead
break

qual_array = fx.get_quality_array()
if qual_array is None or \
not (sum(qual_array) / len(qual_array)) < min_qual:
if tsup:
rfq_sup["pass"] += 1
yield Seq(
Id=fx.name, Name=f"{fx.name} {fx.comment}",
Seq=fx.sequence, Qual=fx.quality, Umi=None)
else:
if sup:
fh.write(str(fx))
if sup:
fh.flush()
fh.close()
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ tqdm==4.26.0
pandas
numpy
pytest
pysam

0 comments on commit cfcb322

Please sign in to comment.