From cfcb32223561affbfa90f210002a6e0261f62914 Mon Sep 17 00:00:00 2001 From: Neil Horner Date: Wed, 7 Jun 2023 11:31:15 +0000 Subject: [PATCH] Cw1814 fastqparse --- conda/meta.yaml | 1 + pychopper/scripts/pychopper.py | 22 ++++-------- pychopper/seq_utils.py | 61 +++++++++------------------------- requirements.txt | 1 + 4 files changed, 24 insertions(+), 61 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index fccc2d5..51feb5a 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -29,6 +29,7 @@ requirements: - six - pandas - numpy + - pysam test: commands: diff --git a/pychopper/scripts/pychopper.py b/pychopper/scripts/pychopper.py index 34d5f97..2e587c8 100755 --- a/pychopper/scripts/pychopper.py +++ b/pychopper/scripts/pychopper.py @@ -358,10 +358,6 @@ def main(): sys.stderr.write("Using kit: {}\n".format(args.k)) sys.stderr.write("Configurations to consider: \"{}\"\n".format(CONFIG)) - in_fh = sys.stdin - if args.input_fastx != '-': - in_fh = _opener(args.input_fastx, "r") - out_fh = sys.stdout if args.output_fastx != '-': out_fh = open(args.output_fastx, "w") @@ -437,8 +433,7 @@ def backend(x, pool, q=None, mb=None): sys.stderr.write( "Total fastq records in input file: {}\n".format(nr_records)) read_sample = list( - seu.readfq(_opener(args.input_fastx, "r"), sample=target_prop, - min_qual=args.Q)) + seu.readfq(args.input_fastx, sample=target_prop, min_qual=args.Q)) sys.stderr.write( "Tuning the cutoff parameter (q) on {} sampled reads ({:.1f}%) passing quality filters (Q >= {}).\n".format( len(read_sample), target_prop * 100.0, args.Q)) @@ -446,15 +441,10 @@ def backend(x, pool, q=None, mb=None): for qv in tqdm.tqdm(cutoffs): clsLen = 0 cls = 0 - with concurrent.futures.ProcessPoolExecutor( - max_workers=args.t) as executor: + with concurrent.futures.ProcessPoolExecutor(max_workers=args.t) as executor: for batch in utils.batch(read_sample, int((len(read_sample)))): - for read, (segments, hits, usable_len) in backend(batch, - executor, - qv, - max(1000, - int(( - len(read_sample)) / args.t))): + for read, (segments, hits, usable_len) \ + in backend(batch, executor, qv, max(1000, int((len(read_sample)) / args.t))): flt = list([x.Len for x in segments if x.Len > 0]) if len(flt) == 1: clsLen += sum(flt) @@ -494,7 +484,7 @@ def backend(x, pool, q=None, mb=None): with concurrent.futures.ProcessPoolExecutor( max_workers=args.t) as executor: for batch in utils.batch( - seu.readfq(in_fh, min_qual=args.Q, rfq_sup=rfq_sup), args.B): + seu.readfq(args.input_fastx, min_qual=args.Q, rfq_sup=rfq_sup), args.B): for read, (segments, hits, usable_len) in backend(batch, executor, q=args.q, mb=min_batch_size): @@ -546,7 +536,7 @@ def backend(x, pool, q=None, mb=None): if args.S is not None: stdf.to_csv(args.S, sep="\t", index=False) - for fh in (in_fh, out_fh, u_fh, l_fh, w_fh, a_fh, d_fh): + for fh in (out_fh, u_fh, l_fh, w_fh, a_fh, d_fh): if fh is None: continue fh.flush() diff --git a/pychopper/seq_utils.py b/pychopper/seq_utils.py index 5bfed0b..8760038 100644 --- a/pychopper/seq_utils.py +++ b/pychopper/seq_utils.py @@ -2,6 +2,7 @@ from six.moves import reduce from numpy.random import random +from pysam import FastxFile from math import log import sys from pychopper.common_structures import Seq @@ -49,7 +50,7 @@ def reverse_complement(seq): return reduce(lambda x, y: x + y, map(base_complement, seq[::-1])) -def readfq(fp, sample=None, min_qual=None, rfq_sup={}): # this is a generator function +def readfq(fastq, sample=None, min_qual=0, rfq_sup={}): # this is a generator function """ Below function taken from https://github.com/lh3/readfq/blob/master/readfq.py Much faster parsing of large files compared to Biopyhton. @@ -58,53 +59,23 @@ def readfq(fp, sample=None, min_qual=None, rfq_sup={}): # this is a generator f tsup = "total" in rfq_sup if sup: fh = open(rfq_sup["out_fq"], "w") - last = None # this is a buffer keeping the last unprocessed line - while True: # mimic closure; is it a bad idea? - if not last: # the first record or a record following a fastq - for l in fp: # search for the start of the next record - if l[0] in '>@': # fasta/q header line - last = l[:-1] # save this line - break - if not last: - break - name, seqs, last = last[1:], [], None - for l in fp: # read the sequence - if l[0] in '@+>': - last = l[:-1] - break - seqs.append(l[:-1]) - if not last or last[0] != '+': # this is a fasta record + + with FastxFile(fastq) as fqin: + for fx in fqin: if sample is None or (random() < sample): if tsup: rfq_sup["total"] += 1 - yield Seq(name.split(" ", 1)[0], name, ''.join(seqs), None, None) # yield a fasta record - if not last: - break - else: # this is a fastq record - seq, leng, seqs = ''.join(seqs), 0, [] - for l in fp: # read the quality - seqs.append(l[:-1]) - leng += len(l) - 1 - if leng >= len(seq): # have read enough quality - last = None - if sample is None or (random() < sample): - quals = "".join(seqs) - oseq = Seq(Id=name.split(" ", 1)[0], Name=name, Seq=seq, Qual=quals, Umi=None) - if tsup: - rfq_sup["total"] += 1 - if not (min_qual is not None and min_qual > 0 and mean_qual(quals) < min_qual): - if tsup: - rfq_sup["pass"] += 1 - yield oseq - else: - if sup: - writefq(oseq, fh) - break - if last: # reach EOF before reading enough quality - if sample is None or (random() < sample): - yield Seq(name.split(" ", 1)[0], name, seq, None) # yield a fasta record instead - break - + qual_array = fx.get_quality_array() + if qual_array is None or \ + not (sum(qual_array) / len(qual_array)) < min_qual: + if tsup: + rfq_sup["pass"] += 1 + yield Seq( + Id=fx.name, Name=f"{fx.name} {fx.comment}", + Seq=fx.sequence, Qual=fx.quality, Umi=None) + else: + if sup: + fh.write(str(fx)) if sup: fh.flush() fh.close() diff --git a/requirements.txt b/requirements.txt index 8ddc707..b72bf4f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ tqdm==4.26.0 pandas numpy pytest +pysam \ No newline at end of file