From cfcb32223561affbfa90f210002a6e0261f62914 Mon Sep 17 00:00:00 2001
From: Neil Horner <neil.horner@nanoporetech.com>
Date: Wed, 7 Jun 2023 11:31:15 +0000
Subject: [PATCH] Cw1814 fastqparse

---
 conda/meta.yaml                |  1 +
 pychopper/scripts/pychopper.py | 22 ++++--------
 pychopper/seq_utils.py         | 61 +++++++++-------------------------
 requirements.txt               |  1 +
 4 files changed, 24 insertions(+), 61 deletions(-)

diff --git a/conda/meta.yaml b/conda/meta.yaml
index fccc2d5..51feb5a 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -29,6 +29,7 @@ requirements:
     - six
     - pandas
     - numpy
+    - pysam
 
 test:
   commands:
diff --git a/pychopper/scripts/pychopper.py b/pychopper/scripts/pychopper.py
index 34d5f97..2e587c8 100755
--- a/pychopper/scripts/pychopper.py
+++ b/pychopper/scripts/pychopper.py
@@ -358,10 +358,6 @@ def main():
     sys.stderr.write("Using kit: {}\n".format(args.k))
     sys.stderr.write("Configurations to consider: \"{}\"\n".format(CONFIG))
 
-    in_fh = sys.stdin
-    if args.input_fastx != '-':
-        in_fh = _opener(args.input_fastx, "r")
-
     out_fh = sys.stdout
     if args.output_fastx != '-':
         out_fh = open(args.output_fastx, "w")
@@ -437,8 +433,7 @@ def backend(x, pool, q=None, mb=None):
         sys.stderr.write(
             "Total fastq records in input file: {}\n".format(nr_records))
         read_sample = list(
-            seu.readfq(_opener(args.input_fastx, "r"), sample=target_prop,
-                       min_qual=args.Q))
+            seu.readfq(args.input_fastx, sample=target_prop, min_qual=args.Q))
         sys.stderr.write(
             "Tuning the cutoff parameter (q) on {} sampled reads ({:.1f}%) passing quality filters (Q >= {}).\n".format(
                 len(read_sample), target_prop * 100.0, args.Q))
@@ -446,15 +441,10 @@ def backend(x, pool, q=None, mb=None):
         for qv in tqdm.tqdm(cutoffs):
             clsLen = 0
             cls = 0
-            with concurrent.futures.ProcessPoolExecutor(
-                    max_workers=args.t) as executor:
+            with concurrent.futures.ProcessPoolExecutor(max_workers=args.t) as executor:
                 for batch in utils.batch(read_sample, int((len(read_sample)))):
-                    for read, (segments, hits, usable_len) in backend(batch,
-                                                                      executor,
-                                                                      qv,
-                                                                      max(1000,
-                                                                          int((
-                                                                              len(read_sample)) / args.t))):
+                    for read, (segments, hits, usable_len) \
+                            in backend(batch, executor, qv, max(1000, int((len(read_sample)) / args.t))):
                         flt = list([x.Len for x in segments if x.Len > 0])
                         if len(flt) == 1:
                             clsLen += sum(flt)
@@ -494,7 +484,7 @@ def backend(x, pool, q=None, mb=None):
     with concurrent.futures.ProcessPoolExecutor(
             max_workers=args.t) as executor:
         for batch in utils.batch(
-                seu.readfq(in_fh, min_qual=args.Q, rfq_sup=rfq_sup), args.B):
+                seu.readfq(args.input_fastx, min_qual=args.Q, rfq_sup=rfq_sup), args.B):
             for read, (segments, hits, usable_len) in backend(batch, executor,
                                                               q=args.q,
                                                               mb=min_batch_size):
@@ -546,7 +536,7 @@ def backend(x, pool, q=None, mb=None):
     if args.S is not None:
         stdf.to_csv(args.S, sep="\t", index=False)
 
-    for fh in (in_fh, out_fh, u_fh, l_fh, w_fh, a_fh, d_fh):
+    for fh in (out_fh, u_fh, l_fh, w_fh, a_fh, d_fh):
         if fh is None:
             continue
         fh.flush()
diff --git a/pychopper/seq_utils.py b/pychopper/seq_utils.py
index 5bfed0b..8760038 100644
--- a/pychopper/seq_utils.py
+++ b/pychopper/seq_utils.py
@@ -2,6 +2,7 @@
 
 from six.moves import reduce
 from numpy.random import random
+from pysam import FastxFile
 from math import log
 import sys
 from pychopper.common_structures import Seq
@@ -49,7 +50,7 @@ def reverse_complement(seq):
     return reduce(lambda x, y: x + y, map(base_complement, seq[::-1]))
 
 
-def readfq(fp, sample=None, min_qual=None, rfq_sup={}):  # this is a generator function
+def readfq(fastq, sample=None, min_qual=0, rfq_sup={}):  # this is a generator function
     """
     Below function taken from https://github.com/lh3/readfq/blob/master/readfq.py
     Much faster parsing of large files compared to Biopyhton.
@@ -58,53 +59,23 @@ def readfq(fp, sample=None, min_qual=None, rfq_sup={}):  # this is a generator f
     tsup = "total" in rfq_sup
     if sup:
         fh = open(rfq_sup["out_fq"], "w")
-    last = None  # this is a buffer keeping the last unprocessed line
-    while True:  # mimic closure; is it a bad idea?
-        if not last:  # the first record or a record following a fastq
-            for l in fp:  # search for the start of the next record
-                if l[0] in '>@':  # fasta/q header line
-                    last = l[:-1]  # save this line
-                    break
-        if not last:
-            break
-        name, seqs, last = last[1:], [], None
-        for l in fp:  # read the sequence
-            if l[0] in '@+>':
-                last = l[:-1]
-                break
-            seqs.append(l[:-1])
-        if not last or last[0] != '+':  # this is a fasta record
+
+    with FastxFile(fastq) as fqin:
+        for fx in fqin:
             if sample is None or (random() < sample):
                 if tsup:
                     rfq_sup["total"] += 1
-                yield Seq(name.split(" ", 1)[0], name, ''.join(seqs), None, None)  # yield a fasta record
-            if not last:
-                break
-        else:  # this is a fastq record
-            seq, leng, seqs = ''.join(seqs), 0, []
-            for l in fp:  # read the quality
-                seqs.append(l[:-1])
-                leng += len(l) - 1
-                if leng >= len(seq):  # have read enough quality
-                    last = None
-                    if sample is None or (random() < sample):
-                        quals = "".join(seqs)
-                        oseq = Seq(Id=name.split(" ", 1)[0], Name=name, Seq=seq, Qual=quals, Umi=None)
-                        if tsup:
-                            rfq_sup["total"] += 1
-                        if not (min_qual is not None and min_qual > 0 and mean_qual(quals) < min_qual):
-                            if tsup:
-                                rfq_sup["pass"] += 1
-                            yield oseq
-                        else:
-                            if sup:
-                                writefq(oseq, fh)
-                    break
-            if last:  # reach EOF before reading enough quality
-                if sample is None or (random() < sample):
-                    yield Seq(name.split(" ", 1)[0], name, seq, None)  # yield a fasta record instead
-                break
-
+                qual_array = fx.get_quality_array()
+                if qual_array is None or \
+                        not (sum(qual_array) / len(qual_array)) < min_qual:
+                    if tsup:
+                        rfq_sup["pass"] += 1
+                    yield Seq(
+                        Id=fx.name, Name=f"{fx.name} {fx.comment}",
+                        Seq=fx.sequence, Qual=fx.quality, Umi=None)
+                else:
+                    if sup:
+                        fh.write(str(fx))
     if sup:
         fh.flush()
         fh.close()
diff --git a/requirements.txt b/requirements.txt
index 8ddc707..b72bf4f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ tqdm==4.26.0
 pandas
 numpy
 pytest
+pysam
\ No newline at end of file