diff --git a/patches/0001-feat-add-HN-tag-to-bwa-aln.patch b/patches/0001-feat-add-HN-tag-to-bwa-aln.patch index 1575c83..ed1136d 100644 --- a/patches/0001-feat-add-HN-tag-to-bwa-aln.patch +++ b/patches/0001-feat-add-HN-tag-to-bwa-aln.patch @@ -1,7 +1,7 @@ -From 7d36ae830fc7391cfa69ffcdf239755f01317c1c Mon Sep 17 00:00:00 2001 +From 0cae163e3364e463324c98bc2d8f88495033dc31 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Thu, 16 Jan 2025 22:35:13 -0800 -Subject: [PATCH 1/2] feat: add HN tag to bwa aln +Subject: [PATCH 1/3] feat: add HN tag to bwa aln --- bwase.c | 17 +++++++++-------- diff --git a/patches/0002-feat-add-an-option-to-output-MD-in-the-XA-tag.patch b/patches/0002-feat-add-an-option-to-output-MD-in-the-XA-tag.patch index bb055f3..8354f8a 100644 --- a/patches/0002-feat-add-an-option-to-output-MD-in-the-XA-tag.patch +++ b/patches/0002-feat-add-an-option-to-output-MD-in-the-XA-tag.patch @@ -1,7 +1,7 @@ -From 028b2a7f2129a6eb8038079c539a54526bb08422 Mon Sep 17 00:00:00 2001 +From 9a9e459d581fc9cae5c4a73fb65ad191425630ec Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Fri, 17 Jan 2025 00:29:08 -0800 -Subject: [PATCH 2/2] feat: add an option to output MD in the XA tag +Subject: [PATCH 2/3] feat: add an option to output MD in the XA tag --- bwape.c | 12 +++++++----- diff --git a/patches/0003-feat-expose-various-internal-bwamem-methods-and-stru.patch b/patches/0003-feat-expose-various-internal-bwamem-methods-and-stru.patch new file mode 100644 index 0000000..417c0ff --- /dev/null +++ b/patches/0003-feat-expose-various-internal-bwamem-methods-and-stru.patch @@ -0,0 +1,134 @@ +From d1aec9f3dc7e8a28f1154eb14d94f857215a299e Mon Sep 17 00:00:00 2001 +From: Nils Homer +Date: Fri, 17 Jan 2025 21:33:12 -0700 +Subject: [PATCH 3/3] feat: expose various internal bwamem methods and structs + +--- + bwamem.c | 23 +++-------------------- + bwamem.h | 27 +++++++++++++++++++++++++++ + bwamem_extra.c | 2 +- + 3 files changed, 31 insertions(+), 21 deletions(-) + +diff --git a/bwamem.c b/bwamem.c +index 03e2a05..5ca5a8d 100644 +--- a/bwamem.c ++++ b/bwamem.c +@@ -116,11 +116,7 @@ mem_opt_t *mem_opt_init() + #define intv_lt(a, b) ((a).info < (b).info) + KSORT_INIT(mem_intv, bwtintv_t, intv_lt) + +-typedef struct { +- bwtintv_v mem, mem1, *tmpv[2]; +-} smem_aux_t; +- +-static smem_aux_t *smem_aux_init() ++smem_aux_t *smem_aux_init() + { + smem_aux_t *a; + a = calloc(1, sizeof(smem_aux_t)); +@@ -129,7 +125,7 @@ static smem_aux_t *smem_aux_init() + return a; + } + +-static void smem_aux_destroy(smem_aux_t *a) ++void smem_aux_destroy(smem_aux_t *a) + { + free(a->tmpv[0]->a); free(a->tmpv[0]); + free(a->tmpv[1]->a); free(a->tmpv[1]); +@@ -514,7 +510,6 @@ int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_ + return m; + } + +-typedef kvec_t(int) int_v; + + static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z) + { // similar to the loop in mem_chain_flt() +@@ -1188,19 +1183,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * + return a; + } + +-typedef struct { +- const mem_opt_t *opt; +- const bwt_t *bwt; +- const bntseq_t *bns; +- const uint8_t *pac; +- const mem_pestat_t *pes; +- smem_aux_t **aux; +- bseq1_t *seqs; +- mem_alnreg_v *regs; +- int64_t n_processed; +-} worker_t; +- +-static void worker1(void *data, int i, int tid) ++void worker1(void *data, int i, int tid) + { + worker_t *w = (worker_t*)data; + if (!(w->opt->flag&MEM_F_PE)) { +diff --git a/bwamem.h b/bwamem.h +index 0a0e3bb..52a9c53 100644 +--- a/bwamem.h ++++ b/bwamem.h +@@ -30,6 +30,7 @@ + #include "bwt.h" + #include "bntseq.h" + #include "bwa.h" ++#include "kvec.h" + + #define MEM_MAPQ_COEF 30.0 + #define MEM_MAPQ_MAX 60 +@@ -123,6 +124,24 @@ typedef struct { // This struct is only used for the convenience of API. + int score, sub, alt_sc; + } mem_aln_t; + ++typedef struct { ++ bwtintv_v mem, mem1, *tmpv[2]; ++} smem_aux_t; ++ ++typedef struct { ++ const mem_opt_t *opt; ++ const bwt_t *bwt; ++ const bntseq_t *bns; ++ const uint8_t *pac; ++ const mem_pestat_t *pes; ++ smem_aux_t **aux; ++ bseq1_t *seqs; ++ mem_alnreg_v *regs; ++ int64_t n_processed; ++} worker_t; ++ ++typedef kvec_t(int) int_v; ++ + #ifdef __cplusplus + extern "C" { + #endif +@@ -136,6 +155,14 @@ extern "C" { + mem_opt_t *mem_opt_init(void); + void mem_fill_scmat(int a, int b, int8_t mat[25]); + ++ int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); ++ void mem_reorder_primary5(int T, mem_alnreg_v *a); ++ ++ smem_aux_t *smem_aux_init(); ++ void smem_aux_destroy(smem_aux_t *a); ++ ++ void worker1(void *data, int i, int tid); ++ + /** + * Align a batch of sequences and generate the alignments in the SAM format + * +diff --git a/bwamem_extra.c b/bwamem_extra.c +index c47b93f..f50cec7 100644 +--- a/bwamem_extra.c ++++ b/bwamem_extra.c +@@ -102,7 +102,7 @@ const bwtintv_v *smem_next(smem_i *itr) + mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) + { // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence + extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf); +- extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); ++ extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); + mem_alnreg_v ar; + char *seq; + seq = malloc(l_seq); +-- +2.39.5 (Apple Git-154) + diff --git a/pybwa/libbwaaln_utils.c b/pybwa/libbwaaln_utils.c index 79a5535..1e54572 100644 --- a/pybwa/libbwaaln_utils.c +++ b/pybwa/libbwaaln_utils.c @@ -5,6 +5,9 @@ #include "bwase.h" #include "libbwaaln_utils.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif void bwa_cal_pac_pos_with_bwt(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr, bwt_t *bwt) { diff --git a/pybwa/libbwamem.pxd b/pybwa/libbwamem.pxd index cd7e72d..b50c291 100644 --- a/pybwa/libbwamem.pxd +++ b/pybwa/libbwamem.pxd @@ -3,6 +3,19 @@ from libc.stdint cimport uint8_t, int64_t, int32_t, uint64_t, int8_t, uint32_t from libc.stdio cimport FILE +cdef extern from "bwa.h": + ctypedef struct bseq1_t: + int l_seq, id + char *name, *comment, *seq, *qual, *sam + +cdef extern from "libbwamem_utils.h": + ctypedef struct mem_alns_t: + size_t n, m + mem_aln_t*a + mem_alns_t * mem_process_seqs_alt(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, + const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, + const mem_pestat_t *pes0) + cdef extern from "limits.h": cdef int INT_MAX @@ -10,7 +23,6 @@ cdef extern from "bwt.h": ctypedef struct bwt_t: int sa_intv - cdef extern from "bntseq.h": ctypedef struct bntann1_t: int64_t offset @@ -29,7 +41,6 @@ cdef extern from "kstring.h": size_t l, m char *s - cdef extern from "bwamem.h": int MEM_F_PE int MEM_F_NOPAIRING @@ -112,5 +123,3 @@ cdef extern void add_cigar(const mem_opt_t *opt, mem_aln_t *p, kstring_t *str, i # from bwamem_extra.c cdef extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); - - diff --git a/pybwa/libbwamem.pyi b/pybwa/libbwamem.pyi index ede77d0..08b832f 100644 --- a/pybwa/libbwamem.pyi +++ b/pybwa/libbwamem.pyi @@ -50,6 +50,7 @@ class BwaMemOptions: gap_open_penalty: int | tuple[int, int] | None = None, gap_extension_penalty: int | tuple[int, int] | None = None, clipping_penalty: int | tuple[int, int] | None = None, + threads: int | None = None, ) -> None: ... _finalized: bool _ignore_alt: bool @@ -180,6 +181,10 @@ class BwaMemOptions: def clipping_penalty(self) -> int | tuple[int, int]: ... @clipping_penalty.setter def clipping_penalty(self, value: int | tuple[int, int]) -> None: ... + @property + def threads(self) -> int: ... + @threads.setter + def threads(self, value: int) -> None: ... def finalize(self, copy: bool = False) -> BwaMemOptions: ... class BwaMem: diff --git a/pybwa/libbwamem.pyx b/pybwa/libbwamem.pyx index 23e65fa..30e72fe 100644 --- a/pybwa/libbwamem.pyx +++ b/pybwa/libbwamem.pyx @@ -1,12 +1,15 @@ # cython: language_level=3 from pathlib import Path from typing import List + from fgpyo.sequence import reverse_complement -from libc.string cimport memset, memcpy +from libc.string cimport memcpy from libc.stdlib cimport calloc, free import enum from pybwa.libbwaindex cimport BwaIndex from pysam import FastxRecord, AlignedSegment, qualitystring_to_array +from libc.string cimport strncpy +from pybwa.libbwaindex cimport force_bytes __all__ = [ @@ -110,7 +113,8 @@ cdef class BwaMemOptions: xa_drop_ratio: float | None = None, gap_open_penalty: int | tuple[int, int] | None = None, gap_extension_penalty: int | tuple[int, int] | None = None, - clipping_penalty: int | tuple[int, int] | None = None) -> None: + clipping_penalty: int | tuple[int, int] | None = None, + threads: int | None = None) -> None: self._finalized = False self._ignore_alt = False self._mode = None @@ -176,16 +180,16 @@ cdef class BwaMemOptions: self.gap_extension_penalty = gap_extension_penalty if clipping_penalty is not None: self.clipping_penalty = clipping_penalty + if threads is not None: + self.threads = threads cdef _copy_options(self, dst: BwaMemOptions, src: BwaMemOptions): memcpy(dst._options, src._options, sizeof(mem_opt_t)) memcpy(dst._options0, src._options0, sizeof(mem_opt_t)) - def __cinit__(self, opt: BwaMemOptions | None = None): + def __cinit__(self): self._options = mem_opt_init() self._options0 = mem_opt_init() - if opt is not None: - self._copy_options(self, opt) def __dealloc__(self): free(self._options) @@ -659,6 +663,15 @@ cdef class BwaMemOptions: self._options.pen_clip5 = five_prime self._options.pen_clip3 = three_prime + @property + def threads(self) -> int: + """:code:`bwa mem -t `""" + return self._options.n_threads + + @threads.setter + def threads(self, value: int) -> None: + self._options.n_threads = value + cdef class BwaMem: """The class to align reads with :code:`bwa mem`.""" @@ -712,14 +725,26 @@ cdef class BwaMem: def __to_str(_bytes: bytes) -> str: return _bytes.decode('utf-8') - cdef _copy_seq(self, q: FastxRecord, kstring_t *seq): - seq_len = len(q.sequence) - seq.s = calloc(sizeof(char), seq_len + 1) - seq.l = seq_len - seq.m = seq_len + 1 + cdef _copy_seq(self, q: FastxRecord, bseq1_t *s): + + # name + s.name = calloc(sizeof(char), len(q.name) + 1) + strncpy(s.name, force_bytes(q.name), len(q.name)) + s.name[len(q.name)] = b'\0' + + # comment + # NB: bwa mem supports appending the comment to the SAM tags verbatim! We do not. + s.comment = NULL + + # sequence + s.l_seq = len(q.sequence) + s.seq = calloc(sizeof(char), s.l_seq + 1) for i, base in enumerate(q.sequence): - seq.s[i] = nst_nt4_table[ord(base)] - seq.s[seq_len] = b'\0' + s.seq[i] = nst_nt4_table[ord(base)] + s.seq[s.l_seq] = b'\0' + + # qualities (always ignore) + s.qual = NULL def _unmapped(self, query: FastxRecord) -> AlignedSegment: # make a default, unmapped, empty record @@ -763,8 +788,8 @@ cdef class BwaMem: cdef _calign(self, opt: BwaMemOptions, queries: List[FastxRecord]): # TODO: ignore_alt # TODO: refactor to make this more readable - cdef kstring_t* seqs - cdef kstring_t* seq + cdef bseq1_t* seqs + cdef bseq1_t* seq cdef char* s_char cdef kstring_t* kstr cdef mem_alnreg_v mem_alnregs @@ -775,47 +800,33 @@ cdef class BwaMem: cdef mem_aln_t mem_aln cdef char *md cdef mem_opt_t *mem_opt + cdef mem_alns_t *mem_alns_vec + cdef mem_alns_t *mem_alns recs_to_return: List[List[AlignedSegment]] = [] - # copy FastqProxy into bwa_seq_t + # copy FastxRecord into bwa_seq_t num_seqs = len(queries) mem_opt = opt.mem_opt() + seqs = calloc(sizeof(bseq1_t), num_seqs) + for i in range(num_seqs): + self._copy_seq(queries[i], &seqs[i]) + + # process the sequences (ignores the paired end stats) + mem_alns_vec = mem_process_seqs_alt(mem_opt, self._index.bwt(), self._index.bns(), self._index.pac(), + 0, num_seqs, seqs, NULL) - seqs = calloc(sizeof(kstring_t), num_seqs) for i in range(num_seqs): seq = &seqs[i] query = queries[i] - self._copy_seq(query, seq) - mem_alnregs = mem_align1(mem_opt, self._index.bwt(), self._index.bns(), self._index.pac(), seq.l, seq.s) - if opt.query_coord_as_primary: - mem_reorder_primary5(opt.minimum_score, &mem_alnregs) - # mimic mem_reg2sam from bwamem.c - XA = NULL - keep_all = opt.output_all_for_fragments - if not keep_all: - XA = mem_gen_alt(mem_opt, self._index.bns(), self._index.pac(), &mem_alnregs, seq.l, seq.s) + mem_alns = &mem_alns_vec[i] mapped_recs = [] - for j in range(mem_alnregs.n): - mem_alnreg = &mem_alnregs.a[j] - if mem_alnreg.score < opt.minimum_score: - continue - if mem_alnreg.secondary >= 0 and (mem_alnreg.is_alt > 0 or not keep_all): - continue - if 0 <= mem_alnreg.secondary < INT_MAX and mem_alnreg.score < mem_alnregs.a[mem_alnreg.secondary].score * opt.xa_drop_ratio: - continue - mem_aln = mem_reg2aln(mem_opt, self._index.bns(), self._index.pac(), seq.l, seq.s, mem_alnreg) - mem_aln.XA = XA[j] if XA != NULL else NULL - if mem_alnreg.secondary >= 0: - mem_aln.sub = -1 # don't output sub-optimal score - if len(mapped_recs) > 0 and mem_alnreg.secondary < 0: # if supplementary - mem_aln.flag |= 0x10000 if opt.short_split_as_secondary else 0x800 - if not opt.keep_mapq_for_supplementary and len(mapped_recs) > 0 and mem_alnreg.is_alt == 0 and mem_aln.mapq > mapped_recs[0].mapping_quality: - mem_aln.mapq = mapped_recs[0].mapping_quality # lower - + for j in range(mem_alns.n): rec = self._unmapped(query=query) + mem_aln = mem_alns.a[j] + # set the flags mem_aln.flag |= 0x4 if mem_aln.rid < 0 else 0 mem_aln.flag |= 0x10 if mem_aln.is_rev > 0 else 0 @@ -888,21 +899,24 @@ cdef class BwaMem: mapped_recs.append(rec) + for j in range(mem_alns.n): + mem_aln = mem_alns.a[j] free(mem_aln.cigar) + free(mem_aln.XA) if len(mapped_recs) == 0: recs_to_return.append([self._unmapped(query=query)]) else: self._add_sa_tag(mapped_recs) recs_to_return.append(mapped_recs) - - if XA != NULL: - for j in range(len(mapped_recs)): - free(XA[j]) - free(XA) free(mem_alnregs.a) for i in range(num_seqs): - free(seqs[i].s) + free(seqs[i].name) + free(seqs[i].comment) + free(seqs[i].seq) + free(seqs[i].qual) + free(mem_alns_vec[i].a) + free(mem_alns_vec) free(seqs) return recs_to_return diff --git a/pybwa/libbwamem_utils.c b/pybwa/libbwamem_utils.c new file mode 100644 index 0000000..b844394 --- /dev/null +++ b/pybwa/libbwamem_utils.c @@ -0,0 +1,120 @@ +#include "bntseq.h" +#include "bwt.h" +#include "kstring.h" +#include "kvec.h" +#include "bwamem.h" +#include "libbwamem_utils.h" +#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +// Port of worker_t that adds the alns field. +typedef struct { + const mem_opt_t *opt; + const bwt_t *bwt; + const bntseq_t *bns; + const uint8_t *pac; + const mem_pestat_t *pes; + smem_aux_t **aux; + bseq1_t *seqs; + mem_alnreg_v *regs; + int64_t n_processed; + mem_alns_t* alns; +} worker_alt_t; + +// Port of mem_reg2sam from "bwamem.c", where we do not print the SAM output, but instead return the vector of alignments +mem_alns_t mem_reg2sam_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) +{ + extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); + kstring_t str; + mem_alns_t aa; + int k, l; + char **XA = 0; + + if (!(opt->flag & MEM_F_ALL)) + XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq); + kv_init(aa); + str.l = str.m = 0; str.s = 0; + for (k = l = 0; k < a->n; ++k) { + mem_alnreg_t *p = &a->a[k]; + mem_aln_t *q; + if (p->score < opt->T) continue; + if (p->secondary >= 0 && (p->is_alt || !(opt->flag&MEM_F_ALL))) continue; + if (p->secondary >= 0 && p->secondary < INT_MAX && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; + q = kv_pushp(mem_aln_t, aa); + *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); + assert(q->rid >= 0); // this should not happen with the new code + q->XA = XA? XA[k] : 0; + q->flag |= extra_flag; // flag secondary + if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score + if (l && p->secondary < 0) // if supplementary + q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; + if (!(opt->flag & MEM_F_KEEP_SUPP_MAPQ) && l && !p->is_alt && q->mapq > aa.a[0].mapq) + q->mapq = aa.a[0].mapq; // lower mapq for supplementary mappings, unless -5 or -q is applied + ++l; + } + if (aa.n == 0) { // no alignments good enough; then create an unaligned record + mem_aln_t *q; + q = kv_pushp(mem_aln_t, aa); + *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); + q->flag |= extra_flag; + } + return aa; +} + +// Port of worker2 from "bwamem.c" so that we can return (and not print) the alignment +static void worker2_alt(void *data, int i, int tid) +{ + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); + extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); + worker_alt_t *w = (worker_alt_t*)data; + // NB: paired end not supported + if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); + if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]); + w->alns[i] = mem_reg2sam_alt(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + free(w->regs[i].a); +} + +// Port of "mem_process_seqs" from "bwamem.c" so that we can align multi-threaded and return the vector of alignments +// (one vector of alignments per input sequence +mem_alns_t* mem_process_seqs_alt(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, + int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) +{ + extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); + worker_t w; + worker_alt_t w_alt; + mem_pestat_t pes[4]; + double ctime, rtime; + int i; + + ctime = cputime(); rtime = realtime(); + w.regs = malloc(n * sizeof(mem_alnreg_v)); + w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; + w.seqs = seqs; w.n_processed = n_processed; + w.pes = &pes[0]; + w.aux = malloc(opt->n_threads * sizeof(smem_aux_t)); + for (i = 0; i < opt->n_threads; ++i) + w.aux[i] = smem_aux_init(); + kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions + for (i = 0; i < opt->n_threads; ++i) + smem_aux_destroy(w.aux[i]); + free(w.aux); + // Paired-end not supported +// if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided +// if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 +// else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data +// } + w_alt.alns = malloc(n * sizeof(mem_alns_t)); + // copy worker_t to worker_alt_t + w_alt.opt = w.opt; w_alt.bwt = bwt; w_alt.bns = w.bns; w_alt.pac = w.pac; + w_alt.pes = w.pes; w_alt.aux = w.aux; w_alt.seqs = w.seqs; w_alt.regs = w.regs; w_alt.n_processed = w.n_processed; + kt_for(opt->n_threads, worker2_alt, &w_alt, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment + free(w.regs); + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime); + + return w_alt.alns; +} \ No newline at end of file diff --git a/pybwa/libbwamem_utils.h b/pybwa/libbwamem_utils.h new file mode 100644 index 0000000..30e095d --- /dev/null +++ b/pybwa/libbwamem_utils.h @@ -0,0 +1,9 @@ +#include "bntseq.h" +#include "bwt.h" +#include "bwamem.h" +#include "kstring.h" +#include "kvec.h" + +typedef kvec_t(mem_aln_t) mem_alns_t; + +mem_alns_t* mem_process_seqs_alt(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); \ No newline at end of file diff --git a/tests/test_bwapy.py b/tests/test_bwapy.py index 8e11971..ae0b3da 100644 --- a/tests/test_bwapy.py +++ b/tests/test_bwapy.py @@ -160,3 +160,29 @@ def test_bwamem(ref_fasta: Path, fastx_record: FastxRecord) -> None: assert rec.is_reverse assert rec.cigarstring == "80M" # TODO: test multi-mapping, reverse strand, etc + + +def test_bwamem_threading(ref_fasta: Path, fastx_record: FastxRecord) -> None: + opt = BwaMemOptions(threads=2) + bwa = BwaMem(prefix=ref_fasta) + + revcomp_seq = None if not fastx_record.sequence else reverse_complement(fastx_record.sequence) + revcomp_record = FastxRecord(name="revcomp", sequence=revcomp_seq) + + queries = [fastx_record if i % 2 == 0 else revcomp_record for i in range(100)] + list_of_recs = bwa.align(opt=opt, queries=queries) + assert len(list_of_recs) == len(queries) + for i, recs in enumerate(list_of_recs): + assert len(recs) == 1 + rec = recs[0] + if i % 2 == 0: + assert rec.query_name == "test" + assert rec.is_forward + else: + assert rec.query_name == "revcomp" + assert rec.is_reverse + assert not rec.is_paired + assert not rec.is_read1 + assert not rec.is_read2 + assert rec.reference_start == 80 + assert rec.cigarstring == "80M"