From d53df429d6bed3df618bab0b9ad1f371732dda2b Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Wed, 18 Dec 2024 02:07:57 -0700 Subject: [PATCH] wip --- bwapy/libbwaaln.pyx | 1 - bwapy/libbwamem.pxd | 39 ++++++- bwapy/libbwamem.pyi | 16 ++- bwapy/libbwamem.pyx | 248 ++++++++++++++++++++++++++++++++------------ 4 files changed, 233 insertions(+), 71 deletions(-) diff --git a/bwapy/libbwaaln.pyx b/bwapy/libbwaaln.pyx index dc36b95..623ab31 100644 --- a/bwapy/libbwaaln.pyx +++ b/bwapy/libbwaaln.pyx @@ -124,7 +124,6 @@ cdef class BwaAln: """The class to align reads with `bwa aln`.""" cdef BwaIndex _index - cdef unsigned char* _pacseq def __init__(self, prefix: str | Path | None = None, index: BwaIndex | None = None): if prefix is not None: diff --git a/bwapy/libbwamem.pxd b/bwapy/libbwamem.pxd index 1cee476..6661d9a 100644 --- a/bwapy/libbwamem.pxd +++ b/bwapy/libbwamem.pxd @@ -1,8 +1,11 @@ # cython: language_level=3 -from libc.stdint cimport uint8_t, int64_t, int32_t, uint64_t, int8_t +from libc.stdint cimport uint8_t, int64_t, int32_t, uint64_t, int8_t, uint32_t from libc.stdio cimport FILE +cdef extern from "limits.h": + cdef int INT_MAX + cdef extern from "bwt.h": ctypedef struct bwt_t: int sa_intv @@ -19,6 +22,13 @@ cdef extern from "bntseq.h": bntann1_t *anns FILE * fp_pac + unsigned char nst_nt4_table[256] + +cdef extern from "kstring.h": + ctypedef struct kstring_t: + size_t l, m + char *s + cdef extern from "bwamem.h": int MEM_F_PE @@ -70,9 +80,32 @@ cdef extern from "bwamem.h": int max_matesw # perform maximally max_matesw rounds of mate-SW for each end int max_XA_hits, max_XA_hits_alt # if there are max_hits or fewer, output them all int8_t mat[25] # scoring matrix mat[0] == 0 if unset + ctypedef struct mem_alnreg_t: + int score # best local SW score + int secondary # index of the parent hit shadowing the current hit; <0 if primary + int n_comp # number of sub-alignments chained together + int is_alt ctypedef struct mem_alnreg_v: - pass + size_t n, m + mem_alnreg_t *a + + ctypedef struct mem_aln_t: + int flag # extra flag + uint32_t is_rev # is_rev: whether on the reverse strand; + uint32_t is_alt + uint32_t mapq # mapq: mapping quality; + uint32_t NM # NM: edit distance + char *XA + int score, sub, alt_sc; + mem_opt_t *mem_opt_init() mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq) - void bwa_fill_scmat(int a, int b, int8_t mat[25]) \ No newline at end of file + void bwa_fill_scmat(int a, int b, int8_t mat[25]) + mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar) + +# from bwamem.c +cdef extern void mem_reorder_primary5(int T, mem_alnreg_v *a) + +# from bwamem_extra.c +cdef extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); diff --git a/bwapy/libbwamem.pyi b/bwapy/libbwamem.pyi index a59d156..b54c7b0 100644 --- a/bwapy/libbwamem.pyi +++ b/bwapy/libbwamem.pyi @@ -1,12 +1,24 @@ +from pathlib import Path +from typing import List +from pysam import AlignedSegment +from pysam import FastxRecord +from bwapy.libbwaindex import BwaIndex class BwaMemOptions: _ignore_alt: bool def __init__(self) -> None: ... - class BwMemOptionsBuilder: _options: BwaMemOptions + _options0: BwaMemOptions def __init__(self, options: BwaMemOptions | None = None) -> None: ... - def build(self) -> BwaMemOptions: ... \ No newline at end of file + def build(self) -> BwaMemOptions: ... + +class BwaMem: + _index: BwaIndex + def __init__(self, prefix: str | Path | None = None, index: BwaIndex | None = None) -> None: ... + def align( + self, opt: BwaMemOptions, queries: List[FastxRecord] + ) -> List[List[AlignedSegment]]: ... diff --git a/bwapy/libbwamem.pyx b/bwapy/libbwamem.pyx index cdc3c5e..9de50ee 100644 --- a/bwapy/libbwamem.pyx +++ b/bwapy/libbwamem.pyx @@ -1,8 +1,13 @@ # cython: language_level=3 from ctypes import memset +from pathlib import Path +from typing import List -from libc.stdlib cimport calloc, free +from libc.stdlib cimport calloc, free, malloc import enum +from bwapy.libbwaindex cimport force_bytes +from bwapy.libbwaindex cimport BwaIndex +from pysam import FastxRecord, AlignedSegment cdef class BwaMemOptions: """The container for options for [`BwaMem`][bwapy.BwaMem]. @@ -35,71 +40,74 @@ class BwaMemMode(enum.Enum): cdef class BwaMemOptionsBuilder: """Builder for options for [`BwaAln`][bwapy.BwaAln]""" _options: BwaMemOptions - _options0: BwaMemOptions # attributes are set to `1` when options are set in _options + cdef mem_opt_t _options0 # attributes are set to `1` when options are set in _options _mode: BwaMemMode | None def __init__(self, options: BwaMemOptions | None = None) -> None: self._options: BwaMemOptions = BwaMemOptions() if options is None else options self._options0: BwaMemOptions = BwaMemOptions() self._mode = None - memset(self._options0._delegate, 0, sizeof(mem_opt_t)) + self.__cinit__() + + cdef _cinit(self): + memset(self._options0, 0, sizeof(mem_opt_t)) def build(self) -> BwaMemOptions: if self._mode is None: # matching score is changed so scale the rest of the penalties/scores - if self._options0._delegate.a == 1: - if self._options0._delegate.b != 1: + if self._options0.a == 1: + if self._options0.b != 1: self._options._delegate.b *= self._options._delegate.a - if self._options0._delegate.T != 1: + if self._options0.T != 1: self._options._delegate.T *= self._options._delegate.a - if self._options0._delegate.o_del != 1: + if self._options0.o_del != 1: self._options._delegate.o_del *= self._options._delegate.a - if self._options0._delegate.e_del != 1: + if self._options0.e_del != 1: self._options._delegate.e_del *= self._options._delegate.a - if self._options0._delegate.o_ins != 1: + if self._options0.o_ins != 1: self._options._delegate.o_ins *= self._options._delegate.a - if self._options0._delegate.e_ins != 1: + if self._options0.e_ins != 1: self._options._delegate.e_ins *= self._options._delegate.a - if self._options0._delegate.zdrop != 1: + if self._options0.zdrop != 1: self._options._delegate.zdrop *= self._options._delegate.a - if self._options0._delegate.pen_clip5 != 1: + if self._options0.pen_clip5 != 1: self._options._delegate.pen_clip5 *= self._options._delegate.a - if self._options0._delegate.pen_clip3 != 1: + if self._options0.pen_clip3 != 1: self._options._delegate.pen_clip3 *= self._options._delegate.a - if self._options0._delegate.pen_unpaired != 1: + if self._options0.pen_unpaired != 1: self._options._delegate.pen_unpaired *= self._options._delegate.a elif self._mode == BwaMemMode.INTRACTG: - if self._options0._delegate.o_del != 1: + if self._options0.o_del != 1: self._options._delegate.o_del = 16 - if self._options0._delegate.o_ins != 1: + if self._options0.o_ins != 1: self._options._delegate.o_ins = 16 - if self._options0._delegate.b != 1: + if self._options0.b != 1: self._options._delegate.b = 9 - if self._options0._delegate.pen_clip5 != 1: + if self._options0.pen_clip5 != 1: self._options._delegate.pen_clip5 = 5 - if self._options0._delegate.pen_clip3 != 1: + if self._options0.pen_clip3 != 1: self._options._delegate.pen_clip3 = 5 else: - if self._options0._delegate.o_del != 1: + if self._options0.o_del != 1: self._options._delegate.o_del = 1 - if self._options0._delegate.o_ins != 1: + if self._options0.o_ins != 1: self._options._delegate.o_ins = 1 - if self._options0._delegate.e_del != 1: + if self._options0.e_del != 1: self._options._delegate.e_del = 1 - if self._options0._delegate.e_ins != 1: + if self._options0.e_ins != 1: self._options._delegate.e_ins = 1 - if self._options0._delegate.b != 1: + if self._options0.b != 1: self._options._delegate.b = 1 - if self._options0._delegate.split_factor == 0.0: + if self._options0.split_factor == 0.0: self._options._delegate.split_factor = 10.0 - if self._options0._delegate.pen_clip5 != 1: + if self._options0.pen_clip5 != 1: self._options._delegate.pen_clip5 = 0 - if self._options0._delegate.pen_clip3 != 1: + if self._options0.pen_clip3 != 1: self._options._delegate.pen_clip3 = 0 # ONT2D vs PACBIO options - if self._options0._delegate.min_chain_weight != 1: + if self._options0.min_chain_weight != 1: self._options._delegate.min_chain_weight = 20 if self._mode == BwaMemMode.ONT2D else 40 - if self._options0._delegate.min_seed_len != 1: + if self._options0.min_seed_len != 1: self._options._delegate.min_seed_len = 14 if self._mode == BwaMemMode.ONT2D else 17 bwa_fill_scmat( @@ -111,7 +119,7 @@ cdef class BwaMemOptionsBuilder: cdef min_seed_len(self, value: int): """bwa mem -k """ self._options._delegate.min_seed_len = value - self._options0._delegate.min_seed_len = 1 + self._options0.min_seed_len = 1 return self # FIXME: convert to an enum enum @@ -123,48 +131,36 @@ cdef class BwaMemOptionsBuilder: cdef band_width(self, value: int): """bwa mem -w """ self._options._delegate.w = value - self._options0._delegate.w = 1 - return self - - cdef band_width(self, value: int): - """bwa mem -w """ - self._options._delegate.w = value - self._options0._delegate.w = 1 + self._options0.w = 1 return self cdef match_score(self, value: int): """bwa mem -A """ self._options._delegate.a = value - self._options0._delegate.a = 1 + self._options0.a = 1 return self cdef mismatch_penalty(self, value: int): """bwa mem -A """ self._options._delegate.b = value - self._options0._delegate.b = 1 + self._options0.b = 1 return self cdef minimum_score(self, value: int): """bwa mem -T """ self._options._delegate.T = value - self._options0._delegate.T = 1 + self._options0.T = 1 return self cdef unpaired_penalty(self, value: int): """bwa mem -U """ self._options._delegate.pen_unpaired = value - self._options0._delegate.pen_unpaired = 1 - return self - - cdef unpaired_penalty(self, value: int): - """bwa mem -U """ - self._options._delegate.pen_unpaired = value - self._options0._delegate.pen_unpaired = 1 + self._options0.pen_unpaired = 1 return self cdef num_threads(self, value: int): """bwa mem -t """ - self._options._delegate.num_threads = value if value > 1 else 1 + self._options._delegate.n_threads = value if value > 1 else 1 return self def _set_flag(self, value: bool, flag: int): @@ -213,54 +209,54 @@ cdef class BwaMemOptionsBuilder: cdef max_occurrences(self, value: int): """bwa mem -c """ self._options._delegate.max_occ = value - self._options0._delegate.max_occ = 1 + self._options0.max_occ = 1 return self cdef off_diagonal_x_dropoff(self, value: int): """bwa mem -d """ self._options._delegate.XA_drop_ratio = value - self._options0._delegate.XA_drop_ratio = 1 + self._options0.XA_drop_ratio = 1 return self cdef ignore_alternate_contigs(self, value: bool): """bwa mem -j""" - self._options._delegate._ignore_alt = value + self._options._ignore_alt = value return self cdef internal_seed_split_factor(self, value: float): """bwa mem -r """ self._options._delegate.split_factor = value - self._options0._delegate.split_factor = 1 + self._options0.split_factor = 1 return self cdef drop_chain_fraction(self, value: float): """bwa mem -D """ self._options._delegate.drop_ratio = value - self._options0._delegate.drop_ratio = 1 + self._options0.drop_ratio = 1 return self cdef max_mate_rescue_rounds(self, value: int): """bwa mem -m """ self._options._delegate.max_matesw = value - self._options0._delegate.max_matesw = 1 + self._options0.max_matesw = 1 return self cdef min_seeded_bases_in_chain(self, value: int): """bwa mem -W """ self._options._delegate.min_chain_weight = value - self._options0._delegate.min_chain_weight = 1 + self._options0.min_chain_weight = 1 return self cdef seed_occurrence_in_3rd_round(self, value: int): """bwa mem -y """ self._options._delegate.max_mem_intv = value - self._options0._delegate.max_mem_intv = 1 + self._options0.max_mem_intv = 1 return self cdef xa_max_hits(self, value: int, alt_value: int | None): """bwa mem -h >""" - self._options0._delegate.max_XA_hits = 1 - self._options0._delegate.max_XA_hits_alt = 1 + self._options0.max_XA_hits = 1 + self._options0.max_XA_hits_alt = 1 self._options._delegate.max_XA_hits = value self._options._delegate.max_XA_hits_alt = value if alt_value is None else alt_value @@ -271,24 +267,24 @@ cdef class BwaMemOptionsBuilder: cdef gap_open_penalty(self, deletions: int, insertions: int): """bwa mem -O >""" - self._options0._delegate.o_del = 1 - self._options0._delegate.o_ins = 1 + self._options0.o_del = 1 + self._options0.o_ins = 1 self._options._delegate.o_del = deletions self._options._delegate.o_ins = insertions return self cdef gap_extension_penalty(self, deletions: int, insertions: int): """bwa mem -E >""" - self._options0._delegate.e_del = 1 - self._options0._delegate.e_ins = 1 + self._options0.e_del = 1 + self._options0.e_ins = 1 self._options._delegate.e_del = deletions self._options._delegate.e_ins = insertions return self cdef clipping_penalty(self, five_prime: int, three_prime: int): """bwa mem -L >""" - self._options0._delegate.pen_clip5 = 1 - self._options0._delegate.pen_clip3 = 1 + self._options0.pen_clip5 = 1 + self._options0.pen_clip3 = 1 self._options._delegate.pen_clip5 = five_prime self._options._delegate.pen_clip3 = three_prime @@ -302,4 +298,126 @@ cdef class BwaMemOptionsBuilder: raise NotImplementedError -# \ No newline at end of file +cdef class BwaMem: + """The class to align reads with `bwa mem`.""" + + cdef BwaIndex _index + + def __init__(self, prefix: str | Path | None = None, index: BwaIndex | None = None): + if prefix is not None: + assert Path(prefix).exists() + self._index = BwaIndex(prefix=prefix) + elif index is not None: + self._index = index + else: + raise Exception("Either prefix or index must be given") + + # TODO: support paired end + def align(self, opt: BwaMemOptions, queries: List[FastxRecord]) -> List[List[AlignedSegment]]: + """Align one or more queries with `bwa aln`. + + Args: + opt: the alignment options + queries: the queries to align + + Returns: + one alignment per query + """ + return self._calign(opt, queries) + + cdef _copy_seq(self, q: FastxRecord, kstring_t *seq): + seq_len = len(q.sequence) + seq.s = calloc(sizeof(char), seq_len + 1) + seq.l = seq_len + seq.m = seq_len + 1 + for i, base in enumerate(q.sequence): + seq.s[i] = nst_nt4_table[ord(base)] + seq.s[seq_len] = b'\0' + + def _unmapped(self, query: FastxRecord) -> AlignedSegment: + # make a default, unmapped, empty record + rec = AlignedSegment(header=self._index.header) + rec.query_name = query.name + rec.reference_id = -1 + rec.reference_start = -1 + rec.mapping_quality = 0 + rec.is_paired = False + rec.is_read1 = True + rec.is_read2 = False + rec.is_qcfail = False + rec.is_duplicate = False + rec.is_secondary = False + rec.is_supplementary = False + rec.is_unmapped = True + rec.query_sequence = query.sequence + rec.query_qualities = query.quality + return rec + + cdef _calign(self, opt: BwaMemOptions, queries: List[FastxRecord]): + # TODO: ignore_alt + cdef kstring_t* seqs + cdef kstring_t* seq + cdef char* s_char + cdef kstring_t* kstr + cdef mem_alnreg_v mem_alnregs + cdef int take_all + cdef size_t j + cdef char **XA + cdef mem_alnreg_t *mem_alnreg + cdef mem_aln_t mem_aln + + recs_to_return: List[List[AlignedSegment]] = [] + + # copy FastqProxy into bwa_seq_t + num_seqs = len(queries) + seqs = calloc(sizeof(kstring_t), num_seqs) + for i in range(num_seqs): + seq = &seqs[i] + self._copy_seq(queries[i], seq) + mem_alnregs = mem_align1(opt._delegate, self._index.bwt(), self._index.bns(), self._index.pac(), seq.l, seq.s) + if opt._delegate.flag & MEM_F_PRIMARY5: + mem_reorder_primary5(opt._delegate.T, &mem_alnregs) + + # mimic mem_reg2sam from bwamem.c + recs = [] + XA = NULL + keep_all = opt._delegate.flag & MEM_F_ALL != 0 + if not keep_all: + XA = mem_gen_alt(opt._delegate, self._index.bns(), self._index.pac(), &mem_alnregs, seq.l, seq.s) + num_mem_aln = 0 + for j in range(mem_alnregs.n): + mem_alnreg = &mem_alnregs.a[j] + if mem_alnreg.score < opt._delegate.T: + continue + if mem_alnreg.secondary >= 0 and (mem_alnreg.is_alt or not keep_all): + continue + if mem_alnreg.secondary >= 0 and mem_alnreg.secondary < INT_MAX and mem_alnreg.score < mem_alnregs.a[mem_alnreg.secondary].score * opt._delegate.drop_ratio: + continue + mem_aln = mem_reg2aln(opt._delegate, self._index.bns(), self._index.pac(), seq.l, seq.s, mem_alnreg) + mem_aln.XA = XA[j] if not keep_all else NULL + if mem_alnreg.secondary >= 0: + mem_aln.sub = 1 # don't output sub-optimal score + if num_mem_aln > 0 and mem_alnreg.secondary < 0: # if supplementary + mem_aln.flag |= 0x10000 if opt._delegate.flag & MEM_F_NO_MULTI else 0x800 + if (opt._delegate.flag & MEM_F_KEEP_SUPP_MAPQ) and num_mem_aln > 0 and mem_alnreg.is_alt > 0 and mem_aln.mapq > mem_alnregs.a[0].mapq: + mem_aln.mapq = mem_alnregs.a[0].mapq # lower + num_mem_aln += 1 + # create a AlignedSegment record here + rec = self._unmapped(query=queries[i]) + # TODO: see mem_aln2sam + # TODO: create recs + recs.append(rec) + if num_mem_aln == 0: # unmapped + recs.append(self._unmapped(query=queries[i])) + if not keep_all: + free(XA) + recs_to_return.append(recs) + + # TODO: free stuff (see mem_reg2sam) + # TODO: free cigars? + for i in range(num_seqs): + free(seqs[i].s) + free(seqs) + + # TODO: how do we handle retval + return recs_to_return \ No newline at end of file