diff --git a/build.py b/build.py index 1525482..7011c44 100644 --- a/build.py +++ b/build.py @@ -1,5 +1,8 @@ import multiprocessing +import os import platform +import subprocess +from contextlib import contextmanager from pathlib import Path from typing import List @@ -7,6 +10,36 @@ from Cython.Distutils.build_ext import new_build_ext as cython_build_ext from setuptools import Extension, Distribution +@contextmanager +def changedir(path): + save_dir = os.getcwd() + os.chdir(path) + try: + yield + finally: + os.chdir(save_dir) + +@contextmanager +def with_patches(): + patches = sorted([ + os.path.abspath(patch) + for patch in Path("patches").iterdir() + if patch.is_file() and patch.suffix == ".patch" + ]) + with changedir("bwa"): + for patch in patches: + retcode = subprocess.call(f"git apply {patch}", shell=True) + if retcode != 0: + raise RuntimeError(f"Failed to apply patch {patch}") + try: + yield + finally: + commands = ["git submodule deinit -f .", "git submodule update --init"] + for command in commands: + retcode = subprocess.call(command, shell=True) + if retcode != 0: + raise RuntimeError(f"Failed to reset submodules: {command}") + SOURCE_DIR = Path("pybwa") BUILD_DIR = Path("cython_build") compile_args = [] @@ -110,46 +143,48 @@ def cythonize_helper(extension_modules: List[Extension]) -> List[Extension]: def build(): - # Collect and cythonize all files - extension_modules = cythonize_helper([ - libbwaindex_module, - libbwaaln_module, - libbwamem_module - ]) - - # Use Setuptools to collect files - distribution = Distribution({ - "name": "pybwa", - 'version': '0.0.1', - 'description': 'Python bindings for BWA', - 'long_description': __doc__, - 'long_description_content_type': 'text/x-rst', - 'author': 'Nils Homer', - 'author_email': 'nils@fulcrumgenomics.com', - 'license': 'MIT', - 'platforms': ['POSIX', 'UNIX', 'MacOS'], - 'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f], - 'url': 'https://github.com/fulcrumgenomics/pybwa', - 'packages': ['pybwa', 'pybwa.include.bwa'], - 'package_dir': {'pybwa': 'pybwa', 'pybwa.include.bwa': 'bwa'}, - 'package_data': {'': ['*.pxd', '*.h', '*.c', 'py.typed', '*.pyi'], }, - "ext_modules": extension_modules, - "cmdclass": { - "build_ext": cython_build_ext, - }, - }) - - # Grab the build_ext command and copy all files back to source dir. - # Done so Poetry grabs the files during the next step in its build. - build_ext_cmd = distribution.get_command_obj("build_ext") - build_ext_cmd.ensure_finalized() - # Set the value to 1 for "inplace", with the goal to build extensions - # in build directory, and then copy all files back to the source dir - # (under the hood, "copy_extensions_to_source" will be called after - # building the extensions). This is done so Poetry grabs the files - # during the next step in its build. - build_ext_cmd.inplace = 1 - build_ext_cmd.run() + # apply patches to bwa, then revert them after + with with_patches(): + # Collect and cythonize all files + extension_modules = cythonize_helper([ + libbwaindex_module, + libbwaaln_module, + libbwamem_module + ]) + + # Use Setuptools to collect files + distribution = Distribution({ + "name": "pybwa", + 'version': '0.0.1', + 'description': 'Python bindings for BWA', + 'long_description': __doc__, + 'long_description_content_type': 'text/x-rst', + 'author': 'Nils Homer', + 'author_email': 'nils@fulcrumgenomics.com', + 'license': 'MIT', + 'platforms': ['POSIX', 'UNIX', 'MacOS'], + 'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f], + 'url': 'https://github.com/fulcrumgenomics/pybwa', + 'packages': ['pybwa', 'pybwa.include.bwa'], + 'package_dir': {'pybwa': 'pybwa', 'pybwa.include.bwa': 'bwa'}, + 'package_data': {'': ['*.pxd', '*.h', '*.c', 'py.typed', '*.pyi'], }, + "ext_modules": extension_modules, + "cmdclass": { + "build_ext": cython_build_ext, + }, + }) + + # Grab the build_ext command and copy all files back to source dir. + # Done so Poetry grabs the files during the next step in its build. + build_ext_cmd = distribution.get_command_obj("build_ext") + build_ext_cmd.ensure_finalized() + # Set the value to 1 for "inplace", with the goal to build extensions + # in build directory, and then copy all files back to the source dir + # (under the hood, "copy_extensions_to_source" will be called after + # building the extensions). This is done so Poetry grabs the files + # during the next step in its build. + build_ext_cmd.inplace = 1 + build_ext_cmd.run() if __name__ == "__main__": diff --git a/docs/api.rst b/docs/api.rst index e0ad9cf..69df70b 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -99,6 +99,14 @@ As a result, the mapping quality may also differ slightly. This is due to an implementation detail in which the order of random numbers used is different between this wrapper and command-line. +Finally, the following additions have been made to :code:`bwa aln/samse`: + +#. The standard SAM tag :code:`HN` is added. This is useful if we find too many hits + (:attr:`~pybwa.BwaAlnOptions.max_hits`) and therefore no hits are reported in the :code:`XA` tag, we can still + know how many were found. +#. The :py:attr:`~pybwa.BwaAlnOptions.with_md` option will add the standard SAM tag :code:`MD` to the :code:`XA` tag, + otherwise :code:`.` will be used. This provides additional information on the quality of alternative alignments. + === API === diff --git a/patches/0001-feat-add-HN-tag-to-bwa-aln.patch b/patches/0001-feat-add-HN-tag-to-bwa-aln.patch new file mode 100644 index 0000000..1575c83 --- /dev/null +++ b/patches/0001-feat-add-HN-tag-to-bwa-aln.patch @@ -0,0 +1,76 @@ +From 7d36ae830fc7391cfa69ffcdf239755f01317c1c Mon Sep 17 00:00:00 2001 +From: Nils Homer +Date: Thu, 16 Jan 2025 22:35:13 -0800 +Subject: [PATCH 1/2] feat: add HN tag to bwa aln + +--- + bwase.c | 17 +++++++++-------- + bwtaln.h | 1 + + 2 files changed, 10 insertions(+), 8 deletions(-) + +diff --git a/bwase.c b/bwase.c +index 18e8671..eb43c02 100644 +--- a/bwase.c ++++ b/bwase.c +@@ -21,7 +21,7 @@ int g_log_n[256]; + + void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) + { +- int i, cnt, best; ++ int i, k, cnt, best; + if (n_aln == 0) { + s->type = BWA_TYPE_NO_MATCH; + s->c1 = s->c2 = 0; +@@ -47,14 +47,14 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma + s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; + } + ++ for (k = s->n_occ = 0; k < n_aln; ++k) { ++ const bwt_aln1_t *q = aln + k; ++ s->n_occ += q->l - q->k + 1; ++ } + if (n_multi) { +- int k, rest, n_occ, z = 0; +- for (k = n_occ = 0; k < n_aln; ++k) { +- const bwt_aln1_t *q = aln + k; +- n_occ += q->l - q->k + 1; +- } ++ int rest, z = 0; + if (s->multi) free(s->multi); +- if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them ++ if (s->n_occ > n_multi + 1) { // if there are too many hits, generate none of them + s->multi = 0; s->n_multi = 0; + return; + } +@@ -62,7 +62,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma + * here. In principle, due to the requirement above, we can + * simply output all hits, but the following samples "rest" + * number of random hits. */ +- rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa ++ rest = s->n_occ > n_multi + 1? n_multi + 1 : s->n_occ; // find one additional for ->sa + s->multi = calloc(rest, sizeof(bwt_multi1_t)); + for (k = 0; k < n_aln; ++k) { + const bwt_aln1_t *q = aln + k; +@@ -477,6 +477,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in + } + } + } ++ err_printf("\tHN:i:%d", p->n_occ); + err_putchar('\n'); + } else { // this read has no match + //ubyte_t *s = p->strand? p->rseq : p->seq; +diff --git a/bwtaln.h b/bwtaln.h +index 4616ff5..71ea627 100644 +--- a/bwtaln.h ++++ b/bwtaln.h +@@ -76,6 +76,7 @@ typedef struct { + // multiple hits + int n_multi; + bwt_multi1_t *multi; ++ int n_occ; // total # of hits found, not just those reported in XA, output in HN + // alignment information + bwtint_t sa, pos; + uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ +-- +2.39.5 (Apple Git-154) + diff --git a/patches/0002-feat-add-an-option-to-output-MD-in-the-XA-tag.patch b/patches/0002-feat-add-an-option-to-output-MD-in-the-XA-tag.patch new file mode 100644 index 0000000..bb055f3 --- /dev/null +++ b/patches/0002-feat-add-an-option-to-output-MD-in-the-XA-tag.patch @@ -0,0 +1,198 @@ +From 028b2a7f2129a6eb8038079c539a54526bb08422 Mon Sep 17 00:00:00 2001 +From: Nils Homer +Date: Fri, 17 Jan 2025 00:29:08 -0800 +Subject: [PATCH 2/2] feat: add an option to output MD in the XA tag + +--- + bwape.c | 12 +++++++----- + bwase.c | 31 +++++++++++++++++++++---------- + bwase.h | 2 +- + bwtaln.h | 1 + + 4 files changed, 30 insertions(+), 16 deletions(-) + +diff --git a/bwape.c b/bwape.c +index fa4f7f7..986cab8 100644 +--- a/bwape.c ++++ b/bwape.c +@@ -621,7 +621,7 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, + return pacseq; + } + +-void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line) ++void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line, int with_md) + { + extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); + int i, j, n_seqs; +@@ -692,7 +692,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f + + fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... "); + for (j = 0; j < 2; ++j) +- bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq); ++ bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, with_md); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + if (pac == 0) free(pacseq); + +@@ -732,12 +732,12 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f + + int bwa_sai2sam_pe(int argc, char *argv[]) + { +- int c; ++ int c, with_md = 0; + pe_opt_t *popt; + char *prefix, *rg_line = 0; + + popt = bwa_init_pe_opt(); +- while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { ++ while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:d")) >= 0) { + switch (c) { + case 'r': + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; +@@ -751,6 +751,7 @@ int bwa_sai2sam_pe(int argc, char *argv[]) + case 'c': popt->ap_prior = atof(optarg); break; + case 'f': xreopen(optarg, "w", stdout); break; + case 'A': popt->force_isize = 1; break; ++ case 'd': with_md = 1; break; + default: return 1; + } + } +@@ -768,6 +769,7 @@ int bwa_sai2sam_pe(int argc, char *argv[]) + fprintf(stderr, " -P preload index into memory (for base-space reads only)\n"); + fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n"); + fprintf(stderr, " -A disable insert size estimate (force -s)\n\n"); ++ fprintf(stderr, " -d output the MD to each alignment in the XA tag, otherwise use \".\"\n\n"); + fprintf(stderr, "Notes: 1. For SOLiD reads, corresponds R3 reads and to F3.\n"); + fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n"); + fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n"); +@@ -778,7 +780,7 @@ int bwa_sai2sam_pe(int argc, char *argv[]) + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + return 1; + } +- bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line); ++ bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line, with_md); + free(prefix); free(popt); + return 0; + } +diff --git a/bwase.c b/bwase.c +index eb43c02..12fe47e 100644 +--- a/bwase.c ++++ b/bwase.c +@@ -284,11 +284,11 @@ void bwa_correct_trimmed(bwa_seq_t *s) + s->len = s->full_len; + } + +-void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) ++void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, int with_md) + { + ubyte_t *pacseq; + int i, j, k; +- kstring_t *str; ++ kstring_t *str = (kstring_t*)calloc(1, sizeof(kstring_t)); + + if (!_pacseq) { + pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); +@@ -305,7 +305,18 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar); + q->n_cigar = n_cigar; + if (q->cigar) s->multi[k++] = *q; +- } else s->multi[k++] = *q; ++ } else { ++ s->multi[k++] = *q; ++ if (with_md) { // create the cigar, needed for bwa_cal_md1 below ++ q->n_cigar = 1; ++ q->cigar = calloc(q->n_cigar, sizeof(uint32_t)); ++ q->cigar[0] = __cigar_create(FROM_M, s->len); ++ } ++ } ++ if (with_md) { ++ int nm; ++ q->md = bwa_cal_md1(q->n_cigar, q->cigar, s->len, q->pos, q->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm); ++ } + } + s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation + if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; +@@ -313,7 +324,6 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t + if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH; + } + // generate MD tag +- str = (kstring_t*)calloc(1, sizeof(kstring_t)); + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *s = seqs + i; + if (s->type != BWA_TYPE_NO_MATCH) { +@@ -505,7 +515,7 @@ void bwase_initialize() + for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); + } + +-void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line) ++void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line, int with_md) + { + extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); + int i, n_seqs, m_aln; +@@ -558,7 +568,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); +- bwa_refine_gapped(bns, n_seqs, seqs, 0); ++ bwa_refine_gapped(bns, n_seqs, seqs, 0, with_md); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_aln_core] print alignments... "); +@@ -579,11 +589,12 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f + + int bwa_sai2sam_se(int argc, char *argv[]) + { +- int c, n_occ = 3; ++ int c, n_occ = 3, with_md = 0; + char *prefix, *rg_line = 0; +- while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { ++ while ((c = getopt(argc, argv, "hdn:f:r:")) >= 0) { + switch (c) { + case 'h': break; ++ case 'd': with_md = 1; break; + case 'r': + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; + break; +@@ -594,14 +605,14 @@ int bwa_sai2sam_se(int argc, char *argv[]) + } + + if (optind + 3 > argc) { +- fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); ++ fprintf(stderr, "Usage: bwa samse [-n max_occ] [-d] [-f out.sam] [-r RG_line] \n"); + return 1; + } + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + return 1; + } +- bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); ++ bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line, with_md); + free(prefix); + return 0; + } +diff --git a/bwase.h b/bwase.h +index 26a9f68..c6f72f4 100644 +--- a/bwase.h ++++ b/bwase.h +@@ -14,7 +14,7 @@ extern "C" { + // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. + void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr); + // Refine the approximate position of the sequence to an actual placement for the sequence. +- void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq); ++ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, int with_md); + // Backfill certain alignment properties mainly centering around number of matches. + void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); + // Calculate the end position of a read given a certain sequence. +diff --git a/bwtaln.h b/bwtaln.h +index 71ea627..ec8103a 100644 +--- a/bwtaln.h ++++ b/bwtaln.h +@@ -61,6 +61,7 @@ typedef struct { + int ref_shift; + bwtint_t pos; + bwa_cigar_t *cigar; ++ char *md; + } bwt_multi1_t; + + typedef struct { +-- +2.39.5 (Apple Git-154) + diff --git a/pybwa/libbwaaln.pxd b/pybwa/libbwaaln.pxd index 6e32a3f..2ae40ce 100644 --- a/pybwa/libbwaaln.pxd +++ b/pybwa/libbwaaln.pxd @@ -80,7 +80,7 @@ cdef extern void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s cdef extern from "bwase.h": int64_t pos_end(const bwa_seq_t *p) - void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, unsigned char *_pacseq) + void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, unsigned char *_pacseq, int with_md) char *bwa_cal_md1(int n_cigar, uint16_t *cigar, int len, uint64_t pos, unsigned char *seq, uint64_t l_pac, unsigned char *pacseq, kstring_t *str, int *_nm) void bwase_initialize() diff --git a/pybwa/libbwaaln.pyi b/pybwa/libbwaaln.pyi index 84bfbe7..4635da9 100644 --- a/pybwa/libbwaaln.pyi +++ b/pybwa/libbwaaln.pyi @@ -37,6 +37,7 @@ class BwaAlnOptions: stop_at_max_best_hits: int # -R max_hits: int # bwa samse -n log_scaled_gap_penalty: bool = True # -L + with_md: bool = True # bwa samse -d class BwaAln: def __init__(self, prefix: str | Path | None = None, index: BwaIndex | None = None) -> None: ... diff --git a/pybwa/libbwaaln.pyx b/pybwa/libbwaaln.pyx index 4321bec..8ac88bc 100644 --- a/pybwa/libbwaaln.pyx +++ b/pybwa/libbwaaln.pyx @@ -34,10 +34,12 @@ cdef class BwaAlnOptions: stop_at_max_best_hits (int | None): :code:`-R ` max_hits (int | None): :code:`bwa samse -n ` log_scaled_gap_penalty (in | None): :code:`-L` + with_md (bool): output the MD to each alignment in the XA tag, otherwise use :code:`"."` """ cdef gap_opt_t * _delegate _max_hits: int + _with_md: bool def __init__(self, max_mismatches: int | None = None, @@ -52,7 +54,8 @@ cdef class BwaAlnOptions: gap_extension_penalty: int | None = None, stop_at_max_best_hits: int | None = None, max_hits: int | None = 3, - log_scaled_gap_penalty: bool | None = None + log_scaled_gap_penalty: bool | None = None, + with_md: bool | None = False ): if max_mismatches is not None: self.max_mismatches = max_mismatches @@ -78,6 +81,8 @@ cdef class BwaAlnOptions: self.max_hits = max_hits if log_scaled_gap_penalty is not None: self.log_scaled_gap_penalty = 1 if log_scaled_gap_penalty else 0 + if with_md is not None: + self.with_md = with_md def __cinit__(self): self._delegate = gap_init_opt() @@ -185,6 +190,15 @@ cdef class BwaAlnOptions: else: self._delegate.mode &= ~BWA_MODE_LOGGAP + property with_md: + """:code:`bwa samse -d + + Output the MD to each alignment in the XA tag, otherwise use :code:`"."`. + """ + def __get__(self) -> bool: + return self._with_md + def __set__(self, value: bool): + self._with_md = value cdef class BwaAln: """The class to align reads with :code:`bwa aln`.""" @@ -396,7 +410,7 @@ cdef class BwaAln: bwa_cal_pac_pos_with_bwt(self._index.bns(), num_seqs, seqs, gap_opt.max_diff, gap_opt.fnr, self._index.bwt()) # refine gapped alignment - bwa_refine_gapped(self._index.bns(), num_seqs, seqs, self._index.pac()) + bwa_refine_gapped(self._index.bns(), num_seqs, seqs, self._index.pac(), opt.with_md) # create the AlignedSegment from FastxRecord and bwa_seq_t. recs = [