Skip to content

Commit

Permalink
feat: output the HN tag and optionally add the MD to the XA tag
Browse files Browse the repository at this point in the history
This requires us to patch bwa, which we store in the patches
sub-directory.  They are applied before we build, and reverted after
the build completes (regardless of success).  They can be removed
when the bwa PRs are merged and submodule updated.

See: lh3/bwa#438
See: lh3/bwa#439
  • Loading branch information
nh13 committed Jan 17, 2025
1 parent 61c488a commit 76c8c27
Show file tree
Hide file tree
Showing 7 changed files with 375 additions and 43 deletions.
115 changes: 75 additions & 40 deletions build.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,45 @@
import multiprocessing
import os
import platform
import subprocess
from contextlib import contextmanager
from pathlib import Path
from typing import List

from Cython.Build import cythonize
from Cython.Distutils.build_ext import new_build_ext as cython_build_ext
from setuptools import Extension, Distribution

@contextmanager
def changedir(path):
save_dir = os.getcwd()
os.chdir(path)
try:
yield
finally:
os.chdir(save_dir)

@contextmanager
def with_patches():
patches = sorted([
os.path.abspath(patch)
for patch in Path("patches").iterdir()
if patch.is_file() and patch.suffix == ".patch"
])
with changedir("bwa"):
for patch in patches:
retcode = subprocess.call(f"git apply {patch}", shell=True)
if retcode != 0:
raise RuntimeError(f"Failed to apply patch {patch}")
try:
yield
finally:
commands = ["git submodule deinit -f .", "git submodule update --init"]
for command in commands:
retcode = subprocess.call(command, shell=True)
if retcode != 0:
raise RuntimeError(f"Failed to reset submodules: {command}")

SOURCE_DIR = Path("pybwa")
BUILD_DIR = Path("cython_build")
compile_args = []
Expand Down Expand Up @@ -110,46 +143,48 @@ def cythonize_helper(extension_modules: List[Extension]) -> List[Extension]:


def build():
# Collect and cythonize all files
extension_modules = cythonize_helper([
libbwaindex_module,
libbwaaln_module,
libbwamem_module
])

# Use Setuptools to collect files
distribution = Distribution({
"name": "pybwa",
'version': '0.0.1',
'description': 'Python bindings for BWA',
'long_description': __doc__,
'long_description_content_type': 'text/x-rst',
'author': 'Nils Homer',
'author_email': '[email protected]',
'license': 'MIT',
'platforms': ['POSIX', 'UNIX', 'MacOS'],
'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f],
'url': 'https://github.com/fulcrumgenomics/pybwa',
'packages': ['pybwa', 'pybwa.include.bwa'],
'package_dir': {'pybwa': 'pybwa', 'pybwa.include.bwa': 'bwa'},
'package_data': {'': ['*.pxd', '*.h', '*.c', 'py.typed', '*.pyi'], },
"ext_modules": extension_modules,
"cmdclass": {
"build_ext": cython_build_ext,
},
})

# Grab the build_ext command and copy all files back to source dir.
# Done so Poetry grabs the files during the next step in its build.
build_ext_cmd = distribution.get_command_obj("build_ext")
build_ext_cmd.ensure_finalized()
# Set the value to 1 for "inplace", with the goal to build extensions
# in build directory, and then copy all files back to the source dir
# (under the hood, "copy_extensions_to_source" will be called after
# building the extensions). This is done so Poetry grabs the files
# during the next step in its build.
build_ext_cmd.inplace = 1
build_ext_cmd.run()
# apply patches to bwa, then revert them after
with with_patches():
# Collect and cythonize all files
extension_modules = cythonize_helper([
libbwaindex_module,
libbwaaln_module,
libbwamem_module
])

# Use Setuptools to collect files
distribution = Distribution({
"name": "pybwa",
'version': '0.0.1',
'description': 'Python bindings for BWA',
'long_description': __doc__,
'long_description_content_type': 'text/x-rst',
'author': 'Nils Homer',
'author_email': '[email protected]',
'license': 'MIT',
'platforms': ['POSIX', 'UNIX', 'MacOS'],
'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f],
'url': 'https://github.com/fulcrumgenomics/pybwa',
'packages': ['pybwa', 'pybwa.include.bwa'],
'package_dir': {'pybwa': 'pybwa', 'pybwa.include.bwa': 'bwa'},
'package_data': {'': ['*.pxd', '*.h', '*.c', 'py.typed', '*.pyi'], },
"ext_modules": extension_modules,
"cmdclass": {
"build_ext": cython_build_ext,
},
})

# Grab the build_ext command and copy all files back to source dir.
# Done so Poetry grabs the files during the next step in its build.
build_ext_cmd = distribution.get_command_obj("build_ext")
build_ext_cmd.ensure_finalized()
# Set the value to 1 for "inplace", with the goal to build extensions
# in build directory, and then copy all files back to the source dir
# (under the hood, "copy_extensions_to_source" will be called after
# building the extensions). This is done so Poetry grabs the files
# during the next step in its build.
build_ext_cmd.inplace = 1
build_ext_cmd.run()


if __name__ == "__main__":
Expand Down
8 changes: 8 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ As a result, the mapping quality may also differ slightly.
This is due to an implementation detail in which the order of random numbers used is different between this wrapper
and command-line.

Finally, the following additions have been made to :code:`bwa aln/samse`:

#. The standard SAM tag :code:`HN` is added. This is useful if we find too many hits
(:attr:`~pybwa.BwaAlnOptions.max_hits`) and therefore no hits are reported in the :code:`XA` tag, we can still
know how many were found.
#. The :py:attr:`~pybwa.BwaAlnOptions.with_md` option will add the standard SAM tag :code:`MD` to the :code:`XA` tag,
otherwise :code:`.` will be used. This provides additional information on the quality of alternative alignments.

===
API
===
Expand Down
76 changes: 76 additions & 0 deletions patches/0001-feat-add-HN-tag-to-bwa-aln.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
From 7d36ae830fc7391cfa69ffcdf239755f01317c1c Mon Sep 17 00:00:00 2001
From: Nils Homer <[email protected]>
Date: Thu, 16 Jan 2025 22:35:13 -0800
Subject: [PATCH 1/2] feat: add HN tag to bwa aln

---
bwase.c | 17 +++++++++--------
bwtaln.h | 1 +
2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/bwase.c b/bwase.c
index 18e8671..eb43c02 100644
--- a/bwase.c
+++ b/bwase.c
@@ -21,7 +21,7 @@ int g_log_n[256];

void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
{
- int i, cnt, best;
+ int i, k, cnt, best;
if (n_aln == 0) {
s->type = BWA_TYPE_NO_MATCH;
s->c1 = s->c2 = 0;
@@ -47,14 +47,14 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
}

+ for (k = s->n_occ = 0; k < n_aln; ++k) {
+ const bwt_aln1_t *q = aln + k;
+ s->n_occ += q->l - q->k + 1;
+ }
if (n_multi) {
- int k, rest, n_occ, z = 0;
- for (k = n_occ = 0; k < n_aln; ++k) {
- const bwt_aln1_t *q = aln + k;
- n_occ += q->l - q->k + 1;
- }
+ int rest, z = 0;
if (s->multi) free(s->multi);
- if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
+ if (s->n_occ > n_multi + 1) { // if there are too many hits, generate none of them
s->multi = 0; s->n_multi = 0;
return;
}
@@ -62,7 +62,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
* here. In principle, due to the requirement above, we can
* simply output all hits, but the following samples "rest"
* number of random hits. */
- rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
+ rest = s->n_occ > n_multi + 1? n_multi + 1 : s->n_occ; // find one additional for ->sa
s->multi = calloc(rest, sizeof(bwt_multi1_t));
for (k = 0; k < n_aln; ++k) {
const bwt_aln1_t *q = aln + k;
@@ -477,6 +477,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
}
}
}
+ err_printf("\tHN:i:%d", p->n_occ);
err_putchar('\n');
} else { // this read has no match
//ubyte_t *s = p->strand? p->rseq : p->seq;
diff --git a/bwtaln.h b/bwtaln.h
index 4616ff5..71ea627 100644
--- a/bwtaln.h
+++ b/bwtaln.h
@@ -76,6 +76,7 @@ typedef struct {
// multiple hits
int n_multi;
bwt_multi1_t *multi;
+ int n_occ; // total # of hits found, not just those reported in XA, output in HN
// alignment information
bwtint_t sa, pos;
uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ
--
2.39.5 (Apple Git-154)

Loading

0 comments on commit 76c8c27

Please sign in to comment.