Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: output the HN tag and optionally add the MD to the XA tag #25

Merged
merged 1 commit into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 75 additions & 40 deletions build.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,45 @@
import multiprocessing
import os
import platform
import subprocess
from contextlib import contextmanager
from pathlib import Path
from typing import List

from Cython.Build import cythonize
from Cython.Distutils.build_ext import new_build_ext as cython_build_ext
from setuptools import Extension, Distribution

@contextmanager
def changedir(path):
save_dir = os.getcwd()
os.chdir(path)
try:
yield
finally:
os.chdir(save_dir)

@contextmanager
def with_patches():
patches = sorted([
os.path.abspath(patch)
for patch in Path("patches").iterdir()
if patch.is_file() and patch.suffix == ".patch"
])
with changedir("bwa"):
for patch in patches:
retcode = subprocess.call(f"git apply {patch}", shell=True)
if retcode != 0:
raise RuntimeError(f"Failed to apply patch {patch}")
try:
yield
finally:
commands = ["git submodule deinit -f .", "git submodule update --init"]
for command in commands:
retcode = subprocess.call(command, shell=True)
if retcode != 0:
raise RuntimeError(f"Failed to reset submodules: {command}")

SOURCE_DIR = Path("pybwa")
BUILD_DIR = Path("cython_build")
compile_args = []
Expand Down Expand Up @@ -110,46 +143,48 @@ def cythonize_helper(extension_modules: List[Extension]) -> List[Extension]:


def build():
# Collect and cythonize all files
extension_modules = cythonize_helper([
libbwaindex_module,
libbwaaln_module,
libbwamem_module
])

# Use Setuptools to collect files
distribution = Distribution({
"name": "pybwa",
'version': '0.0.1',
'description': 'Python bindings for BWA',
'long_description': __doc__,
'long_description_content_type': 'text/x-rst',
'author': 'Nils Homer',
'author_email': '[email protected]',
'license': 'MIT',
'platforms': ['POSIX', 'UNIX', 'MacOS'],
'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f],
'url': 'https://github.com/fulcrumgenomics/pybwa',
'packages': ['pybwa', 'pybwa.include.bwa'],
'package_dir': {'pybwa': 'pybwa', 'pybwa.include.bwa': 'bwa'},
'package_data': {'': ['*.pxd', '*.h', '*.c', 'py.typed', '*.pyi'], },
"ext_modules": extension_modules,
"cmdclass": {
"build_ext": cython_build_ext,
},
})

# Grab the build_ext command and copy all files back to source dir.
# Done so Poetry grabs the files during the next step in its build.
build_ext_cmd = distribution.get_command_obj("build_ext")
build_ext_cmd.ensure_finalized()
# Set the value to 1 for "inplace", with the goal to build extensions
# in build directory, and then copy all files back to the source dir
# (under the hood, "copy_extensions_to_source" will be called after
# building the extensions). This is done so Poetry grabs the files
# during the next step in its build.
build_ext_cmd.inplace = 1
build_ext_cmd.run()
# apply patches to bwa, then revert them after
with with_patches():
# Collect and cythonize all files
extension_modules = cythonize_helper([
libbwaindex_module,
libbwaaln_module,
libbwamem_module
])

# Use Setuptools to collect files
distribution = Distribution({
"name": "pybwa",
'version': '0.0.1',
'description': 'Python bindings for BWA',
'long_description': __doc__,
'long_description_content_type': 'text/x-rst',
'author': 'Nils Homer',
'author_email': '[email protected]',
'license': 'MIT',
'platforms': ['POSIX', 'UNIX', 'MacOS'],
'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f],
'url': 'https://github.com/fulcrumgenomics/pybwa',
'packages': ['pybwa', 'pybwa.include.bwa'],
'package_dir': {'pybwa': 'pybwa', 'pybwa.include.bwa': 'bwa'},
'package_data': {'': ['*.pxd', '*.h', '*.c', 'py.typed', '*.pyi'], },
"ext_modules": extension_modules,
"cmdclass": {
"build_ext": cython_build_ext,
},
})

# Grab the build_ext command and copy all files back to source dir.
# Done so Poetry grabs the files during the next step in its build.
build_ext_cmd = distribution.get_command_obj("build_ext")
build_ext_cmd.ensure_finalized()
# Set the value to 1 for "inplace", with the goal to build extensions
# in build directory, and then copy all files back to the source dir
# (under the hood, "copy_extensions_to_source" will be called after
# building the extensions). This is done so Poetry grabs the files
# during the next step in its build.
build_ext_cmd.inplace = 1
build_ext_cmd.run()


if __name__ == "__main__":
Expand Down
8 changes: 8 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ As a result, the mapping quality may also differ slightly.
This is due to an implementation detail in which the order of random numbers used is different between this wrapper
and command-line.

Finally, the following additions have been made to :code:`bwa aln/samse`:

#. The standard SAM tag :code:`HN` is added. This is useful if we find too many hits
(:attr:`~pybwa.BwaAlnOptions.max_hits`) and therefore no hits are reported in the :code:`XA` tag, we can still
know how many were found.
#. The :py:attr:`~pybwa.BwaAlnOptions.with_md` option will add the standard SAM tag :code:`MD` to the :code:`XA` tag,
otherwise :code:`.` will be used. This provides additional information on the quality of alternative alignments.

===
API
===
Expand Down
76 changes: 76 additions & 0 deletions patches/0001-feat-add-HN-tag-to-bwa-aln.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
From 7d36ae830fc7391cfa69ffcdf239755f01317c1c Mon Sep 17 00:00:00 2001
From: Nils Homer <[email protected]>
Date: Thu, 16 Jan 2025 22:35:13 -0800
Subject: [PATCH 1/2] feat: add HN tag to bwa aln

---
bwase.c | 17 +++++++++--------
bwtaln.h | 1 +
2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/bwase.c b/bwase.c
index 18e8671..eb43c02 100644
--- a/bwase.c
+++ b/bwase.c
@@ -21,7 +21,7 @@ int g_log_n[256];

void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
{
- int i, cnt, best;
+ int i, k, cnt, best;
if (n_aln == 0) {
s->type = BWA_TYPE_NO_MATCH;
s->c1 = s->c2 = 0;
@@ -47,14 +47,14 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
}

+ for (k = s->n_occ = 0; k < n_aln; ++k) {
+ const bwt_aln1_t *q = aln + k;
+ s->n_occ += q->l - q->k + 1;
+ }
if (n_multi) {
- int k, rest, n_occ, z = 0;
- for (k = n_occ = 0; k < n_aln; ++k) {
- const bwt_aln1_t *q = aln + k;
- n_occ += q->l - q->k + 1;
- }
+ int rest, z = 0;
if (s->multi) free(s->multi);
- if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
+ if (s->n_occ > n_multi + 1) { // if there are too many hits, generate none of them
s->multi = 0; s->n_multi = 0;
return;
}
@@ -62,7 +62,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
* here. In principle, due to the requirement above, we can
* simply output all hits, but the following samples "rest"
* number of random hits. */
- rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
+ rest = s->n_occ > n_multi + 1? n_multi + 1 : s->n_occ; // find one additional for ->sa
s->multi = calloc(rest, sizeof(bwt_multi1_t));
for (k = 0; k < n_aln; ++k) {
const bwt_aln1_t *q = aln + k;
@@ -477,6 +477,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
}
}
}
+ err_printf("\tHN:i:%d", p->n_occ);
err_putchar('\n');
} else { // this read has no match
//ubyte_t *s = p->strand? p->rseq : p->seq;
diff --git a/bwtaln.h b/bwtaln.h
index 4616ff5..71ea627 100644
--- a/bwtaln.h
+++ b/bwtaln.h
@@ -76,6 +76,7 @@ typedef struct {
// multiple hits
int n_multi;
bwt_multi1_t *multi;
+ int n_occ; // total # of hits found, not just those reported in XA, output in HN
// alignment information
bwtint_t sa, pos;
uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ
--
2.39.5 (Apple Git-154)

Loading
Loading