-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: output the HN tag and optionally add the MD to the XA tag
This requires us to patch bwa, which we store in the patches sub-directory. They are applied before we build, and reverted after the build completes (regardless of success). They can be removed when the bwa PRs are merged and submodule updated. See: lh3/bwa#438 See: lh3/bwa#439
- Loading branch information
Showing
7 changed files
with
375 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,45 @@ | ||
import multiprocessing | ||
import os | ||
import platform | ||
import subprocess | ||
from contextlib import contextmanager | ||
from pathlib import Path | ||
from typing import List | ||
|
||
from Cython.Build import cythonize | ||
from Cython.Distutils.build_ext import new_build_ext as cython_build_ext | ||
from setuptools import Extension, Distribution | ||
|
||
@contextmanager | ||
def changedir(path): | ||
save_dir = os.getcwd() | ||
os.chdir(path) | ||
try: | ||
yield | ||
finally: | ||
os.chdir(save_dir) | ||
|
||
@contextmanager | ||
def with_patches(): | ||
patches = sorted([ | ||
os.path.abspath(patch) | ||
for patch in Path("patches").iterdir() | ||
if patch.is_file() and patch.suffix == ".patch" | ||
]) | ||
with changedir("bwa"): | ||
for patch in patches: | ||
retcode = subprocess.call(f"git apply {patch}", shell=True) | ||
if retcode != 0: | ||
raise RuntimeError(f"Failed to apply patch {patch}") | ||
try: | ||
yield | ||
finally: | ||
commands = ["git submodule deinit -f .", "git submodule update --init"] | ||
for command in commands: | ||
retcode = subprocess.call(command, shell=True) | ||
if retcode != 0: | ||
raise RuntimeError(f"Failed to reset submodules: {command}") | ||
|
||
SOURCE_DIR = Path("pybwa") | ||
BUILD_DIR = Path("cython_build") | ||
compile_args = [] | ||
|
@@ -110,46 +143,48 @@ def cythonize_helper(extension_modules: List[Extension]) -> List[Extension]: | |
|
||
|
||
def build(): | ||
# Collect and cythonize all files | ||
extension_modules = cythonize_helper([ | ||
libbwaindex_module, | ||
libbwaaln_module, | ||
libbwamem_module | ||
]) | ||
|
||
# Use Setuptools to collect files | ||
distribution = Distribution({ | ||
"name": "pybwa", | ||
'version': '0.0.1', | ||
'description': 'Python bindings for BWA', | ||
'long_description': __doc__, | ||
'long_description_content_type': 'text/x-rst', | ||
'author': 'Nils Homer', | ||
'author_email': '[email protected]', | ||
'license': 'MIT', | ||
'platforms': ['POSIX', 'UNIX', 'MacOS'], | ||
'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f], | ||
'url': 'https://github.com/fulcrumgenomics/pybwa', | ||
'packages': ['pybwa', 'pybwa.include.bwa'], | ||
'package_dir': {'pybwa': 'pybwa', 'pybwa.include.bwa': 'bwa'}, | ||
'package_data': {'': ['*.pxd', '*.h', '*.c', 'py.typed', '*.pyi'], }, | ||
"ext_modules": extension_modules, | ||
"cmdclass": { | ||
"build_ext": cython_build_ext, | ||
}, | ||
}) | ||
|
||
# Grab the build_ext command and copy all files back to source dir. | ||
# Done so Poetry grabs the files during the next step in its build. | ||
build_ext_cmd = distribution.get_command_obj("build_ext") | ||
build_ext_cmd.ensure_finalized() | ||
# Set the value to 1 for "inplace", with the goal to build extensions | ||
# in build directory, and then copy all files back to the source dir | ||
# (under the hood, "copy_extensions_to_source" will be called after | ||
# building the extensions). This is done so Poetry grabs the files | ||
# during the next step in its build. | ||
build_ext_cmd.inplace = 1 | ||
build_ext_cmd.run() | ||
# apply patches to bwa, then revert them after | ||
with with_patches(): | ||
# Collect and cythonize all files | ||
extension_modules = cythonize_helper([ | ||
libbwaindex_module, | ||
libbwaaln_module, | ||
libbwamem_module | ||
]) | ||
|
||
# Use Setuptools to collect files | ||
distribution = Distribution({ | ||
"name": "pybwa", | ||
'version': '0.0.1', | ||
'description': 'Python bindings for BWA', | ||
'long_description': __doc__, | ||
'long_description_content_type': 'text/x-rst', | ||
'author': 'Nils Homer', | ||
'author_email': '[email protected]', | ||
'license': 'MIT', | ||
'platforms': ['POSIX', 'UNIX', 'MacOS'], | ||
'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f], | ||
'url': 'https://github.com/fulcrumgenomics/pybwa', | ||
'packages': ['pybwa', 'pybwa.include.bwa'], | ||
'package_dir': {'pybwa': 'pybwa', 'pybwa.include.bwa': 'bwa'}, | ||
'package_data': {'': ['*.pxd', '*.h', '*.c', 'py.typed', '*.pyi'], }, | ||
"ext_modules": extension_modules, | ||
"cmdclass": { | ||
"build_ext": cython_build_ext, | ||
}, | ||
}) | ||
|
||
# Grab the build_ext command and copy all files back to source dir. | ||
# Done so Poetry grabs the files during the next step in its build. | ||
build_ext_cmd = distribution.get_command_obj("build_ext") | ||
build_ext_cmd.ensure_finalized() | ||
# Set the value to 1 for "inplace", with the goal to build extensions | ||
# in build directory, and then copy all files back to the source dir | ||
# (under the hood, "copy_extensions_to_source" will be called after | ||
# building the extensions). This is done so Poetry grabs the files | ||
# during the next step in its build. | ||
build_ext_cmd.inplace = 1 | ||
build_ext_cmd.run() | ||
|
||
|
||
if __name__ == "__main__": | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
From 7d36ae830fc7391cfa69ffcdf239755f01317c1c Mon Sep 17 00:00:00 2001 | ||
From: Nils Homer <[email protected]> | ||
Date: Thu, 16 Jan 2025 22:35:13 -0800 | ||
Subject: [PATCH 1/2] feat: add HN tag to bwa aln | ||
|
||
--- | ||
bwase.c | 17 +++++++++-------- | ||
bwtaln.h | 1 + | ||
2 files changed, 10 insertions(+), 8 deletions(-) | ||
|
||
diff --git a/bwase.c b/bwase.c | ||
index 18e8671..eb43c02 100644 | ||
--- a/bwase.c | ||
+++ b/bwase.c | ||
@@ -21,7 +21,7 @@ int g_log_n[256]; | ||
|
||
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) | ||
{ | ||
- int i, cnt, best; | ||
+ int i, k, cnt, best; | ||
if (n_aln == 0) { | ||
s->type = BWA_TYPE_NO_MATCH; | ||
s->c1 = s->c2 = 0; | ||
@@ -47,14 +47,14 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma | ||
s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; | ||
} | ||
|
||
+ for (k = s->n_occ = 0; k < n_aln; ++k) { | ||
+ const bwt_aln1_t *q = aln + k; | ||
+ s->n_occ += q->l - q->k + 1; | ||
+ } | ||
if (n_multi) { | ||
- int k, rest, n_occ, z = 0; | ||
- for (k = n_occ = 0; k < n_aln; ++k) { | ||
- const bwt_aln1_t *q = aln + k; | ||
- n_occ += q->l - q->k + 1; | ||
- } | ||
+ int rest, z = 0; | ||
if (s->multi) free(s->multi); | ||
- if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them | ||
+ if (s->n_occ > n_multi + 1) { // if there are too many hits, generate none of them | ||
s->multi = 0; s->n_multi = 0; | ||
return; | ||
} | ||
@@ -62,7 +62,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma | ||
* here. In principle, due to the requirement above, we can | ||
* simply output all hits, but the following samples "rest" | ||
* number of random hits. */ | ||
- rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa | ||
+ rest = s->n_occ > n_multi + 1? n_multi + 1 : s->n_occ; // find one additional for ->sa | ||
s->multi = calloc(rest, sizeof(bwt_multi1_t)); | ||
for (k = 0; k < n_aln; ++k) { | ||
const bwt_aln1_t *q = aln + k; | ||
@@ -477,6 +477,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in | ||
} | ||
} | ||
} | ||
+ err_printf("\tHN:i:%d", p->n_occ); | ||
err_putchar('\n'); | ||
} else { // this read has no match | ||
//ubyte_t *s = p->strand? p->rseq : p->seq; | ||
diff --git a/bwtaln.h b/bwtaln.h | ||
index 4616ff5..71ea627 100644 | ||
--- a/bwtaln.h | ||
+++ b/bwtaln.h | ||
@@ -76,6 +76,7 @@ typedef struct { | ||
// multiple hits | ||
int n_multi; | ||
bwt_multi1_t *multi; | ||
+ int n_occ; // total # of hits found, not just those reported in XA, output in HN | ||
// alignment information | ||
bwtint_t sa, pos; | ||
uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ | ||
-- | ||
2.39.5 (Apple Git-154) | ||
|
Oops, something went wrong.