Skip to content

Commit

Permalink
feat: initial implementation of python bindings for bwa aln
Browse files Browse the repository at this point in the history
  • Loading branch information
nh13 committed Dec 16, 2024
1 parent b4c505d commit 2700943
Show file tree
Hide file tree
Showing 28 changed files with 60,774 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* @nh13
90 changes: 90 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
name: CI

on: push
env:
POETRY_VERSION: 1.8

jobs:
testing:
runs-on: ubuntu-24.04
strategy:
matrix:
PYTHON_VERSION: ["3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v4
with:
repository: "lh3/bwa"
path: "bwa"
- name: Set up Python ${{matrix.PYTHON_VERSION}}
uses: actions/setup-python@v1
with:
python-version: ${{matrix.PYTHON_VERSION}}

- name: Get full Python version
id: full-python-version
shell: bash
run: echo ::set-output name=version::$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))")

- name: Install poetry
shell: bash
run: |
python -m pip install --upgrade pip
pip install poetry==${{env.POETRY_VERSION}}
- name: Configure poetry
shell: bash
run: poetry config virtualenvs.in-project true

- name: Set up cache
uses: actions/cache@v2
id: cache
with:
path: .venv
key: venv-${{ runner.os }}-${{ steps.full-python-version.outputs.version }}-${{ hashFiles('**/poetry.lock') }}

- name: Ensure cache is healthy
if: steps.cache.outputs.cache-hit == 'true'
shell: bash
run: poetry run pip --version >/dev/null 2>&1 || rm -rf .venv

- name: Check that the lock file is up to date
shell: bash
run: |
poetry lock --check
- name: Install deps
shell: bash
run: |
poetry install
- name: Style checking
shell: bash
run: |
poetry run ruff format --check bwapy tests
- name: Run lint
shell: bash
run: |
poetry run ruff check bwapy tests
- name: Run mypy
shell: bash
run: |
poetry run mypy bwapy tests --config=pyproject.toml
- name: Run pytest
shell: bash
run: |
poetry run python -m pytest --cov=bwapy --cov-report=xml --cov-branch
- name: Run docs
shell: bash
run: |
set -euo pipefail
poetry run mkdocs build --strict
- name: Upload code coverage
uses: codecov/[email protected]
with:
token: ${{ secrets.CODECOV_TOKEN }}
20 changes: 20 additions & 0 deletions .github/workflows/readthedocs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: readthedocs/actions
on:
pull_request_target:
types:
- opened
# Execute this action only on PRs that touch
# documentation files.
paths:
- "docs/**"

permissions:
pull-requests: write

jobs:
documentation-links:
runs-on: ubuntu-24.04
steps:
- uses: readthedocs/actions/preview@v1
with:
project-slug: "bwapy"
12 changes: 12 additions & 0 deletions .readthedocs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.11"
jobs:
post_install:
- pip install poetry==1.8.3
- poetry config virtualenvs.create false
- VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH poetry install
mkdocs:
configuration: mkdocs.yml
112 changes: 112 additions & 0 deletions build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from setuptools import Extension, Distribution
from typing import List

from Cython.Build import cythonize
from Cython.Distutils.build_ext import new_build_ext as cython_build_ext
import multiprocessing
from pathlib import Path

SOURCE_DIR = Path("bwapy")
BUILD_DIR = Path("cython_build")
compile_args = []
link_args = []
include_dirs = ["bwa"]
libraries = ['m', 'z', 'pthread']
library_dirs=['bwa']
extra_objects = [] #glob.glob(os.path.join('bwa', '*.o'))
h_files = []
c_files = []
for root_dir in ["bwa", "bwapy"]:
h_files.extend(str(x) for x in Path(root_dir).rglob("*.h"))
c_files.extend(str(x) for x in Path(root_dir).rglob("*.c") if x.name not in ['example.c', 'main.c'])

extension_module = Extension(
name='bwapy.libbwapy',
sources=['bwapy/libbwapy.pyx'] + c_files,
depends=h_files,
extra_compile_args=compile_args,
extra_link_args=link_args,
extra_objects=extra_objects,
include_dirs=include_dirs,
language='c',
libraries=libraries,
library_dirs=library_dirs,
)


def cythonize_helper(extension_modules: List[Extension]) -> List[Extension]:
"""Cythonize all Python extensions"""

return cythonize(
module_list=extension_modules,

# Don't build in source tree (this leaves behind .c files)
build_dir=BUILD_DIR,

# Don't generate an .html output file. Would contain source.
annotate=False,

# Parallelize our build
nthreads=multiprocessing.cpu_count() * 2,

# Tell Cython we're using Python 3. Becomes default in Cython 3
compiler_directives={"language_level": "3"},

# (Optional) Always rebuild, even if files untouched
force=True,
)

CLASSIFIERS = '''
Development Status :: 4 - Beta
Intended Audience :: Science/Research
Intended Audience :: Developers
License :: OSI Approved
Programming Language :: Python
Topic :: Software Development
Topic :: Scientific/Engineering
Operating System :: POSIX
Operating System :: Unix
Operating System :: MacOS
'''


def build():
# Collect and cythonize all files
extension_modules = cythonize_helper([extension_module])

# Use Setuptools to collect files
distribution = Distribution({
"name": "bwapy",
'version': '0.0.1', # FIXME
'description': 'Todo', # FIXME
'long_description': 'FIXME',
'long_description_content_type': 'text/x-rst',
'author': 'Nils Homer',
'author_email': '[email protected]',
'license': 'MIT',
'platforms': ['POSIX', 'UNIX', 'MacOS'],
'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f],
'url': 'https://github.com/fulcrumgenomics/bwapy',
'packages': ['bwapy'],
'package_dirs': {'bwapy': 'bwapy'},
"ext_modules": extension_modules,
"cmdclass": {
"build_ext": cython_build_ext,
},
})

# Grab the build_ext command and copy all files back to source dir.
# Done so Poetry grabs the files during the next step in its build.
build_ext_cmd = distribution.get_command_obj("build_ext")
build_ext_cmd.ensure_finalized()
# Set the value to 1 for "inplace", with the goal to build extensions
# in build directory, and then copy all files back to the source dir
# (under the hood, "copy_extensions_to_source" will be called after
# building the extensions). This is done so Poetry grabs the files
# during the next step in its build.
build_ext_cmd.inplace = 1
build_ext_cmd.run()


if __name__ == "__main__":
build()
9 changes: 9 additions & 0 deletions bwapy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: bwapy
channels:
- defaults
- conda-forge
- bioconda
dependencies:
- python=3.11
- cython=3.0.11
- pysam=0.22.1
4 changes: 4 additions & 0 deletions bwapy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import bwapy.libbwapy as libbwapy
from bwapy.libbwapy import * # noqa: F403

__all__ = libbwapy
119 changes: 119 additions & 0 deletions bwapy/libbwapy.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# cython: language_level=3

from libc.stdint cimport uint8_t, uint64_t, uint16_t, uint32_t, int64_t, int32_t
from libc.stdio cimport FILE

cdef extern from "libbwapy_utils.h":
void bwa_cal_pac_pos_with_bwt(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, int max_mm,
float fnr, bwt_t *bwt)

cdef extern from "utils.h":
int err_fseek(FILE *stream, long offset, int whence)
size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream)

cdef extern from "bntseq.h":
unsigned char nst_nt4_table[256]
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)

cdef extern from "bwa.h":
char * bwa_idx_infer_prefix(const char * hint)

cdef extern from "bwt.h":
ctypedef struct bwt_t:
int sa_intv

bwt_t *bwt_restore_bwt(const char *fn)
void bwt_restore_sa(const char *fn, bwt_t *bwt);
void bwt_destroy(bwt_t *bwt)

cdef extern from "bwtaln.h":
int BWA_TYPE_NO_MATCH
int BWA_MODE_LOGGAP
int BWA_MODE_GAPE

int __cigar_op(uint16_t __cigar)
int __cigar_len(uint16_t __cigar)

ctypedef struct gap_opt_t:
int trim_qual
int s_mm
int s_gapo
int s_gape
int mode # bit 24-31 are the barcode length
int indel_end_skip
int max_del_occ
int max_entries
float fnr
int max_diff
int max_gapo
int max_gape
int max_seed_diff
int seed_len
int n_threads
int max_top2
int trim_qual
int sam
char *rg_line
int n_occ
int interactive_mode
int with_md

gap_opt_t *gap_init_opt()
void gap_print_opt(const gap_opt_t *opt)

void seq_reverse(int len, unsigned char *seq, int is_comp)

ctypedef struct bwt_aln1_t:
pass

cdef extern from "bntseq.h":
ctypedef struct bntann1_t:
int64_t offset
int32_t len
char *name

ctypedef struct bntseq_t:
int64_t l_pac
bntann1_t *anns
FILE * fp_pac

bntseq_t * bns_restore(const char * prefix)
void bns_destroy(bntseq_t *bns)

cdef extern from "kseq.h":
ctypedef struct kstring_t:
char *s

cdef extern from "bwase.h":
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
int64_t pos_end(const bwa_seq_t *p)
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, unsigned char *_pacseq)
char *bwa_cal_md1(int n_cigar, uint16_t *cigar, int len, uint64_t pos, unsigned char *seq, uint64_t l_pac, unsigned char *pacseq, kstring_t *str, int *_nm)
void bwase_initialize()

cdef extern from "bwtaln.h":
ctypedef struct bwa_seq_t:

char *name
uint8_t *seq
uint8_t *rseq
uint8_t *qual
uint32_t len
uint32_t strand
uint32_t type
int mapQ
int clip_len
bwt_aln1_t *aln
int n_aln
uint16_t pos
uint16_t *cigar
int n_cigar
int tid
uint32_t full_len
uint32_t nm
char *md


void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)

void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
Loading

0 comments on commit 2700943

Please sign in to comment.