Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: remove pysam and bwa warmup hack #65

Merged
merged 5 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 18 additions & 21 deletions prymer/offtarget/bwa.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
>>> bwa.map_all(queries=[query])
[BwaResult(query=Query(id='NA', bases='AAAAAA'), hit_count=3968, hits=[])]
>>> bwa.close()
True
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


```
""" # noqa: E501
Expand All @@ -45,10 +46,10 @@
from typing import cast

import pysam
from fgpyo import sam
from fgpyo import sequence
from fgpyo.sam import Cigar
from pysam import AlignedSegment
from pysam import AlignmentHeader

from prymer.api import coordmath
from prymer.util.executable_runner import ExecutableRunner
Expand Down Expand Up @@ -201,6 +202,7 @@ class BwaAlnInteractive(ExecutableRunner):
reverse_complement: reverse complement each query sequence before alignment.
include_alt_hits: if True include hits to references with names ending in _alt, otherwise
do not include them.
header: the SAM alignment header.
clintval marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(
Expand Down Expand Up @@ -284,20 +286,14 @@ def __init__(

super().__init__(command=command)

# HACK ALERT
# This is a hack. By trial and error, pysam.AlignmentFile() will block reading unless
# there's at least a few records due to htslib wanting to read a few records for format
# auto-detection. Lame. So a hundred queries are sent to the aligner to align enable the
# htslib auto-detection to complete, and for us to be able to read using pysam.
num_warmup: int = 100
for i in range(num_warmup):
query = Query(id=f"ignoreme:{i}", bases="A" * 100)
fastq_str = query.to_fastq(reverse_complement=self.reverse_complement)
self._subprocess.stdin.write(fastq_str)
self.__signal_bwa() # forces the input to be sent to the underlying process.
self._reader = sam.reader(path=self._subprocess.stdout, file_type=sam.SamFileType.SAM)
for _ in range(num_warmup):
next(self._reader)
header = []
for line in self._subprocess.stdout:
if line.startswith("@"):
header.append(line)
if line.startswith("@PG"):
break
clintval marked this conversation as resolved.
Show resolved Hide resolved

self.header = AlignmentHeader.from_text("".join(header))

def __signal_bwa(self) -> None:
"""Signals BWA to process the queries"""
Expand Down Expand Up @@ -334,13 +330,18 @@ def map_all(self, queries: list[Query]) -> list[BwaResult]:
for query in queries:
fastq_str = query.to_fastq(reverse_complement=self.reverse_complement)
self._subprocess.stdin.write(fastq_str)
self.__signal_bwa() # forces the input to be sent to the underlying process.

# Force the input to be sent to the underlying process.
self.__signal_bwa()

# Read back the results
results: list[BwaResult] = []
for query in queries:
# get the next alignment and convert to a result
results.append(self._to_result(query=query, rec=next(self._reader)))
line: str = next(self._subprocess.stdout).strip()
clintval marked this conversation as resolved.
Show resolved Hide resolved
assert not line.startswith("@"), f"SAM record must not start with '@'! {line}"
alignment = AlignedSegment.fromstring(line, self.header)
results.append(self._to_result(query=query, rec=alignment))

return results

Expand Down Expand Up @@ -412,7 +413,3 @@ def to_hits(self, rec: AlignedSegment) -> list[BwaHit]:
hits = [hit for hit in hits if not hit.refname.endswith("_alt")]

return hits

def close(self) -> None:
self._reader.close()
super().close()
Comment on lines -416 to -418
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although we no longer need to override this method, it should have conformed to the same shape as the superclass:

def close(self) -> bool:
    self._reader.close()
    return super().close()

18 changes: 18 additions & 0 deletions tests/offtarget/test_bwa.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from dataclasses import replace
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import TypeAlias
from typing import cast

import pytest
from fgpyo.sam import Cigar
Expand All @@ -12,6 +14,9 @@
from prymer.offtarget.bwa import BwaHit
from prymer.offtarget.bwa import Query

SamHeaderType: TypeAlias = dict[str, dict[str, str] | list[dict[str, str]]]
"""A type alias for a SAM sequence header dictionary."""


@pytest.mark.parametrize("bases", [None, ""])
def test_query_no_bases(bases: str | None) -> None:
Expand Down Expand Up @@ -68,6 +73,19 @@ def test_hit_build_rc() -> None:
assert hit_r.negative is True


def test_header_is_properly_constructed(ref_fasta: Path) -> None:
"""Tests that bwa will return a properly constructed header."""
with BwaAlnInteractive(ref=ref_fasta, max_hits=1) as bwa:
header: SamHeaderType = bwa.header.to_dict()
assert set(header.keys()) == {"HD", "SQ", "PG"}
assert header["HD"] == {"GO": "query", "SO": "unsorted", "VN": "1.5"}
assert header["SQ"] == [{"LN": 10001, "SN": "chr1"}]
assert len(header["PG"]) == 1
program_group: dict[str, str] = cast(list[dict[str, str]], header["PG"])[0]
program_group["ID"] = "bwa"
program_group["PN"] = "bwa"
clintval marked this conversation as resolved.
Show resolved Hide resolved


def test_map_one_uniquely_mapped(ref_fasta: Path) -> None:
"""Tests that bwa maps one hit when a query uniquely maps."""
query = Query(bases="TCTACTAAAAATACAAAAAATTAGCTGGGCATGATGGCATGCACCTGTAATCCCGCTACT", id="NA")
Expand Down
Loading