From 2d719493a2cf4898aa6ac92e45248630cd2c1fb4 Mon Sep 17 00:00:00 2001 From: clintval Date: Tue, 25 Jun 2024 16:38:25 -0700 Subject: [PATCH 01/17] Allow the OverlapDetector to be generic over input feature types --- pybedlite/overlap_detector.py | 171 +++++++++++++++++++---- pybedlite/tests/test_overlap_detector.py | 71 +++++++++- 2 files changed, 211 insertions(+), 31 deletions(-) diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index 61d7ee9..25e549f 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -41,12 +41,17 @@ import itertools from pathlib import Path from typing import Dict +from typing import Generic +from typing import Hashable from typing import Iterable from typing import Iterator from typing import List from typing import Optional +from typing import Protocol from typing import Set from typing import Type +from typing import TypeVar +from typing import Union import attr @@ -56,6 +61,52 @@ from pybedlite.bed_source import BedSource +class _Span(Protocol): + """A span with a start and an end. 0-based open-ended.""" + + @property + def start(self) -> int: + """A 0-based start position.""" + + @property + def end(self) -> int: + """A 0-based open-ended position.""" + + +class _GenomicSpanWithChrom(_Span, Hashable, Protocol): + """A genomic feature where reference sequence is accessed with `chrom`.""" + + @property + def chrom(self) -> str: + """A reference sequence name.""" + + +class _GenomicSpanWithContig(_Span, Hashable, Protocol): + """A genomic feature where reference sequence is accessed with `contig`.""" + + @property + def contig(self) -> str: + """A reference sequence name.""" + + +class _GenomicSpanWithRefName(_Span, Hashable, Protocol): + """A genomic feature where reference sequence is accessed with `refname`.""" + + @property + def refname(self) -> str: + """A reference sequence name.""" + + +GenomicSpan = TypeVar( + "GenomicSpan", + bound=Union[_GenomicSpanWithChrom, _GenomicSpanWithContig, _GenomicSpanWithRefName], +) +""" +A genomic feature where the reference sequence name is accessed with any of the 3 most common +property names ("chrom", "contig", "refname"). +""" + + @attr.s(frozen=True, auto_attribs=True) class Interval: """A region mapping to the genome that is 0-based and open-ended @@ -126,36 +177,94 @@ def from_bedrecord(cls: Type["Interval"], record: BedRecord) -> "Interval": ) -class OverlapDetector(Iterable[Interval]): +_GenericGenomicSpan = TypeVar( + "_GenericGenomicSpan", + bound=Union[_GenomicSpanWithChrom, _GenomicSpanWithContig, _GenomicSpanWithRefName], +) +""" +A generic genomic feature where the reference sequence name is accessed with any of the 3 most +common property names ("chrom", "contig", "refname"). This type variable is used for describing the +generic type contained within the :class:`~pybedlite.overlap_detector.OverlapDetector`. +""" + + +class OverlapDetector(Generic[_GenericGenomicSpan], Iterable[_GenericGenomicSpan]): """Detects and returns overlaps between a set of genomic regions and another genomic region. - Since :class:`~pybedlite.overlap_detector.Interval` objects are used both to populate the - overlap detector and to query it, the coordinate system in use is also 0-based open-ended. + The overlap detector may contain any interval-like Python objects that have the following + properties: + + * `chrom` or `contig` or `refname`: The reference sequence name + * `start`: A 0-based start position + * `end`: A 0-based exclusive end position + + Interval-like Python objects may also contain strandedness information which will be used + for sorting them in :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps` using + either of the following properties if they are present: + + * `negative (bool)`: Whether or not the feature is negative stranded or not + * `strand (BedStrand)`: The BED strand of the feature + * `strand (str)`: The strand of the feature (`"-"` for negative) The same interval may be added multiple times, but only a single instance will be returned - when querying for overlaps. Intervals with the same coordinates but different names are - treated as different intervals. + when querying for overlaps. This detector is the most efficient when all intervals are added ahead of time. """ - def __init__(self) -> None: + def __init__(self, intervals: Optional[Iterable[_GenericGenomicSpan]] = None) -> None: # A mapping from the contig/chromosome name to the associated interval tree self._refname_to_tree: Dict[str, cr.cgranges] = {} # type: ignore self._refname_to_indexed: Dict[str, bool] = {} - self._refname_to_intervals: Dict[str, List[Interval]] = {} + self._refname_to_intervals: Dict[str, List[_GenericGenomicSpan]] = {} + if intervals is not None: + self.add_all(intervals) - def __iter__(self) -> Iterator[Interval]: + def __iter__(self) -> Iterator[_GenericGenomicSpan]: """Iterates over the intervals in the overlap detector.""" return itertools.chain(*self._refname_to_intervals.values()) - def add(self, interval: Interval) -> None: + @staticmethod + def _reference_sequence_name(interval: GenomicSpan) -> str: + """Return the reference name of a given interval.""" + if isinstance(interval, Interval) or hasattr(interval, "refname"): + return interval.refname + elif isinstance(interval, BedRecord) or hasattr(interval, "chrom"): + return interval.chrom + elif hasattr(interval, "contig"): + return interval.contig + else: + raise ValueError( + f"Genomic span is missing a reference sequence name property: {interval}" + ) + + @staticmethod + def _is_negative(interval: GenomicSpan) -> bool: + """Determine if this is a negative stranded interval or not.""" + return ( + (hasattr(interval, "negative") and interval.negative) + or ( + hasattr(interval, "strand") + and isinstance(interval.strand, BedStrand) + and interval.strand is BedStrand.Negative + ) + or ( + hasattr(interval, "strand") + and isinstance(interval.strand, str) + and interval.strand == "-" + ) + ) + + def add(self, interval: _GenericGenomicSpan) -> None: """Adds an interval to this detector. Args: interval: the interval to add to this detector """ - refname = interval.refname + if not isinstance(interval, Hashable): + raise ValueError(f"Interval feature is not hashable but should be: {interval}") + + refname = self._reference_sequence_name(interval) if refname not in self._refname_to_tree: self._refname_to_tree[refname] = cr.cgranges() # type: ignore self._refname_to_indexed[refname] = False @@ -168,13 +277,13 @@ def add(self, interval: Interval) -> None: # Add the interval to the tree tree = self._refname_to_tree[refname] - tree.add(interval.refname, interval.start, interval.end, interval_idx) + tree.add(refname, interval.start, interval.end, interval_idx) # Flag this tree as needing to be indexed after adding a new interval, but defer # indexing self._refname_to_indexed[refname] = False - def add_all(self, intervals: Iterable[Interval]) -> None: + def add_all(self, intervals: Iterable[_GenericGenomicSpan]) -> None: """Adds one or more intervals to this detector. Args: @@ -183,7 +292,7 @@ def add_all(self, intervals: Iterable[Interval]) -> None: for interval in intervals: self.add(interval) - def overlaps_any(self, interval: Interval) -> bool: + def overlaps_any(self, interval: GenomicSpan) -> bool: """Determines whether the given interval overlaps any interval in this detector. Args: @@ -193,20 +302,21 @@ def overlaps_any(self, interval: Interval) -> bool: True if and only if the given interval overlaps with any interval in this detector. """ - tree = self._refname_to_tree.get(interval.refname) + refname = self._reference_sequence_name(interval) + tree = self._refname_to_tree.get(refname) if tree is None: return False else: - if not self._refname_to_indexed[interval.refname]: + if not self._refname_to_indexed[refname]: tree.index() try: - next(iter(tree.overlap(interval.refname, interval.start, interval.end))) + next(iter(tree.overlap(refname, interval.start, interval.end))) except StopIteration: return False else: return True - def get_overlaps(self, interval: Interval) -> List[Interval]: + def get_overlaps(self, interval: GenomicSpan) -> List[_GenericGenomicSpan]: """Returns any intervals in this detector that overlap the given interval. Args: @@ -216,23 +326,30 @@ def get_overlaps(self, interval: Interval) -> List[Interval]: The list of intervals in this detector that overlap the given interval, or the empty list if no overlaps exist. The intervals will be return in ascending genomic order. """ - tree = self._refname_to_tree.get(interval.refname) + refname = self._reference_sequence_name(interval) + tree = self._refname_to_tree.get(refname) if tree is None: return [] else: - if not self._refname_to_indexed[interval.refname]: + if not self._refname_to_indexed[refname]: tree.index() - ref_intervals: List[Interval] = self._refname_to_intervals[interval.refname] + ref_intervals: List[_GenericGenomicSpan] = self._refname_to_intervals[refname] # NB: only return unique instances of intervals - intervals: Set[Interval] = { + intervals: Set[_GenericGenomicSpan] = { ref_intervals[index] - for _, _, index in tree.overlap(interval.refname, interval.start, interval.end) + for _, _, index in tree.overlap(refname, interval.start, interval.end) } return sorted( - intervals, key=lambda intv: (intv.start, intv.end, intv.negative, intv.name) + intervals, + key=lambda intv: ( + intv.start, + intv.end, + self._is_negative(intv), + self._reference_sequence_name(intv), + ), ) - def get_enclosing_intervals(self, interval: Interval) -> List[Interval]: + def get_enclosing_intervals(self, interval: GenomicSpan) -> List[_GenericGenomicSpan]: """Returns the set of intervals in this detector that wholly enclose the query interval. i.e. query.start >= target.start and query.end <= target.end. @@ -245,7 +362,7 @@ def get_enclosing_intervals(self, interval: Interval) -> List[Interval]: results = self.get_overlaps(interval) return [i for i in results if interval.start >= i.start and interval.end <= i.end] - def get_enclosed(self, interval: Interval) -> List[Interval]: + def get_enclosed(self, interval: GenomicSpan) -> List[_GenericGenomicSpan]: """Returns the set of intervals in this detector that are enclosed by the query interval. I.e. target.start >= query.start and target.end <= query.end. @@ -267,9 +384,9 @@ def from_bed(cls, path: Path) -> "OverlapDetector": Returns: An overlap detector for the regions in the BED file. """ - detector = OverlapDetector() + detector: OverlapDetector[BedRecord] = OverlapDetector() for region in BedSource(path): - detector.add(Interval.from_bedrecord(region)) + detector.add(region) return detector diff --git a/pybedlite/tests/test_overlap_detector.py b/pybedlite/tests/test_overlap_detector.py index dae34a9..2fcc526 100644 --- a/pybedlite/tests/test_overlap_detector.py +++ b/pybedlite/tests/test_overlap_detector.py @@ -1,6 +1,11 @@ """Tests for :py:mod:`~pybedlite.overlap_detector`""" +from dataclasses import dataclass from typing import List +from typing import Union + +import pytest +from typing_extensions import TypeAlias from pybedlite.bed_record import BedRecord from pybedlite.bed_record import BedStrand @@ -9,7 +14,7 @@ def run_test(targets: List[Interval], query: Interval, results: List[Interval]) -> None: - detector = OverlapDetector() + detector: OverlapDetector[Interval] = OverlapDetector() # Use add_all() to covert itself and add() detector.add_all(intervals=targets) # Test overlaps_any() @@ -113,7 +118,7 @@ def test_get_enclosing_intervals() -> None: d = Interval("1", 15, 19) e = Interval("1", 16, 20) - detector = OverlapDetector() + detector: OverlapDetector[Interval] = OverlapDetector() detector.add_all([a, b, c, d, e]) assert detector.get_enclosing_intervals(Interval("1", 10, 100)) == [a] @@ -128,7 +133,7 @@ def test_get_enclosed() -> None: c = Interval("1", 18, 19) d = Interval("1", 50, 99) - detector = OverlapDetector() + detector: OverlapDetector[Interval] = OverlapDetector() detector.add_all([a, b, c, d]) assert detector.get_enclosed(Interval("1", 1, 250)) == [a, b, c, d] @@ -145,7 +150,7 @@ def test_iterable() -> None: d = Interval("1", 15, 19) e = Interval("1", 16, 20) - detector = OverlapDetector() + detector: OverlapDetector[Interval] = OverlapDetector() detector.add_all([a]) assert list(detector) == [a] detector.add_all([a, b, c, d, e]) @@ -188,3 +193,61 @@ def test_construction_from_interval(bed_records: List[BedRecord]) -> None: assert new_record.strand is BedStrand.Positive else: assert new_record.strand is record.strand + + +def test_arbitrary_interval_types() -> None: + """ + Test that an overlap detector can receive different interval-like objects and query them too. + """ + + @dataclass(eq=True, frozen=True) + class ChromFeature: + chrom: str + start: int + end: int + + @dataclass(eq=True, frozen=True) + class ContigFeature: + contig: str + start: int + end: int + + @dataclass(eq=True, frozen=True) + class RefnameFeature: + refname: str + start: int + end: int + + # Create minimal features of all supported structural types + chrom_feature = ChromFeature(chrom="chr1", start=0, end=30) + contig_feature = ContigFeature(contig="chr1", start=10, end=40) + refname_feature = RefnameFeature(refname="chr1", start=20, end=50) + + # Setup an overlap detector to hold all the features we care about + AllKinds: TypeAlias = Union[ChromFeature, ContigFeature, RefnameFeature] + features: List[AllKinds] = [chrom_feature, contig_feature, refname_feature] + detector: OverlapDetector[AllKinds] = OverlapDetector(features) + + # Query the overlap detector with yet another type + assert detector.get_overlaps(Interval("chr1", 0, 1)) == [chrom_feature] + assert detector.get_overlaps(Interval("chr1", 25, 26)) == [ + chrom_feature, + contig_feature, + refname_feature, + ] + assert detector.get_overlaps(Interval("chr1", 45, 46)) == [refname_feature] + + +def test_the_overlap_detector_wont_accept_a_non_hashable_feature() -> None: + """ + Test that an overlap detector will not accept a non-hashable feature. + """ + + @dataclass # A dataclass missing both `eq` and `frozen` does not implement __hash__. + class ChromFeature: + chrom: str + start: int + end: int + + with pytest.raises(ValueError): + OverlapDetector([ChromFeature(chrom="chr1", start=0, end=30)]) From 13b7834c09db7c7980f35e82fed742d4655b0e92 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Thu, 1 Aug 2024 06:17:18 -0700 Subject: [PATCH 02/17] Keep OverlapDetector generic but normalize the getters for BedRecord and Interval (#36) --- pybedlite/bed_record.py | 11 +- pybedlite/overlap_detector.py | 145 +++++++++-------------- pybedlite/tests/test_overlap_detector.py | 124 +++++++++++++++---- 3 files changed, 167 insertions(+), 113 deletions(-) diff --git a/pybedlite/bed_record.py b/pybedlite/bed_record.py index 517852f..ee4f1dc 100644 --- a/pybedlite/bed_record.py +++ b/pybedlite/bed_record.py @@ -27,7 +27,6 @@ if TYPE_CHECKING: from pybedlite.overlap_detector import Interval - """Maximum BED fields that can be present in a well formed BED file written to specification""" MAX_BED_FIELDS: int = 12 @@ -183,6 +182,16 @@ def bed_fields(self) -> List[str]: ), ] + @property + def refname(self) -> str: + """A reference sequence name.""" + return self.chrom + + @property + def negative(self) -> bool: + """True if the interval is on the negative strand, False otherwise""" + return self.strand is BedStrand.Negative + def as_bed_line(self, number_of_output_fields: Optional[int] = None) -> str: """ Converts a BED record to a tab delimited BED line equivalent, including up to the number of diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index 25e549f..1e584f6 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -2,10 +2,29 @@ Utility Classes for Querying Overlaps with Genomic Regions ---------------------------------------------------------- + +The :class:`~pybedlite.overlap_detector.OverlapDetector` class detects and returns overlaps between +a set of genomic regions and another genomic region. The overlap detector may contain any +interval-like Python objects that have the following properties: + + * `refname` (str): The reference sequence name + * `start` (int): A 0-based start position + * `end` (int): A 0-based exclusive end position + +This is encapsulated in the :class:`~pybedlite.overlap_detector.GenomicSpan` protocol. + +Interval-like Python objects may also contain strandedness information which will be used +for sorting them in :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps` using +the following property if it is present, otherwise assumed to be positive stranded: + + * `negative (bool)`: Whether the feature is negative stranded or not + +This is encapsulated in the :class:`~pybedlite.overlap_detector.StrandedGenomicSpan` protocol. + Examples of Detecting Overlaps ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. code-block:: python +. code-block:: python >>> from pybedlite.overlap_detector import Interval, OverlapDetector >>> detector = OverlapDetector() @@ -61,8 +80,15 @@ from pybedlite.bed_source import BedSource -class _Span(Protocol): - """A span with a start and an end. 0-based open-ended.""" +class GenomicSpan(Protocol): + """ + A genomic span which has protected methods that must be implemented by all subclasses to give + a zero-based open-ended genomic span. + """ + + @property + def refname(self) -> str: + """A reference sequence name.""" @property def start(self) -> int: @@ -73,38 +99,10 @@ def end(self) -> int: """A 0-based open-ended position.""" -class _GenomicSpanWithChrom(_Span, Hashable, Protocol): - """A genomic feature where reference sequence is accessed with `chrom`.""" - +class StrandedGenomicSpan(GenomicSpan, Protocol): @property - def chrom(self) -> str: - """A reference sequence name.""" - - -class _GenomicSpanWithContig(_Span, Hashable, Protocol): - """A genomic feature where reference sequence is accessed with `contig`.""" - - @property - def contig(self) -> str: - """A reference sequence name.""" - - -class _GenomicSpanWithRefName(_Span, Hashable, Protocol): - """A genomic feature where reference sequence is accessed with `refname`.""" - - @property - def refname(self) -> str: - """A reference sequence name.""" - - -GenomicSpan = TypeVar( - "GenomicSpan", - bound=Union[_GenomicSpanWithChrom, _GenomicSpanWithContig, _GenomicSpanWithRefName], -) -""" -A genomic feature where the reference sequence name is accessed with any of the 3 most common -property names ("chrom", "contig", "refname"). -""" + def negative(self) -> bool: + """True if the interval is on the negative strand, False otherwise""" @attr.s(frozen=True, auto_attribs=True) @@ -177,23 +175,18 @@ def from_bedrecord(cls: Type["Interval"], record: BedRecord) -> "Interval": ) -_GenericGenomicSpan = TypeVar( - "_GenericGenomicSpan", - bound=Union[_GenomicSpanWithChrom, _GenomicSpanWithContig, _GenomicSpanWithRefName], -) +GenericGenomicSpan = TypeVar("GenericGenomicSpan", bound=Union[GenomicSpan, StrandedGenomicSpan]) """ -A generic genomic feature where the reference sequence name is accessed with any of the 3 most -common property names ("chrom", "contig", "refname"). This type variable is used for describing the +A generic genomic feature. This type variable is used for describing the generic type contained within the :class:`~pybedlite.overlap_detector.OverlapDetector`. """ -class OverlapDetector(Generic[_GenericGenomicSpan], Iterable[_GenericGenomicSpan]): +class OverlapDetector(Generic[GenericGenomicSpan], Iterable[GenericGenomicSpan]): """Detects and returns overlaps between a set of genomic regions and another genomic region. The overlap detector may contain any interval-like Python objects that have the following properties: - * `chrom` or `contig` or `refname`: The reference sequence name * `start`: A 0-based start position * `end`: A 0-based exclusive end position @@ -201,7 +194,6 @@ class OverlapDetector(Generic[_GenericGenomicSpan], Iterable[_GenericGenomicSpan Interval-like Python objects may also contain strandedness information which will be used for sorting them in :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps` using either of the following properties if they are present: - * `negative (bool)`: Whether or not the feature is negative stranded or not * `strand (BedStrand)`: The BED strand of the feature * `strand (str)`: The strand of the feature (`"-"` for negative) @@ -212,50 +204,19 @@ class OverlapDetector(Generic[_GenericGenomicSpan], Iterable[_GenericGenomicSpan This detector is the most efficient when all intervals are added ahead of time. """ - def __init__(self, intervals: Optional[Iterable[_GenericGenomicSpan]] = None) -> None: + def __init__(self, intervals: Optional[Iterable[GenericGenomicSpan]] = None) -> None: # A mapping from the contig/chromosome name to the associated interval tree self._refname_to_tree: Dict[str, cr.cgranges] = {} # type: ignore self._refname_to_indexed: Dict[str, bool] = {} - self._refname_to_intervals: Dict[str, List[_GenericGenomicSpan]] = {} + self._refname_to_intervals: Dict[str, List[GenericGenomicSpan]] = {} if intervals is not None: self.add_all(intervals) - def __iter__(self) -> Iterator[_GenericGenomicSpan]: + def __iter__(self) -> Iterator[GenericGenomicSpan]: """Iterates over the intervals in the overlap detector.""" return itertools.chain(*self._refname_to_intervals.values()) - @staticmethod - def _reference_sequence_name(interval: GenomicSpan) -> str: - """Return the reference name of a given interval.""" - if isinstance(interval, Interval) or hasattr(interval, "refname"): - return interval.refname - elif isinstance(interval, BedRecord) or hasattr(interval, "chrom"): - return interval.chrom - elif hasattr(interval, "contig"): - return interval.contig - else: - raise ValueError( - f"Genomic span is missing a reference sequence name property: {interval}" - ) - - @staticmethod - def _is_negative(interval: GenomicSpan) -> bool: - """Determine if this is a negative stranded interval or not.""" - return ( - (hasattr(interval, "negative") and interval.negative) - or ( - hasattr(interval, "strand") - and isinstance(interval.strand, BedStrand) - and interval.strand is BedStrand.Negative - ) - or ( - hasattr(interval, "strand") - and isinstance(interval.strand, str) - and interval.strand == "-" - ) - ) - - def add(self, interval: _GenericGenomicSpan) -> None: + def add(self, interval: GenericGenomicSpan) -> None: """Adds an interval to this detector. Args: @@ -264,7 +225,7 @@ def add(self, interval: _GenericGenomicSpan) -> None: if not isinstance(interval, Hashable): raise ValueError(f"Interval feature is not hashable but should be: {interval}") - refname = self._reference_sequence_name(interval) + refname = interval.refname if refname not in self._refname_to_tree: self._refname_to_tree[refname] = cr.cgranges() # type: ignore self._refname_to_indexed[refname] = False @@ -283,7 +244,7 @@ def add(self, interval: _GenericGenomicSpan) -> None: # indexing self._refname_to_indexed[refname] = False - def add_all(self, intervals: Iterable[_GenericGenomicSpan]) -> None: + def add_all(self, intervals: Iterable[GenericGenomicSpan]) -> None: """Adds one or more intervals to this detector. Args: @@ -302,7 +263,7 @@ def overlaps_any(self, interval: GenomicSpan) -> bool: True if and only if the given interval overlaps with any interval in this detector. """ - refname = self._reference_sequence_name(interval) + refname = interval.refname tree = self._refname_to_tree.get(refname) if tree is None: return False @@ -316,7 +277,7 @@ def overlaps_any(self, interval: GenomicSpan) -> bool: else: return True - def get_overlaps(self, interval: GenomicSpan) -> List[_GenericGenomicSpan]: + def get_overlaps(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: """Returns any intervals in this detector that overlap the given interval. Args: @@ -326,16 +287,16 @@ def get_overlaps(self, interval: GenomicSpan) -> List[_GenericGenomicSpan]: The list of intervals in this detector that overlap the given interval, or the empty list if no overlaps exist. The intervals will be return in ascending genomic order. """ - refname = self._reference_sequence_name(interval) + refname = interval.refname tree = self._refname_to_tree.get(refname) if tree is None: return [] else: if not self._refname_to_indexed[refname]: tree.index() - ref_intervals: List[_GenericGenomicSpan] = self._refname_to_intervals[refname] + ref_intervals: List[GenericGenomicSpan] = self._refname_to_intervals[refname] # NB: only return unique instances of intervals - intervals: Set[_GenericGenomicSpan] = { + intervals: Set[GenericGenomicSpan] = { ref_intervals[index] for _, _, index in tree.overlap(refname, interval.start, interval.end) } @@ -344,14 +305,18 @@ def get_overlaps(self, interval: GenomicSpan) -> List[_GenericGenomicSpan]: key=lambda intv: ( intv.start, intv.end, - self._is_negative(intv), - self._reference_sequence_name(intv), + self._negative(intv), + intv.refname, ), ) - def get_enclosing_intervals(self, interval: GenomicSpan) -> List[_GenericGenomicSpan]: + @staticmethod + def _negative(interval: GenomicSpan) -> bool: + return getattr(interval, "negative", False) + + def get_enclosing_intervals(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: """Returns the set of intervals in this detector that wholly enclose the query interval. - i.e. query.start >= target.start and query.end <= target.end. + i.e. `query.start >= target.start` and `query.end <= target.end`. Args: interval: the query interval @@ -362,7 +327,7 @@ def get_enclosing_intervals(self, interval: GenomicSpan) -> List[_GenericGenomic results = self.get_overlaps(interval) return [i for i in results if interval.start >= i.start and interval.end <= i.end] - def get_enclosed(self, interval: GenomicSpan) -> List[_GenericGenomicSpan]: + def get_enclosed(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: """Returns the set of intervals in this detector that are enclosed by the query interval. I.e. target.start >= query.start and target.end <= query.end. diff --git a/pybedlite/tests/test_overlap_detector.py b/pybedlite/tests/test_overlap_detector.py index 2fcc526..38e3bd5 100644 --- a/pybedlite/tests/test_overlap_detector.py +++ b/pybedlite/tests/test_overlap_detector.py @@ -201,41 +201,111 @@ def test_arbitrary_interval_types() -> None: """ @dataclass(eq=True, frozen=True) - class ChromFeature: - chrom: str + class ZeroBasedOpenEndedProtocol: + refname: str start: int end: int + @property + def negative(self) -> bool: + return False + @dataclass(eq=True, frozen=True) - class ContigFeature: + class OneBasedProtocol: contig: str - start: int + one_based_start: int + end: int + + @property + def refname(self) -> str: + return self.contig + + @property + def start(self) -> int: + """A 0-based start position.""" + return self.one_based_start - 1 + + @property + def negative(self) -> bool: + """True if the interval is on the negative strand, False otherwise""" + return False + + @dataclass(eq=True, frozen=True) + class ZeroBasedUnstranded: + refname: str + zero_based_start: int end: int + @property + def start(self) -> int: + """A 0-based start position.""" + return self.zero_based_start + @dataclass(eq=True, frozen=True) - class RefnameFeature: + class ZeroBasedStranded: refname: str - start: int + zero_based_start: int end: int + negative: bool - # Create minimal features of all supported structural types - chrom_feature = ChromFeature(chrom="chr1", start=0, end=30) - contig_feature = ContigFeature(contig="chr1", start=10, end=40) - refname_feature = RefnameFeature(refname="chr1", start=20, end=50) + @property + def start(self) -> int: + """A 0-based start position.""" + return self.zero_based_start - # Setup an overlap detector to hold all the features we care about - AllKinds: TypeAlias = Union[ChromFeature, ContigFeature, RefnameFeature] - features: List[AllKinds] = [chrom_feature, contig_feature, refname_feature] + # Create minimal features of all supported structural types + zero_based_protocol = ZeroBasedOpenEndedProtocol(refname="chr1", start=1, end=50) + one_based_protocol = OneBasedProtocol(contig="chr1", one_based_start=10, end=60) + zero_based_unstranded = ZeroBasedUnstranded(refname="chr1", zero_based_start=20, end=70) + zero_based_stranded = ZeroBasedStranded( + refname="chr1", zero_based_start=30, end=80, negative=True + ) + # Set up an overlap detector to hold all the features we care about + AllKinds: TypeAlias = Union[ + ZeroBasedOpenEndedProtocol, OneBasedProtocol, ZeroBasedUnstranded, ZeroBasedStranded + ] + features: List[AllKinds] = [ + zero_based_protocol, + one_based_protocol, + zero_based_unstranded, + zero_based_stranded, + ] detector: OverlapDetector[AllKinds] = OverlapDetector(features) + assert OverlapDetector._negative(zero_based_protocol) is False + assert OverlapDetector._negative(one_based_protocol) is False + assert OverlapDetector._negative(zero_based_unstranded) is False + assert OverlapDetector._negative(zero_based_stranded) is True + # Query the overlap detector with yet another type - assert detector.get_overlaps(Interval("chr1", 0, 1)) == [chrom_feature] - assert detector.get_overlaps(Interval("chr1", 25, 26)) == [ - chrom_feature, - contig_feature, - refname_feature, + assert detector.get_overlaps(Interval("chr1", 0, 1)) == [] + assert detector.get_overlaps(Interval("chr1", 0, 9)) == [zero_based_protocol] + assert detector.get_overlaps(Interval("chr1", 11, 12)) == [ + zero_based_protocol, + one_based_protocol, + ] + assert detector.get_overlaps(Interval("chr1", 21, 27)) == [ + zero_based_protocol, + one_based_protocol, + zero_based_unstranded, + ] + assert detector.get_overlaps(Interval("chr1", 32, 35)) == [ + zero_based_protocol, + one_based_protocol, + zero_based_unstranded, + zero_based_stranded, ] - assert detector.get_overlaps(Interval("chr1", 45, 46)) == [refname_feature] + assert detector.get_overlaps(Interval("chr1", 54, 55)) == [ + one_based_protocol, + zero_based_unstranded, + zero_based_stranded, + ] + assert detector.get_overlaps(Interval("chr1", 61, 62)) == [ + zero_based_unstranded, + zero_based_stranded, + ] + assert detector.get_overlaps(Interval("chr1", 78, 79)) == [zero_based_stranded] + assert detector.get_overlaps(Interval("chr1", 80, 81)) == [] def test_the_overlap_detector_wont_accept_a_non_hashable_feature() -> None: @@ -245,9 +315,19 @@ def test_the_overlap_detector_wont_accept_a_non_hashable_feature() -> None: @dataclass # A dataclass missing both `eq` and `frozen` does not implement __hash__. class ChromFeature: - chrom: str - start: int + refname: str + zero_based_start: int end: int + @property + def start(self) -> int: + """A 0-based start position.""" + return self.zero_based_start + + @property + def negative(self) -> bool: + """True if the interval is on the negative strand, False otherwise""" + return False + with pytest.raises(ValueError): - OverlapDetector([ChromFeature(chrom="chr1", start=0, end=30)]) + OverlapDetector([ChromFeature(refname="chr1", zero_based_start=0, end=30)]) From 04382f213519405cbe665fd6a0b12d759cfc212a Mon Sep 17 00:00:00 2001 From: clintval Date: Thu, 1 Aug 2024 09:21:55 -0400 Subject: [PATCH 03/17] chore: fixup stale docs --- pybedlite/bed_record.py | 2 +- pybedlite/overlap_detector.py | 22 +++++++++++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pybedlite/bed_record.py b/pybedlite/bed_record.py index ee4f1dc..c64fa2e 100644 --- a/pybedlite/bed_record.py +++ b/pybedlite/bed_record.py @@ -184,7 +184,7 @@ def bed_fields(self) -> List[str]: @property def refname(self) -> str: - """A reference sequence name.""" + """The reference name of the interval described by the record.""" return self.chrom @property diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index 1e584f6..2b57589 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -80,10 +80,9 @@ from pybedlite.bed_source import BedSource -class GenomicSpan(Protocol): +class GenomicSpan(Hashable, Protocol): """ - A genomic span which has protected methods that must be implemented by all subclasses to give - a zero-based open-ended genomic span. + A structural type for a span on a reference sequence with zero-based open-ended coordinates. """ @property @@ -100,6 +99,11 @@ def end(self) -> int: class StrandedGenomicSpan(GenomicSpan, Protocol): + """ + A structural type for a stranded span on a reference sequence with zero-based open-ended + coordinates. + """ + @property def negative(self) -> bool: """True if the interval is on the negative strand, False otherwise""" @@ -177,8 +181,8 @@ def from_bedrecord(cls: Type["Interval"], record: BedRecord) -> "Interval": GenericGenomicSpan = TypeVar("GenericGenomicSpan", bound=Union[GenomicSpan, StrandedGenomicSpan]) """ -A generic genomic feature. This type variable is used for describing the -generic type contained within the :class:`~pybedlite.overlap_detector.OverlapDetector`. +A generic genomic span. This type variable is used for describing the generic type contained within +the :class:`~pybedlite.overlap_detector.OverlapDetector`. """ @@ -187,16 +191,16 @@ class OverlapDetector(Generic[GenericGenomicSpan], Iterable[GenericGenomicSpan]) The overlap detector may contain any interval-like Python objects that have the following properties: - * `chrom` or `contig` or `refname`: The reference sequence name + + * `refname`: The reference sequence name * `start`: A 0-based start position * `end`: A 0-based exclusive end position Interval-like Python objects may also contain strandedness information which will be used for sorting them in :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps` using - either of the following properties if they are present: + the following property if it is present: + * `negative (bool)`: Whether or not the feature is negative stranded or not - * `strand (BedStrand)`: The BED strand of the feature - * `strand (str)`: The strand of the feature (`"-"` for negative) The same interval may be added multiple times, but only a single instance will be returned when querying for overlaps. From e536b9b5b6fda33fb9d588b71c05672a28de5eba Mon Sep 17 00:00:00 2001 From: clintval Date: Thu, 1 Aug 2024 09:29:04 -0400 Subject: [PATCH 04/17] chore: small fixups --- .gitignore | 4 +++- .gitmodules | 1 + pybedlite/overlap_detector.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 7e54d62..f2813da 100644 --- a/.gitignore +++ b/.gitignore @@ -129,4 +129,6 @@ dmypy.json .pyre/ # VSCode configurations -.vscode/ \ No newline at end of file +.vscode/ + +.DS_Store diff --git a/.gitmodules b/.gitmodules index 0a7daea..179f373 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "cgranges"] path = cgranges url = https://github.com/lh3/cgranges + ignore = dirty diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index 2b57589..03c76a7 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -24,7 +24,7 @@ Examples of Detecting Overlaps ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -. code-block:: python +.. code-block:: python >>> from pybedlite.overlap_detector import Interval, OverlapDetector >>> detector = OverlapDetector() From 01b09e2f74d6b7924bccb304428f48d5a9a39ddb Mon Sep 17 00:00:00 2001 From: clintval Date: Thu, 1 Aug 2024 09:35:28 -0400 Subject: [PATCH 05/17] feat: refine type of from_bed() --- pybedlite/overlap_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index 60fab1c..650657c 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -398,7 +398,7 @@ def get_enclosed(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: return [i for i in results if i.start >= interval.start and i.end <= interval.end] @classmethod - def from_bed(cls, path: Path) -> "OverlapDetector": + def from_bed(cls, path: Path) -> "OverlapDetector[BedRecord]": """Builds a :class:`~pybedlite.overlap_detector.OverlapDetector` from a BED file. Args: path: the path to the BED file From d922a35383e1f676094ffcda1680c49aaa8feb62 Mon Sep 17 00:00:00 2001 From: clintval Date: Thu, 1 Aug 2024 09:48:21 -0400 Subject: [PATCH 06/17] chore: remove extra variable --- pybedlite/overlap_detector.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index 650657c..cbf8937 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -281,24 +281,23 @@ def add(self, interval: GenericGenomicSpan) -> None: if not isinstance(interval, Hashable): raise ValueError(f"Interval feature is not hashable but should be: {interval}") - refname = interval.refname - if refname not in self._refname_to_tree: - self._refname_to_tree[refname] = cr.cgranges() # type: ignore - self._refname_to_indexed[refname] = False - self._refname_to_intervals[refname] = [] + if interval.refname not in self._refname_to_tree: + self._refname_to_tree[interval.refname] = cr.cgranges() # type: ignore + self._refname_to_indexed[interval.refname] = False + self._refname_to_intervals[interval.refname] = [] # Append the interval to the list of intervals for this tree, keeping the index # of where it was inserted - interval_idx: int = len(self._refname_to_intervals[refname]) - self._refname_to_intervals[refname].append(interval) + interval_idx: int = len(self._refname_to_intervals[interval.refname]) + self._refname_to_intervals[interval.refname].append(interval) # Add the interval to the tree - tree = self._refname_to_tree[refname] - tree.add(refname, interval.start, interval.end, interval_idx) + tree = self._refname_to_tree[interval.refname] + tree.add(interval.refname, interval.start, interval.end, interval_idx) # Flag this tree as needing to be indexed after adding a new interval, but defer # indexing - self._refname_to_indexed[refname] = False + self._refname_to_indexed[interval.refname] = False def add_all(self, intervals: Iterable[GenericGenomicSpan]) -> None: """Adds one or more intervals to this detector. @@ -319,15 +318,14 @@ def overlaps_any(self, interval: GenomicSpan) -> bool: True if and only if the given interval overlaps with any interval in this detector. """ - refname = interval.refname - tree = self._refname_to_tree.get(refname) + tree = self._refname_to_tree.get(interval.refname) if tree is None: return False else: - if not self._refname_to_indexed[refname]: + if not self._refname_to_indexed[interval.refname]: tree.index() try: - next(iter(tree.overlap(refname, interval.start, interval.end))) + next(iter(tree.overlap(interval.refname, interval.start, interval.end))) except StopIteration: return False else: @@ -343,18 +341,17 @@ def get_overlaps(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: The list of intervals in this detector that overlap the given interval, or the empty list if no overlaps exist. The intervals will be return in ascending genomic order. """ - refname = interval.refname - tree = self._refname_to_tree.get(refname) + tree = self._refname_to_tree.get(interval.refname) if tree is None: return [] else: - if not self._refname_to_indexed[refname]: + if not self._refname_to_indexed[interval.refname]: tree.index() - ref_intervals: List[GenericGenomicSpan] = self._refname_to_intervals[refname] + ref_intervals: List[GenericGenomicSpan] = self._refname_to_intervals[interval.refname] # NB: only return unique instances of intervals intervals: Set[GenericGenomicSpan] = { ref_intervals[index] - for _, _, index in tree.overlap(refname, interval.start, interval.end) + for _, _, index in tree.overlap(interval.refname, interval.start, interval.end) } return sorted( intervals, From 2d9625caa3b48be2faf08fc07c938560fe56779b Mon Sep 17 00:00:00 2001 From: clintval Date: Thu, 1 Aug 2024 11:15:22 -0400 Subject: [PATCH 07/17] docs: improve documentation throughout --- pybedlite/overlap_detector.py | 63 +++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index cbf8937..4248f0c 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -17,7 +17,8 @@ for sorting them in :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps` using the following property if it is present, otherwise assumed to be positive stranded: - * `negative (bool)`: Whether the feature is negative stranded or not + * `negative (bool)`: True if the span is negatively stranded, False if the interval is + unstranded or positively stranded This is encapsulated in the :class:`~pybedlite.overlap_detector.StrandedGenomicSpan` protocol. @@ -106,7 +107,10 @@ class StrandedGenomicSpan(GenomicSpan, Protocol): @property def negative(self) -> bool: - """True if the interval is on the negative strand, False otherwise""" + """ + True if the span is negatively stranded, False if the interval is unstranded or + positively stranded. + """ @attr.s(frozen=True, auto_attribs=True) @@ -117,7 +121,8 @@ class Interval: refname (str): the refname (or chromosome) start (int): the 0-based start position end (int): the 0-based end position (exclusive) - negative (bool): true if the interval is on the negative strand, false otherwise + negative (bool): true if the span is negatively stranded, False if the interval is + unstranded or positively stranded name (Optional[str]): an optional name assigned to the interval """ @@ -188,7 +193,8 @@ def from_ucsc( Construct an `Interval` from a UCSC "position"-formatted string. The "Position" format (referring to the "1-start, fully-closed" system as coordinates are - "positioned" in the browser) + "positioned" in the browser): + * Written as: chr1:127140001-127140001 * The location may optionally be followed by a parenthetically enclosed strand, e.g. chr1:127140001-127140001(+). @@ -197,6 +203,7 @@ def from_ucsc( end coordinates. * When in this format, the assumption is that the coordinate is **1-start, fully-closed.** + https://genome-blog.gi.ucsc.edu/blog/2016/12/12/the-ucsc-genome-browser-coordinate-counting-systems/ # noqa: E501 Note that when the string does not have a specified strand, the `Interval`'s negative @@ -249,8 +256,9 @@ class OverlapDetector(Generic[GenericGenomicSpan], Iterable[GenericGenomicSpan]) * `end`: A 0-based exclusive end position Interval-like Python objects may also contain strandedness information which will be used - for sorting them in :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps` using - the following property if it is present: + for sorting the return of :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps`. This + additional sort key will only be used if the following property on the interval-like object is + present: * `negative (bool)`: Whether or not the feature is negative stranded or not @@ -339,7 +347,13 @@ def get_overlaps(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: Returns: The list of intervals in this detector that overlap the given interval, or the empty - list if no overlaps exist. The intervals will be return in ascending genomic order. + list if no overlaps exist. The intervals will be returned sorted using the following + sort keys: + + * The interval's start + * The interval's end + * The interval's strand (if defined) + * The interval's reference sequence name """ tree = self._refname_to_tree.get(interval.refname) if tree is None: @@ -365,17 +379,27 @@ def get_overlaps(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: @staticmethod def _negative(interval: GenomicSpan) -> bool: + """ + True if the span is negatively stranded, False if the interval is unstranded or positively + stranded. + """ return getattr(interval, "negative", False) def get_enclosing_intervals(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: """Returns the set of intervals in this detector that wholly enclose the query interval. i.e. `query.start >= target.start` and `query.end <= target.end`. - Args: - interval: the query interval - Returns: - The list of intervals in this detector that enclose the query interval. - The intervals will be returned in ascending genomic order. + Args: + interval: the query interval + + Returns: + The list of intervals in this detector that enclose the query interval. The intervals + will be returned sorted using the following sort keys: + + * The interval's start + * The interval's end + * The interval's strand (if defined) + * The interval's reference sequence name """ results = self.get_overlaps(interval) return [i for i in results if interval.start >= i.start and interval.end <= i.end] @@ -384,12 +408,17 @@ def get_enclosed(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: """Returns the set of intervals in this detector that are enclosed by the query interval. I.e. target.start >= query.start and target.end <= query.end. - Args: - interval: the query interval + Args: + interval: the query interval + + Returns: + The list of intervals in this detector that are enclosed within the query interval. + The intervals will be returned sorted using the following sort keys: - Returns: - The list of intervals in this detector that are enclosed within the query interval. - The intervals will be return in ascending genomic order. + * The interval's start + * The interval's end + * The interval's strand (if defined) + * The interval's reference sequence name """ results = self.get_overlaps(interval) return [i for i in results if i.start >= interval.start and i.end <= interval.end] From 74f7d031f48036fd2aa2646b17d4f375669e36fe Mon Sep 17 00:00:00 2001 From: clintval Date: Thu, 1 Aug 2024 11:15:58 -0400 Subject: [PATCH 08/17] docs: improve documentation throughout --- pybedlite/overlap_detector.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index 4248f0c..d664422 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -17,7 +17,7 @@ for sorting them in :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps` using the following property if it is present, otherwise assumed to be positive stranded: - * `negative (bool)`: True if the span is negatively stranded, False if the interval is + * `negative (bool)`: True if the interval is negatively stranded, False if the interval is unstranded or positively stranded This is encapsulated in the :class:`~pybedlite.overlap_detector.StrandedGenomicSpan` protocol. @@ -108,7 +108,7 @@ class StrandedGenomicSpan(GenomicSpan, Protocol): @property def negative(self) -> bool: """ - True if the span is negatively stranded, False if the interval is unstranded or + True if the interval is negatively stranded, False if the interval is unstranded or positively stranded. """ @@ -121,7 +121,7 @@ class Interval: refname (str): the refname (or chromosome) start (int): the 0-based start position end (int): the 0-based end position (exclusive) - negative (bool): true if the span is negatively stranded, False if the interval is + negative (bool): true if the interval is negatively stranded, False if the interval is unstranded or positively stranded name (Optional[str]): an optional name assigned to the interval """ @@ -380,8 +380,8 @@ def get_overlaps(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: @staticmethod def _negative(interval: GenomicSpan) -> bool: """ - True if the span is negatively stranded, False if the interval is unstranded or positively - stranded. + True if the interval is negatively stranded, False if the interval is unstranded or + positively stranded. """ return getattr(interval, "negative", False) From df6fab187636448d9dd7d709af5d0002ebda4601 Mon Sep 17 00:00:00 2001 From: clintval Date: Fri, 2 Aug 2024 06:06:55 -0400 Subject: [PATCH 09/17] chore: suit comments from review --- pybedlite/overlap_detector.py | 55 +++++++++++------------- pybedlite/tests/test_overlap_detector.py | 6 ++- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index d664422..fd77c54 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -4,14 +4,14 @@ The :class:`~pybedlite.overlap_detector.OverlapDetector` class detects and returns overlaps between -a set of genomic regions and another genomic region. The overlap detector may contain any +a set of regions and another region on a reference. The overlap detector may contain any interval-like Python objects that have the following properties: * `refname` (str): The reference sequence name * `start` (int): A 0-based start position * `end` (int): A 0-based exclusive end position -This is encapsulated in the :class:`~pybedlite.overlap_detector.GenomicSpan` protocol. +This is encapsulated in the :class:`~pybedlite.overlap_detector.Span` protocol. Interval-like Python objects may also contain strandedness information which will be used for sorting them in :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps` using @@ -20,7 +20,7 @@ * `negative (bool)`: True if the interval is negatively stranded, False if the interval is unstranded or positively stranded -This is encapsulated in the :class:`~pybedlite.overlap_detector.StrandedGenomicSpan` protocol. +This is encapsulated in the :class:`~pybedlite.overlap_detector.StrandedSpan` protocol. Examples of Detecting Overlaps ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -52,10 +52,10 @@ The module contains the following public classes: - - :class:`~pybedlite.overlap_detector.Interval` -- Represents a region mapping to the genome - that is 0-based and open-ended + - :class:`~pybedlite.overlap_detector.Interval` -- Represents a region mapping to a reference + sequence that is 0-based and open-ended - :class:`~pybedlite.overlap_detector.OverlapDetector` -- Detects and returns overlaps between - a set of genomic regions and another genomic region + a set of regions and another region on a reference """ import itertools @@ -81,7 +81,7 @@ from pybedlite.bed_source import BedSource -class GenomicSpan(Hashable, Protocol): +class Span(Hashable, Protocol): """ A structural type for a span on a reference sequence with zero-based open-ended coordinates. """ @@ -99,7 +99,7 @@ def end(self) -> int: """A 0-based open-ended position.""" -class StrandedGenomicSpan(GenomicSpan, Protocol): +class StrandedSpan(Span, Protocol): """ A structural type for a stranded span on a reference sequence with zero-based open-ended coordinates. @@ -115,7 +115,7 @@ def negative(self) -> bool: @attr.s(frozen=True, auto_attribs=True) class Interval: - """A region mapping to the genome that is 0-based and open-ended + """A region mapping to a reference sequence that is 0-based and open-ended Attributes: refname (str): the refname (or chromosome) @@ -238,15 +238,15 @@ def from_ucsc( ) from exception -GenericGenomicSpan = TypeVar("GenericGenomicSpan", bound=Union[GenomicSpan, StrandedGenomicSpan]) +SpanType = TypeVar("SpanType", bound=Union[Span, StrandedSpan]) """ -A generic genomic span. This type variable is used for describing the generic type contained within -the :class:`~pybedlite.overlap_detector.OverlapDetector`. +A generic reference sequence span. This type variable is used for describing the generic type +contained within the :class:`~pybedlite.overlap_detector.OverlapDetector`. """ -class OverlapDetector(Generic[GenericGenomicSpan], Iterable[GenericGenomicSpan]): - """Detects and returns overlaps between a set of genomic regions and another genomic region. +class OverlapDetector(Generic[SpanType], Iterable[SpanType]): + """Detects and returns overlaps between a set of regions and another region on a reference. The overlap detector may contain any interval-like Python objects that have the following properties: @@ -268,27 +268,24 @@ class OverlapDetector(Generic[GenericGenomicSpan], Iterable[GenericGenomicSpan]) This detector is the most efficient when all intervals are added ahead of time. """ - def __init__(self, intervals: Optional[Iterable[GenericGenomicSpan]] = None) -> None: + def __init__(self, intervals: Optional[Iterable[SpanType]] = None) -> None: # A mapping from the contig/chromosome name to the associated interval tree self._refname_to_tree: Dict[str, cr.cgranges] = {} # type: ignore self._refname_to_indexed: Dict[str, bool] = {} - self._refname_to_intervals: Dict[str, List[GenericGenomicSpan]] = {} + self._refname_to_intervals: Dict[str, List[SpanType]] = {} if intervals is not None: self.add_all(intervals) - def __iter__(self) -> Iterator[GenericGenomicSpan]: + def __iter__(self) -> Iterator[SpanType]: """Iterates over the intervals in the overlap detector.""" return itertools.chain(*self._refname_to_intervals.values()) - def add(self, interval: GenericGenomicSpan) -> None: + def add(self, interval: SpanType) -> None: """Adds an interval to this detector. Args: interval: the interval to add to this detector """ - if not isinstance(interval, Hashable): - raise ValueError(f"Interval feature is not hashable but should be: {interval}") - if interval.refname not in self._refname_to_tree: self._refname_to_tree[interval.refname] = cr.cgranges() # type: ignore self._refname_to_indexed[interval.refname] = False @@ -307,7 +304,7 @@ def add(self, interval: GenericGenomicSpan) -> None: # indexing self._refname_to_indexed[interval.refname] = False - def add_all(self, intervals: Iterable[GenericGenomicSpan]) -> None: + def add_all(self, intervals: Iterable[SpanType]) -> None: """Adds one or more intervals to this detector. Args: @@ -316,7 +313,7 @@ def add_all(self, intervals: Iterable[GenericGenomicSpan]) -> None: for interval in intervals: self.add(interval) - def overlaps_any(self, interval: GenomicSpan) -> bool: + def overlaps_any(self, interval: Span) -> bool: """Determines whether the given interval overlaps any interval in this detector. Args: @@ -339,7 +336,7 @@ def overlaps_any(self, interval: GenomicSpan) -> bool: else: return True - def get_overlaps(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: + def get_overlaps(self, interval: Span) -> List[SpanType]: """Returns any intervals in this detector that overlap the given interval. Args: @@ -361,9 +358,9 @@ def get_overlaps(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: else: if not self._refname_to_indexed[interval.refname]: tree.index() - ref_intervals: List[GenericGenomicSpan] = self._refname_to_intervals[interval.refname] + ref_intervals: List[SpanType] = self._refname_to_intervals[interval.refname] # NB: only return unique instances of intervals - intervals: Set[GenericGenomicSpan] = { + intervals: Set[SpanType] = { ref_intervals[index] for _, _, index in tree.overlap(interval.refname, interval.start, interval.end) } @@ -378,14 +375,14 @@ def get_overlaps(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: ) @staticmethod - def _negative(interval: GenomicSpan) -> bool: + def _negative(interval: Span) -> bool: """ True if the interval is negatively stranded, False if the interval is unstranded or positively stranded. """ return getattr(interval, "negative", False) - def get_enclosing_intervals(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: + def get_enclosing_intervals(self, interval: Span) -> List[SpanType]: """Returns the set of intervals in this detector that wholly enclose the query interval. i.e. `query.start >= target.start` and `query.end <= target.end`. @@ -404,7 +401,7 @@ def get_enclosing_intervals(self, interval: GenomicSpan) -> List[GenericGenomicS results = self.get_overlaps(interval) return [i for i in results if interval.start >= i.start and interval.end <= i.end] - def get_enclosed(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: + def get_enclosed(self, interval: Span) -> List[SpanType]: """Returns the set of intervals in this detector that are enclosed by the query interval. I.e. target.start >= query.start and target.end <= query.end. diff --git a/pybedlite/tests/test_overlap_detector.py b/pybedlite/tests/test_overlap_detector.py index bf56fc5..78de500 100644 --- a/pybedlite/tests/test_overlap_detector.py +++ b/pybedlite/tests/test_overlap_detector.py @@ -358,5 +358,7 @@ def negative(self) -> bool: """True if the interval is on the negative strand, False otherwise""" return False - with pytest.raises(ValueError): - OverlapDetector([ChromFeature(refname="chr1", zero_based_start=0, end=30)]) + with pytest.raises(TypeError): + OverlapDetector([ChromFeature(refname="chr1", zero_based_start=0, end=30)]).get_overlaps( + Interval("chr1", 0, 30) + ) From 34a63a6dcc31300cb4ff24884e067ace607cd88d Mon Sep 17 00:00:00 2001 From: clintval Date: Fri, 2 Aug 2024 06:11:28 -0400 Subject: [PATCH 10/17] docs: improve negative docstring --- pybedlite/bed_record.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pybedlite/bed_record.py b/pybedlite/bed_record.py index c64fa2e..cc1dd26 100644 --- a/pybedlite/bed_record.py +++ b/pybedlite/bed_record.py @@ -189,7 +189,10 @@ def refname(self) -> str: @property def negative(self) -> bool: - """True if the interval is on the negative strand, False otherwise""" + """ + True if the interval is negatively stranded, False if the interval is unstranded or + positively stranded. + """ return self.strand is BedStrand.Negative def as_bed_line(self, number_of_output_fields: Optional[int] = None) -> str: From 295461f3c03acae38aa4d94ce1799cde37e471cc Mon Sep 17 00:00:00 2001 From: clintval Date: Fri, 2 Aug 2024 06:14:05 -0400 Subject: [PATCH 11/17] chore: add unit tests for new properties --- pybedlite/tests/test_pybedlite.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pybedlite/tests/test_pybedlite.py b/pybedlite/tests/test_pybedlite.py index c6f9c51..96774f8 100644 --- a/pybedlite/tests/test_pybedlite.py +++ b/pybedlite/tests/test_pybedlite.py @@ -212,3 +212,13 @@ def test_bedstrand_opposite() -> None: assert BedStrand.Positive.opposite is BedStrand.Negative assert BedStrand.Negative.opposite is BedStrand.Positive + +def test_bedrecord_refname() -> None: + """Test that the alternate property for reference sequence name is correct.""" + assert BedRecord("chr1", 1, 2).refname == "chr1" + +def test_bedrecord_negative() -> None: + """Test that the negative property is set correctly.""" + assert not BedRecord("chr1", 1, 2, strand=None).negative + assert not BedRecord("chr1", 1, 2, strand=BedStrand.Positive).negative + assert BedRecord("chr1", 1, 2, strand=BedStrand.Negative).negative From 55d083de5765475ac5440172fe8aec39620d49d6 Mon Sep 17 00:00:00 2001 From: clintval Date: Fri, 2 Aug 2024 06:14:52 -0400 Subject: [PATCH 12/17] chore: add unit tests for new properties --- pybedlite/tests/test_pybedlite.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pybedlite/tests/test_pybedlite.py b/pybedlite/tests/test_pybedlite.py index 96774f8..789cad5 100644 --- a/pybedlite/tests/test_pybedlite.py +++ b/pybedlite/tests/test_pybedlite.py @@ -213,12 +213,14 @@ def test_bedstrand_opposite() -> None: assert BedStrand.Positive.opposite is BedStrand.Negative assert BedStrand.Negative.opposite is BedStrand.Positive + def test_bedrecord_refname() -> None: """Test that the alternate property for reference sequence name is correct.""" - assert BedRecord("chr1", 1, 2).refname == "chr1" + assert BedRecord(chrom="chr1", start=1, end=2).refname == "chr1" + def test_bedrecord_negative() -> None: """Test that the negative property is set correctly.""" - assert not BedRecord("chr1", 1, 2, strand=None).negative - assert not BedRecord("chr1", 1, 2, strand=BedStrand.Positive).negative - assert BedRecord("chr1", 1, 2, strand=BedStrand.Negative).negative + assert not BedRecord(chrom="chr1", start=1, end=2, strand=None).negative + assert not BedRecord(chrom="chr1", start=1, end=2, strand=BedStrand.Positive).negative + assert BedRecord(chrom="chr1", start=1, end=2, strand=BedStrand.Negative).negative From 19e85ec95c3dc06d3b8defb89aaedaffbb48ccaa Mon Sep 17 00:00:00 2001 From: clintval Date: Fri, 2 Aug 2024 06:16:47 -0400 Subject: [PATCH 13/17] chore: fixup docs slightly --- pybedlite/overlap_detector.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index fd77c54..7b85b8a 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -2,25 +2,24 @@ Utility Classes for Querying Overlaps with Genomic Regions ---------------------------------------------------------- - The :class:`~pybedlite.overlap_detector.OverlapDetector` class detects and returns overlaps between -a set of regions and another region on a reference. The overlap detector may contain any -interval-like Python objects that have the following properties: +a set of regions and another region on a reference. The overlap detector may contain a collection +of interval-like Python objects that have the following properties: * `refname` (str): The reference sequence name * `start` (int): A 0-based start position * `end` (int): A 0-based exclusive end position -This is encapsulated in the :class:`~pybedlite.overlap_detector.Span` protocol. +This contract is described by the :class:`~pybedlite.overlap_detector.Span` protocol. Interval-like Python objects may also contain strandedness information which will be used for sorting them in :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps` using -the following property if it is present, otherwise assumed to be positive stranded: +the following property if it is present: * `negative (bool)`: True if the interval is negatively stranded, False if the interval is unstranded or positively stranded -This is encapsulated in the :class:`~pybedlite.overlap_detector.StrandedSpan` protocol. +This contract is described by the :class:`~pybedlite.overlap_detector.StrandedSpan` protocol. Examples of Detecting Overlaps ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 351b5637099544797ab4009e9918e8ddcb6f8e14 Mon Sep 17 00:00:00 2001 From: clintval Date: Fri, 2 Aug 2024 06:20:46 -0400 Subject: [PATCH 14/17] chore: normalize docstrings throughout --- pybedlite/overlap_detector.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index 7b85b8a..e5fbff7 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -8,7 +8,7 @@ * `refname` (str): The reference sequence name * `start` (int): A 0-based start position - * `end` (int): A 0-based exclusive end position + * `end` (int): A 0-based half-open end position This contract is described by the :class:`~pybedlite.overlap_detector.Span` protocol. @@ -95,7 +95,7 @@ def start(self) -> int: @property def end(self) -> int: - """A 0-based open-ended position.""" + """A 0-based open-ended end position.""" class StrandedSpan(Span, Protocol): @@ -119,7 +119,7 @@ class Interval: Attributes: refname (str): the refname (or chromosome) start (int): the 0-based start position - end (int): the 0-based end position (exclusive) + end (int): the 0-based half-open end position negative (bool): true if the interval is negatively stranded, False if the interval is unstranded or positively stranded name (Optional[str]): an optional name assigned to the interval @@ -247,19 +247,19 @@ def from_ucsc( class OverlapDetector(Generic[SpanType], Iterable[SpanType]): """Detects and returns overlaps between a set of regions and another region on a reference. - The overlap detector may contain any interval-like Python objects that have the following - properties: + The overlap detector may contain a collection of interval-like Python objects that have the + following properties: - * `refname`: The reference sequence name - * `start`: A 0-based start position - * `end`: A 0-based exclusive end position + * `refname` (str): The reference sequence name + * `start` (int): A 0-based start position + * `end` (int): A 0-based half-open end position Interval-like Python objects may also contain strandedness information which will be used - for sorting the return of :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps`. This - additional sort key will only be used if the following property on the interval-like object is - present: + for sorting them in :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps` using + the following property if it is present: - * `negative (bool)`: Whether or not the feature is negative stranded or not + * `negative (bool)`: True if the interval is negatively stranded, False if the interval is + unstranded or positively stranded The same interval may be added multiple times, but only a single instance will be returned when querying for overlaps. From 8556eb8924af71c83384789e39185e86644f39fc Mon Sep 17 00:00:00 2001 From: clintval Date: Fri, 2 Aug 2024 06:24:25 -0400 Subject: [PATCH 15/17] docs: clarify sort order --- pybedlite/overlap_detector.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index e5fbff7..66a50f7 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -346,10 +346,10 @@ def get_overlaps(self, interval: Span) -> List[SpanType]: list if no overlaps exist. The intervals will be returned sorted using the following sort keys: - * The interval's start - * The interval's end - * The interval's strand (if defined) - * The interval's reference sequence name + * The interval's start (ascending) + * The interval's end (ascending) + * The interval's strand, positive or negative (assumed to be positive if undefined) + * The interval's reference sequence name (lexicographically) """ tree = self._refname_to_tree.get(interval.refname) if tree is None: @@ -392,10 +392,10 @@ def get_enclosing_intervals(self, interval: Span) -> List[SpanType]: The list of intervals in this detector that enclose the query interval. The intervals will be returned sorted using the following sort keys: - * The interval's start - * The interval's end - * The interval's strand (if defined) - * The interval's reference sequence name + * The interval's start (ascending) + * The interval's end (ascending) + * The interval's strand, positive or negative (assumed to be positive if undefined) + * The interval's reference sequence name (lexicographically) """ results = self.get_overlaps(interval) return [i for i in results if interval.start >= i.start and interval.end <= i.end] @@ -411,10 +411,10 @@ def get_enclosed(self, interval: Span) -> List[SpanType]: The list of intervals in this detector that are enclosed within the query interval. The intervals will be returned sorted using the following sort keys: - * The interval's start - * The interval's end - * The interval's strand (if defined) - * The interval's reference sequence name + * The interval's start (ascending) + * The interval's end (ascending) + * The interval's strand, positive or negative (assumed to be positive if undefined) + * The interval's reference sequence name (lexicographically) """ results = self.get_overlaps(interval) return [i for i in results if i.start >= interval.start and i.end <= interval.end] From ee6ecb6f5c62eef7de0c6c127933a78c09ef281d Mon Sep 17 00:00:00 2001 From: clintval Date: Fri, 2 Aug 2024 06:41:42 -0400 Subject: [PATCH 16/17] chore: add MyPy-specific type test --- pybedlite/tests/test_overlap_detector.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pybedlite/tests/test_overlap_detector.py b/pybedlite/tests/test_overlap_detector.py index 78de500..d6d7c31 100644 --- a/pybedlite/tests/test_overlap_detector.py +++ b/pybedlite/tests/test_overlap_detector.py @@ -224,6 +224,16 @@ def test_construction_from_ucsc_other_contigs(contig: str) -> None: assert Interval.from_ucsc(f"{contig}:101-200") == Interval(contig, 100, 200) +def test_that_overlap_detector_allows_generic_parameterization() -> None: + """ + Test that the overlap detector allows for generic parameterization. + """ + records = [BedRecord(chrom="chr1", start=1, end=2), BedRecord(chrom="chr1", start=4, end=5)] + detector: OverlapDetector[BedRecord] = OverlapDetector(records) + overlaps: List[BedRecord] = detector.get_overlaps(Interval("chr1", 1, 2)) + assert overlaps == [BedRecord(chrom="chr1", start=1, end=2)] + + def test_arbitrary_interval_types() -> None: """ Test that an overlap detector can receive different interval-like objects and query them too. From eeda85e7b123941e3a4b5de089c05e1e47601865 Mon Sep 17 00:00:00 2001 From: clintval Date: Fri, 2 Aug 2024 06:53:11 -0400 Subject: [PATCH 17/17] chore: round out code coverage --- pybedlite/tests/test_overlap_detector.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/pybedlite/tests/test_overlap_detector.py b/pybedlite/tests/test_overlap_detector.py index d6d7c31..4208fe2 100644 --- a/pybedlite/tests/test_overlap_detector.py +++ b/pybedlite/tests/test_overlap_detector.py @@ -1,12 +1,14 @@ """Tests for :py:mod:`~pybedlite.overlap_detector`""" from dataclasses import dataclass +from pathlib import Path from typing import List from typing import Union import pytest from typing_extensions import TypeAlias +import pybedlite as pybed from pybedlite.bed_record import BedRecord from pybedlite.bed_record import BedStrand from pybedlite.overlap_detector import Interval @@ -363,12 +365,22 @@ def start(self) -> int: """A 0-based start position.""" return self.zero_based_start - @property - def negative(self) -> bool: - """True if the interval is on the negative strand, False otherwise""" - return False - with pytest.raises(TypeError): OverlapDetector([ChromFeature(refname="chr1", zero_based_start=0, end=30)]).get_overlaps( Interval("chr1", 0, 30) ) + + +def test_the_overlap_detector_can_be_built_from_a_bed_file(tmp_path: Path) -> None: + """ + Test that the overlap detector can be built from a BED file. + """ + records = [BedRecord(chrom="chr1", start=1, end=2), BedRecord(chrom="chr1", start=4, end=5)] + + bed = tmp_path / "test.bed" + with pybed.writer(bed, num_fields=3) as writer: + writer.write_all(records) + + detector: OverlapDetector[BedRecord] = OverlapDetector.from_bed(bed) + overlaps: List[BedRecord] = detector.get_overlaps(Interval("chr1", 1, 2)) + assert overlaps == [BedRecord(chrom="chr1", start=1, end=2)]