From cf99c00c836030ef648008e394ac1365df803424 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Wed, 31 Jul 2024 14:08:39 -0700 Subject: [PATCH] fixes --- pybedlite/bed_record.py | 2 +- pybedlite/overlap_detector.py | 37 +++++++++++++++++++++++------------ 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/pybedlite/bed_record.py b/pybedlite/bed_record.py index 37d90c3..ee4f1dc 100644 --- a/pybedlite/bed_record.py +++ b/pybedlite/bed_record.py @@ -190,7 +190,7 @@ def refname(self) -> str: @property def negative(self) -> bool: """True if the interval is on the negative strand, False otherwise""" - return self.strand is not None and self.strand == BedStrand.Positive + return self.strand is BedStrand.Negative def as_bed_line(self, number_of_output_fields: Optional[int] = None) -> str: """ diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index c79ead1..1e584f6 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -175,35 +175,48 @@ def from_bedrecord(cls: Type["Interval"], record: BedRecord) -> "Interval": ) -GenericGenomicsSpan = TypeVar("GenericGenomicsSpan", bound=Union[GenomicSpan, StrandedGenomicSpan]) +GenericGenomicSpan = TypeVar("GenericGenomicSpan", bound=Union[GenomicSpan, StrandedGenomicSpan]) """ A generic genomic feature. This type variable is used for describing the generic type contained within the :class:`~pybedlite.overlap_detector.OverlapDetector`. """ -class OverlapDetector(Generic[GenericGenomicsSpan], Iterable[GenericGenomicsSpan]): +class OverlapDetector(Generic[GenericGenomicSpan], Iterable[GenericGenomicSpan]): """Detects and returns overlaps between a set of genomic regions and another genomic region. + The overlap detector may contain any interval-like Python objects that have the following + properties: + * `chrom` or `contig` or `refname`: The reference sequence name + * `start`: A 0-based start position + * `end`: A 0-based exclusive end position + + Interval-like Python objects may also contain strandedness information which will be used + for sorting them in :func:`~pybedlite.overlap_detector.OverlapDetector.get_overlaps` using + either of the following properties if they are present: + * `negative (bool)`: Whether or not the feature is negative stranded or not + * `strand (BedStrand)`: The BED strand of the feature + * `strand (str)`: The strand of the feature (`"-"` for negative) + The same interval may be added multiple times, but only a single instance will be returned when querying for overlaps. This detector is the most efficient when all intervals are added ahead of time. """ - def __init__(self, intervals: Optional[Iterable[GenericGenomicsSpan]] = None) -> None: + def __init__(self, intervals: Optional[Iterable[GenericGenomicSpan]] = None) -> None: # A mapping from the contig/chromosome name to the associated interval tree self._refname_to_tree: Dict[str, cr.cgranges] = {} # type: ignore self._refname_to_indexed: Dict[str, bool] = {} - self._refname_to_intervals: Dict[str, List[GenericGenomicsSpan]] = {} + self._refname_to_intervals: Dict[str, List[GenericGenomicSpan]] = {} if intervals is not None: self.add_all(intervals) - def __iter__(self) -> Iterator[GenericGenomicsSpan]: + def __iter__(self) -> Iterator[GenericGenomicSpan]: """Iterates over the intervals in the overlap detector.""" return itertools.chain(*self._refname_to_intervals.values()) - def add(self, interval: GenericGenomicsSpan) -> None: + def add(self, interval: GenericGenomicSpan) -> None: """Adds an interval to this detector. Args: @@ -231,7 +244,7 @@ def add(self, interval: GenericGenomicsSpan) -> None: # indexing self._refname_to_indexed[refname] = False - def add_all(self, intervals: Iterable[GenericGenomicsSpan]) -> None: + def add_all(self, intervals: Iterable[GenericGenomicSpan]) -> None: """Adds one or more intervals to this detector. Args: @@ -264,7 +277,7 @@ def overlaps_any(self, interval: GenomicSpan) -> bool: else: return True - def get_overlaps(self, interval: GenomicSpan) -> List[GenericGenomicsSpan]: + def get_overlaps(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: """Returns any intervals in this detector that overlap the given interval. Args: @@ -281,9 +294,9 @@ def get_overlaps(self, interval: GenomicSpan) -> List[GenericGenomicsSpan]: else: if not self._refname_to_indexed[refname]: tree.index() - ref_intervals: List[GenericGenomicsSpan] = self._refname_to_intervals[refname] + ref_intervals: List[GenericGenomicSpan] = self._refname_to_intervals[refname] # NB: only return unique instances of intervals - intervals: Set[GenericGenomicsSpan] = { + intervals: Set[GenericGenomicSpan] = { ref_intervals[index] for _, _, index in tree.overlap(refname, interval.start, interval.end) } @@ -301,7 +314,7 @@ def get_overlaps(self, interval: GenomicSpan) -> List[GenericGenomicsSpan]: def _negative(interval: GenomicSpan) -> bool: return getattr(interval, "negative", False) - def get_enclosing_intervals(self, interval: GenomicSpan) -> List[GenericGenomicsSpan]: + def get_enclosing_intervals(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: """Returns the set of intervals in this detector that wholly enclose the query interval. i.e. `query.start >= target.start` and `query.end <= target.end`. @@ -314,7 +327,7 @@ def get_enclosing_intervals(self, interval: GenomicSpan) -> List[GenericGenomics results = self.get_overlaps(interval) return [i for i in results if interval.start >= i.start and interval.end <= i.end] - def get_enclosed(self, interval: GenomicSpan) -> List[GenericGenomicsSpan]: + def get_enclosed(self, interval: GenomicSpan) -> List[GenericGenomicSpan]: """Returns the set of intervals in this detector that are enclosed by the query interval. I.e. target.start >= query.start and target.end <= query.end.