diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index 61d7ee9..ab08225 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -125,6 +125,58 @@ def from_bedrecord(cls: Type["Interval"], record: BedRecord) -> "Interval": name=record.name, ) + @classmethod + def from_ucsc( + cls: Type["Interval"], + string: str, + name: Optional[str] = None, + ) -> "Interval": + """ + Construct an `Interval` from a UCSC "position"-formatted string. + + The "Position" format (referring to the "1-start, fully-closed" system as coordinates are + "positioned" in the browser) + * Written as: chr1:127140001-127140001 + * The location may optionally be followed by a parenthetically enclosed strand, e.g. + chr1:127140001-127140001(+). + * No spaces. + * Includes punctuation: a colon after the chromosome, and a dash between the start and + end coordinates. + * When in this format, the assumption is that the coordinate is **1-start, + fully-closed.** + https://genome-blog.gi.ucsc.edu/blog/2016/12/12/the-ucsc-genome-browser-coordinate-counting-systems/ # noqa: E501 + + Note that when the string does not have a specified strand, the `Interval`'s negative + attribute is set to `False`. This mimics the behavior of `OverlapDetector.from_bed()` when + reading a record that does not have a specified strand. + + Args: + string: The UCSC "position"-formatted string. + name: An optional name for the interval. + + Returns: + An `Interval` corresponding to the same region specified in the string. + Note that the `Interval` is **zero-based open-ended**. + + Raises: + ValueError: If the string is not a valid UCSC position-formatted string. + """ + try: + if string[-1] == ")": + interval, strand = string.rstrip(")").rsplit("(", 1) + else: + interval, strand = string, "+" + + contig, span = interval.rsplit(":", 1) + start, end = span.split("-") + + return Interval(contig, int(start) - 1, int(end), negative=(strand == "-"), name=name) + + except Exception as exception: + raise ValueError( + f"Not a valid UCSC position-formatted string: {string}" + ) from exception + class OverlapDetector(Iterable[Interval]): """Detects and returns overlaps between a set of genomic regions and another genomic region. diff --git a/pybedlite/tests/test_overlap_detector.py b/pybedlite/tests/test_overlap_detector.py index dae34a9..ce27b1e 100644 --- a/pybedlite/tests/test_overlap_detector.py +++ b/pybedlite/tests/test_overlap_detector.py @@ -2,6 +2,8 @@ from typing import List +import pytest + from pybedlite.bed_record import BedRecord from pybedlite.bed_record import BedStrand from pybedlite.overlap_detector import Interval @@ -188,3 +190,32 @@ def test_construction_from_interval(bed_records: List[BedRecord]) -> None: assert new_record.strand is BedStrand.Positive else: assert new_record.strand is record.strand + + +def test_construction_from_ucsc() -> None: + """ + `Interval.from_ucsc()` should convert a UCSC position-formatted string to an `Interval`. + + The position-formatted string should be one-based fully-closed, and the `Interval` should be + zero-based half-open. + """ + assert Interval.from_ucsc("chr1:101-200") == Interval("chr1", 100, 200) + + +@pytest.mark.parametrize("strand", ["+", "-"]) +def test_construction_from_ucsc_with_strand(strand: str) -> None: + """ + `Interval.from_ucsc()` should correctly parse UCSC position-formatted strings with strands. + """ + expected_interval = Interval("chr1", 100, 200, negative=(strand == "-")) + assert Interval.from_ucsc(f"chr1:101-200({strand})") == expected_interval + + +@pytest.mark.parametrize( + "contig", ["chrUn_JTFH01001499v1_decoy", "HLA-DRB1*15:01:01:02", "chr10_GL383545v1_alt"] +) +def test_construction_from_ucsc_other_contigs(contig: str) -> None: + """ + `Interval.from_ucsc()` should accomodate non-human, decoy, custom, and other contig names. + """ + assert Interval.from_ucsc(f"{contig}:101-200") == Interval(contig, 100, 200)