From 66d2a0047d57b3e8f13b024b3fa092012faf1d2c Mon Sep 17 00:00:00 2001 From: Matt Stone Date: Thu, 4 Apr 2024 04:13:06 -0400 Subject: [PATCH] feat: add Interval constructor from UCSC formatted string --- pybedlite/overlap_detector.py | 44 ++++++++++++++++++++++++ pybedlite/tests/test_overlap_detector.py | 11 ++++++ 2 files changed, 55 insertions(+) diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index 281bf77..075a942 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -39,6 +39,7 @@ """ import itertools +import re from pathlib import Path from typing import Dict from typing import Iterable @@ -125,6 +126,49 @@ def from_bedrecord(cls: Type["Interval"], record: BedRecord) -> "Interval": name=record.name, ) + @classmethod + def from_ucsc_position( + cls: Type["Interval"], + position: str, + name: str | None = None, + ) -> "Interval": + """ + Construct an `Interval` from a UCSC "position"-formatted string. + + The "Position" format (referring to the "1-start, fully-closed" system as coordinates are + "positioned" in the browser) + * Written as: chr1:127140001-127140001 + * No spaces. + * Includes punctuation: a colon after the chromosome, and a dash between the start and + end coordinates. + * When in this format, the assumption is that the coordinate is **1-start, + fully-closed.** + https://genome-blog.gi.ucsc.edu/blog/2016/12/12/the-ucsc-genome-browser-coordinate-counting-systems/ # noqa: E501 + + Args: + position: The UCSC "position"-formatted string. + name: An optional name for the interval. + + Returns: + An `Interval` corresponding to the same region specified in the string. + Note that the `Interval` is **zero-based open-ended**. + + Raises: + ValueError: If the string is not a valid UCSC position-formatted string. + """ + + position_re = re.compile(r"^(chr(\d+|X|Y|M|MT)(?:_[A-Za-z0-9]+_alt)?):(\d+)-(\d+)$") + + match = position_re.match(position) + if match is None: + raise ValueError(f"Not a valid UCSC position-formatted string: {position}") + + refname = match.group(1) + start = int(match.group(3)) - 1 + end = int(match.group(4)) + + return cls(refname=refname, start=start, end=end, negative=False, name=name) + class OverlapDetector(Iterable[Interval]): """Detects and returns overlaps between a set of genomic regions and another genomic region. diff --git a/pybedlite/tests/test_overlap_detector.py b/pybedlite/tests/test_overlap_detector.py index e8e9fd5..22f908e 100644 --- a/pybedlite/tests/test_overlap_detector.py +++ b/pybedlite/tests/test_overlap_detector.py @@ -188,3 +188,14 @@ def test_construction_from_interval(bed_records: List[BedRecord]) -> None: assert new_record.strand is BedStrand.Positive else: assert new_record.strand is record.strand + + +def test_construction_from_ucsc_position() -> None: + """ + Test that we can convert a UCSC position to an Interval and back. + """ + + assert Interval.from_ucsc_position("chr1:101-200") == Interval("chr1", 100, 200) + assert Interval.from_ucsc_position("chr10_GL383545v1_alt:101-200") == Interval( + "chr10_GL383545v1_alt", 100, 200 + ) # noqa: E501