Skip to content

Commit

Permalink
feat: add Interval constructor from UCSC formatted string
Browse files Browse the repository at this point in the history
  • Loading branch information
msto committed Apr 4, 2024
1 parent 674567c commit 66d2a00
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 0 deletions.
44 changes: 44 additions & 0 deletions pybedlite/overlap_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
"""

import itertools
import re
from pathlib import Path
from typing import Dict
from typing import Iterable
Expand Down Expand Up @@ -125,6 +126,49 @@ def from_bedrecord(cls: Type["Interval"], record: BedRecord) -> "Interval":
name=record.name,
)

@classmethod
def from_ucsc_position(
cls: Type["Interval"],
position: str,
name: str | None = None,
) -> "Interval":
"""
Construct an `Interval` from a UCSC "position"-formatted string.
The "Position" format (referring to the "1-start, fully-closed" system as coordinates are
"positioned" in the browser)
* Written as: chr1:127140001-127140001
* No spaces.
* Includes punctuation: a colon after the chromosome, and a dash between the start and
end coordinates.
* When in this format, the assumption is that the coordinate is **1-start,
fully-closed.**
https://genome-blog.gi.ucsc.edu/blog/2016/12/12/the-ucsc-genome-browser-coordinate-counting-systems/ # noqa: E501
Args:
position: The UCSC "position"-formatted string.
name: An optional name for the interval.
Returns:
An `Interval` corresponding to the same region specified in the string.
Note that the `Interval` is **zero-based open-ended**.
Raises:
ValueError: If the string is not a valid UCSC position-formatted string.
"""

position_re = re.compile(r"^(chr(\d+|X|Y|M|MT)(?:_[A-Za-z0-9]+_alt)?):(\d+)-(\d+)$")

match = position_re.match(position)
if match is None:
raise ValueError(f"Not a valid UCSC position-formatted string: {position}")

refname = match.group(1)
start = int(match.group(3)) - 1
end = int(match.group(4))

return cls(refname=refname, start=start, end=end, negative=False, name=name)


class OverlapDetector(Iterable[Interval]):
"""Detects and returns overlaps between a set of genomic regions and another genomic region.
Expand Down
11 changes: 11 additions & 0 deletions pybedlite/tests/test_overlap_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,14 @@ def test_construction_from_interval(bed_records: List[BedRecord]) -> None:
assert new_record.strand is BedStrand.Positive
else:
assert new_record.strand is record.strand


def test_construction_from_ucsc_position() -> None:
"""
Test that we can convert a UCSC position to an Interval and back.
"""

assert Interval.from_ucsc_position("chr1:101-200") == Interval("chr1", 100, 200)
assert Interval.from_ucsc_position("chr10_GL383545v1_alt:101-200") == Interval(
"chr10_GL383545v1_alt", 100, 200
) # noqa: E501

0 comments on commit 66d2a00

Please sign in to comment.