From c5e80e29597849c76a5907f107d89b4eb8468816 Mon Sep 17 00:00:00 2001 From: Matt Stone Date: Thu, 4 Apr 2024 04:23:50 -0400 Subject: [PATCH] feat: permit strand --- pybedlite/overlap_detector.py | 19 ++++++++++++++++++- pybedlite/tests/test_overlap_detector.py | 10 +++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pybedlite/overlap_detector.py b/pybedlite/overlap_detector.py index 075a942..0bce4ce 100644 --- a/pybedlite/overlap_detector.py +++ b/pybedlite/overlap_detector.py @@ -138,6 +138,8 @@ def from_ucsc_position( The "Position" format (referring to the "1-start, fully-closed" system as coordinates are "positioned" in the browser) * Written as: chr1:127140001-127140001 + * The location may optionally be followed by a parenthetically enclosed strand, e.g. + chr1:127140001-127140001(+). * No spaces. * Includes punctuation: a colon after the chromosome, and a dash between the start and end coordinates. @@ -145,6 +147,10 @@ def from_ucsc_position( fully-closed.** https://genome-blog.gi.ucsc.edu/blog/2016/12/12/the-ucsc-genome-browser-coordinate-counting-systems/ # noqa: E501 + Note that when the string does not have a specified strand, the `Interval`'s negative + attribute is set to False. This mimics the behavior of `OverlapDetector.from_bed()` when + reading a record that does not have a specified strand. + Args: position: The UCSC "position"-formatted string. name: An optional name for the interval. @@ -157,6 +163,17 @@ def from_ucsc_position( ValueError: If the string is not a valid UCSC position-formatted string. """ + # First, check to see if the strand is specified, and remove it from the string. + strand_re = re.compile(r".*\((\+|-)\)$") + strand_match = strand_re.match(position) + + if strand_match is not None: + negative = strand_match.group(1) == "-" + position = position[:-3] + else: + negative = False + + # Then parse the location position_re = re.compile(r"^(chr(\d+|X|Y|M|MT)(?:_[A-Za-z0-9]+_alt)?):(\d+)-(\d+)$") match = position_re.match(position) @@ -167,7 +184,7 @@ def from_ucsc_position( start = int(match.group(3)) - 1 end = int(match.group(4)) - return cls(refname=refname, start=start, end=end, negative=False, name=name) + return cls(refname=refname, start=start, end=end, negative=negative, name=name) class OverlapDetector(Iterable[Interval]): diff --git a/pybedlite/tests/test_overlap_detector.py b/pybedlite/tests/test_overlap_detector.py index 22f908e..f9ecfa7 100644 --- a/pybedlite/tests/test_overlap_detector.py +++ b/pybedlite/tests/test_overlap_detector.py @@ -198,4 +198,12 @@ def test_construction_from_ucsc_position() -> None: assert Interval.from_ucsc_position("chr1:101-200") == Interval("chr1", 100, 200) assert Interval.from_ucsc_position("chr10_GL383545v1_alt:101-200") == Interval( "chr10_GL383545v1_alt", 100, 200 - ) # noqa: E501 + ) + + # Check strand + assert Interval.from_ucsc_position("chr1:101-200(+)") == Interval( + "chr1", 100, 200, negative=False + ) + assert Interval.from_ucsc_position("chr1:101-200(-)") == Interval( + "chr1", 100, 200, negative=True + )