Skip to content

Commit

Permalink
feat: permit strand
Browse files Browse the repository at this point in the history
  • Loading branch information
msto committed Apr 4, 2024
1 parent 66d2a00 commit c5e80e2
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 2 deletions.
19 changes: 18 additions & 1 deletion pybedlite/overlap_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,19 @@ def from_ucsc_position(
The "Position" format (referring to the "1-start, fully-closed" system as coordinates are
"positioned" in the browser)
* Written as: chr1:127140001-127140001
* The location may optionally be followed by a parenthetically enclosed strand, e.g.
chr1:127140001-127140001(+).
* No spaces.
* Includes punctuation: a colon after the chromosome, and a dash between the start and
end coordinates.
* When in this format, the assumption is that the coordinate is **1-start,
fully-closed.**
https://genome-blog.gi.ucsc.edu/blog/2016/12/12/the-ucsc-genome-browser-coordinate-counting-systems/ # noqa: E501
Note that when the string does not have a specified strand, the `Interval`'s negative
attribute is set to False. This mimics the behavior of `OverlapDetector.from_bed()` when
reading a record that does not have a specified strand.
Args:
position: The UCSC "position"-formatted string.
name: An optional name for the interval.
Expand All @@ -157,6 +163,17 @@ def from_ucsc_position(
ValueError: If the string is not a valid UCSC position-formatted string.
"""

# First, check to see if the strand is specified, and remove it from the string.
strand_re = re.compile(r".*\((\+|-)\)$")
strand_match = strand_re.match(position)

if strand_match is not None:
negative = strand_match.group(1) == "-"
position = position[:-3]
else:
negative = False

# Then parse the location
position_re = re.compile(r"^(chr(\d+|X|Y|M|MT)(?:_[A-Za-z0-9]+_alt)?):(\d+)-(\d+)$")

match = position_re.match(position)
Expand All @@ -167,7 +184,7 @@ def from_ucsc_position(
start = int(match.group(3)) - 1
end = int(match.group(4))

return cls(refname=refname, start=start, end=end, negative=False, name=name)
return cls(refname=refname, start=start, end=end, negative=negative, name=name)


class OverlapDetector(Iterable[Interval]):
Expand Down
10 changes: 9 additions & 1 deletion pybedlite/tests/test_overlap_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,4 +198,12 @@ def test_construction_from_ucsc_position() -> None:
assert Interval.from_ucsc_position("chr1:101-200") == Interval("chr1", 100, 200)
assert Interval.from_ucsc_position("chr10_GL383545v1_alt:101-200") == Interval(
"chr10_GL383545v1_alt", 100, 200
) # noqa: E501
)

# Check strand
assert Interval.from_ucsc_position("chr1:101-200(+)") == Interval(
"chr1", 100, 200, negative=False
)
assert Interval.from_ucsc_position("chr1:101-200(-)") == Interval(
"chr1", 100, 200, negative=True
)

0 comments on commit c5e80e2

Please sign in to comment.