From f1448f4ab5dc5acc0e90c9b4880f0f282937d9e5 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <50678786+jarbesfeld@users.noreply.github.com> Date: Tue, 29 Oct 2024 09:38:19 -0400 Subject: [PATCH] fix!: Expect user to supply valid, case-sensitive HGNC symbol (#375) closes #374 --- .../mappers/exon_genomic_coords.py | 24 +++++++++---------- tests/mappers/test_exon_genomic_coords.py | 12 +++------- 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 788d31c..a03e604 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -87,7 +87,9 @@ class GenomicTxSeg(BaseModelForbidExtra): """Model for representing a boundary for a transcript segment.""" seg: TxSegment | None = Field(None, description="Transcript segment.") - gene: StrictStr | None = Field(None, description="HGNC gene symbol.") + gene: StrictStr | None = Field( + None, description="Valid, case-sensitive HGNC gene symbol." + ) genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.") tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.") errors: list[StrictStr] = Field([], description="Error messages.") @@ -139,7 +141,9 @@ def check_errors(cls, values: dict) -> dict: # noqa: N805 class GenomicTxSegService(BaseModelForbidExtra): """Service model for genomic and transcript data.""" - gene: StrictStr | None = Field(None, description="HGNC gene symbol.") + gene: StrictStr | None = Field( + None, description="Valid, case-sensitive HGNC gene symbol." + ) genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.") tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.") seg_start: TxSegment | None = Field(None, description="Start transcript segment.") @@ -292,7 +296,7 @@ async def tx_segment_to_genomic( ('NC_000001.11', 154192135, 154170399) :param transcript: RefSeq transcript accession - :param gene: HGNC gene symbol + :param gene: Valid, case-sensitive HGNC gene symbol :param exon_start: Starting transcript exon number (1-based). If not provided, must provide ``exon_end`` :param exon_start_offset: Starting exon offset @@ -335,9 +339,6 @@ async def tx_segment_to_genomic( if errors: return _return_service_errors(errors) - if gene: - gene = gene.upper() - # Get aligned genomic data (hgnc gene, alt_ac, alt_start_i, alt_end_i, strand) # for exon(s) ( @@ -455,7 +456,7 @@ async def genomic_to_tx_segment( following the breakpoint for the 3' end. For the negative strand, adjacent is defined as the exon following the breakpoint for the 5' end and the exon preceding the breakpoint for the 3' end. - :param gene: gene name. Ideally, HGNC symbol. Must be given if no ``transcript`` + :param gene: A valid, case-sensitive HGNC symbol. Must be given if no ``transcript`` value is provided. :param coordinate_type: Coordinate type for ``seg_start_genomic`` and ``seg_end_genomic`` @@ -473,9 +474,6 @@ async def genomic_to_tx_segment( if errors: return _return_service_errors(errors) - if gene is not None: - gene = gene.upper() - params = {} if seg_start_genomic: @@ -630,7 +628,7 @@ async def _get_genomic_aln_coords( must provide ``tx_exon_end`` :param tx_exon_end: Transcript's exon end coordinates. If not provided, must provide ``tx_exon_start`` - :param gene: HGNC gene symbol + :param gene: A valid, case-sensitive HGNC gene symbol :return: Tuple containing aligned genomic data for start and end exon and warnings if found """ @@ -755,7 +753,7 @@ async def _genomic_to_tx_segment( :param transcript: The transcript to use. If this is not given, we will try the following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript - :param gene: HGNC gene symbol + :param gene: Valid, case-sensitive HGNC gene symbol :param get_nearest_transcript_junction: If ``True``, this will return the adjacent exon if the position specified by``seg_start_genomic`` or ``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent @@ -1062,7 +1060,7 @@ async def _get_tx_seg_genomic_metadata( :param genomic_ac: Genomic RefSeq accession :param genomic_pos: Genomic position where the transcript segment occurs :param is_seg_start: Whether or not ``genomic_pos`` represents the start position. - :param gene: HGNC gene symbol + :param gene: Valid, case-sensitive HGNC gene symbol :param tx_ac: Transcript RefSeq accession. If not provided, will use MANE transcript :return: Transcript segment data and associated genomic metadata diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index 44938f9..07ff3ff 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -1162,7 +1162,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): "seg_start_genomic": 9597639, "seg_end_genomic": 9609996, "transcript": "NM_003390.3", - "gene": "wee1", + "gene": "WEE1", } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, wee1_exon2_exon11) @@ -1177,7 +1177,7 @@ async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): "genomic_ac": "NC_000011.9", "seg_start_genomic": 9597639, # GRCh38 coords: 9576092 "seg_end_genomic": 9609996, # GRCh38 coords: 9588449 - "gene": "wee1", + "gene": "WEE1", } g_to_t_resp = await test_egc_mapper.genomic_to_tx_segment(**inputs) genomic_tx_seg_service_checks(g_to_t_resp, mane_wee1_exon2_exon11) @@ -1216,12 +1216,6 @@ async def test_transcript_to_genomic( expected.seg_end.genomic_location.start = 154170399 genomic_tx_seg_service_checks(resp, expected) - resp = await test_egc_mapper.tx_segment_to_genomic( - exon_start=None, exon_end=8, gene="tpm3", transcript="NM_152263.3" - ) - expected.seg_end.genomic_location.start = 154170399 - genomic_tx_seg_service_checks(resp, expected) - expected = tpm3_exon1_exon8.model_copy(deep=True) resp = await test_egc_mapper.tx_segment_to_genomic( exon_start=1, exon_end=8, exon_end_offset=-5, transcript="NM_152263.3" @@ -1371,7 +1365,7 @@ async def test_invalid(test_egc_mapper): gene="dummy gene", ) genomic_tx_seg_service_checks(resp, is_valid=False) - assert resp.errors == ["Expected gene, DUMMY GENE, but found TPM3"] + assert resp.errors == ["Expected gene, dummy gene, but found TPM3"] # Invalid accession resp = await test_egc_mapper.genomic_to_tx_segment(