Skip to content

Commit

Permalink
Merge pull request #45 from bgyori/uniprot_resource
Browse files Browse the repository at this point in the history
Update UniProt client due to new UniProt API
  • Loading branch information
bgyori authored Aug 29, 2023
2 parents fffb634 + 2bb0981 commit 98e7663
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 39 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ jobs:
key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }}
restore-keys: |
${{ runner.os }}-pip-
- name: Set up Python 3.6
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.6
python-version: 3.8
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion protmapper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.0.27'
__version__ = '0.0.28'
import os
import logging

Expand Down
2 changes: 1 addition & 1 deletion protmapper/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def map_to_human_ref(self, prot_id, prot_ns, residue, position,
position)
error_code = None
except HTTPError as ex:
if ex.response.status_code == 404:
if ex.response.status_code in {400, 404}:
error_code = 'UNIPROT_HTTP_NOT_FOUND'
else:
error_code = 'UNIPROT_HTTP_OTHER'
Expand Down
70 changes: 50 additions & 20 deletions protmapper/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,39 +72,68 @@ def download_uniprot_entries(out_file, cached=True):
if cached:
_download_from_s3('uniprot_entries.tsv.gz', out_file)
return
base_columns = ['id', 'genes(PREFERRED)', 'entry%20name',
'database(RGD)', 'database(MGI)', 'length', 'reviewed',
'organism-id', 'database(GeneID)']
processed_columns = ['genes', 'protein%20names']
feature_types = ['SIGNAL', 'CHAIN', 'PROPEPTIDE', 'PEPTIDE', 'TRANSIT']
columns = base_columns + processed_columns + \
['feature(%s)' % feat for feat in feature_types]
base_columns = [
'accession', # 'id',
'gene_primary', # 'genes(PREFERRED)',
'id', # 'entry%20name',
'xref_rgd', # 'database(RGD)',
'xref_mgi', # 'database(MGI)',
'length', # 'length',
'reviewed', # 'reviewed',
'organism_id', # 'organism-id',
'xref_geneid', # 'database(GeneID)'
]
processed_columns = [
'gene_names', # 'genes',
'protein_name', # 'protein%20names'
]

feature_types = {
'ft_signal': 'SIGNAL', # 'SIGNAL',
'ft_chain': 'CHAIN', # 'CHAIN',
'ft_propep': 'PROPEPTIDE', # 'PROPEPTIDE',
'ft_peptide': 'PEPTIDE', # 'PEPTIDE',
'ft_transit': 'TRANSIT', # 'TRANSIT',
}
feature_columns = list(feature_types)
columns = base_columns + processed_columns + feature_columns
columns_str = ','.join(columns)
logger.info('Downloading UniProt entries')
url = 'https://legacy.uniprot.org/uniprot/?' + \
'sort=id&desc=no&compress=no&query=reviewed:yes&' + \
'format=tab&columns=' + columns_str
url = 'https://rest.uniprot.org/uniprotkb/stream?' \
'format=tsv&' \
'query=reviewed:true&' \
'compressed=true&' \
'sort=accession asc&' \
'fields=' + columns_str
#url = 'http://www.uniprot.org/uniprot/?' + \
# 'sort=id&desc=no&compress=no&query=reviewed:yes&' + \
# 'format=tab&columns=' + columns_str
logger.info('Downloading %s' % url)
res = requests.get(url)
if res.status_code != 200:
logger.info('Failed to download "%s"' % url)
reviewed_entries = res.content

url = 'https://legacy.uniprot.org/uniprot/?' + \
'sort=id&desc=no&compress=no&query=reviewed:no&fil=organism:' + \
'%22Homo%20sapiens%20(Human)%20[9606]%22&' + \
'format=tab&columns=' + columns_str
reviewed_entries = gzip.decompress(res.content).decode('utf-8')

url = 'https://rest.uniprot.org/uniprotkb/stream?' \
'format=tsv&' \
'query=organism_id:9606 AND (reviewed:false)&' \
'compressed=true&' \
'sort=accession asc&' \
'fields=' + columns_str

#url = 'http://www.uniprot.org/uniprot/?' + \
# 'sort=id&desc=no&compress=no&query=reviewed:no&fil=organism:' + \
# '%22Homo%20sapiens%20(Human)%20[9606]%22&' + \
# 'format=tab&columns=' + columns_str
logger.info('Downloading %s' % url)
res = requests.get(url)
if res.status_code != 200:
logger.info('Failed to download "%s"' % url)
unreviewed_human_entries = res.content
unreviewed_human_entries = gzip.decompress(res.content).decode('utf-8')

if not((reviewed_entries is not None) and
(unreviewed_human_entries is not None)):
return
unreviewed_human_entries = unreviewed_human_entries.decode('utf-8')
reviewed_entries = reviewed_entries.decode('utf-8')
lines = reviewed_entries.strip('\n').split('\n')
lines += unreviewed_human_entries.strip('\n').split('\n')[1:]

Expand All @@ -114,7 +143,7 @@ def download_uniprot_entries(out_file, cached=True):
if line_idx == 0:
continue
new_line = process_uniprot_line(line, base_columns, processed_columns,
feature_types)
list(feature_types.values()))
new_lines.append(new_line)

# Join all lines into a single string
Expand All @@ -127,6 +156,7 @@ def download_uniprot_entries(out_file, cached=True):
def process_uniprot_line(line, base_columns, processed_columns,
feature_types):
terms = line.split('\t')
terms[4] = terms[4].replace('MGI:', '')

# At this point, we need to clean up the gene names.
# If there are multiple gene names, take the first one
Expand Down
10 changes: 5 additions & 5 deletions protmapper/tests/test_protmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,11 @@ def test_map_mouse_site():

def test_map_rat_site():
sm = ProtMapper()
ms = sm.map_to_human_ref('NPHS1', 'hgnc', 'Y', '1204')
assert ms == MappedSite(up_id='O60500', error_code=None, valid=False,
orig_res='Y', orig_pos='1204', mapped_id='O60500',
mapped_res='Y', mapped_pos='1193',
description='INFERRED_RAT_SITE', gene_name='NPHS1')
ms = sm.map_to_human_ref('ELK1', 'hgnc', 'S', '159')
assert ms == MappedSite(up_id='P19419', error_code=None, valid=False,
orig_res='S', orig_pos='159', mapped_id='P19419',
mapped_res='S', mapped_pos='160',
description='INFERRED_RAT_SITE', gene_name='ELK1')


def test_map_methionine_cleavage():
Expand Down
9 changes: 5 additions & 4 deletions protmapper/tests/test_uniprot_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def test_query_protein_deprecated():

@attr('webservice')
def test_get_family_members():
members = uniprot_client.get_family_members('RAF')
members = uniprot_client.get_family_members(
'protein kinase superfamily TKL Ser/Thr protein kinase family RAF subfamily')
assert 'ARAF' in members
assert 'BRAF' in members
assert 'RAF1' in members
Expand Down Expand Up @@ -84,8 +85,8 @@ def test_get_gene_name_no_gene_name():


def test_get_gene_name_multiple_gene_names():
gene_name = uniprot_client.get_gene_name('Q5VWM5')
assert gene_name == 'PRAMEF9'
gene_name = uniprot_client.get_gene_name('P69905')
assert gene_name == 'HBA1'


def test_is_human():
Expand Down Expand Up @@ -262,7 +263,7 @@ def test_features():
for chain in chains:
assert chain.type == 'CHAIN'
if chain.name == 'BH3-interacting domain death agonist p15':
assert chain.begin == 62, chain
assert chain.begin == 61, chain
assert chain.end == 195
assert chain.id == 'PRO_0000223233'

Expand Down
23 changes: 17 additions & 6 deletions protmapper/uniprot_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
logger = logging.getLogger(__name__)


uniprot_url = 'https://legacy.uniprot.org/uniprot/'
uniprot_url = 'https://uniprot.org/uniprot/'
rest_api_url = 'https://rest.uniprot.org/uniprotkb/'
stream_api_url = rest_api_url + 'stream'

xml_ns = {'up': 'http://uniprot.org/uniprot'}

Expand Down Expand Up @@ -46,6 +48,8 @@ def query_protein(protein_id: str) -> Union[ElementTree.ElementTree, None]:
# the response for the primary ID P40417.
ret = requests.get(url)
et = ElementTree.fromstring(ret.content)
if et.tag == 'errorInfo':
return None
return et
except Exception as e:
return None
Expand Down Expand Up @@ -155,11 +159,18 @@ def get_family_members(family_name, human_only=True):
gene_names : list
The HGNC gene symbols corresponding to the given family.
"""
data = {'query': 'family:%s' % family_name,
'format': 'list'}
query_parts = [
'family:"%s"' % family_name,
'reviewed:true'
]
if human_only:
data['fil'] = 'organism:human'
res = requests.get(uniprot_url, params=data)
query_parts.append('model_organism:9606')

query_str = ' AND '.join([f'({q})' for q in query_parts])

data = {'query': query_str,
'format': 'list'}
res = requests.get(stream_api_url, params=data)
if not res.status_code == 200 or not res.text:
return None
# res.text gets us the Unicode
Expand Down Expand Up @@ -362,7 +373,7 @@ def get_sequence(protein_id):
protein_id = _reattach_isoform(base, iso)
seq = um.uniprot_sequences.get(protein_id)
if seq is None:
url = uniprot_url + '%s.fasta' % protein_id
url = rest_api_url + '%s.fasta' % protein_id
res = requests.get(url)
res.raise_for_status()
# res.text is Unicode
Expand Down

0 comments on commit 98e7663

Please sign in to comment.