Skip to content

Commit

Permalink
wormbase is now integrated into AllianceGenome
Browse files Browse the repository at this point in the history
  • Loading branch information
jal347 committed Jul 24, 2024
1 parent dd05f39 commit 53b752b
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 90 deletions.
2 changes: 1 addition & 1 deletion src/config_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
"ratmap:": r"RATMAP:",
"rgd:": r"RGD:",
"flybase:": r"FLYBASE:",
"wormbase:": r"WormBase:",
# "wormbase:": r"WormBase:", # deprecated for alliancegenome
"tair:": r"TAIR:",
"zfin:": r"ZFIN:",
"sgd:": r"SGD:",
Expand Down
112 changes: 51 additions & 61 deletions src/hub/dataload/sources/entrez/gene_upload.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from .parser import GeneInfoParser
from .parser import get_geneid_d
import biothings.hub.dataload.uploader as uploader
from biothings.utils.common import dump2gridfs

from .parser import GeneInfoParser, get_geneid_d


class EntrezGeneUploader(uploader.MergerSourceUploader):

Expand All @@ -22,8 +22,16 @@ def get_geneid_d(self, *args, **kwargs):
def post_update_data(self, *args, **kwargs):
self.logger.info('Uploading "geneid_d" to GridFS...')
geneid_d = self.get_geneid_d(load_cache=False, save_cache=False)
dump2gridfs(geneid_d, self.name + '__geneid_d.pyobj', self.db)
for field in ["MGI", "HGNC", "RGD", "TAIR", "WormBase", "ZFIN", "SGD", "FLYBASE"]:
dump2gridfs(geneid_d, self.name + "__geneid_d.pyobj", self.db)
for field in [
"MGI",
"HGNC",
"RGD",
"TAIR",
"ZFIN",
"SGD",
"FLYBASE",
]:
self.logger.info("Indexing '%s'" % field)
self.collection.create_index(field, background=True)

Expand All @@ -33,105 +41,87 @@ def get_mapping(klass):
"entrezgene": {
"type": "keyword",
"normalizer": "keyword_lowercase_normalizer",
"copy_to": "all"
},
"taxid": {
"type": "integer"
"copy_to": "all",
},
"taxid": {"type": "integer"},
"alias": {
"type": "keyword",
"normalizer": "keyword_lowercase_normalizer",
"copy_to": "all"
},
"name": {
"type": "text",
"copy_to": "all"
},
"other_names": {
"type": "text",
"copy_to": "all"
"copy_to": "all",
},
"name": {"type": "text", "copy_to": "all"},
"other_names": {"type": "text", "copy_to": "all"},
"symbol": {
"type": "keyword",
"normalizer": "keyword_lowercase_normalizer",
"copy_to": "all"
"copy_to": "all",
},
"locus_tag": {
"type": "keyword",
"normalizer": "keyword_lowercase_normalizer",
"copy_to": "all"
"copy_to": "all",
},

# do not index map_location and type_of_gene
"map_location": {
"index": False,
"type": "text"
},
"map_location": {"index": False, "type": "text"},
"type_of_gene": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword"
},
"AnimalQTLdb": {
"index": False,
"type": "text"
},
"Vega": {
"index": False,
"type": "text"
"type": "keyword",
},

"AnimalQTLdb": {"index": False, "type": "text"},
"Vega": {"index": False, "type": "text"},
# convert index_name to lower-case, and excluded from "_all"
"HGNC": {
"type": "keyword", # 1771
"normalizer": "keyword_lowercase_normalizer"
"type": "keyword", # 1771
"normalizer": "keyword_lowercase_normalizer",
},
"HPRD": {
"type": "keyword", # 00310
"normalizer": "keyword_lowercase_normalizer"
"type": "keyword", # 00310
"normalizer": "keyword_lowercase_normalizer",
},
"MIM": {
"type": "keyword", # 116953
"normalizer": "keyword_lowercase_normalizer"
"type": "keyword", # 116953
"normalizer": "keyword_lowercase_normalizer",
},
"MGI": {
"type": "keyword", # MGI:104772
"normalizer": "keyword_lowercase_normalizer"
},
"RATMAP": {
"type": "keyword",
"normalizer": "keyword_lowercase_normalizer"
"type": "keyword", # MGI:104772
"normalizer": "keyword_lowercase_normalizer",
},
"RATMAP": {"type": "keyword", "normalizer": "keyword_lowercase_normalizer"},
"RGD": {
"type": "keyword", # 70486
"normalizer": "keyword_lowercase_normalizer"
"type": "keyword", # 70486
"normalizer": "keyword_lowercase_normalizer",
},
"FLYBASE": {
"type": "keyword", # FBgn0004107
"normalizer": "keyword_lowercase_normalizer"
"type": "keyword", # FBgn0004107
"normalizer": "keyword_lowercase_normalizer",
},
"WormBase": {
"type": "keyword", # WBGene00000871
"normalizer": "keyword_lowercase_normalizer"
# "WormBase": {
# "type": "keyword", # WBGene00000871
# "normalizer": "keyword_lowercase_normalizer"
# },
"AllianceGenome": {
"type": "keyword", # WBGene00019362
"normalizer": "keyword_lowercase_normalizer",
},
"TAIR": {
"type": "keyword", # AT3G48750
"normalizer": "keyword_lowercase_normalizer"
"type": "keyword", # AT3G48750
"normalizer": "keyword_lowercase_normalizer",
},
"ZFIN": {
"type": "keyword", # ZDB-GENE-040426-2741
"normalizer": "keyword_lowercase_normalizer"
"type": "keyword", # ZDB-GENE-040426-2741
"normalizer": "keyword_lowercase_normalizer",
},
"SGD": {
"type": "keyword", # S000003566
"normalizer": "keyword_lowercase_normalizer"
"type": "keyword", # S000003566
"normalizer": "keyword_lowercase_normalizer",
},
"Xenbase": {
"type": "keyword",
"normalizer": "keyword_lowercase_normalizer"
"normalizer": "keyword_lowercase_normalizer",
},
"miRBase": {
"type": "keyword",
"normalizer": "keyword_lowercase_normalizer"
"normalizer": "keyword_lowercase_normalizer",
},
}
return mapping
73 changes: 46 additions & 27 deletions src/hub/datatransform/keylookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,51 +3,70 @@

graph_mygene = nx.DiGraph()

for field in ["entrez","ensembl","uniprot","mgi","hgnc","rgd","tair","wormbase","zfin","sgd","flybase"]:
for field in [
"entrez",
"ensembl",
"uniprot",
"mgi",
"hgnc",
"rgd",
"tair",
"zfin",
"sgd",
"flybase",
]:
graph_mygene.add_node(field)


graph_mygene.add_edge('swissprot', 'entrez',
object=MongoDBEdge('uniprot',
lookup='uniprot.Swiss-Prot',
field='_id'))
graph_mygene.add_edge(
"swissprot",
"entrez",
object=MongoDBEdge("uniprot", lookup="uniprot.Swiss-Prot", field="_id"),
)

graph_mygene.add_edge('swissprot', 'swissprot',
object=MongoDBEdge('uniprot',
lookup='uniprot.Swiss-Prot',
field='uniprot.Swiss-Prot'))
graph_mygene.add_edge(
"swissprot",
"swissprot",
object=MongoDBEdge(
"uniprot", lookup="uniprot.Swiss-Prot", field="uniprot.Swiss-Prot"
),
)

graph_mygene.add_edge('trembl', 'entrez',
object=MongoDBEdge('uniprot',
lookup='uniprot.TrEMBL',
field='_id'))
graph_mygene.add_edge(
"trembl",
"entrez",
object=MongoDBEdge("uniprot", lookup="uniprot.TrEMBL", field="_id"),
)

# TODO: GeneID already contains entrez _id but we need to keep
# the interface until keylookup is able to take from values
# from docs
graph_mygene.add_edge('entrez', 'entrez',
object=MongoDBEdge('entrez_gene',
lookup='_id',
field='_id'))
graph_mygene.add_edge(
"entrez", "entrez", object=MongoDBEdge("entrez_gene", lookup="_id", field="_id")
)

for field in ["MGI","HGNC","RGD","TAIR","WormBase","ZFIN","SGD","FLYBASE"]:
graph_mygene.add_edge(field.lower(), 'entrez',
object=MongoDBEdge('entrez_gene',
lookup=field,
field='_id'))
for field in ["MGI", "HGNC", "RGD", "TAIR", "ZFIN", "SGD", "FLYBASE"]:
graph_mygene.add_edge(
field.lower(),
"entrez",
object=MongoDBEdge("entrez_gene", lookup=field, field="_id"),
)


# TODO: conversions from ensembl to entrez should be added but mappings are currently
# computed and stored in mongo as files, not collections, so they can't be queried

import pprint

pprint.pprint(graph_mygene)


class MyGeneKeyLookup(DataTransformMDB):

def __init__(self, input_types, skip_on_failure=False):
super(MyGeneKeyLookup, self).__init__(graph_mygene,
input_types,
output_types=["entrez","ensembl"],
skip_on_failure=skip_on_failure)

super(MyGeneKeyLookup, self).__init__(
graph_mygene,
input_types,
output_types=["entrez", "ensembl"],
skip_on_failure=skip_on_failure,
)
2 changes: 1 addition & 1 deletion src/tests/data_tests/test_1_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def test_104(self):
def test_105(self):
# this is not nematode, "taxid": 31234
res = self.request("gene/171911").json()
assert "WormBase" in res
assert "AllianceGenome" in res

def test_106(self):
# fission yeast
Expand Down
2 changes: 2 additions & 0 deletions src/tests/data_tests/test_4_input.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
from biothings.tests.web import BiothingsDataTest


Expand Down Expand Up @@ -44,6 +45,7 @@ def test_411_case_sensitivity(self):
# case-insensitive sources
self.query(q="mirbase:MI0017267")

@pytest.mark.skip("Deprecated now integrated into AllianceGenome")
def test_412_case_sensitivity(self):
self.query(q="wormbase:WBGene00019362", species=6239)

Expand Down

0 comments on commit 53b752b

Please sign in to comment.