Skip to content

Commit

Permalink
increased weight for symbol in api query search and added tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jal347 committed Aug 2, 2024
1 parent 3116a7b commit d91df91
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 69 deletions.
5 changes: 5 additions & 0 deletions src/tests/data_tests/test_2_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,11 @@ def test_260_order(self):
assert (10116, 367901) in genes # rat
assert (695, 9606) not in genes # this field should not be sorted

def test_261_order(self):
res = self.request("query?q=CTNNA2&species=human").json()
assert res["hits"][0]["_id"] == "1496"
assert res["hits"][0]["symbol"] == "CTNNA2"


class TestQueryPOST(BiothingsDataTest):
host = "mygene.info"
Expand Down
111 changes: 42 additions & 69 deletions src/web/pipeline/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,17 @@
def dismax(q):

_query = {

"tie_breaker": 0,
"boost": 1,
"queries": [
{
"function_score": {
"query": {
"match": {
"symbol": {
"query": q,
"analyzer": "whitespace_lowercase"
}
"symbol": {"query": q, "analyzer": "whitespace_lowercase"}
},
},
"weight": 5
"weight": 8,
}
},
{
Expand All @@ -32,8 +28,7 @@ def dismax(q):
# kinase 2" appears first
"match_phrase": {"name": q},
},
"weight": 4

"weight": 4,
}
},
{
Expand All @@ -43,24 +38,21 @@ def dismax(q):
"name": {
"query": q,
"operator": "and",
"analyzer": "whitespace_lowercase"
"analyzer": "whitespace_lowercase",
}
},
},
"weight": 3
"weight": 3,
}
},
{
"function_score": {
"query": {
"match": {
"unigene": {
"query": q,
"analyzer": "string_lowercase"
}
"unigene": {"query": q, "analyzer": "string_lowercase"}
}
},
"weight": 1.1
"weight": 1.1,
}
},
{
Expand All @@ -69,65 +61,53 @@ def dismax(q):
"multi_match": {
"query": q,
"fields": [
'refseq.rna',
'refseq.protein',
'accession.rna',
'accession.protein'
"refseq.rna",
"refseq.protein",
"accession.rna",
"accession.protein",
],
"operator": "or"
"operator": "or",
}
},
"weight": 1.1
"weight": 1.1,
}
},
{
"function_score": {
"query": {
"match": {
"go": {
"query": q,
"analyzer": "string_lowercase"
}
}
"match": {"go": {"query": q, "analyzer": "string_lowercase"}}
},
"weight": 1.1
"weight": 1.1,
}
},
{
"function_score": {
"query": {
"query_string": {
"query": q,
"default_operator": "AND"
},
"query_string": {"query": q, "default_operator": "AND"},
},
"weight": 1
"weight": 1,
}
}
]
},
],
}

if is_int(q):
_query['queries'] = [
_query["queries"] = [
{
"function_score": {
"query": {
"term": {"entrezgene": int(q)},
},
"weight": 8
"weight": 8,
}
}
]

return {
"query": {
"dis_max": _query
}
}
return {"query": {"dis_max": _query}}


def wildcard(q):
'''q should contains either * or ?, but not the first character.'''
"""q should contains either * or ?, but not the first character."""

_query = {
"tie_breaker": 0,
Expand Down Expand Up @@ -166,45 +146,41 @@ def wildcard(q):
}
}
},
]
],
}

return {
"query": {
"dis_max": _query
}
}
return {"query": {"dis_max": _query}}


def safe_genome_pos(s):
'''
safe_genome_pos(1000) = 1000
safe_genome_pos('1000') = 1000
safe_genome_pos('10,000') = 100000
'''
"""
safe_genome_pos(1000) = 1000
safe_genome_pos('1000') = 1000
safe_genome_pos('10,000') = 100000
"""
if isinstance(s, int):
return s
elif isinstance(s, str):
return int(s.replace(',', ''))
return int(s.replace(",", ""))
else:
raise ValueError('invalid type "%s" for "save_genome_pos"' % type(s))


def interval(chrom, gstart, gend, assembly=None):
'''By default if assembly is None, the lastest assembly is used.
for some species (e.g. human) we support multiple assemblies,
exact assembly is passed as well.
'''
"""By default if assembly is None, the lastest assembly is used.
for some species (e.g. human) we support multiple assemblies,
exact assembly is passed as well.
"""
gstart = safe_genome_pos(gstart)
gend = safe_genome_pos(gend)
if chrom.lower().startswith('chr'):
if chrom.lower().startswith("chr"):
chrom = chrom[3:]

genomic_pos_field = "genomic_pos"
if assembly:
if assembly == 'hg19':
if assembly == "hg19":
genomic_pos_field = "genomic_pos_hg19"
if assembly == 'mm9':
if assembly == "mm9":
genomic_pos_field = "genomic_pos_mm9"

_query = {
Expand All @@ -213,15 +189,12 @@ def interval(chrom, gstart, gend, assembly=None):
"query": {
"bool": {
"must": [
{"term": {
genomic_pos_field + ".chr": chrom.lower()}},
{"range": {
genomic_pos_field + ".start": {"lte": gend}}},
{"range": {
genomic_pos_field + ".end": {"gte": gstart}}}
{"term": {genomic_pos_field + ".chr": chrom.lower()}},
{"range": {genomic_pos_field + ".start": {"lte": gend}}},
{"range": {genomic_pos_field + ".end": {"gte": gstart}}},
]
}
}
},
}
}
return dict(query=_query)

0 comments on commit d91df91

Please sign in to comment.