From 5411a64a68324ff5c4bc6898ec765df299870626 Mon Sep 17 00:00:00 2001 From: carlosribas Date: Mon, 18 Nov 2019 15:45:34 +0000 Subject: [PATCH 1/8] Add new field to check when a sequence should be searched --- sequence_search/db/models.py | 78 ++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/sequence_search/db/models.py b/sequence_search/db/models.py index b43ece0a..d68cecf4 100644 --- a/sequence_search/db/models.py +++ b/sequence_search/db/models.py @@ -74,53 +74,54 @@ class CONSUMER_STATUS_CHOICES(object): # TODO: consistent naming for tables: either 'jobs' and 'consumers' or 'job' and 'consumer' """State of a consumer instance""" Consumer = sa.Table('consumer', metadata, - sa.Column('ip', sa.String(20), primary_key=True), - sa.Column('status', sa.String(255)), # choices=CONSUMER_STATUS_CHOICES, default='available' - sa.Column('job_chunk_id', sa.ForeignKey('job_chunks.id')), - sa.Column('port', sa.String(10))) + sa.Column('ip', sa.String(20), primary_key=True), + sa.Column('status', sa.String(255)), # choices=CONSUMER_STATUS_CHOICES, default='available' + sa.Column('job_chunk_id', sa.ForeignKey('job_chunks.id')), + sa.Column('port', sa.String(10))) """A search job that is divided into multiple job chunks per database""" Job = sa.Table('jobs', metadata, - sa.Column('id', sa.String(36), primary_key=True), - sa.Column('query', sa.Text), - sa.Column('description', sa.Text, nullable=True), - sa.Column('ordering', sa.Text, nullable=True), - sa.Column('submitted', sa.DateTime), - sa.Column('finished', sa.DateTime, nullable=True), - sa.Column('status', sa.String(255))) # choices=JOB_STATUS_CHOICES, default='started' + sa.Column('id', sa.String(36), primary_key=True), + sa.Column('query', sa.Text), + sa.Column('description', sa.Text, nullable=True), + sa.Column('ordering', sa.Text, nullable=True), + sa.Column('submitted', sa.DateTime), + sa.Column('finished', sa.DateTime, nullable=True), + sa.Column('result_in_db', sa.Boolean), + sa.Column('status', sa.String(255))) # choices=JOB_STATUS_CHOICES """Part of the search job, run against a specific database and assigned to a specific consumer""" JobChunk = sa.Table('job_chunks', metadata, - sa.Column('id', sa.Integer, primary_key=True), - sa.Column('job_id', sa.String(36), sa.ForeignKey('jobs.id')), - sa.Column('database', sa.String(255)), - sa.Column('submitted', sa.DateTime, nullable=True), - sa.Column('finished', sa.DateTime, nullable=True), - sa.Column('consumer', sa.ForeignKey('consumer.ip'), nullable=True), - sa.Column('status', sa.String(255))) # choices=JOB_CHUNK_STATUS_CHOICES, default='started' + sa.Column('id', sa.Integer, primary_key=True), + sa.Column('job_id', sa.String(36), sa.ForeignKey('jobs.id')), + sa.Column('database', sa.String(255)), + sa.Column('submitted', sa.DateTime, nullable=True), + sa.Column('finished', sa.DateTime, nullable=True), + sa.Column('consumer', sa.ForeignKey('consumer.ip'), nullable=True), + sa.Column('status', sa.String(255))) # choices=JOB_CHUNK_STATUS_CHOICES, default='started' """Result of a specific JobChunk""" JobChunkResult = sa.Table('job_chunk_results', metadata, - sa.Column('id', sa.Integer, primary_key=True), - sa.Column('job_chunk_id', None, sa.ForeignKey('job_chunks.id')), - sa.Column('rnacentral_id', sa.String(255)), - sa.Column('description', sa.Text, nullable=True), - sa.Column('score', sa.Float), - sa.Column('bias', sa.Float), - sa.Column('e_value', sa.Float), - sa.Column('target_length', sa.Integer), - sa.Column('alignment', sa.Text), - sa.Column('alignment_length', sa.Integer), - sa.Column('gap_count', sa.Integer), - sa.Column('match_count', sa.Integer), - sa.Column('nts_count1', sa.Integer), - sa.Column('nts_count2', sa.Integer), - sa.Column('identity', sa.Float), - sa.Column('query_coverage', sa.Float), - sa.Column('target_coverage', sa.Float), - sa.Column('gaps', sa.Float), - sa.Column('query_length', sa.Integer), - sa.Column('result_id', sa.Integer)) + sa.Column('id', sa.Integer, primary_key=True), + sa.Column('job_chunk_id', None, sa.ForeignKey('job_chunks.id')), + sa.Column('rnacentral_id', sa.String(255)), + sa.Column('description', sa.Text, nullable=True), + sa.Column('score', sa.Float), + sa.Column('bias', sa.Float), + sa.Column('e_value', sa.Float), + sa.Column('target_length', sa.Integer), + sa.Column('alignment', sa.Text), + sa.Column('alignment_length', sa.Integer), + sa.Column('gap_count', sa.Integer), + sa.Column('match_count', sa.Integer), + sa.Column('nts_count1', sa.Integer), + sa.Column('nts_count2', sa.Integer), + sa.Column('identity', sa.Float), + sa.Column('query_coverage', sa.Float), + sa.Column('target_coverage', sa.Float), + sa.Column('gaps', sa.Float), + sa.Column('query_length', sa.Integer), + sa.Column('result_id', sa.Integer)) # Migrations @@ -160,6 +161,7 @@ async def migrate(ENVIRONMENT): ordering TEXT, submitted TIMESTAMP, finished TIMESTAMP, + result_in_db BOOLEAN, status VARCHAR(255)) ''') From 9cb357ad71329579c42221ae40886d160f1d9758 Mon Sep 17 00:00:00 2001 From: carlosribas Date: Mon, 18 Nov 2019 15:51:50 +0000 Subject: [PATCH 2/8] Search or retrieve data from database --- sequence_search/db/jobs.py | 55 ++++++++---- sequence_search/producer/views/submit_job.py | 88 +++++++++++--------- 2 files changed, 84 insertions(+), 59 deletions(-) diff --git a/sequence_search/db/jobs.py b/sequence_search/db/jobs.py index bcdaa92c..1c3e0ccf 100644 --- a/sequence_search/db/jobs.py +++ b/sequence_search/db/jobs.py @@ -29,6 +29,28 @@ def __str__(self): return "Job '%s' not found" % self.job_id +async def sequence_exists(engine, query): + """ + Check if this query has already been searched + :param engine: params to connect to the db + :param query: the sequence that the user wants to search + :return: job_id if this query is in the db, otherwise returns none + """ + try: + async with engine.acquire() as connection: + try: + sql_query = sa.select([Job.c.id]).select_from(Job).where( + (Job.c.query == query) & Job.c.result_in_db + ) + async for row in connection.execute(sql_query): + return row[0] if row else None + except Exception as e: + raise SQLError("Failed to check if query exists for query = %s" % query) from e + except psycopg2.Error as e: + raise DatabaseConnectionError("Failed to open connection to the database in sequence_exists() for " + "sequence with query = %s" % query) from e + + async def get_job(engine, job_id): try: async with engine.acquire() as connection: @@ -81,8 +103,8 @@ async def save_job(engine, query, description): raise SQLError("Failed to save job for query = %s, " "description = %s to the database" % (query, description)) from e except psycopg2.Error as e: - raise DatabaseConnectionError("Failed to open connection to the database in " - "save_job() for job with job_id = %s" % job_id) from e + raise DatabaseConnectionError("Failed to open connection to the database in save_job() for job with " + "job_id = %s" % job_id) from e async def set_job_status(engine, job_id, status): @@ -96,11 +118,12 @@ async def set_job_status(engine, job_id, status): try: async with engine.acquire() as connection: try: - query = sa.text('''UPDATE jobs SET status = :status, finished = :finished WHERE id = :job_id''') - await connection.execute(query, job_id=job_id, status=status, finished=finished) + query = sa.text('''UPDATE jobs SET status = :status, finished = :finished, + result_in_db = :result_in_db WHERE id = :job_id''') + await connection.execute(query, job_id=job_id, status=status, finished=finished, result_in_db=True) except Exception as e: - raise SQLError("Failed to save job to the database about failed job, " - "job_id = %s, status = %s" % (job_id, status)) from e + raise SQLError("Failed to save job to the database about failed job, job_id = %s, " + "status = %s" % (job_id, status)) from e except psycopg2.Error as e: raise DatabaseConnectionError("Failed to open connection to the database in set_job_status() " "for job with job_id = %s" % job_id) from e @@ -125,18 +148,14 @@ async def get_jobs_statuses(engine): jobs_dict = {} async for row in connection.execute(query): if row.job_id not in jobs_dict: - jobs_dict[row.job_id] = { - 'id': row.job_id, - 'status': row.job_status, - 'submitted': str(row.submitted), - 'chunks': [ - { - 'database': row.database, - 'status': row.status, - 'consumer': row.consumer - } - ] - } + jobs_dict[row.job_id] = { + 'id': row.job_id, + 'status': row.job_status, + 'submitted': str(row.submitted), + 'chunks': [ + {'database': row.database, 'status': row.status, 'consumer': row.consumer} + ] + } else: jobs_dict[row.job_id]['chunks'].append({ 'database': row.database, diff --git a/sequence_search/producer/views/submit_job.py b/sequence_search/producer/views/submit_job.py index 7e3028cf..8a009bbc 100644 --- a/sequence_search/producer/views/submit_job.py +++ b/sequence_search/producer/views/submit_job.py @@ -18,7 +18,7 @@ from sequence_search.db.models import JOB_CHUNK_STATUS_CHOICES from ...db.consumers import delegate_job_chunk_to_consumer, find_available_consumers -from ...db.jobs import save_job +from ...db.jobs import save_job, sequence_exists from ...db.job_chunks import save_job_chunk, set_job_chunk_status from ...consumer.rnacentral_databases import producer_validator, producer_to_consumers_databases @@ -105,43 +105,49 @@ async def submit_job(request): except (KeyError, TypeError, ValueError) as e: raise web.HTTPBadRequest(text=str(e)) from e - # save metadata about this job and job_chunks to the database - job_id = await save_job(request.app['engine'], data['query'], data['description']) - - databases = producer_to_consumers_databases(data['databases']) - for database in databases: - # save job_chunk with "created" status. This prevents the check_chunks_and_consumers function, - # which runs every 5 seconds, from executing the same job_chunk again. - await save_job_chunk(request.app['engine'], job_id, database) - - # TODO: what if Job was saved and JobChunk was not? Need transactions? - - consumers = await find_available_consumers(request.app['engine']) - - # if there are consumers available, delegate job_chunk to consumer - for index in range(min(len(consumers), len(databases))): - try: - await delegate_job_chunk_to_consumer( - engine=request.app['engine'], - consumer_ip=consumers[index].ip, - consumer_port=consumers[index].port, - job_id=job_id, - database=databases[index], - query=data['query'] - ) - except Exception as e: - return web.HTTPBadGateway(text=str(e)) - - # change job_chunk status to pending if no consumer is available - for index in range(len(consumers), len(databases)): - try: - await set_job_chunk_status( - request.app['engine'], - job_id, - databases[index], - status=JOB_CHUNK_STATUS_CHOICES.pending - ) - except Exception as e: - return web.HTTPBadGateway(text=str(e)) - - return web.json_response({"job_id": job_id}, status=201) + # perform the search or get the data from the database? + job_id = await sequence_exists(request.app['engine'], data['query']) + + if job_id: + return web.json_response({"job_id": job_id}, status=201) + else: + # save metadata about this job and job_chunks to the database + job_id = await save_job(request.app['engine'], data['query'], data['description']) + + databases = producer_to_consumers_databases(data['databases']) + for database in databases: + # save job_chunk with "created" status. This prevents the check_chunks_and_consumers function, + # which runs every 5 seconds, from executing the same job_chunk again. + await save_job_chunk(request.app['engine'], job_id, database) + + # TODO: what if Job was saved and JobChunk was not? Need transactions? + + consumers = await find_available_consumers(request.app['engine']) + + # if there are consumers available, delegate job_chunk to consumer + for index in range(min(len(consumers), len(databases))): + try: + await delegate_job_chunk_to_consumer( + engine=request.app['engine'], + consumer_ip=consumers[index].ip, + consumer_port=consumers[index].port, + job_id=job_id, + database=databases[index], + query=data['query'] + ) + except Exception as e: + return web.HTTPBadGateway(text=str(e)) + + # change job_chunk status to pending if no consumer is available + for index in range(len(consumers), len(databases)): + try: + await set_job_chunk_status( + request.app['engine'], + job_id, + databases[index], + status=JOB_CHUNK_STATUS_CHOICES.pending + ) + except Exception as e: + return web.HTTPBadGateway(text=str(e)) + + return web.json_response({"job_id": job_id}, status=201) From e5ed9429c3742d97a9e353982b0d0a412b01ad6b Mon Sep 17 00:00:00 2001 From: carlosribas Date: Mon, 18 Nov 2019 15:55:49 +0000 Subject: [PATCH 3/8] Ignoring hosts* --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index c15bda1c..a388271a 100644 --- a/.gitignore +++ b/.gitignore @@ -20,7 +20,7 @@ __pycache__/ *.pyo *.retry -ansible/hosts +ansible/hosts* ansible/producer_latest.dump node_modules/ ENV/ From f34882710a2ab3df40d722b932342e82ade19f9b Mon Sep 17 00:00:00 2001 From: carlosribas Date: Mon, 18 Nov 2019 16:17:12 +0000 Subject: [PATCH 4/8] Ignoring my tests --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a388271a..c57bd019 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,4 @@ sequence_search/consumer/results sequence_search/consumer/queries sequence_search/consumer/databases sequence_search/monitor/.conf-file +sequence_search/monitor/test* From 7d32e3abddab7c557f72728d373422423603924b Mon Sep 17 00:00:00 2001 From: carlosribas Date: Thu, 21 Nov 2019 17:22:18 +0000 Subject: [PATCH 5/8] Playbook to use after a new release --- ansible/postgres_new_release.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 ansible/postgres_new_release.yml diff --git a/ansible/postgres_new_release.yml b/ansible/postgres_new_release.yml new file mode 100644 index 00000000..80eedd42 --- /dev/null +++ b/ansible/postgres_new_release.yml @@ -0,0 +1,11 @@ +--- +# Playbook to update the jobs table and allow users to search for a sequence that already has results in the database +- hosts: postgres + remote_user: centos + become: true + become_method: sudo + vars: + ansible_ssh_private_key_file: "../terraform/sequence_search_rsa" + tasks: + - name: Update jobs to allow users to search for any sequence + command: psql -U docker -c "UPDATE jobs SET result_in_db = NOT result_in_db WHERE result_in_db = TRUE" producer From 9b1a849d406cd50fe13c1215c9e7535bfb18df48 Mon Sep 17 00:00:00 2001 From: carlosribas Date: Mon, 25 Nov 2019 10:16:18 +0000 Subject: [PATCH 6/8] Docstring describing save_job_chunk function --- sequence_search/db/job_chunks.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sequence_search/db/job_chunks.py b/sequence_search/db/job_chunks.py index 30021440..6d97a355 100644 --- a/sequence_search/db/job_chunks.py +++ b/sequence_search/db/job_chunks.py @@ -60,6 +60,14 @@ async def get_job_chunk_from_job_and_database(engine, job_id, database): async def save_job_chunk(engine, job_id, database): + """ + Jobs are divided into chunks and each chunk searches for sequences in a piece of the database. + Here we are saving a job chunk for a specific fasta file. + :param engine: params to connect to the db + :param job_id: id of the job + :param database: fasta file with RNAcentral data + :return: id of the job chunk + """ try: async with engine.acquire() as connection: try: From e5f2c40843830e0f6937ba47f516cedc937e5b9d Mon Sep 17 00:00:00 2001 From: carlosribas Date: Mon, 25 Nov 2019 13:50:31 +0000 Subject: [PATCH 7/8] PEP8 fixes --- sequence_search/consumer/views/submit_job.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sequence_search/consumer/views/submit_job.py b/sequence_search/consumer/views/submit_job.py index 5b32bd35..86f21ee8 100644 --- a/sequence_search/consumer/views/submit_job.py +++ b/sequence_search/consumer/views/submit_job.py @@ -24,7 +24,7 @@ from ..rnacentral_databases import query_file_path, result_file_path, consumer_validator from ..settings import MAX_RUN_TIME from ...db import DatabaseConnectionError, SQLError -from ...db.models import CONSUMER_STATUS_CHOICES, JOB_STATUS_CHOICES, JOB_CHUNK_STATUS_CHOICES +from ...db.models import CONSUMER_STATUS_CHOICES, JOB_CHUNK_STATUS_CHOICES from ...db.job_chunk_results import set_job_chunk_results from ...db.job_chunks import get_consumer_ip_from_job_chunk, get_job_chunk_from_job_and_database, \ set_job_chunk_status, set_job_chunk_consumer @@ -152,7 +152,7 @@ async def submit_job(request): job_id = data['job_id'] sequence = data['sequence'] database = data['database'] - consumer_ip = get_ip(request.app) # 'host.docker.internal' + consumer_ip = get_ip(request.app) # 'host.docker.internal' # if request was successful, save the consumer state and job_chunk state to the database try: From 26e2340dfbf4136b3cd9face0cafa03be8fc2500 Mon Sep 17 00:00:00 2001 From: carlosribas Date: Mon, 25 Nov 2019 16:02:45 +0000 Subject: [PATCH 8/8] Updating the script to prevent retrieving database results --- sequence_search/monitor/monitor.sh | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sequence_search/monitor/monitor.sh b/sequence_search/monitor/monitor.sh index d2c97948..f8681fa7 100755 --- a/sequence_search/monitor/monitor.sh +++ b/sequence_search/monitor/monitor.sh @@ -1,4 +1,5 @@ -# Script to regularly run a specific search and check the results +# Script to regularly run a search and check the results. +# Each job searches a different sequence with a different size to avoid retrieving database results. # -o allexport enables all following variable definitions to be exported. # +o allexport disables this feature. @@ -7,7 +8,11 @@ source $PWD/.conf-file set +o allexport CONTENT_TYPE="Content-Type: application/json" -DATABASE_AND_QUERY="{\"databases\": [\"mirbase\"], \"query\": \">sequence-search-test\nCUGUACUAUCUACUGUCUCUC\"}" +SIZE=$(( $RANDOM % 5 + 20 )) +NEW_SEQUENCE=$(cat /dev/urandom | env LC_CTYPE=C tr -dc ACGTU | head -c $SIZE) +DATABASES_LIST=("flybase" "gencode" "mirbase" "pdbe" "snopy" "srpdb") +DATABASE=${DATABASES_LIST[RANDOM%${#DATABASES_LIST[@]}]} +DATABASE_AND_QUERY="{\"databases\": [\"$DATABASE\"], \"query\": \">sequence-search-test\n$NEW_SEQUENCE\"}" # Run search on the correct target (test or default). # Ansible adds the correct floating_ip to the .conf-file @@ -25,11 +30,11 @@ if curl -s --head --request GET $HOST | grep "200 OK" > /dev/null; then PAUSE="0" if [ "$STATUS" == "started" ] || [ "$STATUS" == "pending" ] then - while [ $PAUSE -lt 30 ] + while [ $PAUSE -lt 300 ] do - sleep 60 + sleep 10 STATUS=$(curl -s ${HOST}/api/job-status/$JOB_ID | jq -r '.chunks | .[] | .status') - if [ "$STATUS" == "success" ] || [ "$STATUS" == "error" ] || [ "$STATUS" == "timeout" ] + if [ "$STATUS" == "success" ] || [ "$STATUS" = "partial_success" ] || [ "$STATUS" == "error" ] || [ "$STATUS" == "timeout" ] then break fi @@ -38,13 +43,12 @@ if curl -s --head --request GET $HOST | grep "200 OK" > /dev/null; then fi # Facets search - # Expected response: "hitCount = 23 and textSearchError = false" FACETS_SEARCH=$(curl -s ${HOST}/api/facets-search/$JOB_ID | jq '[.hitCount,.textSearchError]') HIT_COUNT=$(echo ${FACETS_SEARCH} | jq '.[0]') SEARCH_ERROR=$(echo ${FACETS_SEARCH} | jq '.[1]') # Send message in case of unexpected result - if [ "$STATUS" != "success" ] || [ "$HIT_COUNT" != "23" ] || [ "$SEARCH_ERROR" != "false" ] + if [ "$STATUS" != "success" ] || [ "$HIT_COUNT" -gt 0 ] && [ "$SEARCH_ERROR" != "false" ] || [ "$HIT_COUNT" == 0 ] && [ "$SEARCH_ERROR" != "true" ] then text="Ops! There is something wrong with the RNAcentral sequence search. Please check the links below: \n \n @@ -52,7 +56,7 @@ if curl -s --head --request GET $HOST | grep "200 OK" > /dev/null; then The job status is ${STATUS} (the expected status is success). \n \n To check the results see: ${HOST}/api/facets-search/$JOB_ID \n - The job results are ${HIT_COUNT} and ${SEARCH_ERROR} (the expected results are 23 and false)." + The job results are ${HIT_COUNT} and ${SEARCH_ERROR}." escapedText=$(echo ${text} | sed 's/"/\"/g' | sed "s/'/\'/g" ) json="{\"text\": \"$escapedText\"}" curl -s -d "payload=$json" $WEBHOOK_URL