From 815042a56a3d6aa9603e4f37800e7271769dc125 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Sun, 6 Mar 2022 17:37:32 -0500 Subject: [PATCH 01/18] Make no_dedup_sentences the default for extract-and-vector (can be overridden via MC_NO_DEDUP_SENTENCES env var) apps/common/src/python/mediawords/util/config/__init__.py: add env_bool function apps/extract-and-vector/bin/extract_and_vector_worker.py: honor MC_NO_DEDUP_SENTENCES --- .../src/python/mediawords/util/config/__init__.py | 10 ++++++++++ .../bin/extract_and_vector_worker.py | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/apps/common/src/python/mediawords/util/config/__init__.py b/apps/common/src/python/mediawords/util/config/__init__.py index 08f12feb8e..53819ff3e5 100644 --- a/apps/common/src/python/mediawords/util/config/__init__.py +++ b/apps/common/src/python/mediawords/util/config/__init__.py @@ -46,6 +46,16 @@ def env_value(name: str, required: bool = True, allow_empty_string: bool = False return value +def env_bool(name: str, default: bool = False) -> bool: + """ + Retrieve boolean from environment variable; should be 0 or 1. + + :param name: Environment variable name. + :param default: default value, if no value found. + """ + + value = os.environ.get(name, default) + return bool(int(value)) def file_with_env_value(name: str, allow_empty_string: bool = False, encoded_with_base64: bool = False) -> str: """ diff --git a/apps/extract-and-vector/bin/extract_and_vector_worker.py b/apps/extract-and-vector/bin/extract_and_vector_worker.py index 0738c6e200..7a21a67864 100755 --- a/apps/extract-and-vector/bin/extract_and_vector_worker.py +++ b/apps/extract-and-vector/bin/extract_and_vector_worker.py @@ -4,6 +4,7 @@ from mediawords.db import connect_to_db from mediawords.job import JobBroker +from mediawords.util.config import env_bool from mediawords.util.log import create_logger from mediawords.util.perl import decode_object_from_bytes_if_needed from extract_and_vector.dbi.stories.extractor_arguments import PyExtractorArguments @@ -69,8 +70,10 @@ def run_extract_and_vector(stories_id: int, use_cache: bool = False, use_existin log.info("Extracting story {}...".format(stories_id)) + no_dedup_sentences = env_bool('MC_NO_DEDUP_SENTENCES', True) try: - extractor_args = PyExtractorArguments(use_cache=use_cache, use_existing=use_existing) + extractor_args = PyExtractorArguments(use_cache=use_cache, use_existing=use_existing, + no_dedup_sentences=no_dedup_sentences) extract_and_process_story(db=db, story=story, extractor_args=extractor_args) except Exception as ex: From c77dd185a959740930f6c4c4048c6ea9d3b62335 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Fri, 18 Mar 2022 18:29:41 -0400 Subject: [PATCH 02/18] common/Dockerfile: skip jieba.cache creation; makes empty root owned file --- apps/common/Dockerfile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/apps/common/Dockerfile b/apps/common/Dockerfile index 08d99c3ef5..7fbb0b8c4a 100644 --- a/apps/common/Dockerfile +++ b/apps/common/Dockerfile @@ -139,11 +139,13 @@ RUN \ done # Prebuild Jieba dictionary cache -COPY bin/build_jieba_dict_cache.py / -RUN \ - /build_jieba_dict_cache.py && \ - rm /build_jieba_dict_cache.py && \ - true +# PLB 2022-03-18: was creating empty, root owned file + +#COPY bin/build_jieba_dict_cache.py / +#RUN \ +# /build_jieba_dict_cache.py && \ +# rm /build_jieba_dict_cache.py && \ +# true # Symlink Log::Log4perl configuration to where it's going to be found RUN ln -s /opt/mediacloud/src/common/perl/log4perl.conf /etc/log4perl.conf From f37b7f92729106d2afc5161e1d822c5da25b7be8 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Fri, 18 Mar 2022 18:31:45 -0400 Subject: [PATCH 03/18] solr-base/Dockerfile: try cloning mediacloud config as mediacloud64 --- apps/solr-base/Dockerfile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/apps/solr-base/Dockerfile b/apps/solr-base/Dockerfile index 0ff5015f9c..6c76d33d8e 100644 --- a/apps/solr-base/Dockerfile +++ b/apps/solr-base/Dockerfile @@ -19,5 +19,18 @@ RUN \ RUN mkdir -p /usr/src/ COPY src/solr/ /usr/src/solr/ +# Try to create 64-bit enabled mediacloud64 collection by cloning config +# NOTE: collections/mediacloud/conf/solrconfig.xml uses +# ${mediacloud.luceneMatchVersion} ${mediacloud.solr_webapp_dir} ${mediacloud.solr_dist_dir} +# which reference JVM properties set in solr-shard/bin/solr-shard.sh +# ALSO: core.properties has "instanceDir=/var/lib/solr/mediacloud" (dir does not exist?!) +# will be wacked to .../mediacloud64 (also does not exist) +RUN \ + mkdir -p /usr/src/solr/collections/mediacloud64 && \ + cp -rp /usr/src/solr/collections/mediacloud/* /usr/src/solr/collections/mediacloud64/ && \ + sed -i.32 's/mediacloud/mediacloud64/' /usr/src/solr/collections/mediacloud64/core.properties && \ + sed -i.32 '/ Date: Tue, 22 Mar 2022 21:30:34 -0400 Subject: [PATCH 04/18] common/Dockerfile: reenable jieba.cache creation & chown it --- apps/common/Dockerfile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/apps/common/Dockerfile b/apps/common/Dockerfile index 7fbb0b8c4a..b7a575490c 100644 --- a/apps/common/Dockerfile +++ b/apps/common/Dockerfile @@ -139,13 +139,13 @@ RUN \ done # Prebuild Jieba dictionary cache -# PLB 2022-03-18: was creating empty, root owned file - -#COPY bin/build_jieba_dict_cache.py / -#RUN \ -# /build_jieba_dict_cache.py && \ -# rm /build_jieba_dict_cache.py && \ -# true +COPY bin/build_jieba_dict_cache.py / +RUN \ + /build_jieba_dict_cache.py && \ + rm /build_jieba_dict_cache.py && \ + chown mediacloud:mediacloud /var/tmp/jieba.cache && \ + ls -l /var/tmp/jieba.cache && \ + true # Symlink Log::Log4perl configuration to where it's going to be found RUN ln -s /opt/mediacloud/src/common/perl/log4perl.conf /etc/log4perl.conf From 18682966e56f2caf24e09d07d50479a65bb61d35 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Wed, 23 Mar 2022 16:54:51 -0400 Subject: [PATCH 05/18] apps/common/src/python/mediawords/solr/request.py: add/use SOLR_COLLECTION Set to "mediacloud2" (solr alias!) --- apps/common/src/python/mediawords/solr/request.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/common/src/python/mediawords/solr/request.py b/apps/common/src/python/mediawords/solr/request.py index 5694c0e0da..4e79984c6a 100644 --- a/apps/common/src/python/mediawords/solr/request.py +++ b/apps/common/src/python/mediawords/solr/request.py @@ -24,6 +24,8 @@ __QUERY_HTTP_TIMEOUT = 15 * 60 """Timeout of a single HTTP query.""" +# Testing alias!! +SOLR_COLLECTION = 'mediacloud2' class _AbstractSolrRequestException(Exception, metaclass=abc.ABCMeta): """Abstract .solr.request exception.""" @@ -59,7 +61,7 @@ def __wait_for_solr_to_start(config: Optional[CommonConfig]) -> None: """Wait for Solr to start and collections to become available, if needed.""" # search for an empty or rare term here because searching for *:* sometimes causes a timeout for some reason - sample_select_url = f"{config.solr_url()}/mediacloud/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json" + sample_select_url = f"{config.solr_url()}/{SOLR_COLLECTION}/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json" connected = False @@ -191,7 +193,7 @@ def solr_request(path: str, if not params: params = {} - abs_uri = furl(f"{solr_url}/mediacloud/{path}") + abs_uri = furl(f"{solr_url}/{SOLR_COLLECTION}/{path}") abs_uri = abs_uri.set(params) abs_url = str(abs_uri) From 4ab37292d295722f64040b778cf20eef9106fa69 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Wed, 23 Mar 2022 17:50:13 -0400 Subject: [PATCH 06/18] apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm: speedups for processing backlog up FETCH_BLOCK_SIZE from 100 to 200: ammortizes citus connection startup time by processing fetching larger blocks of text. Honor skip_update_snapshot option (defaults to true): skip setting snapshots.searchable=true --- apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm b/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm index a260cb109c..223dea7fcd 100644 --- a/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm +++ b/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm @@ -55,7 +55,7 @@ Readonly my @SOLR_FIELDS => qw/stories_id media_id publish_date publish_day publ text title language processed_stories_id tags_id_stories timespans_id/; # how many sentences to fetch at a time from the postgres query -Readonly my $FETCH_BLOCK_SIZE => 100; +Readonly my $FETCH_BLOCK_SIZE => 200; # default time sleep when there are less than MIN_STORIES_TO_PROCESS: Readonly my $DEFAULT_THROTTLE => 60; @@ -601,6 +601,7 @@ Options: * throttle -- sleep this number of seconds between each block of stories (default 60) * full -- shortcut for: update=false, empty_queue=true, throttle=1; assume and optimize for static queue * skip_logging -- skip logging the import into the solr_import_stories or solr_imports tables (default=false) +* skip_update_snapshot -- skip setting snapshots.searchable=true (default=true) The import will run in blocks of "max_queued_stories" at a time. The function will keep trying to find stories to import. If there are less than @@ -627,6 +628,7 @@ sub import_data($;$) my $empty_queue = $options->{ empty_queue } // 0; my $throttle = $options->{ throttle } // $DEFAULT_THROTTLE; my $skip_logging = $options->{ skip_logging } // 0; + my $skip_update_snapshot = $options->{ skip_update_snapshot } // 1; my $daemon = $options->{ daemon } // 0; $_last_max_queue_stories_id = 0; @@ -669,7 +671,7 @@ sub import_data($;$) _save_import_log( $db, $stories_ids ); } - if ( !$skip_logging ) + if ( !$skip_logging && !$skip_update_snapshot ) { _update_snapshot_solr_status( $db ); } From 791248a0edc00154759bef6fb6773d0667ea9918 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Wed, 23 Mar 2022 18:14:15 -0400 Subject: [PATCH 07/18] add apps/solr-base/src/solr/aliases.json with "mediacloud2" solr alias --- apps/solr-base/src/solr/aliases.json | 1 + 1 file changed, 1 insertion(+) create mode 100644 apps/solr-base/src/solr/aliases.json diff --git a/apps/solr-base/src/solr/aliases.json b/apps/solr-base/src/solr/aliases.json new file mode 100644 index 0000000000..c25d98d5cb --- /dev/null +++ b/apps/solr-base/src/solr/aliases.json @@ -0,0 +1 @@ +{"collection":{"mediacloud2":"mediacloud64,mediacloud"}} From c4fb3d600a7d53118806e0bbdbfb0b936a3d9426 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Thu, 24 Mar 2022 22:18:10 -0400 Subject: [PATCH 08/18] apps/common/src/requirements.txt: force MarkupSafe==2.0.1 (Jinja2 2.11.3 can't cope with the new MarkupSafe 2.1.1) --- apps/common/src/requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/common/src/requirements.txt b/apps/common/src/requirements.txt index 3bb17a43d9..6b8237199f 100644 --- a/apps/common/src/requirements.txt +++ b/apps/common/src/requirements.txt @@ -43,6 +43,10 @@ furl==2.1.0 # Chinese language tokenizer, stemmer, etc. jieba==0.42.1 +# For Jinja2 2.11.3, which requests MarkupSafe>=0.23 and is now +# getting version 2.1.1, which removed a deprecated function. +MarkupSafe==2.0.1 + # Parsing email templates Jinja2==2.11.3 From 5845a61c4ca3f879a6213a8240d3358c9d9faf6f Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Sun, 27 Mar 2022 17:35:06 -0400 Subject: [PATCH 09/18] solr-zookeeper: preload aliases.json into zookeeper --- apps/solr-zookeeper/bin/init_solr_config.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/apps/solr-zookeeper/bin/init_solr_config.sh b/apps/solr-zookeeper/bin/init_solr_config.sh index 752a435389..965dc25e3a 100755 --- a/apps/solr-zookeeper/bin/init_solr_config.sh +++ b/apps/solr-zookeeper/bin/init_solr_config.sh @@ -41,5 +41,12 @@ for collection_path in /usr/src/solr/collections/*; do fi done +ALIASES=/usr/src/solr/aliases.json +if [ -f $ALIASES ]; then + /opt/solr/server/scripts/cloud-scripts/zkcli.sh \ + -zkhost 127.0.0.1:2181 \ + -cmd putfile /aliases.json $ALIASES +fi + # Stop after initial configuration pkill java From 40391ed953f633643ede7eb6a5b38bb3b63e24e6 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Wed, 27 Apr 2022 12:19:40 -0400 Subject: [PATCH 10/18] apps/postgresql-server/bin/apply_migrations.sh: increase PGCTL_START_TIMEOUT to 3hrs --- apps/postgresql-server/bin/apply_migrations.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/postgresql-server/bin/apply_migrations.sh b/apps/postgresql-server/bin/apply_migrations.sh index bcc2d702e0..77267db3a7 100755 --- a/apps/postgresql-server/bin/apply_migrations.sh +++ b/apps/postgresql-server/bin/apply_migrations.sh @@ -14,7 +14,8 @@ MIGRATIONS_DIR="/opt/postgresql-server/pgmigrate/migrations" TEMP_PORT=12345 # In case the database is in recovery, wait for up to 1 hour for it to complete -PGCTL_START_TIMEOUT=3600 +# PLB: increased to three hours +PGCTL_START_TIMEOUT=10800 if [ ! -d "${MIGRATIONS_DIR}" ]; then echo "Migrations directory ${MIGRATIONS_DIR} does not exist." From f3951163cc58b146ef336e2e0bf05d3d417e4e60 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Fri, 6 May 2022 20:34:08 -0400 Subject: [PATCH 11/18] postgresql-pgbouncer/conf/pgbounder.init: PG server running on postgresql EC2 server w/o docker --- apps/postgresql-pgbouncer/conf/pgbouncer.ini | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/postgresql-pgbouncer/conf/pgbouncer.ini b/apps/postgresql-pgbouncer/conf/pgbouncer.ini index eb3f28662c..fd88573c47 100644 --- a/apps/postgresql-pgbouncer/conf/pgbouncer.ini +++ b/apps/postgresql-pgbouncer/conf/pgbouncer.ini @@ -1,5 +1,6 @@ [databases] -* = host=postgresql-server port=5432 user=mediacloud +; PhilB 5/6/22: PG server running on postgresql EC2 server w/o docker +* = host=postgresql. port=5432 user=mediacloud [pgbouncer] From 76be84476b420bac15a37e34373a580d57863327 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Sat, 7 May 2022 00:15:50 -0400 Subject: [PATCH 12/18] pgbouncer.ini: use postgresql server ip --- apps/postgresql-pgbouncer/conf/pgbouncer.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/postgresql-pgbouncer/conf/pgbouncer.ini b/apps/postgresql-pgbouncer/conf/pgbouncer.ini index fd88573c47..f7a14b215d 100644 --- a/apps/postgresql-pgbouncer/conf/pgbouncer.ini +++ b/apps/postgresql-pgbouncer/conf/pgbouncer.ini @@ -1,6 +1,6 @@ [databases] ; PhilB 5/6/22: PG server running on postgresql EC2 server w/o docker -* = host=postgresql. port=5432 user=mediacloud +* = host=172.30.0.58 port=5432 user=mediacloud [pgbouncer] From bf745541b996e0f7571e1b04c5c857958aed8aa6 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Wed, 18 May 2022 11:58:56 -0400 Subject: [PATCH 13/18] apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Timespans.pm Removed extra AND to try to fix #833 as suggested by Rahul. --- .../src/perl/MediaWords/Controller/Api/V2/Topics/Timespans.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Topics/Timespans.pm b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Topics/Timespans.pm index 3fbb4e236e..cb87851cc7 100644 --- a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Topics/Timespans.pm +++ b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Topics/Timespans.pm @@ -88,7 +88,7 @@ SQL snapshots_id FROM timespans AS t where - topics_id = ? AND + topics_id = ? $snapshot_clause $focus_clause $timespan_clause From 6654fbb39fef6873ae5008dc432c9d1a4b75fa09 Mon Sep 17 00:00:00 2001 From: Xavier Frankline Date: Mon, 7 Nov 2022 17:35:29 +0300 Subject: [PATCH 14/18] fix solr query on multiple collections --- .../src/python/mediawords/solr/request.py | 102 ++++++++++++++---- 1 file changed, 84 insertions(+), 18 deletions(-) diff --git a/apps/common/src/python/mediawords/solr/request.py b/apps/common/src/python/mediawords/solr/request.py index 4e79984c6a..589ef39e4d 100644 --- a/apps/common/src/python/mediawords/solr/request.py +++ b/apps/common/src/python/mediawords/solr/request.py @@ -4,6 +4,7 @@ import abc import time +import json from typing import Union, Optional from urllib.parse import urlencode @@ -26,6 +27,8 @@ # Testing alias!! SOLR_COLLECTION = 'mediacloud2' +MEDIACLOUD_32 = 'mediacloud' +MEDIACLOUD_64 = 'mediacloud64' class _AbstractSolrRequestException(Exception, metaclass=abc.ABCMeta): """Abstract .solr.request exception.""" @@ -154,6 +157,54 @@ def __solr_error_message_from_response(response: Response) -> str: return error_message +def merge_responses(dict1: dict,dict2: dict): + """ + Merge solr responses from each of the collections to one + + :param dict1: Response from mediacloud32 collection. + :param dict2: Response from mediacloud64 collection. + + """ + new_response = {} + responseHeader = dict1["responseHeader"] + + new_response.update(responseHeader) + + #response + response = dict1["response"] + num_found = response["numFound"] + dict2["response"]["numFound"] + start = response["start"] + dict2["response"]["start"] + if response["docs"] is not None: + docs = response["docs"].extend(dict2["response"]["docs"]) + else: + docs = [] + + res_data = {"numFound":num_found,"start":start,"docs":docs} + response = {"response": res_data} + + new_response.update(response) + + #facets + facets = dict1["facets"] + count = facets["count"] + dict2["facets"]["count"] + if "categories" in facets: + if facets["categories"]["buckets"] is not None: + if "categories" in dict2["facets"]: + buckets = facets["categories"]["buckets"].extend(dict2["facets"]["categories"]["buckets"]) + else: + buckets = facets["categories"]["buckets"] + else: + buckets = [] + categories = {"buckets":buckets} + else: + categories = {} + + facets_data = {"count":count,"categories":categories} + facets = {"facets":facets_data} + new_response.update(facets) + + return new_response + def solr_request(path: str, params: SolrParams = None, content: Union[str, SolrParams] = None, @@ -193,10 +244,8 @@ def solr_request(path: str, if not params: params = {} - abs_uri = furl(f"{solr_url}/{SOLR_COLLECTION}/{path}") - abs_uri = abs_uri.set(params) - abs_url = str(abs_uri) - + collections = [MEDIACLOUD_32, MEDIACLOUD_64] + ua = UserAgent() ua.set_timeout(__QUERY_HTTP_TIMEOUT) ua.set_max_size(None) @@ -221,21 +270,38 @@ def solr_request(path: str, content_encoded = content.encode('utf-8', errors='replace') - request = Request(method='POST', url=abs_url) - request.set_header(name='Content-Type', value=content_type) - request.set_header(name='Content-Length', value=str(len(content_encoded))) - request.set_content(content_encoded) - + results = [] + for collection in collections: + abs_uri = furl(f"{solr_url}/{collection}/{path}") + abs_uri = abs_uri.set(params) + abs_url = str(abs_uri) + request = Request(method='POST', url=abs_url) + request.set_header(name='Content-Type', value=content_type) + request.set_header(name='Content-Length', value=str(len(content_encoded))) + request.set_content(content_encoded) + results.append(request) + else: - request = Request(method='GET', url=abs_url) + log.debug(f"Sending Solr request: {request}") + + responses = [] + if len(results) > 1: + for r in results: + response = ua.request(r) + if response.is_success(): + responses.append(response.decoded_content()) + else: + error_message = __solr_error_message_from_response(response=response) + raise McSolrRequestQueryErrorException(f"Error fetching Solr response: {error_message}") + + response = merge_responses(json.loads(responses[0]),json.loads(responses[1])) + return json.dumps(response) - log.debug(f"Sending Solr request: {request}") - - response = ua.request(request) - - if not response.is_success(): - error_message = __solr_error_message_from_response(response=response) - raise McSolrRequestQueryErrorException(f"Error fetching Solr response: {error_message}") + else: + response = ua.request(request) + if not response.is_success(): + error_message = __solr_error_message_from_response(response=response) + raise McSolrRequestQueryErrorException(f"Error fetching Solr response: {error_message}") - return response.decoded_content() + return response.decoded_content() From e6bc74d210c3ae6ba8099866ee215f926717079c Mon Sep 17 00:00:00 2001 From: Xavier Frankline Date: Thu, 10 Nov 2022 15:51:26 +0300 Subject: [PATCH 15/18] refactor merge solr function --- .../src/python/mediawords/solr/request.py | 67 ++++++++++--------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/apps/common/src/python/mediawords/solr/request.py b/apps/common/src/python/mediawords/solr/request.py index 589ef39e4d..4c288e483d 100644 --- a/apps/common/src/python/mediawords/solr/request.py +++ b/apps/common/src/python/mediawords/solr/request.py @@ -157,7 +157,7 @@ def __solr_error_message_from_response(response: Response) -> str: return error_message -def merge_responses(dict1: dict,dict2: dict): +def merge_responses(mc_32_bit_collection: dict,mc_64_bit_collection: dict): """ Merge solr responses from each of the collections to one @@ -166,45 +166,48 @@ def merge_responses(dict1: dict,dict2: dict): """ new_response = {} - responseHeader = dict1["responseHeader"] - new_response.update(responseHeader) + new_response.update(mc_32_bit_collection.get("responseHeader", {})) - #response - response = dict1["response"] - num_found = response["numFound"] + dict2["response"]["numFound"] - start = response["start"] + dict2["response"]["start"] - if response["docs"] is not None: - docs = response["docs"].extend(dict2["response"]["docs"]) - else: - docs = [] + mc_32_bit_response = mc_32_bit_collection.get("response", {}) + mc_64_bit_response = mc_64_bit_collection.get("response", {}) - res_data = {"numFound":num_found,"start":start,"docs":docs} - response = {"response": res_data} + num_found = mc_32_bit_response.get("numFound", 0) + mc_64_bit_response.get("numFound", 0) + start_index = mc_32_bit_response.get("start", 0) + mc_64_bit_response.get("start", 0) - new_response.update(response) + docs = [] - #facets - facets = dict1["facets"] - count = facets["count"] + dict2["facets"]["count"] - if "categories" in facets: - if facets["categories"]["buckets"] is not None: - if "categories" in dict2["facets"]: - buckets = facets["categories"]["buckets"].extend(dict2["facets"]["categories"]["buckets"]) - else: - buckets = facets["categories"]["buckets"] - else: - buckets = [] - categories = {"buckets":buckets} - else: - categories = {} - - facets_data = {"count":count,"categories":categories} - facets = {"facets":facets_data} - new_response.update(facets) + docs.extend(mc_32_bit_response.get("docs", [])).extend(mc_64_bit_response.get("docs", [])) + + new_response.update({ + "response": { + "numFound": num_found, + "start": start_index, + "docs": docs, + } + }) + + # facets + mc_32_bit_facets = mc_32_bit_response.get("facets", {}) + mc_64_bit_facets = mc_64_bit_response.get("facets", {}) + + count = mc_32_bit_facets.get("count", 0) + mc_64_bit_facets.get("count", 0) + + categories = {} + + if "categories" in mc_32_bit_facets: + if mc_32_bit_facets.get("categories", {}).get("buckets", []): + categories.update({"buckets": mc_32_bit_facets.get('categories', {}).get('buckets', []).extend(mc_64_bit_facets.get('categories', {}).get('buckets', []))}) + new_response.update({ + "facets": { + "count": count, + "categories": categories, + } + }) return new_response + def solr_request(path: str, params: SolrParams = None, content: Union[str, SolrParams] = None, From bc3c626392c50ff47aaca258caeefad274fe2e6c Mon Sep 17 00:00:00 2001 From: Xavier Frankline Date: Thu, 9 Feb 2023 09:41:51 +0300 Subject: [PATCH 16/18] fix solr response schema --- .../src/python/mediawords/solr/request.py | 50 ++++++++++++------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/apps/common/src/python/mediawords/solr/request.py b/apps/common/src/python/mediawords/solr/request.py index 4c288e483d..ce492042c4 100644 --- a/apps/common/src/python/mediawords/solr/request.py +++ b/apps/common/src/python/mediawords/solr/request.py @@ -177,7 +177,8 @@ def merge_responses(mc_32_bit_collection: dict,mc_64_bit_collection: dict): docs = [] - docs.extend(mc_32_bit_response.get("docs", [])).extend(mc_64_bit_response.get("docs", [])) + docs.extend(mc_32_bit_response.get("docs", [])) + docs.extend(mc_64_bit_response.get("docs", [])) new_response.update({ "response": { @@ -188,22 +189,37 @@ def merge_responses(mc_32_bit_collection: dict,mc_64_bit_collection: dict): }) # facets - mc_32_bit_facets = mc_32_bit_response.get("facets", {}) - mc_64_bit_facets = mc_64_bit_response.get("facets", {}) - - count = mc_32_bit_facets.get("count", 0) + mc_64_bit_facets.get("count", 0) - - categories = {} - - if "categories" in mc_32_bit_facets: - if mc_32_bit_facets.get("categories", {}).get("buckets", []): - categories.update({"buckets": mc_32_bit_facets.get('categories', {}).get('buckets', []).extend(mc_64_bit_facets.get('categories', {}).get('buckets', []))}) - new_response.update({ - "facets": { - "count": count, - "categories": categories, - } - }) + if "facets" in mc_32_bit_collection: + mc_32_bit_facets = mc_32_bit_response.get("facets", {}) + mc_64_bit_facets = mc_64_bit_response.get("facets", {}) + + count = mc_32_bit_facets.get("count", 0) + mc_64_bit_facets.get("count", 0) + x = mc_32_bit_facets.get("x", 0) + mc_64_bit_facets.get("x", 0) + + categories = {} + + if "categories" in mc_32_bit_facets: + buckets = [] + mc_32_buckets = mc_32_bit_facets.get("categories", {}).get("buckets", []) + mc_64_buckets = mc_64_bit_facets.get("categories", {}).get("buckets", []) + buckets.extend(mc_32_buckets) + buckets.extend(mc_64_buckets) + + categories.update({"buckets":buckets}) + + new_response.update({ + "facets": { + "count": count, + "categories": categories + } + }) + else: + new_response.update({ + "facets": { + "count": count, + "x": x + } + }) return new_response From e8bb5a8f998237ae26910009a034e7da2f374def Mon Sep 17 00:00:00 2001 From: Xavier Frankline Date: Tue, 7 Mar 2023 15:40:16 +0300 Subject: [PATCH 17/18] update solr merge function --- apps/common/src/python/mediawords/solr/request.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/common/src/python/mediawords/solr/request.py b/apps/common/src/python/mediawords/solr/request.py index ce492042c4..2f8fa1b7e9 100644 --- a/apps/common/src/python/mediawords/solr/request.py +++ b/apps/common/src/python/mediawords/solr/request.py @@ -189,7 +189,7 @@ def merge_responses(mc_32_bit_collection: dict,mc_64_bit_collection: dict): }) # facets - if "facets" in mc_32_bit_collection: + if "facets" in mc_32_bit_collection or "facets" in mc_64_bit_collection: mc_32_bit_facets = mc_32_bit_response.get("facets", {}) mc_64_bit_facets = mc_64_bit_response.get("facets", {}) @@ -198,7 +198,7 @@ def merge_responses(mc_32_bit_collection: dict,mc_64_bit_collection: dict): categories = {} - if "categories" in mc_32_bit_facets: + if "categories" in mc_32_bit_facets or "categories" in mc_64_bit_facets: buckets = [] mc_32_buckets = mc_32_bit_facets.get("categories", {}).get("buckets", []) mc_64_buckets = mc_64_bit_facets.get("categories", {}).get("buckets", []) From cb168c9c99180899ccccfc89f2efcd43e997fc21 Mon Sep 17 00:00:00 2001 From: Xavier Frankline Date: Tue, 21 Mar 2023 17:32:01 +0300 Subject: [PATCH 18/18] fix solr merge buckets duplicat date values --- apps/common/src/python/mediawords/solr/request.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/apps/common/src/python/mediawords/solr/request.py b/apps/common/src/python/mediawords/solr/request.py index 2f8fa1b7e9..1036c32aaf 100644 --- a/apps/common/src/python/mediawords/solr/request.py +++ b/apps/common/src/python/mediawords/solr/request.py @@ -202,9 +202,17 @@ def merge_responses(mc_32_bit_collection: dict,mc_64_bit_collection: dict): buckets = [] mc_32_buckets = mc_32_bit_facets.get("categories", {}).get("buckets", []) mc_64_buckets = mc_64_bit_facets.get("categories", {}).get("buckets", []) - buckets.extend(mc_32_buckets) - buckets.extend(mc_64_buckets) - + merged = {} + for item in mc_32_buckets + mc_64_buckets: + val = item['val'] + if val in merged: + merged[val]['count'] += item['count'] + merged[val]['x'] += item['x'] + else: + merged[val] = item.copy() + + merged = list(merged.values()) + buckets.extend(merged) categories.update({"buckets":buckets}) new_response.update({