From 4ef5c41890ac766afd5c2c95f420a5cdb126edc5 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Mon, 4 Nov 2024 12:49:53 -0600 Subject: [PATCH 01/17] Added some Hysds-related entries to .gitignore --- .gitignore | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.gitignore b/.gitignore index 1d5d4528d..ce018c28c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,11 @@ +###################################################################### +# HySDS +###################################################################### + +celeryconfig.py +_alt_error.txt +_alt_traceback.txt + ###################################################################### # Terraform ###################################################################### From 5434a92144a291b1f52e40ae0edb8640ea457388 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Mon, 4 Nov 2024 12:54:07 -0600 Subject: [PATCH 02/17] Added data_subscriber/gcov package with basic catalog and query class definitions for NISAR GCOV products --- data_subscriber/gcov/__init__.py | 0 data_subscriber/gcov/gcov_catalog.py | 13 +++++++++++++ data_subscriber/gcov/gcov_query.py | 12 ++++++++++++ tests/unit/data_subscriber/test_catalog.py | 18 ++++++++++++++++++ 4 files changed, 43 insertions(+) create mode 100644 data_subscriber/gcov/__init__.py create mode 100644 data_subscriber/gcov/gcov_catalog.py create mode 100644 data_subscriber/gcov/gcov_query.py diff --git a/data_subscriber/gcov/__init__.py b/data_subscriber/gcov/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/data_subscriber/gcov/gcov_catalog.py b/data_subscriber/gcov/gcov_catalog.py new file mode 100644 index 000000000..688672333 --- /dev/null +++ b/data_subscriber/gcov/gcov_catalog.py @@ -0,0 +1,13 @@ + +from data_subscriber.catalog import ProductCatalog + +class NisarGcovProductCatalog(ProductCatalog): + """Cataloging class for downloaded NISAR GCOV Products.""" + NAME = "nisar_gcov_catalog" + ES_INDEX_PATTERNS = "nisar_gcov_catalog*" + + def process_query_result(self, query_result): + return [result['_source'] for result in (query_result or [])] + + def granule_and_revision(self, es_id): + return es_id.split('-r')[0], es_id.split('-r')[1] diff --git a/data_subscriber/gcov/gcov_query.py b/data_subscriber/gcov/gcov_query.py new file mode 100644 index 000000000..3b8d4eb54 --- /dev/null +++ b/data_subscriber/gcov/gcov_query.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 + +import logging + +from data_subscriber.query import CmrQuery + +logger = logging.getLogger(__name__) + +class NisarGcovCmrQuery(CmrQuery): + + def __init__(self, args, token, es_conn, cmr, job_id, settings): + super().__init__(args, token, es_conn, cmr, job_id, settings) diff --git a/tests/unit/data_subscriber/test_catalog.py b/tests/unit/data_subscriber/test_catalog.py index 1de21ea68..3631cb3a4 100644 --- a/tests/unit/data_subscriber/test_catalog.py +++ b/tests/unit/data_subscriber/test_catalog.py @@ -5,6 +5,7 @@ from data_subscriber.cslc.cslc_catalog import CSLCProductCatalog from data_subscriber.cslc.cslc_catalog import CSLCStaticProductCatalog +from data_subscriber.gcov.gcov_catalog import NisarGcovProductCatalog from data_subscriber.hls.hls_catalog import HLSProductCatalog from data_subscriber.hls.hls_catalog import HLSSpatialProductCatalog from data_subscriber.rtc.rtc_catalog import RTCProductCatalog @@ -432,3 +433,20 @@ def test_cslc_static_product_catalog(): assert granule == "OPERA_L2_CSLC-S1-STATIC_T042-088897-IW1_20140403_S1A_v1.0.h5" assert revision == "50" + +def test_nisar_gcov_product_catalog(): + """Tests for functionality specific to the CSLCStaticProductCatalog class""" + nisar_gcov_product_catalog = NisarGcovProductCatalog() + + assert nisar_gcov_product_catalog.NAME == "nisar_gcov_catalog" + assert nisar_gcov_product_catalog.ES_INDEX_PATTERNS == "nisar_gcov_catalog*" + + assert isinstance(nisar_gcov_product_catalog.generate_es_index_name(), str) + assert nisar_gcov_product_catalog.generate_es_index_name().startswith("nisar_gcov_catalog-") + + granule, revision = nisar_gcov_product_catalog.granule_and_revision( + es_id="NISAR_L2_PR_GCOV_001_001_A_000_2000_SHNA_A_20240609T045403_20240609T045413_T00777_M_F_J_777.h5-r86" + ) + + assert granule == "NISAR_L2_PR_GCOV_001_001_A_000_2000_SHNA_A_20240609T045403_20240609T045413_T00777_M_F_J_777.h5" + assert revision == "86" From 8c13da7419a96189959de7c0a24761c6a2b1307b Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Mon, 4 Nov 2024 12:55:20 -0600 Subject: [PATCH 03/17] Added entries for NISAR GCOV to enumerations in data_subscriber/cmr.py --- data_subscriber/cmr.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/data_subscriber/cmr.py b/data_subscriber/cmr.py index d538b20cf..2b638876d 100644 --- a/data_subscriber/cmr.py +++ b/data_subscriber/cmr.py @@ -1,18 +1,17 @@ #!/usr/bin/env python3 import logging +import netrc import re +from collections import namedtuple from datetime import datetime, timedelta from enum import Enum from typing import Iterable -from collections import namedtuple -import netrc import dateutil.parser -from more_itertools import first_true - from data_subscriber.aws_token import supply_token from data_subscriber.rtc import mgrs_bursts_collection_db_client as mbc_client +from more_itertools import first_true from rtc_utils import rtc_granule_regex from tools.ops.cmr_audit import cmr_client from tools.ops.cmr_audit.cmr_client import cmr_requests_get, async_cmr_posts @@ -30,6 +29,7 @@ class Collection(str, Enum): RTC_S1_V1 = "OPERA_L2_RTC-S1_V1" CSLC_S1_V1 = "OPERA_L2_CSLC-S1_V1" CSLC_S1_STATIC_V1 = "OPERA_L2_CSLC-S1-STATIC_V1" + NISAR_GCOV_BETA_V1 = "NISAR_L2_GCOV_BETA_V1" class Endpoint(str, Enum): OPS = "OPS" @@ -42,6 +42,7 @@ class Provider(str, Enum): ASF_RTC = "ASF-RTC" ASF_CSLC = "ASF-CSLC" ASF_CSLC_STATIC = "ASF-CSLC-STATIC" + ASF_NISAR_GCOV = "ASF-NISAR-GCOV" class ProductType(str, Enum): HLS = "HLS" @@ -49,6 +50,7 @@ class ProductType(str, Enum): RTC = "RTC" CSLC = "CSLC" CSLC_STATIC = "CSLC_STATIC" + NISAR_GCOV = "NISAR_GCOV" CMR_TIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ" @@ -59,7 +61,8 @@ class ProductType(str, Enum): Collection.S1B_SLC: Provider.ASF.value, Collection.RTC_S1_V1: Provider.ASF.value, Collection.CSLC_S1_V1: Provider.ASF.value, - Collection.CSLC_S1_STATIC_V1: Provider.ASF.value + Collection.CSLC_S1_STATIC_V1: Provider.ASF.value, + Collection.NISAR_GCOV_BETA_V1: Provider.ASF.value } COLLECTION_TO_PROVIDER_TYPE_MAP = { @@ -69,7 +72,8 @@ class ProductType(str, Enum): Collection.S1B_SLC: Provider.ASF.value, Collection.RTC_S1_V1: Provider.ASF_RTC.value, Collection.CSLC_S1_V1: Provider.ASF_CSLC.value, - Collection.CSLC_S1_STATIC_V1: Provider.ASF_CSLC_STATIC.value + Collection.CSLC_S1_STATIC_V1: Provider.ASF_CSLC_STATIC.value, + Collection.NISAR_GCOV_BETA_V1: Provider.ASF_NISAR_GCOV.value } COLLECTION_TO_PRODUCT_TYPE_MAP = { @@ -79,7 +83,8 @@ class ProductType(str, Enum): Collection.S1B_SLC: ProductType.SLC.value, Collection.RTC_S1_V1: ProductType.RTC.value, Collection.CSLC_S1_V1: ProductType.CSLC.value, - Collection.CSLC_S1_STATIC_V1: ProductType.CSLC_STATIC.value + Collection.CSLC_S1_STATIC_V1: ProductType.CSLC_STATIC.value, + Collection.NISAR_GCOV_BETA_V1: ProductType.NISAR_GCOV.value } COLLECTION_TO_EXTENSIONS_FILTER_MAP = { @@ -90,6 +95,7 @@ class ProductType(str, Enum): Collection.RTC_S1_V1: ["tif", "h5"], Collection.CSLC_S1_V1: ["h5"], Collection.CSLC_S1_STATIC_V1: ["h5"], + Collection.NISAR_GCOV_BETA_V1: ["h5"], "DEFAULT": ["tif", "h5"] } From 65a9727e792898c1f8972000a66ba2552be06998 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Mon, 4 Nov 2024 13:11:23 -0600 Subject: [PATCH 04/17] Refactored functions in query.py to utilize attributes assigned at object creation, rather than copies passed in as arguments --- data_subscriber/asf_cslc_download.py | 2 +- data_subscriber/cslc/cslc_query.py | 22 +++--- data_subscriber/daac_data_subscriber.py | 6 +- data_subscriber/query.py | 61 ++++++++--------- data_subscriber/rtc/rtc_query.py | 86 ++++++++++++------------ tests/data_subscriber/test_cslc_query.py | 12 ++-- tests/scenarios/cslc_query_test.py | 2 +- 7 files changed, 92 insertions(+), 99 deletions(-) diff --git a/data_subscriber/asf_cslc_download.py b/data_subscriber/asf_cslc_download.py index c8c2a3ff8..389225918 100644 --- a/data_subscriber/asf_cslc_download.py +++ b/data_subscriber/asf_cslc_download.py @@ -341,7 +341,7 @@ def query_cslc_static_files_for_cslc_batch(self, cslc_files, args, token, job_id cmr_query = CslcStaticCmrQuery(cslc_query_args, token, es_conn, cmr, job_id, settings) - result = cmr_query.run_query(cslc_query_args, token, es_conn, cmr, job_id, settings) + result = cmr_query.run_query() return result["download_granules"] diff --git a/data_subscriber/cslc/cslc_query.py b/data_subscriber/cslc/cslc_query.py index 983f6c5b4..495436994 100644 --- a/data_subscriber/cslc/cslc_query.py +++ b/data_subscriber/cslc/cslc_query.py @@ -15,7 +15,7 @@ from data_subscriber.url import cslc_unique_id from data_subscriber.cslc.cslc_catalog import KCSLCProductCatalog -K_MULT_FACTOR = 2 #TODO: This should be a setting in probably settings.yaml. +K_MULT_FACTOR = 2 # TODO: This should be a setting in probably settings.yaml. EARLIEST_POSSIBLE_CSLC_DATE = "2016-01-01T00:00:00Z" logger = logging.getLogger(__name__) @@ -368,12 +368,12 @@ def query_cmr_by_frame_and_dates(self, frame_id: int, args, token, cmr, setting return new_granules - def query_cmr(self, args, token, cmr, settings, timerange: DateTimeRange, now: datetime): + def query_cmr(self, timerange: DateTimeRange, now: datetime): # If we are in historical mode, we will query one frame worth at a time if self.proc_mode == "historical": frame_id = int(self.args.frame_id) - all_granules = self.query_cmr_by_frame_and_dates(frame_id, args, token, cmr, settings, now, timerange) + all_granules = self.query_cmr_by_frame_and_dates(frame_id, self.args, self.token, self.cmr, self.settings, now, timerange) # Get rid of any granules that aren't in the historical database sensing_datetime_days_index frame_id = int(self.args.frame_id) @@ -385,28 +385,28 @@ def query_cmr(self, args, token, cmr, settings, timerange: DateTimeRange, now: d # native_id search takes precedence over date range if both are specified elif self.proc_mode == "reprocessing": - if args.native_id is not None: - all_granules = self.query_cmr_by_native_id(args, token, cmr, settings, now, args.native_id) + if self.args.native_id is not None: + all_granules = self.query_cmr_by_native_id(self.args, self.token, self.cmr, self.settings, now, self.args.native_id) # Reprocessing by date range is a two-step process: # 1) Query CMR for all CSLC files in the date range specified # and create list of granules with unique frame_id-acquisition_cycle pairs # 2) Process each granule as if they were passed in as frame_ids and date ranges - elif args.start_date is not None and args.end_date is not None: + elif self.args.start_date is not None and self.args.end_date is not None: unique_frames_dates = set() # First get all CSLC files in the range specified and create a unique set of frame_ids that we need to query for. # Note the subtle difference between when the frame_id is specified and when it's not. if self.args.frame_id is not None: frame_id = int(self.args.frame_id) - granules = self.query_cmr_by_frame_and_dates(frame_id, args, token, cmr, settings, now, timerange) + granules = self.query_cmr_by_frame_and_dates(frame_id, self.args, self.token, self.cmr, self.settings, now, timerange) for granule in granules: _, _, acquisition_cycles, _ = parse_cslc_native_id(granule["granule_id"], self.burst_to_frames, self.disp_burst_map_hist) for _, acq_cycle in acquisition_cycles.items(): unique_frames_dates.add(f"{frame_id}-{acq_cycle}") else: - granules = query_cmr_cslc_blackout_polarization(args, token, cmr, settings, timerange, now, False, self.blackout_dates_obj, False, None) + granules = query_cmr_cslc_blackout_polarization(self.args, self.token, self.cmr, self.settings, timerange, now, False, self.blackout_dates_obj, False, None) for granule in granules: _, _, acquisition_cycles, _ = parse_cslc_native_id(granule["granule_id"], self.burst_to_frames, self.disp_burst_map_hist) for frame_id, acq_cycle in acquisition_cycles.items(): @@ -417,7 +417,7 @@ def query_cmr(self, args, token, cmr, settings, timerange: DateTimeRange, now: d # We could perform two queries so create a unique set of granules. for frame_id_acq in unique_frames_dates: frame_id, acquisition_cycle = frame_id_acq.split("-") - new_granules = self.query_cmr_by_frame_and_acq_cycle(int(frame_id), int(acquisition_cycle), args, token, cmr, settings, now) + new_granules = self.query_cmr_by_frame_and_acq_cycle(int(frame_id), int(acquisition_cycle), self.args, self.token, self.cmr, self.settings, now) all_granules.extend(new_granules) else: @@ -426,9 +426,9 @@ def query_cmr(self, args, token, cmr, settings, timerange: DateTimeRange, now: d else: # Forward processing if self.args.frame_id is not None: frame_id = int(self.args.frame_id) - all_granules = self.query_cmr_by_frame_and_dates(frame_id, args, token, cmr, settings, now, timerange) + all_granules = self.query_cmr_by_frame_and_dates(frame_id, self.args, self.token, self.cmr, self.settings, now, timerange) else: - all_granules = query_cmr_cslc_blackout_polarization(args, token, cmr, settings, timerange, now, False, self.blackout_dates_obj, False, None) + all_granules = query_cmr_cslc_blackout_polarization(self.args, self.token, self.cmr, self.settings, timerange, now, False, self.blackout_dates_obj, False, None) return all_granules diff --git a/data_subscriber/daac_data_subscriber.py b/data_subscriber/daac_data_subscriber.py index 072c2d557..8bd84f397 100644 --- a/data_subscriber/daac_data_subscriber.py +++ b/data_subscriber/daac_data_subscriber.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + import argparse import asyncio import concurrent.futures @@ -124,17 +125,16 @@ def run_query(args: argparse.Namespace, token: str, es_conn: ProductCatalog, cmr cmr_query = CslcCmrQuery(args, token, es_conn, cmr, job_id, settings) elif product_type == ProductType.CSLC_STATIC: cmr_query = CslcStaticCmrQuery(args, token, es_conn, cmr, job_id, settings) - # RTC is a special case in that it needs to run asynchronously elif product_type == ProductType.RTC: cmr_query = RtcCmrQuery(args, token, es_conn, cmr, job_id, settings) - result = asyncio.run(cmr_query.run_query(args, token, es_conn, cmr, job_id, settings)) + result = asyncio.run(cmr_query.run_query()) return result else: raise ValueError(f'Unknown collection type "{args.collection}" provided') - return cmr_query.run_query(args, token, es_conn, cmr, job_id, settings) + return cmr_query.run_query() def run_download(args, token, es_conn, netloc, username, password, cmr, job_id): provider = (COLLECTION_TO_PROVIDER_TYPE_MAP[args.collection] diff --git a/data_subscriber/query.py b/data_subscriber/query.py index 19c79c0aa..3213c1aa0 100644 --- a/data_subscriber/query.py +++ b/data_subscriber/query.py @@ -9,20 +9,17 @@ import dateutil.parser from more_itertools import chunked -from data_subscriber.catalog import ProductCatalog from data_subscriber.cmr import (async_query_cmr, ProductType, DateTimeRange, COLLECTION_TO_PRODUCT_TYPE_MAP, COLLECTION_TO_PROVIDER_TYPE_MAP) +from data_subscriber.cslc.cslc_dependency import CSLCDependency +from data_subscriber.cslc_utils import split_download_batch_id, save_blocked_download_job from data_subscriber.geojson_utils import (localize_include_exclude, - filter_granules_by_regions, - download_from_s3) + filter_granules_by_regions) from data_subscriber.rtc.rtc_download_job_submitter import submit_rtc_download_job_submissions_tasks -from data_subscriber.cslc_utils import split_download_batch_id, save_blocked_download_job -from data_subscriber.cslc.cslc_dependency import CSLCDependency from data_subscriber.url import form_batch_id, _slc_url_to_chunk_id from hysds_commons.job_utils import submit_mozart_job -from util.conf_util import SettingsConf logger = logging.getLogger(__name__) @@ -41,20 +38,20 @@ def __init__(self, args, token, es_conn, cmr, job_id, settings): def validate_args(self): pass - def run_query(self, args, token, es_conn: ProductCatalog, cmr, job_id, settings): + def run_query(self): query_dt = datetime.now() now = datetime.utcnow() - query_timerange: DateTimeRange = get_query_timerange(args, now) + query_timerange: DateTimeRange = get_query_timerange(self.args, now) logger.info("CMR query STARTED") - granules = self.query_cmr(args, token, cmr, settings, query_timerange, now) + granules = self.query_cmr(query_timerange, now) logger.info("CMR query FINISHED") # Get rid of duplicate granules. This happens often for CSLC and TODO: probably RTC granules = self.eliminate_duplicate_granules(granules) - if args.smoke_run: - logger.info(f"{args.smoke_run=}. Restricting to 1 granule(s).") + if self.args.smoke_run: + logger.info(f"{self.args.smoke_run=}. Restricting to 1 granule(s).") granules = granules[:1] # If processing mode is historical, apply the include/exclude-region filtering @@ -62,8 +59,8 @@ def run_query(self, args, token, es_conn: ProductCatalog, cmr, job_id, settings) logging.info(f"Processing mode is historical so applying include and exclude regions...") # Fetch all necessary geojson files from S3 - localize_include_exclude(args) - granules[:] = filter_granules_by_regions(granules, args.include_regions, args.exclude_regions) + localize_include_exclude(self.args) + granules[:] = filter_granules_by_regions(granules, self.args.include_regions, self.args.exclude_regions) # TODO: This function only applies to CSLC, merge w RTC at some point # Generally this function returns the same granules as input but for CSLC (and RTC if also refactored), @@ -80,38 +77,35 @@ def run_query(self, args, token, es_conn: ProductCatalog, cmr, job_id, settings) #TODO: This function only applies to RTC, merge w CSLC at some point batch_id_to_products_map = self.refresh_index() - if args.subparser_name == "full": + if self.args.subparser_name == "full": logger.info( - f"{args.subparser_name=}. Skipping download job submission. Download will be performed directly.") + f"{self.args.subparser_name=}. Skipping download job submission. Download will be performed directly.") - if COLLECTION_TO_PRODUCT_TYPE_MAP[args.collection] == ProductType.RTC: - args.provider = COLLECTION_TO_PROVIDER_TYPE_MAP[args.collection] - args.batch_ids = self.affected_mgrs_set_id_acquisition_ts_cycle_indexes - elif COLLECTION_TO_PRODUCT_TYPE_MAP[args.collection] == ProductType.CSLC: - args.provider = COLLECTION_TO_PROVIDER_TYPE_MAP[args.collection] - args.chunk_size = args.k - args.batch_ids = list(set(granule["download_batch_id"] for granule in download_granules)) + if COLLECTION_TO_PRODUCT_TYPE_MAP[self.args.collection] == ProductType.RTC: + self.args.provider = COLLECTION_TO_PROVIDER_TYPE_MAP[self.args.collection] + self.args.batch_ids = self.affected_mgrs_set_id_acquisition_ts_cycle_indexes + elif COLLECTION_TO_PRODUCT_TYPE_MAP[self.args.collection] == ProductType.CSLC: + self.args.provider = COLLECTION_TO_PROVIDER_TYPE_MAP[self.args.collection] + self.args.chunk_size = self.args.k + self.args.batch_ids = list(set(granule["download_batch_id"] for granule in download_granules)) return {"download_granules": download_granules} - if args.no_schedule_download: - logger.info(f"{args.no_schedule_download=}. Forcefully skipping download job submission.") + if self.args.no_schedule_download: + logger.info(f"{self.args.no_schedule_download=}. Forcefully skipping download job submission.") return {"download_granules": download_granules} - if not args.chunk_size: - logger.info(f"{args.chunk_size=}. Insufficient chunk size. Skipping download job submission.") + if not self.args.chunk_size: + logger.info(f"{self.args.chunk_size=}. Insufficient chunk size. Skipping download job submission.") return - if COLLECTION_TO_PRODUCT_TYPE_MAP[args.collection] == ProductType.RTC: - job_submission_tasks = submit_rtc_download_job_submissions_tasks(batch_id_to_products_map.keys(), args, settings) + if COLLECTION_TO_PRODUCT_TYPE_MAP[self.args.collection] == ProductType.RTC: + job_submission_tasks = submit_rtc_download_job_submissions_tasks(batch_id_to_products_map.keys(), self.args, self.settings) results = asyncio.gather(*job_submission_tasks, return_exceptions=True) else: - #for g in download_granules: - # print(g["download_batch_id"]) job_submission_tasks = self.download_job_submission_handler(download_granules, query_timerange) results = job_submission_tasks - logger.info(f"{len(results)=}") logger.debug(f"{results=}") @@ -127,8 +121,8 @@ def run_query(self, args, token, es_conn: ProductCatalog, cmr, job_id, settings) "download_granules": download_granules } - def query_cmr(self, args, token, cmr, settings, timerange, now: datetime): - granules = asyncio.run(async_query_cmr(args, token, cmr, settings, timerange, now)) + def query_cmr(self, timerange, now: datetime): + granules = asyncio.run(async_query_cmr(self.args, self.token, self.cmr, self.settings, timerange, now)) return granules def eliminate_duplicate_granules(self, granules): @@ -218,7 +212,6 @@ def download_job_submission_handler(self, granules, query_timerange): else: raise ValueError(f"Can't use {self.args.collection=} to select grouping function.") - #print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&", granule["download_batch_id"]) for filter_url in granule.get("filtered_urls"): if product_type == ProductType.CSLC: batch_id_to_urls_map[granule["download_batch_id"]].add(filter_url) diff --git a/data_subscriber/rtc/rtc_query.py b/data_subscriber/rtc/rtc_query.py index 3f800e139..54ee9a38b 100644 --- a/data_subscriber/rtc/rtc_query.py +++ b/data_subscriber/rtc/rtc_query.py @@ -1,11 +1,12 @@ + import asyncio import itertools -from itertools import chain import logging import re from collections import namedtuple, defaultdict from datetime import datetime, timedelta from functools import partial +from itertools import chain from pathlib import Path import dateutil.parser @@ -15,7 +16,6 @@ from data_subscriber.geojson_utils import localize_include_exclude, filter_granules_by_regions from data_subscriber.query import CmrQuery from data_subscriber.rtc import mgrs_bursts_collection_db_client as mbc_client, evaluator -from data_subscriber.rtc.rtc_catalog import RTCProductCatalog from data_subscriber.rtc.rtc_download_job_submitter import submit_rtc_download_job_submissions_tasks from data_subscriber.url import determine_acquisition_cycle from geo.geo_util import does_bbox_intersect_region @@ -38,33 +38,33 @@ class RtcCmrQuery(CmrQuery): def __init__(self, args, token, es_conn, cmr, job_id, settings): super().__init__(args, token, es_conn, cmr, job_id, settings) - async def run_query(self, args, token, es_conn: RTCProductCatalog, cmr, job_id, settings): + async def run_query(self): query_dt = datetime.now() now = datetime.utcnow() - query_timerange: DateTimeRange = get_query_timerange(args, now) + query_timerange: DateTimeRange = get_query_timerange(self.args, now) logger.info("CMR query STARTED") - granules = await async_query_cmr(args, token, cmr, settings, query_timerange, now) + granules = await async_query_cmr(self.args, self.token, self.cmr, self.settings, query_timerange, now) logger.info("CMR query FINISHED") # If processing mode is historical, apply include/exclude-region filtering - if args.proc_mode == "historical": + if self.args.proc_mode == "historical": logger.info(f"Processing mode is historical so applying include and exclude regions...") # Fetch all necessary geojson files from S3 - localize_include_exclude(args) - granules[:] = filter_granules_by_regions(granules, args.include_regions, args.exclude_regions) + localize_include_exclude(self.args) + granules[:] = filter_granules_by_regions(granules, self.args.include_regions, self.args.exclude_regions) logger.info("catalogue-ing STARTED") affected_mgrs_set_id_acquisition_ts_cycle_indexes = set() - granules[:] = filter_granules_rtc(granules, args) + granules[:] = filter_granules_rtc(granules, self.args) logger.info(f"Filtered to {len(granules)} granules") mgrs = mbc_client.cached_load_mgrs_burst_db(filter_land=True) - if args.native_id: - match_native_id = re.match(rtc_granule_regex, args.native_id) + if self.args.native_id: + match_native_id = re.match(rtc_granule_regex, self.args.native_id) burst_id = mbc_client.product_burst_id_to_mapping_burst_id(match_native_id.group("burst_id")) native_id_mgrs_burst_set_ids = mbc_client.burst_id_to_mgrs_set_ids(mgrs, mbc_client.product_burst_id_to_mapping_burst_id(burst_id)) @@ -78,7 +78,7 @@ async def run_query(self, args, token, es_conn: RTCProductCatalog, cmr, job_id, additional_fields = {} additional_fields["revision_id"] = revision_id - additional_fields["processing_mode"] = args.proc_mode + additional_fields["processing_mode"] = self.args.proc_mode additional_fields["instrument"] = "S1A" if "S1A" in granule_id else "S1B" @@ -95,15 +95,15 @@ async def run_query(self, args, token, es_conn: RTCProductCatalog, cmr, job_id, mgrs_set_id_acquisition_ts_cycle_indexes = update_additional_fields_mgrs_set_id_acquisition_ts_cycle_indexes( acquisition_cycle, additional_fields, mgrs_burst_set_ids) - if args.native_id: # native-id supplied. don't affect adjacent burst sets, tossing out irrelevant burst sets + if self.args.native_id: # native-id supplied. don't affect adjacent burst sets, tossing out irrelevant burst sets matching_native_id_mgrs_burst_set_ids = list(set(native_id_mgrs_burst_set_ids) & set(mgrs_burst_set_ids)) update_affected_mgrs_set_ids(acquisition_cycle, affected_mgrs_set_id_acquisition_ts_cycle_indexes, matching_native_id_mgrs_burst_set_ids) else: update_affected_mgrs_set_ids(acquisition_cycle, affected_mgrs_set_id_acquisition_ts_cycle_indexes, mgrs_burst_set_ids) - es_conn.update_granule_index( + self.es_conn.update_granule_index( granule=granule, - job_id=job_id, + job_id=self.job_id, query_dt=query_dt, mgrs_set_id_acquisition_ts_cycle_indexes=mgrs_set_id_acquisition_ts_cycle_indexes, **additional_fields @@ -114,22 +114,22 @@ async def run_query(self, args, token, es_conn: RTCProductCatalog, cmr, job_id, succeeded = [] failed = [] logger.info("performing index refresh") - es_conn.refresh() + self.es_conn.refresh() logger.info("performed index refresh") logger.info("evaluating available burst sets") logger.info(f"{affected_mgrs_set_id_acquisition_ts_cycle_indexes=}") - if args.native_id: # limit query to the 1 or 2 affected sets in backlog + if self.args.native_id: # limit query to the 1 or 2 affected sets in backlog logger.info("Supplied native-id. Limiting evaluation") - min_num_bursts = args.coverage_target_num + min_num_bursts = self.args.coverage_target_num if not min_num_bursts: - min_num_bursts = settings["DSWX_S1_MINIMUM_NUMBER_OF_BURSTS_REQUIRED"] - coverage_target = args.coverage_target + min_num_bursts = self.settings["DSWX_S1_MINIMUM_NUMBER_OF_BURSTS_REQUIRED"] + coverage_target = self.args.coverage_target if coverage_target is None: - coverage_target = settings["DSWX_S1_COVERAGE_TARGET"] - grace_mins = args.grace_mins + coverage_target = self.settings["DSWX_S1_COVERAGE_TARGET"] + grace_mins = self.args.grace_mins if grace_mins is None: - grace_mins = settings["DSWX_S1_COLLECTION_GRACE_PERIOD_MINUTES"] + grace_mins = self.settings["DSWX_S1_COLLECTION_GRACE_PERIOD_MINUTES"] evaluator_results = evaluator.main( coverage_target=coverage_target, required_min_age_minutes_for_partial_burstsets=grace_mins, @@ -138,15 +138,15 @@ async def run_query(self, args, token, es_conn: RTCProductCatalog, cmr, job_id, ) else: # evaluate ALL sets in backlog logger.info("Performing full evaluation") - min_num_bursts = args.coverage_target_num + min_num_bursts = self.args.coverage_target_num if not min_num_bursts: - min_num_bursts = settings["DSWX_S1_MINIMUM_NUMBER_OF_BURSTS_REQUIRED"] - coverage_target = args.coverage_target + min_num_bursts = self.settings["DSWX_S1_MINIMUM_NUMBER_OF_BURSTS_REQUIRED"] + coverage_target = self.args.coverage_target if coverage_target is None: - coverage_target = settings["DSWX_S1_COVERAGE_TARGET"] - grace_mins = args.grace_mins + coverage_target = self.settings["DSWX_S1_COVERAGE_TARGET"] + grace_mins = self.args.grace_mins if grace_mins is None: - grace_mins = settings["DSWX_S1_COLLECTION_GRACE_PERIOD_MINUTES"] + grace_mins = self.settings["DSWX_S1_COLLECTION_GRACE_PERIOD_MINUTES"] evaluator_results = evaluator.main( coverage_target=coverage_target, required_min_age_minutes_for_partial_burstsets=grace_mins, @@ -172,27 +172,27 @@ async def run_query(self, args, token, es_conn: RTCProductCatalog, cmr, job_id, # doc needs to be associated with the batch. so filter the other doc that isn't part of this batch if product_doc["mgrs_set_id_acquisition_ts_cycle_index"] == batch_id: batch_id_to_products_map[batch_id][product_doc["id"]].append(product_doc) - if args.smoke_run: - logger.info(f"{args.smoke_run=}. Filtering to single batch") + if self.args.smoke_run: + logger.info(f"{self.args.smoke_run=}. Filtering to single batch") batch_id_to_products_map = dict(sorted(batch_id_to_products_map.items())[:1]) logger.info(f"num_batches={len(batch_id_to_products_map)}") - if args.subparser_name == "full": - logger.info(f"{args.subparser_name=}. Skipping download job submission. Download will be performed directly.") - args.provider = COLLECTION_TO_PROVIDER_TYPE_MAP[args.collection] - args.batch_ids = affected_mgrs_set_id_acquisition_ts_cycle_indexes + if self.args.subparser_name == "full": + logger.info(f"{self.args.subparser_name=}. Skipping download job submission. Download will be performed directly.") + self.args.provider = COLLECTION_TO_PROVIDER_TYPE_MAP[self.args.collection] + self.args.batch_ids = affected_mgrs_set_id_acquisition_ts_cycle_indexes return - if args.no_schedule_download: - logger.info(f"{args.no_schedule_download=}. Forcefully skipping download job submission.") + if self.args.no_schedule_download: + logger.info(f"{self.args.no_schedule_download=}. Forcefully skipping download job submission.") return - if not args.chunk_size: - logger.info(f"{args.chunk_size=}. Insufficient chunk size. Skipping download job submission.") + if not self.args.chunk_size: + logger.info(f"{self.args.chunk_size=}. Insufficient chunk size. Skipping download job submission.") return results = [] logger.info(f"Submitting batches for RTC download job: {list(batch_id_to_products_map)}") for batch_id, products_map in batch_id_to_products_map.items(): - job_submission_tasks = submit_rtc_download_job_submissions_tasks({batch_id: products_map}, args, settings) + job_submission_tasks = submit_rtc_download_job_submissions_tasks({batch_id: products_map}, self.args, self.settings) results_batch = await asyncio.gather(*job_submission_tasks, return_exceptions=True) results.extend(results_batch) @@ -204,11 +204,11 @@ async def run_query(self, args, token, es_conn: RTCProductCatalog, cmr, job_id, # use doc obj to pass params to elasticsearch client product["download_job_ids"] = first(suceeded_batch) - if args.dry_run: - logger.info(f"{args.dry_run=}. Skipping marking jobs as downloaded. Producing mock job ID") + if self.args.dry_run: + logger.info(f"{self.args.dry_run=}. Skipping marking jobs as downloaded. Producing mock job ID") pass else: - es_conn.mark_products_as_download_job_submitted({batch_id: batch_id_to_products_map[batch_id]}) + self.es_conn.mark_products_as_download_job_submitted({batch_id: batch_id_to_products_map[batch_id]}) succeeded.extend(suceeded_batch) failed.extend(failed_batch) diff --git a/tests/data_subscriber/test_cslc_query.py b/tests/data_subscriber/test_cslc_query.py index 6e38ec5bf..17da27124 100644 --- a/tests/data_subscriber/test_cslc_query.py +++ b/tests/data_subscriber/test_cslc_query.py @@ -10,7 +10,7 @@ from datetime import datetime from data_subscriber.cmr import DateTimeRange -forward_arguments = ["query", "-c", "OPERA_L2_CSLC-S1_V1", "--processing-mode=forward", "--start-date=2021-01-24T23:00:00Z",\ +forward_arguments = ["query", "-c", "OPERA_L2_CSLC-S1_V1", "--processing-mode=forward", "--start-date=2021-01-24T23:00:00Z", "--end-date=2021-01-25T00:00:00Z", "--grace-mins=60", "--k=4", "--m=4"] BURST_MAP = Path(__file__).parent / "opera-disp-s1-consistent-burst-ids-2024-10-14-2016-07-01_to_2024-09-04.json" @@ -39,7 +39,7 @@ def test_reprocessing_by_native_id(caplog): reproc_args = create_parser().parse_args(reprocessing_arguments) c_query = cslc_query.CslcCmrQuery(reproc_args, None, None, None, None, {"DEFAULT_DISP_S1_QUERY_GRACE_PERIOD_MINUTES": 60},BURST_MAP) - c_query.query_cmr(reproc_args, None, None, None, None, datetime.utcnow()) + c_query.query_cmr(None, datetime.utcnow()) assert ("native_id='OPERA_L2_CSLC-S1_T027-056778-IW1_20231008T133102Z_20231009T204457Z_S1A_VV_v1.0' is not found in the DISP-S1 Burst ID Database JSON. Nothing to process" in caplog.text) @@ -47,15 +47,15 @@ def test_reprocessing_by_native_id(caplog): def test_historical_query(caplog): ''' Tests historical query commands and high-level processing when specifying a frame range''' hist_arguments = ["query", "-c", "OPERA_L2_CSLC-S1_V1", "--processing-mode=historical", "--k=4", "--m=4", - "--start-date=2022-07-18T13:02:00Z", "--end-date=2022-07-19T13:01:53Z", "--frame-id=44044åå"] + "--start-date=2022-07-18T13:02:00Z", "--end-date=2022-07-19T13:01:53Z", "--frame-id=44044"] args = create_parser().parse_args(hist_arguments) query_timerange = DateTimeRange(args.start_date, args.end_date) c_query = cslc_query.CslcCmrQuery(args, None, None, None, None, {"DEFAULT_DISP_S1_QUERY_GRACE_PERIOD_MINUTES": 60}, BURST_MAP) # TODO: figure out how to test the query_cmr method - #await c_query.query_cmr(args, None, None, None, query_timerange, datetime.utcnow())å - #assert ("native_id='OPERA_L2_CSLC-S1_T027-056778-IW1_20231008T133102Z_20231009T204457Z_S1A_VV_v1.0å' is not found in the DISP-S1 Burst ID Database JSON. Nothing to process" + #await c_query.query_cmr(query_timerange, datetime.utcnow()) + #assert ("native_id='OPERA_L2_CSLC-S1_T027-056778-IW1_20231008T133102Z_20231009T204457Z_S1A_VV_v1.0' is not found in the DISP-S1 Burst ID Database JSON. Nothing to process" # in caplog.text) @pytest.mark.skip @@ -68,7 +68,7 @@ def test_reprocessing_by_dates(): query_timerange = DateTimeRange(reproc_args.start_date, reproc_args.end_date) c_query = cslc_query.CslcCmrQuery(reproc_args, None, None, None, None, {"DEFAULT_DISP_S1_QUERY_GRACE_PERIOD_MINUTES": 60},BURST_MAP) - cr = c_query.query_cmr(reproc_args, None, None, None, query_timerange, datetime.utcnow()) + cr = c_query.query_cmr(query_timerange, datetime.utcnow()) args = cr.cr_frame.f_locals["args"] assert args.collection == 'OPERA_L2_CSLC-S1_V1' assert args.start_date == '2021-01-24T23:00:00Z' diff --git a/tests/scenarios/cslc_query_test.py b/tests/scenarios/cslc_query_test.py index 9afd8d389..5dfa5ae95 100644 --- a/tests/scenarios/cslc_query_test.py +++ b/tests/scenarios/cslc_query_test.py @@ -190,7 +190,7 @@ def query_and_validate(current_args, test_range, blackout_dates, validation_data print("Querying with args: " + " ".join(current_args)) args = create_parser().parse_args(current_args) c_query = cslc_query.CslcCmrQuery(args, token, es_conn, cmr, "job_id", settings,None, blackout_dates) - q_result = c_query.run_query(args, token, es_conn, cmr, "job_id", settings) + q_result = c_query.run_query() q_result = q_result["download_granules"] # Main granules q_result.extend(c_query.k_retrieved_granules) # k granules print("+++++++++++++++++++++++++++++++++++++++++++++++") From 9c27a1743059e2db1521df49a0f81b4e740cb7cd Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Mon, 4 Nov 2024 13:13:36 -0600 Subject: [PATCH 05/17] Removed silent argument from query.get_query_timerange() Also removed unused process_frame_burst_db() function --- data_subscriber/query.py | 27 ++++++++++----------------- data_subscriber/survey.py | 2 +- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/data_subscriber/query.py b/data_subscriber/query.py index 3213c1aa0..602b1eb99 100644 --- a/data_subscriber/query.py +++ b/data_subscriber/query.py @@ -1,3 +1,5 @@ + +import argparse import asyncio import hashlib import logging @@ -422,27 +424,18 @@ def update_url_index( for filename, filename_urls in filename_to_urls_map.items(): es_conn.process_url(filename_urls, granule, job_id, query_dt, temporal_extent_beginning_dt, revision_date_dt, *args, **kwargs) -def get_query_timerange(args, now: datetime, silent=False): +def get_query_timerange(args: argparse.Namespace, now: datetime): now_minus_minutes_dt = ( - now - timedelta(minutes=args.minutes)) if not args.native_id else dateutil.parser.isoparse( - "1900-01-01T00:00:00Z") + now - timedelta(minutes=args.minutes) + if not args.native_id + else dateutil.parser.isoparse("1900-01-01T00:00:00Z") + ) start_date = args.start_date if args.start_date else now_minus_minutes_dt.strftime("%Y-%m-%dT%H:%M:%SZ") end_date = args.end_date if args.end_date else now.strftime("%Y-%m-%dT%H:%M:%SZ") query_timerange = DateTimeRange(start_date, end_date) - if not silent: - logger.info(f"{query_timerange=}") - return query_timerange -def process_frame_burst_db(): - settings = SettingsConf().cfg - bucket = settings["GEOJSON_BUCKET"] - - try: - for geojson in geojsons: - key = geojson.strip() + ".geojson" - # output_filepath = os.path.join(working_dir, key) - download_from_s3(bucket, key, key) - except Exception as e: - raise Exception("Exception while fetching geojson file: %s. " % key + str(e)) \ No newline at end of file + logger.debug(f"{query_timerange=}") + + return query_timerange diff --git a/data_subscriber/survey.py b/data_subscriber/survey.py index a991a53fa..89427f6b3 100644 --- a/data_subscriber/survey.py +++ b/data_subscriber/survey.py @@ -59,7 +59,7 @@ def run_survey(args, token, cmr, settings): args.start_date = start_str args.end_date = end_str - query_timerange: DateTimeRange = get_query_timerange(args, now, silent=True) + query_timerange: DateTimeRange = get_query_timerange(args, now) granules = _query_cmr_backoff(args, token, cmr, settings, query_timerange, now, disp_burst_map, silent=True) From 45bfab9a24b86667c0fda16fa6513df335322ec0 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Mon, 4 Nov 2024 13:15:39 -0600 Subject: [PATCH 06/17] Fixed issue where the --verbose option had no actual affect on what was logged by daac_data_subscriber.py --- data_subscriber/daac_data_subscriber.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/data_subscriber/daac_data_subscriber.py b/data_subscriber/daac_data_subscriber.py index 8bd84f397..b79e059f4 100644 --- a/data_subscriber/daac_data_subscriber.py +++ b/data_subscriber/daac_data_subscriber.py @@ -47,17 +47,21 @@ from util.exec_util import exec_wrapper from util.job_util import supply_job_id, is_running_outside_verdi_worker_context -logging.basicConfig(level="INFO") logger = logging.getLogger(__name__) - +logging.basicConfig(level="INFO") @exec_wrapper def main(): - configure_logger() run(sys.argv) -def configure_logger(): +def configure_logger(verbose=False): + global logger + + log_level = "DEBUG" if verbose else "INFO" + logger.setLevel(log_level) + logger.info("Log level set to %s", log_level) + logger_hysds_commons = logging.getLogger("hysds_commons") logger_hysds_commons.addFilter(NoJobUtilsFilter()) @@ -70,12 +74,13 @@ def configure_logger(): def run(argv: list[str]): - logger.info(f"{argv=}") parser = create_parser() args = parser.parse_args(argv[1:]) validate_args(args) + configure_logger(args.verbose) + es_conn = supply_es_conn(args) if args.file: @@ -83,10 +88,10 @@ def run(argv: list[str]): update_url_index(es_conn, f.readlines(), None, None, None) exit(0) - logger.info(f"{args=}") + logger.debug(f"daac_data_subscriber.py invoked with {args=}") job_id = supply_job_id() - logger.info(f"{job_id=}") + logger.debug(f"Using {job_id=}") settings = SettingsConf().cfg cmr, token, username, password, edl = get_cmr_token(args.endpoint, settings) @@ -107,7 +112,7 @@ def run(argv: list[str]): else: results["download"] = run_download(args, token, es_conn, netloc, username, password, cmr, job_id) - logger.info(f"{len(results)=}") + logger.debug(f"{len(results)=}") logger.debug(f"{results=}") logger.info("END") From fe064886ca04e1708da477b93bd96e0cf1c2704e Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Mon, 4 Nov 2024 13:16:10 -0600 Subject: [PATCH 07/17] Added support for querying of NISAR GCOV products to daac_data_subscriber.py --- data_subscriber/daac_data_subscriber.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/data_subscriber/daac_data_subscriber.py b/data_subscriber/daac_data_subscriber.py index b79e059f4..f99f1370f 100644 --- a/data_subscriber/daac_data_subscriber.py +++ b/data_subscriber/daac_data_subscriber.py @@ -29,6 +29,8 @@ from data_subscriber.cslc.cslc_catalog import CSLCProductCatalog, CSLCStaticProductCatalog from data_subscriber.cslc.cslc_query import CslcCmrQuery from data_subscriber.cslc.cslc_static_query import CslcStaticCmrQuery +from data_subscriber.gcov.gcov_catalog import NisarGcovProductCatalog +from data_subscriber.gcov.gcov_query import NisarGcovCmrQuery from data_subscriber.hls.hls_catalog import HLSProductCatalog from data_subscriber.hls.hls_query import HlsCmrQuery from data_subscriber.lpdaac_download import DaacDownloadLpdaac @@ -135,7 +137,8 @@ def run_query(args: argparse.Namespace, token: str, es_conn: ProductCatalog, cmr cmr_query = RtcCmrQuery(args, token, es_conn, cmr, job_id, settings) result = asyncio.run(cmr_query.run_query()) return result - + elif product_type == ProductType.NISAR_GCOV: + cmr_query = NisarGcovCmrQuery(args, token, es_conn, cmr, job_id, settings) else: raise ValueError(f'Unknown collection type "{args.collection}" provided') @@ -305,15 +308,17 @@ def supply_es_conn(args): else args.provider) if provider == Provider.LPCLOUD: - es_conn = HLSProductCatalog(logging.getLogger(__name__)) + es_conn = HLSProductCatalog(logger) elif provider in (Provider.ASF, Provider.ASF_SLC): - es_conn = SLCProductCatalog(logging.getLogger(__name__)) + es_conn = SLCProductCatalog(logger) elif provider == Provider.ASF_RTC: - es_conn = RTCProductCatalog(logging.getLogger(__name__)) + es_conn = RTCProductCatalog(logger) elif provider == Provider.ASF_CSLC: - es_conn = CSLCProductCatalog(logging.getLogger(__name__)) + es_conn = CSLCProductCatalog(logger) elif provider == Provider.ASF_CSLC_STATIC: - es_conn = CSLCStaticProductCatalog(logging.getLogger(__name__)) + es_conn = CSLCStaticProductCatalog(logger) + elif provider == Provider.ASF_NISAR_GCOV: + es_conn = NisarGcovProductCatalog(logger) else: raise ValueError(f'Unsupported provider "{provider}"') From 37e85bf8f7f4cbbced05243ac58b9a889f0f8340 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Mon, 4 Nov 2024 15:19:18 -0600 Subject: [PATCH 08/17] Refactored use of logging throughout the data_subscriber package to utilize a singleton logger instance This allows logging by the data subscriber to configured within a single location and standardizes the logging behavior across the package --- commons/logger.py | 43 ++++++- data_subscriber/asf_cslc_download.py | 62 +++++----- data_subscriber/asf_download.py | 2 - data_subscriber/asf_rtc_download.py | 32 ++--- data_subscriber/asf_slc_download.py | 22 ++-- data_subscriber/aws_token.py | 6 +- data_subscriber/cmr.py | 6 +- data_subscriber/cslc/cslc_blackout.py | 23 ++-- data_subscriber/cslc/cslc_dependency.py | 27 +++-- data_subscriber/cslc/cslc_query.py | 51 ++++---- data_subscriber/cslc/cslc_static_query.py | 5 - data_subscriber/cslc_utils.py | 27 +++-- data_subscriber/daac_data_subscriber.py | 29 +---- data_subscriber/download.py | 30 ++--- data_subscriber/gcov/gcov_query.py | 4 - data_subscriber/geojson_utils.py | 28 +++-- data_subscriber/hls/hls_query.py | 4 +- data_subscriber/ionosphere_download.py | 11 +- data_subscriber/lpdaac_download.py | 43 ++++--- data_subscriber/query.py | 49 ++++---- .../rtc/mgrs_bursts_collection_db_client.py | 8 +- .../rtc/rtc_download_job_submitter.py | 5 +- data_subscriber/rtc/rtc_job_submitter.py | 8 +- data_subscriber/rtc/rtc_query.py | 114 ++++++++---------- data_subscriber/slc/slc_query.py | 5 +- data_subscriber/submit_pending_jobs.py | 32 ++--- data_subscriber/survey.py | 24 ++-- data_subscriber/url.py | 16 ++- tools/ops/cmr_audit/cmr_client.py | 33 ++--- util/job_util.py | 5 +- 30 files changed, 373 insertions(+), 381 deletions(-) diff --git a/commons/logger.py b/commons/logger.py index 089f77350..e7af3e989 100644 --- a/commons/logger.py +++ b/commons/logger.py @@ -2,6 +2,8 @@ from enum import Enum +import boto3 + # set logger and custom filter to handle being run from sciflo log_format = "[%(asctime)s: %(levelname)s/%(funcName)s] %(message)s" logging.basicConfig(format=log_format, level=logging.INFO) @@ -43,12 +45,39 @@ def filter(self, record): record.id = "--" return True - logger = logging.getLogger("opera_pcm") -logger.setLevel(logging.INFO) logger.addFilter(LogFilter()) +logger_initialized = False +def get_logger(verbose=False, quiet=False): + global logger_initialized + + if not logger_initialized: + + if verbose: + log_level = LogLevels.DEBUG.value + elif quiet: + log_level = LogLevels.WARNING.value + else: + log_level = LogLevels.INFO.value + + if verbose: + log_format = '[%(asctime)s: %(levelname)s/%(module)s:%(funcName)s:%(lineno)d] %(message)s' + else: + log_format = "[%(asctime)s: %(levelname)s/%(funcName)s] %(message)s" + + logging.basicConfig(level=log_level, format=log_format, force=True) + + logger.addFilter(NoLogUtilsFilter()) + + logger_initialized = True + logger.info("Initial logging configuration complete") + logger.info("Log level set to %s", log_level) + + return logger + + class NoLogUtilsFilter(logging.Filter): """Filters out large JSON output of HySDS internals. Apply to any logger (typically __main__) or its @@ -91,3 +120,13 @@ def filter(self, record): and "/containers/_doc/" not in record.getMessage() \ and "/_search?" not in record.getMessage() \ and "/_update" not in record.getMessage() + + +def configure_library_loggers(): + logger_hysds_commons = logging.getLogger("hysds_commons") + logger_hysds_commons.addFilter(NoJobUtilsFilter()) + + logger_elasticsearch = logging.getLogger("elasticsearch") + logger_elasticsearch.addFilter(NoBaseFilter()) + + boto3.set_stream_logger(name='botocore.credentials', level=logging.ERROR) diff --git a/data_subscriber/asf_cslc_download.py b/data_subscriber/asf_cslc_download.py index 389225918..2a8289632 100644 --- a/data_subscriber/asf_cslc_download.py +++ b/data_subscriber/asf_cslc_download.py @@ -1,5 +1,4 @@ import copy -import logging import os import urllib.parse from datetime import datetime, timezone @@ -24,7 +23,6 @@ from data_subscriber.cslc.cslc_dependency import CSLCDependency from data_subscriber.cslc.cslc_blackout import DispS1BlackoutDates, localize_disp_blackout_dates -logger = logging.getLogger(__name__) _C_CSLC_ES_INDEX_PATTERNS = "grq_1_l2_cslc_s1_compressed*" @@ -41,7 +39,7 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, jo settings = SettingsConf().cfg product_id = "_".join([batch_id for batch_id in args.batch_ids]) - logger.info(f"{product_id=}") + self.logger.info(f"{product_id=}") cslc_s3paths = [] c_cslc_s3paths = [] cslc_static_s3paths = [] @@ -61,7 +59,7 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, jo new_args = copy.deepcopy(args) for batch_id in args.batch_ids: - logger.info(f"Downloading CSLC files for batch {batch_id}") + self.logger.info(f"Downloading CSLC files for batch {batch_id}") # Determine the highest acquisition cycle index here for later use in retrieving m compressed CSLCs _, acq_cycle_index = split_download_batch_id(batch_id) @@ -75,7 +73,7 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, jo cslc_products_to_filepaths: dict[str, set[Path]] = super().run_download( new_args, token, es_conn, netloc, username, password, cmr, job_id, rm_downloads_dir=False ) - logger.info(f"Uploading CSLC input files to S3") + self.logger.info(f"Uploading CSLC input files to S3") cslc_files_to_upload = [fp for fp_set in cslc_products_to_filepaths.values() for fp in fp_set] cslc_s3paths.extend(concurrent_s3_client_try_upload_file(bucket=settings["DATASET_BUCKET"], key_prefix=f"tmp/disp_s1/{batch_id}", @@ -88,7 +86,7 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, jo # For s3 we can use the files directly so simply copy over the paths else: # s3 or auto - logger.info("Skipping download CSLC bursts and instead using ASF S3 paths for direct SCIFLO PGE ingestion") + self.logger.info("Skipping download CSLC bursts and instead using ASF S3 paths for direct SCIFLO PGE ingestion") downloads = self.get_downloads(args, es_conn) cslc_s3paths = [download["s3_url"] for download in downloads] if len(cslc_s3paths) == 0: @@ -104,9 +102,9 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, jo try: head_object = boto3.client("s3").head_object(Bucket=bucket, Key=key) - logger.info(f"Adding CSLC file: {p}") + self.logger.info(f"Adding CSLC file: {p}") except Exception as e: - logger.error("Failed when accessing the S3 object:" + p) + self.logger.error("Failed when accessing the S3 object:" + p) raise e file_size = int(head_object["ContentLength"]) @@ -125,27 +123,27 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, jo to_mark_downloaded.append((unique_id, file_size)) burst_id_set.add(burst_id) - logger.info(f"Querying CSLC-S1 Static Layer products for {batch_id}") + self.logger.info(f"Querying CSLC-S1 Static Layer products for {batch_id}") cslc_static_granules = self.query_cslc_static_files_for_cslc_batch( cslc_files_to_upload, args, token, job_id, settings ) # Download the files from ASF only if the transfer protocol is HTTPS if args.transfer_protocol == "https": - logger.info(f"Downloading CSLC Static Layer products for {batch_id}") + self.logger.info(f"Downloading CSLC Static Layer products for {batch_id}") cslc_static_products_to_filepaths: dict[str, set[Path]] = self.download_cslc_static_files_for_cslc_batch( cslc_static_granules, args, token, netloc, username, password, job_id ) - logger.info("Uploading CSLC Static input files to S3") + self.logger.info("Uploading CSLC Static input files to S3") cslc_static_files_to_upload = [fp for fp_set in cslc_static_products_to_filepaths.values() for fp in fp_set] cslc_static_s3paths.extend(concurrent_s3_client_try_upload_file(bucket=settings["DATASET_BUCKET"], key_prefix=f"tmp/disp_s1/{batch_id}", files=cslc_static_files_to_upload)) # For s3 we can use the files directly so simply copy over the paths else: # s3 or auto - logger.info("Skipping download CSLC static files and instead using ASF S3 paths for direct SCIFLO PGE ingestion") + self.logger.info("Skipping download CSLC static files and instead using ASF S3 paths for direct SCIFLO PGE ingestion") cslc_static_products_to_filepaths = {} # Dummy when trying to delete files later in this function @@ -159,11 +157,11 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, jo # Download all Ionosphere files corresponding to the dates covered by the input CSLC set # We always download ionosphere files, there is no direct S3 ingestion option - logger.info(f"Downloading Ionosphere files for {batch_id}") + self.logger.info(f"Downloading Ionosphere files for {batch_id}") ionosphere_paths = self.download_ionosphere_files_for_cslc_batch(cslc_files_to_upload, self.downloads_dir) - logger.info(f"Uploading Ionosphere files to S3") + self.logger.info(f"Uploading Ionosphere files to S3") # TODO: since all ionosphere files now go to the same S3 location, # it should be possible to do a lookup before redownloading a file ionosphere_s3paths.extend(concurrent_s3_client_try_upload_file(bucket=settings["DATASET_BUCKET"], @@ -172,7 +170,7 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, jo # Delete the files from the file system after uploading to S3 if rm_downloads_dir: - logger.info("Removing downloaded files from local filesystem") + self.logger.info("Removing downloaded files from local filesystem") for fp_set in cslc_products_to_filepaths.values(): for fp in fp_set: os.remove(fp) @@ -186,7 +184,7 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, jo # Determine M Compressed CSLCs by querying compressed cslc GRQ ES --------------> k, m = es_conn.get_k_and_m(args.batch_ids[0]) - logger.info(f"{k=}, {m=}") + self.logger.info(f"{k=}, {m=}") cslc_dependency = CSLCDependency(k, m, self.disp_burst_map, args, token, cmr, settings, self.blackout_dates_obj) @@ -197,14 +195,14 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, jo for ccslc in ccslcs: cslc_path = ccslc["_source"]["metadata"]["product_s3_paths"] c_cslc_s3paths.extend(cslc_path) - logger.info(f"Adding {cslc_path} to c_cslc_s3paths") + self.logger.info(f"Adding {cslc_path} to c_cslc_s3paths") # Now acquire the Ionosphere files for the reference dates of the Compressed CSLC products - logger.info(f"Downloading Ionosphere files for Compressed CSLCs") + self.logger.info(f"Downloading Ionosphere files for Compressed CSLCs") ionosphere_paths = self.download_ionosphere_files_for_cslc_batch(c_cslc_s3paths, self.downloads_dir) - logger.info(f"Uploading Ionosphere files for Compressed CSLCs to S3") + self.logger.info(f"Uploading Ionosphere files for Compressed CSLCs to S3") ionosphere_s3paths.extend(concurrent_s3_client_try_upload_file(bucket=settings["DATASET_BUCKET"], key_prefix=f"tmp/disp_s1/ionosphere", files=ionosphere_paths)) @@ -218,12 +216,12 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, jo print(f'{bounding_box=}') # Now submit DISP-S1 SCIFLO job - logger.info(f"Submitting DISP-S1 SCIFLO job") + self.logger.info(f"Submitting DISP-S1 SCIFLO job") save_compressed_cslc = False if cslc_dependency.determine_k_cycle(None, latest_acq_cycle_index, frame_id) == 0: save_compressed_cslc = True - logger.info(f"{save_compressed_cslc=}") + self.logger.info(f"{save_compressed_cslc=}") product = { "_id": product_id, @@ -284,7 +282,7 @@ def get_downloads(self, args, es_conn): Granules are stored in either cslc_catalog or k_cslc_catalog index. We assume that the latest batch_id (defined as the one with the greatest acq_cycle_index) is stored in cslc_catalog. The rest are stored in k_cslc_catalog.''' - k_es_conn = KCSLCProductCatalog(logging.getLogger(__name__)) + k_es_conn = KCSLCProductCatalog(self.logger) # Sort the batch_ids by acq_cycle_index batch_ids = sorted(args.batch_ids, key = lambda batch_id: split_download_batch_id(batch_id)[1]) @@ -293,25 +291,25 @@ def get_downloads(self, args, es_conn): # Historical mode stores all granules in normal cslc_catalog if "proc_mode" in args and args.proc_mode == "historical": - logger.info("Downloading cslc files for historical mode") + self.logger.info("Downloading cslc files for historical mode") for batch_id in batch_ids: downloads = es_conn.get_download_granule_revision(batch_id) - logger.info(f"Got {len(downloads)=} cslc downloads for {batch_id=}") + self.logger.info(f"Got {len(downloads)=} cslc downloads for {batch_id=}") assert len(downloads) > 0, f"No downloads found for batch_id={batch_id}!" all_downloads.extend(downloads) # Forward and reprocessing modes store all granules in k_cslc_catalog else: - logger.info("Downloading cslc files for forward/reprocessing mode") + self.logger.info("Downloading cslc files for forward/reprocessing mode") downloads = es_conn.get_download_granule_revision(batch_ids[-1]) - logger.info(f"Got {len(downloads)=} cslc granules downloads for batch_id={batch_ids[-1]}") + self.logger.info(f"Got {len(downloads)=} cslc granules downloads for batch_id={batch_ids[-1]}") assert len(downloads) > 0, f"No downloads found for batch_id={batch_ids[-1]}!" all_downloads.extend(downloads) # Download K-CSLC granules for batch_id in batch_ids[:-1]: downloads = k_es_conn.get_download_granule_revision(batch_id) - logger.info(f"Got {len(downloads)=} k cslc downloads for {batch_id=}") + self.logger.info(f"Got {len(downloads)=} k cslc downloads for {batch_id=}") assert len(downloads) > 0, f"No downloads found for batch_id={batch_id}!" all_downloads.extend(downloads) @@ -337,7 +335,7 @@ def query_cslc_static_files_for_cslc_batch(self, cslc_files, args, token, job_id cslc_query_args.max_revision = 1000 cslc_query_args.proc_mode = "forward" # CSLC static query does not care about # this, but a value must be provided - es_conn = CSLCStaticProductCatalog(logging.getLogger(__name__)) + es_conn = CSLCStaticProductCatalog(self.logger) cmr_query = CslcStaticCmrQuery(cslc_query_args, token, es_conn, cmr, job_id, settings) @@ -351,7 +349,7 @@ def download_cslc_static_files_for_cslc_batch(self, cslc_static_granules, args, session = SessionWithHeaderRedirection(username, password, netloc) downloads = [] - es_conn = CSLCStaticProductCatalog(logging.getLogger(__name__)) + es_conn = CSLCStaticProductCatalog(self.logger) for cslc_static_granule in cslc_static_granules: download_dict = { @@ -384,13 +382,13 @@ def download_ionosphere_files_for_cslc_batch(self, cslc_files, download_dir): downloaded_ionosphere_files = set() for cslc_file in cslc_files: - logger.info(f'Downloading Ionosphere file for CSLC granule {cslc_file}') + self.logger.info(f'Downloading Ionosphere file for CSLC granule {cslc_file}') cslc_file_tokens = cslc_file.split('_') acq_datetime = cslc_file_tokens[4] acq_date = acq_datetime.split('T')[0] - logger.debug(f'{acq_date=}') + self.logger.debug(f'{acq_date=}') if acq_date not in downloaded_ionosphere_dates: ionosphere_filepath = ionosphere_download.download_ionosphere_correction_file( @@ -400,7 +398,7 @@ def download_ionosphere_files_for_cslc_batch(self, cslc_files, download_dir): downloaded_ionosphere_dates.add(acq_date) downloaded_ionosphere_files.add(ionosphere_filepath) else: - logger.info(f'Already downloaded Ionosphere file for date {acq_date}, skipping...') + self.logger.info(f'Already downloaded Ionosphere file for date {acq_date}, skipping...') return downloaded_ionosphere_files diff --git a/data_subscriber/asf_download.py b/data_subscriber/asf_download.py index a6e3daf5f..569c86ca1 100644 --- a/data_subscriber/asf_download.py +++ b/data_subscriber/asf_download.py @@ -4,8 +4,6 @@ from datetime import datetime from pathlib import PurePath, Path -import backoff -import requests import requests.utils from data_subscriber.download import DaacDownload diff --git a/data_subscriber/asf_rtc_download.py b/data_subscriber/asf_rtc_download.py index d73a1be6c..767d06154 100644 --- a/data_subscriber/asf_rtc_download.py +++ b/data_subscriber/asf_rtc_download.py @@ -1,17 +1,14 @@ import concurrent.futures -import logging import os from collections import defaultdict from pathlib import PurePath, Path import requests.utils -from data_subscriber.download import DaacDownload from data_subscriber.catalog import ProductCatalog +from data_subscriber.download import DaacDownload from data_subscriber.url import _to_urls, _to_https_urls, _rtc_url_to_chunk_id -logger = logging.getLogger(__name__) - class AsfDaacRtcDownload(DaacDownload): @@ -19,19 +16,12 @@ def __init__(self, provider): super().__init__(provider) self.daac_s3_cred_settings_key = "RTC_DOWNLOAD" - def perform_download( - self, - session: requests.Session, - es_conn: ProductCatalog, - downloads: list[dict], - args, - token, - job_id - ): - logger.info(f"downloading {len(downloads)} documents") + def perform_download(self, session: requests.Session, es_conn: ProductCatalog, + downloads: list[dict], args, token, job_id): + self.logger.info(f"downloading {len(downloads)} documents") if args.dry_run: - logger.info(f"{args.dry_run=}. Skipping download.") + self.logger.info(f"{args.dry_run=}. Skipping download.") downloads = [] product_to_product_filepaths_map = defaultdict(set) @@ -51,14 +41,14 @@ def perform_download( download_id_to_downloads_map[download_id]["filesize"] += filesize for download in downloads: - logger.info(f"Marking as downloaded. {download['id']=}") + self.logger.info(f"Marking as downloaded. {download['id']=}") es_conn.mark_product_as_downloaded(download['id'], job_id, download_id_to_downloads_map[download["id"]]["filesize"]) - logger.info(f"downloaded {len(product_to_product_filepaths_map)} products") + self.logger.info(f"downloaded {len(product_to_product_filepaths_map)} products") return product_to_product_filepaths_map def perform_download_single(self, download, token, args, download_counter, num_downloads): - logger.info(f"Downloading {download_counter} of {num_downloads} downloads") + self.logger.info(f"Downloading {download_counter} of {num_downloads} downloads") if args.transfer_protocol == "https": product_urls = _to_https_urls(download) @@ -73,7 +63,7 @@ def perform_download_single(self, download, token, args, download_counter, num_d product_urls = [product_urls] for product_url in product_urls: - logger.info(f"Processing {product_url=}") + self.logger.info(f"Processing {product_url=}") product_id = _rtc_url_to_chunk_id(product_url, str(download['revision_id'])) product_download_dir = self.downloads_dir / product_id product_download_dir.mkdir(exist_ok=True) @@ -88,13 +78,13 @@ def perform_download_single(self, download, token, args, download_counter, num_d product_filepath = self.download_asf_product( product_url, token, product_download_dir ) - logger.info(f"{product_filepath=}") + self.logger.info(f"{product_filepath=}") list_product_id_product_filepath.append((product_id, product_filepath, download["id"], os.path.getsize(product_filepath))) return list_product_id_product_filepath def download_asf_product(self, product_url, token: str, target_dirpath: Path): - logger.info(f"Requesting from {product_url}") + self.logger.info(f"Requesting from {product_url}") asf_response = self._handle_url_redirect(product_url, token) asf_response.raise_for_status() diff --git a/data_subscriber/asf_slc_download.py b/data_subscriber/asf_slc_download.py index f77cb8104..ffe7415e2 100644 --- a/data_subscriber/asf_slc_download.py +++ b/data_subscriber/asf_slc_download.py @@ -1,7 +1,5 @@ import json -import logging import netrc -import os from datetime import datetime, timedelta from pathlib import PurePath, Path @@ -15,12 +13,10 @@ T_ORBIT, ORBIT_PAD) -logger = logging.getLogger(__name__) - class AsfDaacSlcDownload(DaacDownloadAsf): def download_orbit_file(self, dataset_dir, product_filepath, additional_metadata): - logger.info("Downloading associated orbit file") + self.logger.info("Downloading associated orbit file") # Get the PCM username/password for authentication to Copernicus Dataspace username, _, password = netrc.netrc().authenticators('dataspace.copernicus.eu') @@ -33,7 +29,7 @@ def download_orbit_file(self, dataset_dir, product_filepath, additional_metadata sensing_stop_range = safe_stop_datetime + timedelta(seconds=ORBIT_PAD) try: - logger.info(f"Querying for Precise Ephemeris Orbit (POEORB) file") + self.logger.info(f"Querying for Precise Ephemeris Orbit (POEORB) file") stage_orbit_file_args = stage_orbit_file.get_parser().parse_args( [ @@ -49,7 +45,7 @@ def download_orbit_file(self, dataset_dir, product_filepath, additional_metadata stage_orbit_file.main(stage_orbit_file_args) except (NoQueryResultsException, NoSuitableOrbitFileException): try: - logger.warning("POEORB file could not be found, querying for Restituted Orbit (ROEORB) file") + self.logger.warning("POEORB file could not be found, querying for Restituted Orbit (ROEORB) file") stage_orbit_file_args = stage_orbit_file.get_parser().parse_args( [ f"--output-directory={str(dataset_dir)}", @@ -63,9 +59,9 @@ def download_orbit_file(self, dataset_dir, product_filepath, additional_metadata ) stage_orbit_file.main(stage_orbit_file_args) except (NoQueryResultsException, NoSuitableOrbitFileException): - logger.warning("Single RESORB file could not be found, querying for consecutive RESORB files") + self.logger.warning("Single RESORB file could not be found, querying for consecutive RESORB files") - logger.info("Querying for RESORB with range [sensing_start - 1 min, sensing_end + 1 min]") + self.logger.info("Querying for RESORB with range [sensing_start - 1 min, sensing_end + 1 min]") sensing_start_range = safe_start_datetime - timedelta(seconds=ORBIT_PAD) sensing_stop_range = safe_stop_datetime + timedelta(seconds=ORBIT_PAD) @@ -82,7 +78,7 @@ def download_orbit_file(self, dataset_dir, product_filepath, additional_metadata ) stage_orbit_file.main(stage_orbit_file_args) - logger.info("Querying for RESORB with range [sensing_start – T_orb – 1 min, sensing_start – T_orb + 1 min]") + self.logger.info("Querying for RESORB with range [sensing_start – T_orb – 1 min, sensing_start – T_orb + 1 min]") sensing_start_range = safe_start_datetime - timedelta(seconds=T_ORBIT + ORBIT_PAD) sensing_stop_range = safe_start_datetime - timedelta(seconds=T_ORBIT - ORBIT_PAD) @@ -103,7 +99,7 @@ def download_orbit_file(self, dataset_dir, product_filepath, additional_metadata del username del password - logger.info("Added orbit file(s) to dataset") + self.logger.info("Added orbit file(s) to dataset") def download_ionosphere_file(self, dataset_dir, product_filepath): try: @@ -121,11 +117,11 @@ def download_ionosphere_file(self, dataset_dir, product_filepath): ) self.update_pending_dataset_metadata_with_ionosphere_metadata(dataset_dir, ionosphere_metadata) except IonosphereFileNotFoundException: - logger.warning("Ionosphere file not found remotely. Allowing job to continue.") + self.logger.warning("Ionosphere file not found remotely. Allowing job to continue.") pass def update_pending_dataset_metadata_with_ionosphere_metadata(self, dataset_dir: PurePath, ionosphere_metadata: dict): - logger.info("Updating dataset's met.json with ionosphere metadata") + self.logger.info("Updating dataset's met.json with ionosphere metadata") with Path(dataset_dir / f"{dataset_dir.name}.met.json").open("r") as fp: met_json: dict = json.load(fp) diff --git a/data_subscriber/aws_token.py b/data_subscriber/aws_token.py index 75dd5fe64..945f5ad79 100644 --- a/data_subscriber/aws_token.py +++ b/data_subscriber/aws_token.py @@ -1,10 +1,9 @@ -import logging from datetime import datetime import requests from requests.auth import HTTPBasicAuth -logger = logging.getLogger(__name__) +from commons.logger import get_logger def supply_token(edl: str, username: str, password: str) -> str: @@ -61,7 +60,10 @@ def _create_token(edl: str, username: str, password: str) -> str: def _delete_token(edl: str, username: str, password: str, token: str) -> None: + logger = get_logger() + url = f"https://{edl}/api/users/revoke_token" + try: resp = requests.post(url, auth=HTTPBasicAuth(username, password), params={"token": token}) diff --git a/data_subscriber/cmr.py b/data_subscriber/cmr.py index 2b638876d..419f37c87 100644 --- a/data_subscriber/cmr.py +++ b/data_subscriber/cmr.py @@ -9,6 +9,7 @@ from typing import Iterable import dateutil.parser +from commons.logger import get_logger from data_subscriber.aws_token import supply_token from data_subscriber.rtc import mgrs_bursts_collection_db_client as mbc_client from more_itertools import first_true @@ -16,8 +17,8 @@ from tools.ops.cmr_audit import cmr_client from tools.ops.cmr_audit.cmr_client import cmr_requests_get, async_cmr_posts -logger = logging.getLogger(__name__) -MAX_CHARS_PER_LINE = 250000 #This is the maximum number of characters per line you can display in cloudwatch logs +MAX_CHARS_PER_LINE = 250000 +"""The maximum number of characters per line you can display in cloudwatch logs""" DateTimeRange = namedtuple("DateTimeRange", ["start_date", "end_date"]) @@ -109,6 +110,7 @@ def get_cmr_token(endpoint, settings): return cmr, token, username, password, edl async def async_query_cmr(args, token, cmr, settings, timerange, now: datetime, silent=False) -> list: + logger = get_logger() request_url = f"https://{cmr}/search/granules.umm_json" bounding_box = args.bbox diff --git a/data_subscriber/cslc/cslc_blackout.py b/data_subscriber/cslc/cslc_blackout.py index b326bb1b3..3d503ffd0 100644 --- a/data_subscriber/cslc/cslc_blackout.py +++ b/data_subscriber/cslc/cslc_blackout.py @@ -1,25 +1,30 @@ + +import asyncio import json -from copy import deepcopy from collections import defaultdict -import dateutil -import logging +from copy import deepcopy from functools import cache -import asyncio +import dateutil + +from commons.logger import get_logger +from data_subscriber.cmr import async_query_cmr, CMR_TIME_FORMAT +from data_subscriber.cslc_utils import localize_anc_json, sensing_time_day_index, parse_cslc_native_id, \ + parse_cslc_file_name, download_batch_id_forward_reproc from data_subscriber.url import cslc_unique_id -from data_subscriber.cmr import async_query_cmr, CMR_TIME_FORMAT, DateTimeRange -from data_subscriber.cslc_utils import localize_anc_json, sensing_time_day_index, parse_cslc_native_id, parse_cslc_file_name, download_batch_id_forward_reproc DEFAULT_DISP_BLACKOUT_DATE_NAME = 'opera-disp-s1-blackout-dates.json' -logger = logging.getLogger(__name__) @cache def localize_disp_blackout_dates(): + logger = get_logger() + try: file = localize_anc_json("DISP_S1_BLACKOUT_DATES_S3PATH") except: - logger.warning(f"Could not download DISP-S1 blackout dates file from settings.yaml field DISP_S1_BLACKOUT_DATES_S3PATH from S3. Attempting to use local copy named {DEFAULT_DISP_BLACKOUT_DATE_NAME}.") + logger.warning(f"Could not download DISP-S1 blackout dates file from settings.yaml field DISP_S1_BLACKOUT_DATES_S3PATH from S3. " + f"Attempting to use local copy named {DEFAULT_DISP_BLACKOUT_DATE_NAME}.") file = DEFAULT_DISP_BLACKOUT_DATE_NAME return process_disp_blackout_dates(file) @@ -110,7 +115,7 @@ def extend_additional_records(self, granules, proc_mode, no_duplicate=False, for def _filter_cslc_blackout_polarization(granules, proc_mode, blackout_dates_obj, no_duplicate, force_frame_id, vv_only = True): '''Filter for CSLC granules and filter for blackout dates and polarization''' - + logger = get_logger() filtered_granules = [] # Get rid of any bursts that aren't in the disp-s1 consistent database. Need to do this before the extending records diff --git a/data_subscriber/cslc/cslc_dependency.py b/data_subscriber/cslc/cslc_dependency.py index 1ce1d2e37..de245fa47 100644 --- a/data_subscriber/cslc/cslc_dependency.py +++ b/data_subscriber/cslc/cslc_dependency.py @@ -1,17 +1,19 @@ -from copy import deepcopy from collections import defaultdict +from copy import deepcopy from datetime import datetime, timedelta + import dateutil -import logging +from commons.logger import get_logger from data_subscriber.cmr import CMR_TIME_FORMAT, DateTimeRange -from data_subscriber.cslc_utils import parse_cslc_file_name, determine_acquisition_cycle_cslc, build_cslc_native_ids, build_ccslc_m_index, _C_CSLC_ES_INDEX_PATTERNS from data_subscriber.cslc.cslc_blackout import query_cmr_cslc_blackout_polarization +from data_subscriber.cslc_utils import parse_cslc_file_name, determine_acquisition_cycle_cslc, build_cslc_native_ids, \ + build_ccslc_m_index, _C_CSLC_ES_INDEX_PATTERNS -logger = logging.getLogger(__name__) class CSLCDependency: def __init__(self, k: int, m: int, frame_to_bursts, args, token, cmr, settings, blackout_dates_obj, VV_only = True): + self.logger = get_logger() self.k = k self.m = m self.frame_to_bursts = frame_to_bursts @@ -44,7 +46,7 @@ def get_prev_day_indices(self, day_index: int, frame_number: int): query_timerange = DateTimeRange(start_date.strftime(CMR_TIME_FORMAT), end_date.strftime(CMR_TIME_FORMAT)) acq_index_to_bursts, _ = self.get_k_granules_from_cmr(query_timerange, frame_number, silent = True) all_prev_indices = frame.sensing_datetime_days_index + sorted(list(acq_index_to_bursts.keys())) - logger.debug(f"All previous day indices: {all_prev_indices}") + self.logger.debug(f"All previous day indices: {all_prev_indices}") return all_prev_indices def get_k_granules_from_cmr(self, query_timerange, frame_number: int, silent = False): '''Return two dictionaries that satisfy the burst pattern for the frame_number within the time range: @@ -91,9 +93,9 @@ def k_granules_grouping(self, frame_number, granules: list): # Get rid of the day indices that don't match the burst pattern for g_day_index in list(acq_index_to_bursts.keys()): if not acq_index_to_bursts[g_day_index].issuperset(frame.burst_ids): - logger.info( + self.logger.info( f"Removing day index {g_day_index} from k-cycle determination because it doesn't suffice the burst pattern") - logger.info(f"{acq_index_to_bursts[g_day_index]}") + self.logger.info(f"{acq_index_to_bursts[g_day_index]}") del acq_index_to_bursts[g_day_index] del acq_index_to_granules[g_day_index] @@ -132,8 +134,8 @@ def determine_k_cycle(self, acquisition_dts: datetime, day_index: int, frame_num acq_index_to_bursts, _ = self.get_k_granules_from_cmr(query_timerange, frame_number, silent) # The k-index is then the complete index number (historical + post historical) mod k - logger.info(f"{len(acq_index_to_bursts.keys())} day indices since historical that match the burst pattern: {acq_index_to_bursts.keys()}") - logger.info(f"{len(frame.sensing_datetime_days_index)} day indices already in historical database.") + self.logger.info(f"{len(acq_index_to_bursts.keys())} day indices since historical that match the burst pattern: {acq_index_to_bursts.keys()}") + self.logger.info(f"{len(frame.sensing_datetime_days_index)} day indices already in historical database.") index_number = len(frame.sensing_datetime_days_index) + len(acq_index_to_bursts.keys()) + 1 return index_number % self.k @@ -170,13 +172,14 @@ def get_dependent_compressed_cslcs(self, frame_id, day_index, eu): ]}}}) if len(ccslc) == 0: - logger.info("Compressed CSLCs for ccslc_m_index: %s was not found in GRQ ES", ccslc_m_index) + self.logger.info("Compressed CSLCs for ccslc_m_index: %s was not found in GRQ ES", ccslc_m_index) return False ccslcs.append(ccslc[0]) # There should only be one - logger.info("All Compresseed CSLSs for frame %s at day index %s found in GRQ ES", frame_id, day_index) - logger.debug(ccslcs) + self.logger.info("All Compresseed CSLSs for frame %s at day index %s found in GRQ ES", frame_id, day_index) + self.logger.debug(ccslcs) + return ccslcs def get_dependent_ccslc_index(prev_day_indices, mm, k, burst_id): '''last_m_index: The index of the last M compressed CSLC, index into prev_day_indices diff --git a/data_subscriber/cslc/cslc_query.py b/data_subscriber/cslc/cslc_query.py index 495436994..37d93bd75 100644 --- a/data_subscriber/cslc/cslc_query.py +++ b/data_subscriber/cslc/cslc_query.py @@ -18,7 +18,6 @@ K_MULT_FACTOR = 2 # TODO: This should be a setting in probably settings.yaml. EARLIEST_POSSIBLE_CSLC_DATE = "2016-01-01T00:00:00Z" -logger = logging.getLogger(__name__) class CslcCmrQuery(CmrQuery): @@ -124,13 +123,13 @@ def determine_download_granules(self, granules): if self.args.k > 1: k_granules = self.retrieve_k_granules(ready_granules, self.args, self.args.k - 1, True, silent=True) self.catalog_granules(k_granules, datetime.now(), self.k_es_conn) - logger.info(f"Length of K-granules: {len(k_granules)=}") + self.logger.info(f"Length of K-granules: {len(k_granules)=}") for k_g in k_granules: self.download_batch_ids[k_g["download_batch_id"]].add(batch_id) self.k_batch_ids[batch_id].add(k_g["download_batch_id"]) self.k_retrieved_granules.extend(k_granules) # This is used for scenario testing else: - logger.info(f"Skipping download for {batch_id} because only {len(download_batch)} of {max_bursts} granules are present") + self.logger.info(f"Skipping download for {batch_id} because only {len(download_batch)} of {max_bursts} granules are present") return reproc_granules @@ -145,8 +144,8 @@ def determine_download_granules(self, granules): self.refresh_index() unsubmitted = self.es_conn.get_unsubmitted_granules() - logger.info(f"{len(granules)=}") - logger.info(f"{len(unsubmitted)=}") + self.logger.info(f"{len(granules)=}") + self.logger.info(f"{len(unsubmitted)=}") # Group all granules by download_batch_id # If the same download_batch_id is in both granules and unsubmitted, we will use the one in granules because it's newer @@ -155,9 +154,9 @@ def determine_download_granules(self, granules): for granule in granules: by_download_batch_id[granule["download_batch_id"]][granule["unique_id"]] = granule - logger.info("Received the following cslc granules from CMR") + self.logger.info("Received the following cslc granules from CMR") for batch_id, download_batch in by_download_batch_id.items(): - logger.info(f"{batch_id=} {len(download_batch)=}") + self.logger.info(f"{batch_id=} {len(download_batch)=}") # THIS RULE ALSO NO LONGER APPLIES. Without Rule 2 there is no Rule 3 # Rule 3: If granules have been downloaded already but with less than 100% and we have new granules for that batch, download all granules for that batch @@ -176,15 +175,15 @@ def determine_download_granules(self, granules): self.download_batch_ids[batch_id].add(batch_id)''' for granule in unsubmitted: - logger.info(f"Merging in unsubmitted granule {granule['unique_id']}: {granule['granule_id']} for triggering consideration") + self.logger.info(f"Merging in unsubmitted granule {granule['unique_id']}: {granule['granule_id']} for triggering consideration") download_batch = by_download_batch_id[granule["download_batch_id"]] if granule["unique_id"] not in download_batch: download_batch[granule["unique_id"]] = granule # Print them all here so that it's nicely all in one place - logger.info("After merging unsubmitted granules with the new ones returned from CMR") + self.logger.info("After merging unsubmitted granules with the new ones returned from CMR") for batch_id, download_batch in by_download_batch_id.items(): - logger.info(f"{batch_id=} {len(download_batch)=}") + self.logger.info(f"{batch_id=} {len(download_batch)=}") # Combine unsubmitted and new granules and determine which granules meet the criteria for download # Rule 1: If all granules for a given download_batch_id are present, download all granules for that batch @@ -195,11 +194,11 @@ def determine_download_granules(self, granules): new_downloads = False if len(download_batch) == max_bursts: # Rule 1 - logger.info(f"Download all granules for {batch_id} because all {max_bursts} granules are present") + self.logger.info(f"Download all granules for {batch_id} because all {max_bursts} granules are present") self.download_batch_ids[batch_id].add(batch_id) # This batch needs to be submitted as part of the download job for sure new_downloads = True else: - logger.info(f"Skipping download for {batch_id} because only {len(download_batch)} of {max_bursts} granules are present") + self.logger.info(f"Skipping download for {batch_id} because only {len(download_batch)} of {max_bursts} granules are present") '''As per email from Heresh at ADT on 7-25-2024, we will not use rule 2. We will always only process full-frames Keeping this code around in case we change our mind on that. else: @@ -228,11 +227,11 @@ def determine_download_granules(self, granules): # Retrieve K- granules and M- compressed CSLCs for this batch if self.args.k > 1: - logger.info("Retrieving K frames worth of data from CMR") + self.logger.info("Retrieving K frames worth of data from CMR") k_granules = self.retrieve_k_granules(list(download_batch.values()), self.args, self.args.k-1) self.catalog_granules(k_granules, current_time, self.k_es_conn) self.k_retrieved_granules.extend(k_granules) # This is used for scenario testing - logger.info(f"Length of K-granules: {len(k_granules)=}") + self.logger.info(f"Length of K-granules: {len(k_granules)=}") #print(f"{granules=}") # All the k batches need to be submitted as part of the download job for this batch @@ -242,11 +241,11 @@ def determine_download_granules(self, granules): self.k_batch_ids[batch_id].add(k_g["download_batch_id"]) if (len(download_batch) > max_bursts): - logger.error(f"{len(download_batch)=} {max_bursts=}") - logger.error(f"{download_batch=}") + self.logger.error(f"{len(download_batch)=} {max_bursts=}") + self.logger.error(f"{download_batch=}") raise AssertionError("Something seriously went wrong matching up CSLC input granules!") - logger.info(f"{len(download_granules)=}") + self.logger.info(f"{len(download_granules)=}") return download_granules @@ -286,7 +285,7 @@ def retrieve_k_granules(self, downloads, args, k_minus_one, VV_only = True, sile if end_date_object < datetime.strptime(EARLIEST_POSSIBLE_CSLC_DATE, CMR_TIME_FORMAT): raise AssertionError(f"We are searching earlier than {EARLIEST_POSSIBLE_CSLC_DATE}. There is no more data here. {end_date_object=}") - logger.info(f"Retrieving K-1 granules {start_date=} {end_date=} for {frame_id=}") + self.logger.info(f"Retrieving K-1 granules {start_date=} {end_date=} for {frame_id=}") # Step 1 of 2: This will return dict of acquisition_cycle -> set of granules for only onse that match the burst pattern cslc_dependency = CSLCDependency( @@ -304,7 +303,7 @@ def retrieve_k_granules(self, downloads, args, k_minus_one, VV_only = True, sile granules = granules_map[acq_day_index] k_granules.extend(granules) k_satified += 1 - logger.info(f" {frame_id=} {acq_day_index=} satsifies. {k_satified=} {k_minus_one=}") + self.logger.info(f" {frame_id=} {acq_day_index=} satsifies. {k_satified=} {k_minus_one=}") if k_satified == k_minus_one: break @@ -322,7 +321,7 @@ def query_cmr_by_native_id (self, args, token, cmr, settings, now: datetime, nat burst_id, acquisition_time, _, frame_ids = parse_cslc_native_id(native_id, self.burst_to_frames, self.disp_burst_map_hist) if len(frame_ids) == 0: - logger.warning(f"{native_id=} is not found in the DISP-S1 Burst ID Database JSON. Nothing to process") + self.logger.warning(f"{native_id=} is not found in the DISP-S1 Burst ID Database JSON. Nothing to process") return [] # TODO: Also check if we should be processing this native_id for this acquisition cycle. @@ -332,14 +331,14 @@ def query_cmr_by_native_id (self, args, token, cmr, settings, now: datetime, nat start_date = (acquisition_time - timedelta(minutes=15)).strftime(CMR_TIME_FORMAT) end_date = (acquisition_time + timedelta(minutes=15)).strftime(CMR_TIME_FORMAT) timerange = DateTimeRange(start_date, end_date) - logger.info(f"Querying CMR for all CSLC files that belong to the frame {frame_id}, derived from the native_id {native_id}") + self.logger.info(f"Querying CMR for all CSLC files that belong to the frame {frame_id}, derived from the native_id {native_id}") return self.query_cmr_by_frame_and_dates(frame_id, local_args, token, cmr, settings, now, timerange, False) def query_cmr_by_frame_and_acq_cycle(self, frame_id: int, acq_cycle: int, args, token, cmr, settings, now: datetime, silent=False): '''Query CMR for specific date range for a specific frame_id and acquisition cycle. Need to always use temporal queries''' - logger.info(f"Querying CMR for all CSLC files that belong to the frame {frame_id} and acquisition cycle {acq_cycle}") + self.logger.info(f"Querying CMR for all CSLC files that belong to the frame {frame_id} and acquisition cycle {acq_cycle}") new_args = copy.deepcopy(args) new_args.use_temporal = True @@ -411,7 +410,7 @@ def query_cmr(self, timerange: DateTimeRange, now: datetime): _, _, acquisition_cycles, _ = parse_cslc_native_id(granule["granule_id"], self.burst_to_frames, self.disp_burst_map_hist) for frame_id, acq_cycle in acquisition_cycles.items(): unique_frames_dates.add(f"{frame_id}-{acq_cycle}") - logger.info(f"Added the follwing frame_id-acq_cycle pairs reprocessing mode: {list(unique_frames_dates)}") + self.logger.info(f"Added the follwing frame_id-acq_cycle pairs reprocessing mode: {list(unique_frames_dates)}") all_granules = [] # We could perform two queries so create a unique set of granules. @@ -480,13 +479,13 @@ def get_download_chunks(self, batch_id_to_urls_map): if self.proc_mode == "historical": if (len(chunk_map[frame_id]) != self.args.k): - logger.error([chunk for chunk, data in chunk_map[frame_id]]) + self.logger.error([chunk for chunk, data in chunk_map[frame_id]]) err_str = f"Number of download batches {len(chunk_map[frame_id])} for frame {frame_id} does not equal K {self.args.k}." raise AssertionError(err_str) return chunk_map.values() def refresh_index(self): - logger.info("performing index refresh") + self.logger.info("performing index refresh") self.es_conn.refresh() - logger.info("performed index refresh") + self.logger.info("performed index refresh") diff --git a/data_subscriber/cslc/cslc_static_query.py b/data_subscriber/cslc/cslc_static_query.py index 6019ce69c..c607ac8a8 100644 --- a/data_subscriber/cslc/cslc_static_query.py +++ b/data_subscriber/cslc/cslc_static_query.py @@ -1,11 +1,6 @@ -#!/usr/bin/env python3 - -import logging from data_subscriber.query import CmrQuery -logger = logging.getLogger(__name__) - class CslcStaticCmrQuery(CmrQuery): def __init__(self, args, token, es_conn, cmr, job_id, settings): diff --git a/data_subscriber/cslc_utils.py b/data_subscriber/cslc_utils.py index a45ba3f33..08927379c 100644 --- a/data_subscriber/cslc_utils.py +++ b/data_subscriber/cslc_utils.py @@ -2,13 +2,14 @@ import re from collections import defaultdict from datetime import datetime +from functools import cache from urllib.parse import urlparse -import dateutil + import boto3 -import logging -from functools import cache +import dateutil import elasticsearch +from commons.logger import get_logger from util import datasets_json_util from util.conf_util import SettingsConf @@ -18,15 +19,14 @@ PENDING_TYPE_CSLC_DOWNLOAD = "cslc_download" _C_CSLC_ES_INDEX_PATTERNS = "grq_1_l2_cslc_s1_compressed*" -logger = logging.getLogger(__name__) class _HistBursts(object): def __init__(self): self.frame_number = None - self.burst_ids = set() # Burst ids as strings in a set - self.sensing_datetimes = [] # Sensing datetimes as datetime object, sorted - self.sensing_seconds_since_first = [] # Sensing time in seconds since the first sensing time - self.sensing_datetime_days_index = [] # Sensing time in days since the first sensing time, rounded to the nearest day + self.burst_ids = set() # Burst ids as strings in a set + self.sensing_datetimes = [] # Sensing datetimes as datetime object, sorted + self.sensing_seconds_since_first = [] # Sensing time in seconds since the first sensing time + self.sensing_datetime_days_index = [] # Sensing time in days since the first sensing time, rounded to the nearest day def localize_anc_json(settings_field): '''Copy down a file from S3 whose path is defined in settings.yaml by settings_field''' @@ -42,20 +42,26 @@ def localize_anc_json(settings_field): @cache def localize_disp_frame_burst_hist(): + logger = get_logger() + try: file = localize_anc_json("DISP_S1_BURST_DB_S3PATH") except: - logger.warning(f"Could not download DISP-S1 burst database json from settings.yaml field DISP_S1_BURST_DB_S3PATH from S3. Attempting to use local copy named {DEFAULT_DISP_FRAME_BURST_DB_NAME}.") + logger.warning(f"Could not download DISP-S1 burst database json from settings.yaml field DISP_S1_BURST_DB_S3PATH from S3. " + f"Attempting to use local copy named {DEFAULT_DISP_FRAME_BURST_DB_NAME}.") file = DEFAULT_DISP_FRAME_BURST_DB_NAME return process_disp_frame_burst_hist(file) @cache def localize_frame_geo_json(): + logger = get_logger() + try: file = localize_anc_json("DISP_S1_FRAME_GEO_SIMPLE") except: - logger.warning(f"Could not download DISP-S1 frame geo simple json {DEFAULT_FRAME_GEO_SIMPLE_JSON_NAME} from S3. Attempting to use local copy named {DEFAULT_FRAME_GEO_SIMPLE_JSON_NAME}.") + logger.warning(f"Could not download DISP-S1 frame geo simple json {DEFAULT_FRAME_GEO_SIMPLE_JSON_NAME} from S3. " + f"Attempting to use local copy named {DEFAULT_FRAME_GEO_SIMPLE_JSON_NAME}.") file = DEFAULT_FRAME_GEO_SIMPLE_JSON_NAME return process_frame_geo_json(file) @@ -115,6 +121,7 @@ def calculate_historical_progress(frame_states: dict, end_date, frame_to_bursts) @cache def process_disp_frame_burst_hist(file): '''Process the disp frame burst map json file intended and return 3 dictionaries''' + logger = get_logger() try: j = json.load(open(file))["data"] diff --git a/data_subscriber/daac_data_subscriber.py b/data_subscriber/daac_data_subscriber.py index f99f1370f..6be52bc14 100644 --- a/data_subscriber/daac_data_subscriber.py +++ b/data_subscriber/daac_data_subscriber.py @@ -3,7 +3,6 @@ import argparse import asyncio import concurrent.futures -import logging import os import re import sys @@ -13,11 +12,10 @@ from pathlib import Path from urllib.parse import urlparse -import boto3 from more_itertools import first from smart_open import open -from commons.logger import NoJobUtilsFilter, NoBaseFilter, NoLogUtilsFilter +from commons.logger import configure_library_loggers, get_logger from data_subscriber.asf_cslc_download import AsfDaacCslcDownload from data_subscriber.asf_rtc_download import AsfDaacRtcDownload from data_subscriber.asf_slc_download import AsfDaacSlcDownload @@ -49,39 +47,20 @@ from util.exec_util import exec_wrapper from util.job_util import supply_job_id, is_running_outside_verdi_worker_context -logger = logging.getLogger(__name__) -logging.basicConfig(level="INFO") @exec_wrapper def main(): run(sys.argv) -def configure_logger(verbose=False): - global logger - - log_level = "DEBUG" if verbose else "INFO" - logger.setLevel(log_level) - logger.info("Log level set to %s", log_level) - - logger_hysds_commons = logging.getLogger("hysds_commons") - logger_hysds_commons.addFilter(NoJobUtilsFilter()) - - logger_elasticsearch = logging.getLogger("elasticsearch") - logger_elasticsearch.addFilter(NoBaseFilter()) - - boto3.set_stream_logger(name='botocore.credentials', level=logging.ERROR) - - logger.addFilter(NoLogUtilsFilter()) - - def run(argv: list[str]): parser = create_parser() args = parser.parse_args(argv[1:]) validate_args(args) - configure_logger(args.verbose) + logger = get_logger(args.verbose) + configure_library_loggers() es_conn = supply_es_conn(args) @@ -165,6 +144,7 @@ def run_download(args, token, es_conn, netloc, username, password, cmr, job_id): def run_rtc_download(args, token, es_conn, netloc, username, password, cmr, job_id): + logger = get_logger() provider = args.provider # "ASF-RTC" settings = SettingsConf().cfg @@ -303,6 +283,7 @@ def multithread_gather(job_submission_tasks): def supply_es_conn(args): + logger = get_logger() provider = (COLLECTION_TO_PROVIDER_TYPE_MAP[args.collection] if hasattr(args, "collection") else args.provider) diff --git a/data_subscriber/download.py b/data_subscriber/download.py index 632b1cddd..c80d579c6 100644 --- a/data_subscriber/download.py +++ b/data_subscriber/download.py @@ -13,11 +13,12 @@ from cachetools.func import ttl_cache import extractor.extract +from commons.logger import get_logger from data_subscriber.cmr import Provider, CMR_TIME_FORMAT from data_subscriber.query import DateTimeRange from data_subscriber.url import _to_batch_id, _to_orbit_number -from util.conf_util import SettingsConf from tools.stage_orbit_file import fatal_code +from util.conf_util import SettingsConf logger = logging.getLogger(__name__) @@ -51,11 +52,12 @@ def rebuild_auth(self, prepared_request, response): class DaacDownload: def __init__(self, provider): + self.logger = get_logger() self.provider = provider self.daac_s3_cred_settings_key = None self.cfg = SettingsConf().cfg # has metadata extractor config - logger.info("Creating directories to process products") + self.logger.info("Creating directories to process products") # house all file downloads self.downloads_dir = Path("downloads") @@ -67,11 +69,11 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, downloads = self.get_downloads(args, es_conn) if not downloads: - logger.info(f"No undownloaded files found in index.") + self.logger.info(f"No undownloaded files found in index.") return product_to_product_filepaths_map if args.dry_run: - logger.info(f"{args.dry_run=}. Skipping downloads.") + self.logger.info(f"{args.dry_run=}. Skipping downloads.") return product_to_product_filepaths_map session = SessionWithHeaderRedirection(username, password, netloc) @@ -81,7 +83,7 @@ def run_download(self, args, token, es_conn, netloc, username, password, cmr, ) if rm_downloads_dir: - logger.info(f"Removing directory tree. {self.downloads_dir}") + self.logger.info(f"Removing directory tree. {self.downloads_dir}") shutil.rmtree(self.downloads_dir) return product_to_product_filepaths_map @@ -92,7 +94,7 @@ def get_downloads(self, args, es_conn): # simply find entries for that one granule if args.batch_ids and len(args.batch_ids) == 1: one_granule = args.batch_ids[0] - logger.info(f"Downloading files for the granule {one_granule}") + self.logger.info(f"Downloading files for the granule {one_granule}") downloads = es_conn.get_download_granule_revision(one_granule) else: download_timerange = self.get_download_timerange(args) @@ -101,18 +103,18 @@ def get_downloads(self, args, es_conn): dateutil.parser.isoparse(download_timerange.end_date), args.use_temporal ) - logger.info(f"{len(list(all_pending_downloads))=}") + self.logger.info(f"{len(list(all_pending_downloads))=}") downloads = all_pending_downloads if args.batch_ids: - logger.info(f"Filtering pending downloads by {args.batch_ids=}") + self.logger.info(f"Filtering pending downloads by {args.batch_ids=}") id_func = (_to_batch_id if self.provider in (Provider.LPCLOUD, Provider.ASF_RTC, Provider.ASF_CSLC) else _to_orbit_number) downloads = list(filter(lambda d: id_func(d) in args.batch_ids, all_pending_downloads)) - logger.info(f"{len(downloads)=}") - logger.debug(f"{downloads=}") + self.logger.info(f"{len(downloads)=}") + self.logger.debug(f"{downloads=}") return downloads @@ -123,7 +125,7 @@ def get_download_timerange(self, args): start_date = args.start_date if args.start_date else "1900-01-01T00:00:00Z" end_date = args.end_date if args.end_date else datetime.utcnow().strftime(CMR_TIME_FORMAT) download_timerange = DateTimeRange(start_date, end_date) - logger.info(f"{download_timerange=}") + self.logger.info(f"{download_timerange=}") return download_timerange def extract_one_to_one(self, product: Path, settings_cfg: dict, working_dir: Path, extra_metadata=None, name_postscript='') -> PurePath: @@ -135,7 +137,7 @@ def extract_one_to_one(self, product: Path, settings_cfg: dict, working_dir: Pat """ # create dataset dir for product # (this also extracts the metadata to *.met.json file) - logger.info("Creating dataset directory") + self.logger.info("Creating dataset directory") dataset_dir = extractor.extract.extract( product=str(product), product_types=settings_cfg["PRODUCT_TYPES"], @@ -143,14 +145,14 @@ def extract_one_to_one(self, product: Path, settings_cfg: dict, working_dir: Pat extra_met=extra_metadata, name_postscript=name_postscript ) - logger.info(f"{dataset_dir=}") + self.logger.info(f"{dataset_dir=}") return PurePath(dataset_dir) def download_product_using_s3(self, url, token, target_dirpath: Path, args) -> Path: if self.cfg["USE_DAAC_S3_CREDENTIALS"] is True: aws_creds = self.get_aws_creds(token) - logger.debug(f"{self.get_aws_creds.cache_info()=}") + self.logger.debug(f"{self.get_aws_creds.cache_info()=}") s3 = boto3.Session(aws_access_key_id=aws_creds['accessKeyId'], aws_secret_access_key=aws_creds['secretAccessKey'], aws_session_token=aws_creds['sessionToken'], diff --git a/data_subscriber/gcov/gcov_query.py b/data_subscriber/gcov/gcov_query.py index 3b8d4eb54..c783fa0c7 100644 --- a/data_subscriber/gcov/gcov_query.py +++ b/data_subscriber/gcov/gcov_query.py @@ -1,10 +1,6 @@ -#!/usr/bin/env python3 - -import logging from data_subscriber.query import CmrQuery -logger = logging.getLogger(__name__) class NisarGcovCmrQuery(CmrQuery): diff --git a/data_subscriber/geojson_utils.py b/data_subscriber/geojson_utils.py index 4c01aca26..41c329b06 100644 --- a/data_subscriber/geojson_utils.py +++ b/data_subscriber/geojson_utils.py @@ -1,10 +1,12 @@ -import boto3 -import logging import os +import boto3 + +from commons.logger import get_logger from geo.geo_util import does_bbox_intersect_region from util.conf_util import SettingsConf + def localize_include_exclude(args): geojsons = [] @@ -17,6 +19,7 @@ def localize_include_exclude(args): localize_geojsons(geojsons) def localize_geojsons(geojsons): + logger = get_logger() settings = SettingsConf().cfg bucket = settings["GEOJSON_BUCKET"] @@ -27,8 +30,9 @@ def localize_geojsons(geojsons): try: download_from_s3(bucket, key, key) except Exception as e: - logging.warning("Exception while fetching geojson files: %s from S3 bucket %s. \ -Will try to fetch the same file from current working directory - this should only be done in testing scenarios. " % (key, bucket) + str(e)) + logger.warning("Exception while fetching geojson files: %s from S3 bucket %s. " + "Will try to fetch the same file from current working directory - " + "this should only be done in testing scenarios." % (key, bucket) + str(e)) # See if the file exists in the current working directory if not os.path.exists(key): @@ -45,6 +49,7 @@ def does_granule_intersect_regions(granule, intersect_regions): def filter_granules_by_regions(granules, include_regions, exclude_regions): '''Filters granules based on include and exclude regions lists''' + logger = get_logger() filtered = [] for granule in granules: @@ -53,17 +58,20 @@ def filter_granules_by_regions(granules, include_regions, exclude_regions): if include_regions is not None: (result, region) = does_granule_intersect_regions(granule, include_regions) if result is False: - logging.info( - f"The following granule does not intersect with any include regions. Skipping processing %s" - % granule.get("granule_id")) + logger.info( + "The following granule does not intersect with any include regions. " + "Skipping processing %s" % granule.get("granule_id") + ) continue # Skip this granule if it's in the exclude list if exclude_regions is not None: (result, region) = does_granule_intersect_regions(granule, exclude_regions) if result is True: - logging.info(f"The following granule intersects with the exclude region %s. Skipping processing %s" - % (region, granule.get("granule_id"))) + logger.info( + "The following granule intersects with the exclude region %s. " + "Skipping processing %s" % (region, granule.get("granule_id")) + ) continue # If both filters don't apply, add this granule to the list @@ -76,4 +84,4 @@ def download_from_s3(bucket, file, path): try: s3.Object(bucket, file).download_file(path) except Exception as e: - raise Exception("Exception while fetching file: %s from S3 bucket %s. " % (file, bucket) + str(e)) + raise Exception("Exception while fetching file: %s from S3 bucket %s." % (file, bucket) + str(e)) diff --git a/data_subscriber/hls/hls_query.py b/data_subscriber/hls/hls_query.py index 2e1476034..1a5a3b401 100644 --- a/data_subscriber/hls/hls_query.py +++ b/data_subscriber/hls/hls_query.py @@ -1,6 +1,4 @@ -import logging - from data_subscriber.query import CmrQuery from data_subscriber.hls.hls_catalog import HLSSpatialProductCatalog @@ -8,5 +6,5 @@ class HlsCmrQuery(CmrQuery): """Class used to query the Common Metadata Repository (CMR) for Harmonized Landsat and Sentinel-1 (HLS) products.""" def update_granule_index(self, granule): - spatial_catalog_conn = HLSSpatialProductCatalog(logging.getLogger(__name__)) + spatial_catalog_conn = HLSSpatialProductCatalog(self.logger) spatial_catalog_conn.process_granule(granule) diff --git a/data_subscriber/ionosphere_download.py b/data_subscriber/ionosphere_download.py index 8038928a1..18fc0b0b5 100644 --- a/data_subscriber/ionosphere_download.py +++ b/data_subscriber/ionosphere_download.py @@ -12,11 +12,11 @@ import boto3 import dateutil.parser import dateutil.parser -from hysds_commons.job_utils import submit_mozart_job from more_itertools import chunked, partition from mypy_boto3_s3 import S3Client -from commons.logger import NoJobUtilsFilter, NoBaseFilter +from commons.logger import configure_library_loggers +from hysds_commons.job_utils import submit_mozart_job from tools import stage_ionosphere_file from tools.stage_ionosphere_file import IonosphereFileNotFoundException from util import grq_client as grq_client, job_util @@ -29,12 +29,7 @@ @exec_wrapper def main(): - logger_hysds_commons = logging.getLogger("hysds_commons") - logger_hysds_commons.addFilter(NoJobUtilsFilter()) - - logger_elasticsearch = logging.getLogger("elasticsearch") - logger_elasticsearch.addFilter(NoBaseFilter()) - + configure_library_loggers() asyncio.run(run(sys.argv)) diff --git a/data_subscriber/lpdaac_download.py b/data_subscriber/lpdaac_download.py index f93cf5d79..e43404d52 100644 --- a/data_subscriber/lpdaac_download.py +++ b/data_subscriber/lpdaac_download.py @@ -1,19 +1,18 @@ import itertools -import logging import json -import extractor.extract -from pathlib import PurePath, Path -from collections import defaultdict +import logging import shutil -import requests -import requests.utils +from collections import defaultdict from datetime import datetime -from product2dataset import product2dataset -from data_subscriber.url import _to_batch_id, _to_orbit_number, _has_url, _to_urls, _to_https_urls, form_batch_id +from pathlib import PurePath, Path + +import requests.utils +import extractor.extract from data_subscriber.download import DaacDownload +from data_subscriber.url import _to_urls, _to_https_urls, form_batch_id +from product2dataset import product2dataset -logger = logging.getLogger(__name__) class HLSDownload: def __init__(self): @@ -56,7 +55,7 @@ def perform_download(self, # One HLSDownload object contains multiple es_id and url pairs for key, downloads in download_map.items(): - logger.info(f"Processing {key=}") + self.logger.info(f"Processing {key=}") granule_download_dir = self.downloads_dir / key granule_download_dir.mkdir(exist_ok=True) @@ -67,7 +66,7 @@ def perform_download(self, for download in downloads.es_ids_urls: product_url = download[1] if args.dry_run: - logger.info(f"{args.dry_run=}. Skipping download.") + self.logger.info(f"{args.dry_run=}. Skipping download.") break product_filepath = self.download_product(product_url, session, token, args, granule_download_dir) products.append(product_filepath) @@ -76,14 +75,14 @@ def perform_download(self, # Mark as downloaded es_id = download[0] es_conn.mark_product_as_downloaded(es_id, job_id) - logger.info(f"{products=}") + self.logger.info(f"{products=}") - logger.info(f"{len(product_urls_downloaded)=}, {product_urls_downloaded=}") + self.logger.info(f"{len(product_urls_downloaded)=}, {product_urls_downloaded=}") self.extract_many_to_one(products, key, self.cfg) - logger.info(f"Removing directory {granule_download_dir}") + self.logger.info(f"Removing directory {granule_download_dir}") shutil.rmtree(granule_download_dir) @@ -139,7 +138,7 @@ def extract_many_to_one(self, products: list[Path], group_dataset_id, settings_c self.extract_one_to_one(product, settings_cfg, working_dir=product_extracts_dir) for product in products ] - logger.info(f"{dataset_dirs=}") + self.logger.info(f"{dataset_dirs=}") # generate merge metadata from single-product datasets shared_met_entries_dict = {} # this is updated, when merging, with metadata common to multiple input files @@ -148,32 +147,32 @@ def extract_many_to_one(self, products: list[Path], group_dataset_id, settings_c str(product_extracts_dir.resolve()), extra_met=shared_met_entries_dict # copy some common metadata from each product. ) - logger.debug(f"{merged_met_dict=}") + self.logger.debug(f"{merged_met_dict=}") - logger.info("Creating target dataset directory") + self.logger.info("Creating target dataset directory") target_dataset_dir = Path(group_dataset_id) target_dataset_dir.mkdir(exist_ok=True) for product in products: shutil.copy(product, target_dataset_dir.resolve()) - logger.info("Copied input products to dataset directory") + self.logger.info("Copied input products to dataset directory") # group_dataset_id coming in is the ES _id which contains the revision-id from CMR as # the last .# So we split that out #TODO: Make this a function in url granule_id = group_dataset_id.split('-')[0] - logger.info("update merged *.met.json with additional, top-level metadata") + self.logger.info("update merged *.met.json with additional, top-level metadata") merged_met_dict.update(shared_met_entries_dict) merged_met_dict["FileSize"] = total_product_file_sizes merged_met_dict["FileName"] = granule_id merged_met_dict["id"] = granule_id - logger.debug(f"{merged_met_dict=}") + self.logger.debug(f"{merged_met_dict=}") # write out merged *.met.json merged_met_json_filepath = target_dataset_dir.resolve() / f"{group_dataset_id}.met.json" with open(merged_met_json_filepath, mode="w") as output_file: json.dump(merged_met_dict, output_file) - logger.info(f"Wrote {merged_met_json_filepath=!s}") + self.logger.info(f"Wrote {merged_met_json_filepath=!s}") # write out basic *.dataset.json file (version + created_timestamp) dataset_json_dict = extractor.extract.create_dataset_json( @@ -193,7 +192,7 @@ def extract_many_to_one(self, products: list[Path], group_dataset_id, settings_c granule_dataset_json_filepath = target_dataset_dir.resolve() / f"{group_dataset_id}.dataset.json" with open(granule_dataset_json_filepath, mode="w") as output_file: json.dump(dataset_json_dict, output_file) - logger.info(f"Wrote {granule_dataset_json_filepath=!s}") + self.logger.info(f"Wrote {granule_dataset_json_filepath=!s}") shutil.rmtree(extracts_dir) diff --git a/data_subscriber/query.py b/data_subscriber/query.py index 602b1eb99..80c96137d 100644 --- a/data_subscriber/query.py +++ b/data_subscriber/query.py @@ -2,7 +2,6 @@ import argparse import asyncio import hashlib -import logging import uuid from collections import defaultdict from datetime import datetime, timedelta @@ -11,6 +10,7 @@ import dateutil.parser from more_itertools import chunked +from commons.logger import get_logger from data_subscriber.cmr import (async_query_cmr, ProductType, DateTimeRange, COLLECTION_TO_PRODUCT_TYPE_MAP, @@ -23,10 +23,10 @@ from data_subscriber.url import form_batch_id, _slc_url_to_chunk_id from hysds_commons.job_utils import submit_mozart_job -logger = logging.getLogger(__name__) class CmrQuery: def __init__(self, args, token, es_conn, cmr, job_id, settings): + self.logger = get_logger() self.args = args self.token = token self.es_conn = es_conn @@ -45,20 +45,20 @@ def run_query(self): now = datetime.utcnow() query_timerange: DateTimeRange = get_query_timerange(self.args, now) - logger.info("CMR query STARTED") + self.logger.info("CMR query STARTED") granules = self.query_cmr(query_timerange, now) - logger.info("CMR query FINISHED") + self.logger.info("CMR query FINISHED") # Get rid of duplicate granules. This happens often for CSLC and TODO: probably RTC granules = self.eliminate_duplicate_granules(granules) if self.args.smoke_run: - logger.info(f"{self.args.smoke_run=}. Restricting to 1 granule(s).") + self.logger.info(f"{self.args.smoke_run=}. Restricting to 1 granule(s).") granules = granules[:1] # If processing mode is historical, apply the include/exclude-region filtering if self.proc_mode == "historical": - logging.info(f"Processing mode is historical so applying include and exclude regions...") + self.logger.info(f"Processing mode is historical so applying include and exclude regions...") # Fetch all necessary geojson files from S3 localize_include_exclude(self.args) @@ -72,16 +72,17 @@ def run_query(self): '''TODO: Optional. For CSLC query jobs, make sure that we got all the bursts here according to database json. Otherwise, fail this job''' - logger.info("catalogue-ing STARTED") + self.logger.info("catalogue-ing STARTED") self.catalog_granules(granules, query_dt) - logger.info("catalogue-ing FINISHED") + self.logger.info("catalogue-ing FINISHED") #TODO: This function only applies to RTC, merge w CSLC at some point batch_id_to_products_map = self.refresh_index() if self.args.subparser_name == "full": - logger.info( - f"{self.args.subparser_name=}. Skipping download job submission. Download will be performed directly.") + self.logger.info( + f"{self.args.subparser_name=}. Skipping download job submission. Download will be performed directly." + ) if COLLECTION_TO_PRODUCT_TYPE_MAP[self.args.collection] == ProductType.RTC: self.args.provider = COLLECTION_TO_PROVIDER_TYPE_MAP[self.args.collection] @@ -94,11 +95,11 @@ def run_query(self): return {"download_granules": download_granules} if self.args.no_schedule_download: - logger.info(f"{self.args.no_schedule_download=}. Forcefully skipping download job submission.") + self.logger.info(f"{self.args.no_schedule_download=}. Forcefully skipping download job submission.") return {"download_granules": download_granules} if not self.args.chunk_size: - logger.info(f"{self.args.chunk_size=}. Insufficient chunk size. Skipping download job submission.") + self.logger.info(f"{self.args.chunk_size=}. Insufficient chunk size. Skipping download job submission.") return if COLLECTION_TO_PRODUCT_TYPE_MAP[self.args.collection] == ProductType.RTC: @@ -108,14 +109,14 @@ def run_query(self): job_submission_tasks = self.download_job_submission_handler(download_granules, query_timerange) results = job_submission_tasks - logger.info(f"{len(results)=}") - logger.debug(f"{results=}") + self.logger.info(f"{len(results)=}") + self.logger.debug(f"{results=}") succeeded = [job_id for job_id in results if isinstance(job_id, str)] failed = [e for e in results if isinstance(e, Exception)] - logger.info(f"{succeeded=}") - logger.info(f"{failed=}") + self.logger.info(f"{succeeded=}") + self.logger.info(f"{failed=}") return { "success": succeeded, @@ -220,7 +221,7 @@ def download_job_submission_handler(self, granules, query_timerange): else: batch_id_to_urls_map[url_grouping_func(granule_id, revision_id)].add(filter_url) - logger.debug(f"{batch_id_to_urls_map=}") + self.logger.debug(f"{batch_id_to_urls_map=}") job_submission_tasks = self.submit_download_job_submissions_tasks(batch_id_to_urls_map, query_timerange) @@ -232,7 +233,7 @@ def get_download_chunks(self, batch_id_to_urls_map): def submit_download_job_submissions_tasks(self, batch_id_to_urls_map, query_timerange): job_submission_tasks = [] - logger.info(f"{self.args.chunk_size=}") + self.logger.info(f"{self.args.chunk_size=}") if COLLECTION_TO_PRODUCT_TYPE_MAP[self.args.collection] == ProductType.CSLC: # Note that self.disp_burst_map_hist and self.blackout_dates_obj are created in the child class @@ -257,9 +258,9 @@ def submit_download_job_submissions_tasks(self, batch_id_to_urls_map, query_time payload_hash = hashlib.md5(granule_to_hash.encode()).hexdigest() - logger.info(f"{chunk_batch_ids=}") - logger.info(f"{payload_hash=}") - logger.debug(f"{chunk_urls=}") + self.logger.info(f"{chunk_batch_ids=}") + self.logger.info(f"{payload_hash=}") + self.logger.debug(f"{chunk_urls=}") params = self.create_download_job_params(query_timerange, chunk_batch_ids) @@ -273,7 +274,7 @@ def submit_download_job_submissions_tasks(self, batch_id_to_urls_map, query_time # and wait for the next query to come in. Any acquisition index will work because all batches # require the same compressed cslcs if not cslc_dependency.compressed_cslc_satisfied(frame_id, acq_indices[0], self.es_conn.es_util): - logger.info(f"Not all compressed CSLCs are satisfied so this download job is blocked until they are satisfied") + self.logger.info(f"Not all compressed CSLCs are satisfied so this download job is blocked until they are satisfied") save_blocked_download_job(self.es_conn.es_util, self.settings["RELEASE_VERSION"], product_type, params, self.args.job_queue, job_name, frame_id, acq_indices[0], self.args.k, self.args.m, chunk_batch_ids) @@ -358,7 +359,7 @@ def create_download_job_params(self, query_timerange, chunk_batch_ids): "from": "value" } ] - logger.info(f"{download_job_params=}") + self.logger.info(f"{download_job_params=}") return download_job_params @@ -425,6 +426,8 @@ def update_url_index( es_conn.process_url(filename_urls, granule, job_id, query_dt, temporal_extent_beginning_dt, revision_date_dt, *args, **kwargs) def get_query_timerange(args: argparse.Namespace, now: datetime): + logger = get_logger() + now_minus_minutes_dt = ( now - timedelta(minutes=args.minutes) if not args.native_id diff --git a/data_subscriber/rtc/mgrs_bursts_collection_db_client.py b/data_subscriber/rtc/mgrs_bursts_collection_db_client.py index 7d75c31ef..270cc4690 100644 --- a/data_subscriber/rtc/mgrs_bursts_collection_db_client.py +++ b/data_subscriber/rtc/mgrs_bursts_collection_db_client.py @@ -1,5 +1,5 @@ + import ast -import logging import os import re from collections import defaultdict @@ -12,10 +12,9 @@ from mypy_boto3_s3 import S3Client from pyproj import Transformer +from commons.logger import get_logger from util.conf_util import SettingsConf -logger = logging.getLogger(__name__) - def tree(): """ @@ -34,12 +33,14 @@ def dicts(t): @cache def cached_load_mgrs_burst_db(filter_land=True): """see :func:`~data_subscriber.rtc.mgrs_bursts_collection_db_client.load_mgrs_burst_db`""" + logger = get_logger() logger.info(f"Cache loading MGRS burst database. {filter_land=}") return load_mgrs_burst_db(filter_land) def load_mgrs_burst_db(filter_land=True): """see :func:`~data_subscriber.rtc.mgrs_bursts_collection_db_client.load_mgrs_burst_db_raw`""" + logger = get_logger() logger.info(f"Loading MGRS burst database. {filter_land=}") vector_gdf = load_mgrs_burst_db_raw(filter_land) @@ -55,6 +56,7 @@ def load_mgrs_burst_db(filter_land=True): def load_mgrs_burst_db_raw(filter_land=True) -> GeoDataFrame: """Loads the MGRS Tile Collection Database. On AWS environments, this will localize from a known S3 location.""" + logger = get_logger() mtc_local_filepath = Path(os.environ.get("MGRS_TILE_COLLECTION_DB_FILEPATH", "~/Downloads/MGRS_tile_collection_v0.3.sqlite")).expanduser() if mtc_local_filepath.exists(): diff --git a/data_subscriber/rtc/rtc_download_job_submitter.py b/data_subscriber/rtc/rtc_download_job_submitter.py index 62715ad04..aed3a1c26 100644 --- a/data_subscriber/rtc/rtc_download_job_submitter.py +++ b/data_subscriber/rtc/rtc_download_job_submitter.py @@ -1,19 +1,18 @@ import asyncio import itertools import json -import logging from functools import partial from typing import Optional +from commons.logger import get_logger from util.job_submitter import try_submit_mozart_job -logger = logging.getLogger(__name__) - flatten = itertools.chain.from_iterable async def example(batch_id_to_urls_map, args): """Sample usage function.""" + logger = get_logger() job_submission_tasks = submit_rtc_download_job_submissions_tasks(batch_id_to_urls_map, args) results = await asyncio.gather(*job_submission_tasks, return_exceptions=True) logger.info(f"{len(results)=}") diff --git a/data_subscriber/rtc/rtc_job_submitter.py b/data_subscriber/rtc/rtc_job_submitter.py index 4b125c6fd..6dcb279c5 100644 --- a/data_subscriber/rtc/rtc_job_submitter.py +++ b/data_subscriber/rtc/rtc_job_submitter.py @@ -1,19 +1,19 @@ + import asyncio -from datetime import datetime, timezone -import logging import os +from datetime import datetime, timezone from functools import partial from pathlib import PurePath from typing import Optional +from commons.logger import get_logger from data_subscriber.rtc import mgrs_bursts_collection_db_client as mbc_client from util.job_submitter import try_submit_mozart_job -logger = logging.getLogger(__name__) - async def example(batch_id_to_urls_map, args): """Sample usage function.""" + logger = get_logger() job_submission_tasks = submit_dswx_s1_job_submissions_tasks(batch_id_to_urls_map, args) results = await asyncio.gather(*job_submission_tasks, return_exceptions=True) logger.info(f"{len(results)=}") diff --git a/data_subscriber/rtc/rtc_query.py b/data_subscriber/rtc/rtc_query.py index 54ee9a38b..a50eb6251 100644 --- a/data_subscriber/rtc/rtc_query.py +++ b/data_subscriber/rtc/rtc_query.py @@ -1,7 +1,6 @@ import asyncio import itertools -import logging import re from collections import namedtuple, defaultdict from datetime import datetime, timedelta @@ -14,15 +13,13 @@ from data_subscriber.cmr import async_query_cmr, COLLECTION_TO_PROVIDER_TYPE_MAP from data_subscriber.geojson_utils import localize_include_exclude, filter_granules_by_regions -from data_subscriber.query import CmrQuery +from data_subscriber.query import CmrQuery, get_query_timerange from data_subscriber.rtc import mgrs_bursts_collection_db_client as mbc_client, evaluator from data_subscriber.rtc.rtc_download_job_submitter import submit_rtc_download_job_submissions_tasks from data_subscriber.url import determine_acquisition_cycle from geo.geo_util import does_bbox_intersect_region from rtc_utils import rtc_granule_regex -logger = logging.getLogger(__name__) - DateTimeRange = namedtuple("DateTimeRange", ["start_date", "end_date"]) # required constants @@ -44,23 +41,23 @@ async def run_query(self): now = datetime.utcnow() query_timerange: DateTimeRange = get_query_timerange(self.args, now) - logger.info("CMR query STARTED") + self.logger.info("CMR query STARTED") granules = await async_query_cmr(self.args, self.token, self.cmr, self.settings, query_timerange, now) - logger.info("CMR query FINISHED") + self.logger.info("CMR query FINISHED") - # If processing mode is historical, apply include/exclude-region filtering + # If processing mode is historical, apply the include/exclude-region filtering if self.args.proc_mode == "historical": - logger.info(f"Processing mode is historical so applying include and exclude regions...") + self.logger.info(f"Processing mode is historical so applying include and exclude regions...") # Fetch all necessary geojson files from S3 localize_include_exclude(self.args) granules[:] = filter_granules_by_regions(granules, self.args.include_regions, self.args.exclude_regions) - logger.info("catalogue-ing STARTED") + self.logger.info("catalogue-ing STARTED") affected_mgrs_set_id_acquisition_ts_cycle_indexes = set() - granules[:] = filter_granules_rtc(granules, self.args) - logger.info(f"Filtered to {len(granules)} granules") + granules[:] = self.filter_granules_rtc(granules) + self.logger.info(f"Filtered to {len(granules)} granules") mgrs = mbc_client.cached_load_mgrs_burst_db(filter_land=True) if self.args.native_id: @@ -71,7 +68,7 @@ async def run_query(self): num_granules = len(granules) for i, granule in enumerate(granules): - logger.debug(f"Processing granule {i+1} of {num_granules}") + self.logger.debug(f"Processing granule {i+1} of {num_granules}") granule_id = granule.get("granule_id") revision_id = granule.get("revision_id") @@ -109,18 +106,18 @@ async def run_query(self): **additional_fields ) - logger.info("catalogue-ing FINISHED") + self.logger.info("catalogue-ing FINISHED") succeeded = [] failed = [] - logger.info("performing index refresh") + self.logger.info("performing index refresh") self.es_conn.refresh() - logger.info("performed index refresh") + self.logger.info("performed index refresh") - logger.info("evaluating available burst sets") - logger.info(f"{affected_mgrs_set_id_acquisition_ts_cycle_indexes=}") + self.logger.info("evaluating available burst sets") + self.logger.info(f"{affected_mgrs_set_id_acquisition_ts_cycle_indexes=}") if self.args.native_id: # limit query to the 1 or 2 affected sets in backlog - logger.info("Supplied native-id. Limiting evaluation") + self.logger.info("Supplied native-id. Limiting evaluation") min_num_bursts = self.args.coverage_target_num if not min_num_bursts: min_num_bursts = self.settings["DSWX_S1_MINIMUM_NUMBER_OF_BURSTS_REQUIRED"] @@ -137,7 +134,7 @@ async def run_query(self): min_num_bursts=min_num_bursts ) else: # evaluate ALL sets in backlog - logger.info("Performing full evaluation") + self.logger.info("Performing full evaluation") min_num_bursts = self.args.coverage_target_num if not min_num_bursts: min_num_bursts = self.settings["DSWX_S1_MINIMUM_NUMBER_OF_BURSTS_REQUIRED"] @@ -173,24 +170,24 @@ async def run_query(self): if product_doc["mgrs_set_id_acquisition_ts_cycle_index"] == batch_id: batch_id_to_products_map[batch_id][product_doc["id"]].append(product_doc) if self.args.smoke_run: - logger.info(f"{self.args.smoke_run=}. Filtering to single batch") + self.logger.info(f"{self.args.smoke_run=}. Filtering to single batch") batch_id_to_products_map = dict(sorted(batch_id_to_products_map.items())[:1]) - logger.info(f"num_batches={len(batch_id_to_products_map)}") + self.logger.info(f"num_batches={len(batch_id_to_products_map)}") if self.args.subparser_name == "full": - logger.info(f"{self.args.subparser_name=}. Skipping download job submission. Download will be performed directly.") + self.logger.info(f"{self.args.subparser_name=}. Skipping download job submission. Download will be performed directly.") self.args.provider = COLLECTION_TO_PROVIDER_TYPE_MAP[self.args.collection] self.args.batch_ids = affected_mgrs_set_id_acquisition_ts_cycle_indexes return if self.args.no_schedule_download: - logger.info(f"{self.args.no_schedule_download=}. Forcefully skipping download job submission.") + self.logger.info(f"{self.args.no_schedule_download=}. Forcefully skipping download job submission.") return if not self.args.chunk_size: - logger.info(f"{self.args.chunk_size=}. Insufficient chunk size. Skipping download job submission.") + self.logger.info(f"{self.args.chunk_size=}. Insufficient chunk size. Skipping download job submission.") return results = [] - logger.info(f"Submitting batches for RTC download job: {list(batch_id_to_products_map)}") + self.logger.info(f"Submitting batches for RTC download job: {list(batch_id_to_products_map)}") for batch_id, products_map in batch_id_to_products_map.items(): job_submission_tasks = submit_rtc_download_job_submissions_tasks({batch_id: products_map}, self.args, self.settings) results_batch = await asyncio.gather(*job_submission_tasks, return_exceptions=True) @@ -205,7 +202,7 @@ async def run_query(self): product["download_job_ids"] = first(suceeded_batch) if self.args.dry_run: - logger.info(f"{self.args.dry_run=}. Skipping marking jobs as downloaded. Producing mock job ID") + self.logger.info(f"{self.args.dry_run=}. Skipping marking jobs as downloaded. Producing mock job ID") pass else: self.es_conn.mark_products_as_download_job_submitted({batch_id: batch_id_to_products_map[batch_id]}) @@ -213,19 +210,40 @@ async def run_query(self): succeeded.extend(suceeded_batch) failed.extend(failed_batch) - logger.info(f"{len(results)=}") - logger.info(f"{results=}") + self.logger.info(f"{len(results)=}") + self.logger.info(f"{results=}") - logger.info(f"{len(succeeded)=}") - logger.info(f"{succeeded=}") - logger.info(f"{len(failed)=}") - logger.info(f"{failed=}") + self.logger.info(f"{len(succeeded)=}") + self.logger.info(f"{succeeded=}") + self.logger.info(f"{len(failed)=}") + self.logger.info(f"{failed=}") return { "success": succeeded, "fail": failed } + def filter_granules_rtc(self, granules): + self.logger.info("Applying land/water filter on CMR granules") + + filtered_granules = [] + for granule in granules: + granule_id = granule.get("granule_id") + + match_product_id = re.match(rtc_granule_regex, granule_id) + burst_id = match_product_id.group("burst_id") + + mgrs = mbc_client.cached_load_mgrs_burst_db(filter_land=True) + mgrs_sets = mbc_client.burst_id_to_mgrs_set_ids(mgrs, + mbc_client.product_burst_id_to_mapping_burst_id(burst_id)) + if not mgrs_sets: + self.logger.debug(f"{burst_id=} not associated with land or land/water data. skipping.") + continue + + filtered_granules.append(granule) + + return filtered_granules + def update_affected_mgrs_set_ids(acquisition_cycle, affected_mgrs_set_id_acquisition_ts_cycle_indexes, mgrs_burst_set_ids): # construct filters for evaluation # ati = Acquisition Time Index @@ -253,18 +271,6 @@ def update_additional_fields_mgrs_set_id_acquisition_ts_cycle_indexes(acquisitio return mgrs_set_id_acquisition_ts_cycle_indexes -def get_query_timerange(args, now: datetime, silent=False): - now_minus_minutes_dt = (now - timedelta(minutes=args.minutes)) if not args.native_id else dateutil.parser.isoparse("1900-01-01T00:00:00Z") - - start_date = args.start_date if args.start_date else now_minus_minutes_dt.strftime("%Y-%m-%dT%H:%M:%SZ") - end_date = args.end_date if args.end_date else now.strftime("%Y-%m-%dT%H:%M:%SZ") - - query_timerange = DateTimeRange(start_date, end_date) - if not silent: - logger.info(f"{query_timerange=}") - return query_timerange - - def update_url_index( es_conn, urls: list[str], @@ -298,23 +304,3 @@ def does_granule_intersect_regions(granule, intersect_regions): return True, region return False, None - - -def filter_granules_rtc(granules, args): - logger.info("Applying land/water filter on CMR granules") - - filtered_granules = [] - for granule in granules: - granule_id = granule.get("granule_id") - - match_product_id = re.match(rtc_granule_regex, granule_id) - burst_id = match_product_id.group("burst_id") - - mgrs = mbc_client.cached_load_mgrs_burst_db(filter_land=True) - mgrs_sets = mbc_client.burst_id_to_mgrs_set_ids(mgrs, mbc_client.product_burst_id_to_mapping_burst_id(burst_id)) - if not mgrs_sets: - logger.debug(f"{burst_id=} not associated with land or land/water data. skipping.") - continue - - filtered_granules.append(granule) - return filtered_granules diff --git a/data_subscriber/slc/slc_query.py b/data_subscriber/slc/slc_query.py index 366222362..c60606042 100644 --- a/data_subscriber/slc/slc_query.py +++ b/data_subscriber/slc/slc_query.py @@ -1,12 +1,9 @@ -import logging - from data_subscriber.geojson_utils import localize_geojsons from data_subscriber.query import CmrQuery from data_subscriber.slc.slc_catalog import SLCSpatialProductCatalog from geo.geo_util import _NORTH_AMERICA, does_bbox_intersect_north_america -logger = logging.getLogger(__name__) class SlcCmrQuery(CmrQuery): @@ -17,7 +14,7 @@ def __init__(self, args, token, es_conn, cmr, job_id, settings): localize_geojsons([_NORTH_AMERICA]) def update_granule_index(self, granule): - spatial_catalog_conn = SLCSpatialProductCatalog(logger) + spatial_catalog_conn = SLCSpatialProductCatalog(self.logger) spatial_catalog_conn.process_granule(granule) def prepare_additional_fields(self, granule, args, granule_id): diff --git a/data_subscriber/submit_pending_jobs.py b/data_subscriber/submit_pending_jobs.py index 691cc4c0f..f56135a5f 100644 --- a/data_subscriber/submit_pending_jobs.py +++ b/data_subscriber/submit_pending_jobs.py @@ -3,22 +3,21 @@ '''Goes through the list of pending jobs and submits them to the job queue after checking if they are ready to be submitted''' -import boto3 import logging import sys -from commons.logger import NoJobUtilsFilter, NoBaseFilter, NoLogUtilsFilter -from util.conf_util import SettingsConf -from data_subscriber.cmr import get_cmr_token -from data_subscriber.parser import create_parser -from data_subscriber.query import submit_download_job +from cslc_utils import (get_pending_download_jobs, + localize_disp_frame_burst_hist, + mark_pending_download_job_submitted) +from commons.logger import configure_library_loggers from data_subscriber import es_conn_util -from cslc_utils import (get_pending_download_jobs, localize_disp_frame_burst_hist, mark_pending_download_job_submitted) -from data_subscriber.cslc.cslc_dependency import CSLCDependency +from data_subscriber.cmr import get_cmr_token from data_subscriber.cslc.cslc_blackout import DispS1BlackoutDates, localize_disp_blackout_dates from data_subscriber.cslc.cslc_catalog import CSLCProductCatalog - - +from data_subscriber.cslc.cslc_dependency import CSLCDependency +from data_subscriber.parser import create_parser +from data_subscriber.query import submit_download_job +from util.conf_util import SettingsConf from util.exec_util import exec_wrapper logging.basicConfig(level="INFO") @@ -27,20 +26,9 @@ @exec_wrapper def main(): - configure_logger() + configure_library_loggers() run(sys.argv) -def configure_logger(): - logger_hysds_commons = logging.getLogger("hysds_commons") - logger_hysds_commons.addFilter(NoJobUtilsFilter()) - - logger_elasticsearch = logging.getLogger("elasticsearch") - logger_elasticsearch.addFilter(NoBaseFilter()) - - boto3.set_stream_logger(name='botocore.credentials', level=logging.ERROR) - - logger.addFilter(NoLogUtilsFilter()) - def run(argv: list[str]): logger.info(f"{argv=}") diff --git a/data_subscriber/survey.py b/data_subscriber/survey.py index 89427f6b3..c8f144a8a 100644 --- a/data_subscriber/survey.py +++ b/data_subscriber/survey.py @@ -1,12 +1,12 @@ import asyncio -import logging from datetime import datetime, timedelta import backoff +from commons.logger import get_logger +from data_subscriber import cslc_utils from data_subscriber.cmr import async_query_cmr, CMR_TIME_FORMAT from data_subscriber.query import get_query_timerange, DateTimeRange -from data_subscriber import cslc_utils _date_format_str = CMR_TIME_FORMAT _date_format_str_cmr = _date_format_str[:-1] + ".%fZ" @@ -27,7 +27,7 @@ def _query_cmr_backoff(args, token, cmr, settings, query_timerange, now, disp_bu def run_survey(args, token, cmr, settings): - + logger = get_logger() start_dt = datetime.strptime(args.start_date, _date_format_str) end_dt = datetime.strptime(args.end_date, _date_format_str) @@ -45,7 +45,7 @@ def run_survey(args, token, cmr, settings): disp_burst_map = None if args.frame_id is not None: - logging.info("Querying for DISP-S1 frame_id only: " + str(args.frame_id)) + logger.info("Querying for DISP-S1 frame_id only: " + str(args.frame_id)) disp_burst_map, _, _ = cslc_utils.localize_disp_frame_burst_hist() while start_dt < end_dt: @@ -73,10 +73,10 @@ def run_survey(args, token, cmr, settings): g_td_dt = datetime.strptime(g_td, _date_format_str) update_temporal_delta = g_rd_dt - g_td_dt update_temporal_delta_hrs = update_temporal_delta.total_seconds() / 3600 - logging.debug(f"{g_id}, {g_rd}, {g_td}, delta: {update_temporal_delta_hrs} hrs") + logger.debug(f"{g_id}, {g_rd}, {g_td}, delta: {update_temporal_delta_hrs} hrs") if (g_id in all_granules): (og_rd, og_td, _, _) = all_granules[g_id] - logging.warning(f"{g_id} had already been found {og_rd=} {og_td=}") + logger.warning(f"{g_id} had already been found {og_rd=} {og_td=}") else: raw_csv.write(g_id+", "+g_rd+", "+g_td+", "+"%10.2f" % update_temporal_delta_hrs+", "+r_id+"\n") all_granules[g_id] = (g_rd, g_td, update_temporal_delta_hrs, r_id) @@ -92,29 +92,27 @@ def run_survey(args, token, cmr, settings): out_csv.write(str(count)) out_csv.write('\n') - logging.info(f"{start_str},{end_str},{str(count)}") + logger.info(f"{start_str},{end_str},{str(count)}") start_dt = start_dt + step_time total_g_str = "Total granules found: " + str(total_granules) print(f"{len(all_granules)=}") - logging.info(total_g_str) + logger.info(total_g_str) out_csv.write(total_g_str) out_csv.close() raw_csv.close() - logging.info(f"Output CSV written out to files: {args.out_csv}, {args.out_csv}.raw.csv" ) + logger.info(f"Output CSV written out to files: {args.out_csv}, {args.out_csv}.raw.csv" ) hist_title = f"Histogram of Revision vs Temporal Time for all granules" - logging.info(hist_title) + logger.info(hist_title) import matplotlib.pyplot as plt _ = plt.hist(all_deltas, bins=50) - #print(hist_e) - #print(hist_v) plt.title(hist_title) - logging.info("Saving histogram figure as " + args.out_csv+".svg") + logger.info("Saving histogram figure as " + args.out_csv + ".svg") plt.savefig(args.out_csv+".svg", format="svg", dpi=1200) plt.show() diff --git a/data_subscriber/url.py b/data_subscriber/url.py index 513bfe7c0..826621c48 100644 --- a/data_subscriber/url.py +++ b/data_subscriber/url.py @@ -1,9 +1,11 @@ -import logging import re +from datetime import timedelta from pathlib import Path from typing import Any + import dateutil -from datetime import datetime, timedelta + +from commons.logger import get_logger _EPOCH_S1A = "20140101T000000Z" @@ -56,28 +58,31 @@ def _to_tile_id(dl_doc: dict[str, Any]): def _has_url(dl_dict: dict[str, Any]): + logger = get_logger() result = _has_s3_url(dl_dict) or _has_https_url(dl_dict) if not result: - logging.error(f"Couldn't find any URL in {dl_dict=}") + logger.error(f"Couldn't find any URL in {dl_dict=}") return result def _has_https_url(dl_dict: dict[str, Any]): + logger = get_logger() result = dl_dict.get("https_url") if not result: - logging.warning(f"Couldn't find any HTTPS URL in {dl_dict=}") + logger.warning(f"Couldn't find any HTTPS URL in {dl_dict=}") return result def _has_s3_url(dl_dict: dict[str, Any]): + logger = get_logger() result = dl_dict.get("s3_url") if not result: - logging.warning(f"Couldn't find any S3 URL in {dl_dict=}") + logger.warning(f"Couldn't find any S3 URL in {dl_dict=}") return result @@ -116,7 +121,6 @@ def determine_acquisition_cycle(burst_id, acquisition_dts, granule_id, epoch = N burst_identification_number / MAX_BURST_IDENTIFICATION_NUMBER)) ) / ACQUISITION_CYCLE_DURATION_SECS - #print(acquisition_index) acquisition_cycle = round(acquisition_index) assert acquisition_cycle >= 0, f"Acquisition cycle is negative: {acquisition_cycle=}" diff --git a/tools/ops/cmr_audit/cmr_client.py b/tools/ops/cmr_audit/cmr_client.py index e8c33f283..1369ea148 100644 --- a/tools/ops/cmr_audit/cmr_client.py +++ b/tools/ops/cmr_audit/cmr_client.py @@ -1,7 +1,7 @@ + import asyncio import contextlib import itertools -import logging import math import os from math import ceil @@ -12,35 +12,35 @@ import requests from requests.exceptions import HTTPError -logger = logging.getLogger(__name__) +from commons.logger import get_logger async def async_cmr_posts(url, request_bodies: list): - """Given a list of request bodies, performs CMR queries asynchronously, returning the response JSONs.""" - logger.info("Querying CMR") - + """Given a list of request bodies, performs CMR queries asynchronously, returning the response JSONs.""" async with aiohttp.ClientSession() as session: tasks = [] sem = asyncio.Semaphore(1) - # async with asyncio.Semaphore(1) as sem: + for request_body in request_bodies: tasks.append(async_cmr_post(url, request_body, session, sem)) responses = await asyncio.gather(*tasks) - logger.info("Queried CMR") return list(itertools.chain.from_iterable(responses)) async def async_cmr_post(url, data: str, session: aiohttp.ClientSession, sem: Optional[asyncio.Semaphore]): """Issues a request asynchronously. If a semaphore is provided, it will use it as a context manager.""" + logger = get_logger() + sem = sem if sem is not None else contextlib.nullcontext() + async with sem: page_size = 2000 # default is 10, max is 2000 data += f"&page_size={page_size}" logger.debug(f"async_cmr_post({url=}..., {len(data)=:,}, {data[-250:]=}") max_pages = 1 # cap the number of pages (requests) to scroll through results. - # after first response, update with the smaller of the forced max and the number of hits + # after first response, update with the smallest of the forced max and the number of hits current_page = 1 headers = { @@ -48,7 +48,7 @@ async def async_cmr_post(url, data: str, session: aiohttp.ClientSession, sem: Op 'Client-Id': f'nasa.jpl.opera.sds.pcm.data_subscriber.{os.environ["USER"]}' } - logger.info("Issuing request. This may take a while depending on search page size and number of pages/results") + logger.info("Issuing request. This may take a while depending on search page size and number of pages/results.") response_jsons = [] while current_page <= max_pages: @@ -57,13 +57,16 @@ async def async_cmr_post(url, data: str, session: aiohttp.ClientSession, sem: Op response_jsons.append(response_json) if current_page == 1: - logger.info(f'CMR number of granules (cmr-query): {response_json["hits"]=:,}') + logger.debug(f'CMR number of granules (cmr-query): {response_json["hits"]=:,}') max_pages = math.ceil(response_json["hits"]/page_size) - logger.info(f"Updating max pages to {max_pages=}") - logger.info(f'CMR query (cmr-query-page {current_page} of {ceil(response_json["hits"]/page_size)}): {len(response_json["items"])=:,}') + logger.debug("Updating max pages to %d", max_pages) + + logger.debug(f'CMR query (cmr-query-page {current_page} of {ceil(response_json["hits"]/page_size)}): ' + f'{len(response_json["items"])=:,}') cmr_search_after = response.headers.get("CMR-Search-After") logger.debug(f"{cmr_search_after=}") + if cmr_search_after: headers.update({"CMR-Search-After": response.headers["CMR-Search-After"]}) @@ -74,9 +77,9 @@ async def async_cmr_post(url, data: str, session: aiohttp.ClientSession, sem: Op current_page += 1 if not current_page <= max_pages: logger.warning( - f"Reached max pages limit. {max_pages=} " - "Not all search results exhausted. " - "Adjust limit or time ranges to process all hits, then re-run this script." + "Reached max pages limit (%d). Not all search results exhausted. " + "Adjust limit or time ranges to process all hits, then re-run this script.", + max_pages ) return response_jsons diff --git a/util/job_util.py b/util/job_util.py index 37655ee7b..ff6764651 100644 --- a/util/job_util.py +++ b/util/job_util.py @@ -12,14 +12,13 @@ def is_running_outside_verdi_worker_context(): def supply_job_id(): if is_running_outside_verdi_worker_context(): - logger.debug("Running outside of job context. Generating random job ID") + logger.info("Running outside of job context. Generating random job ID") job_id = str(uuid.uuid4()) else: with open("_job.json", "r+") as job: - logger.debug("job_path: {}".format(job)) + logger.debug("job_path: %s", str(job)) local_job_json = json.load(job) logger.debug(f"{local_job_json=!s}") job_id = local_job_json["job_info"]["job_payload"]["payload_task_id"] - logger.info(f"{job_id=}") return job_id From cf63d77706dca9b2aa073996873debd656ab9c55 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Tue, 5 Nov 2024 09:40:23 -0600 Subject: [PATCH 09/17] Added support for a quiet logging mode to daac_data_subscriber.py --- data_subscriber/daac_data_subscriber.py | 2 +- data_subscriber/parser.py | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/data_subscriber/daac_data_subscriber.py b/data_subscriber/daac_data_subscriber.py index 6be52bc14..a512163d2 100644 --- a/data_subscriber/daac_data_subscriber.py +++ b/data_subscriber/daac_data_subscriber.py @@ -59,7 +59,7 @@ def run(argv: list[str]): validate_args(args) - logger = get_logger(args.verbose) + logger = get_logger(args.verbose, args.quiet) configure_library_loggers() es_conn = supply_es_conn(args) diff --git a/data_subscriber/parser.py b/data_subscriber/parser.py index 18f472b3f..0e7260420 100644 --- a/data_subscriber/parser.py +++ b/data_subscriber/parser.py @@ -15,7 +15,13 @@ def create_parser(): verbose = {"positionals": ["-v", "--verbose"], "kwargs": {"dest": "verbose", "action": "store_true", - "help": "Verbose mode."}} + "help": "Verbose mode, enables debug level logging. If provided, " + "takes precedence over the --quiet option."}} + + quiet ={"positionals": ["-q", "--quiet"], + "kwargs": {"dest": "quiet", + "action": "store_true", + "help": "Quiet mode, only warning and error level messages will be logged."}} file = {"positionals": ["-f", "--file"], "kwargs": {"dest": "file", @@ -211,12 +217,12 @@ def create_parser(): "help": "The protocol used for retrieving data, " "HTTPS or S3 or AUTO."}} - parser_arg_list = [verbose, file] + parser_arg_list = [verbose, quiet, file] _add_arguments(parser, parser_arg_list) survey_parser = subparsers.add_parser("survey", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - survey_parser_arg_list = [verbose, endpoint, provider, collection, + survey_parser_arg_list = [verbose, quiet, endpoint, provider, collection, start_date, end_date, bbox, minutes, max_revision, smoke_run, native_id, frame_id, use_temporal, temporal_start_date, step_hours, out_csv] @@ -224,7 +230,7 @@ def create_parser(): full_parser = subparsers.add_parser("full", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - full_parser_arg_list = [verbose, endpoint, collection, start_date, end_date, + full_parser_arg_list = [verbose, quiet, endpoint, collection, start_date, end_date, bbox, minutes, k, m, grace_mins, dry_run, smoke_run, no_schedule_download, release_version, job_queue, chunk_size, max_revision, @@ -236,7 +242,7 @@ def create_parser(): query_parser = subparsers.add_parser("query", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - query_parser_arg_list = [verbose, endpoint, collection, start_date, end_date, + query_parser_arg_list = [verbose, quiet, endpoint, collection, start_date, end_date, bbox, minutes, k, m, grace_mins, dry_run, smoke_run, no_schedule_download, release_version, job_queue, chunk_size, max_revision, @@ -248,7 +254,7 @@ def create_parser(): download_parser = subparsers.add_parser("download", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - download_parser_arg_list = [verbose, file, endpoint, dry_run, smoke_run, provider, + download_parser_arg_list = [verbose, quiet, file, endpoint, dry_run, smoke_run, provider, batch_ids, start_date, end_date, use_temporal, proc_mode, temporal_start_date, transfer_protocol, release_version] _add_arguments(download_parser, download_parser_arg_list) From 5591877dd4f404d6314e18f810e1e6d225f885f1 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Tue, 5 Nov 2024 12:24:34 -0600 Subject: [PATCH 10/17] Removed unused deprecated CMR request functions --- data_subscriber/cmr.py | 7 +-- .../test_daac_data_subscriber.py | 59 ------------------- tools/ops/cmr_audit/cmr_client.py | 49 --------------- 3 files changed, 1 insertion(+), 114 deletions(-) diff --git a/data_subscriber/cmr.py b/data_subscriber/cmr.py index 419f37c87..59003e656 100644 --- a/data_subscriber/cmr.py +++ b/data_subscriber/cmr.py @@ -15,7 +15,7 @@ from more_itertools import first_true from rtc_utils import rtc_granule_regex from tools.ops.cmr_audit import cmr_client -from tools.ops.cmr_audit.cmr_client import cmr_requests_get, async_cmr_posts +from tools.ops.cmr_audit.cmr_client import async_cmr_posts MAX_CHARS_PER_LINE = 250000 """The maximum number of characters per line you can display in cloudwatch logs""" @@ -254,11 +254,6 @@ async def _async_request_search_cmr_granules(args, request_url, paramss: Iterabl return response_jsons_to_cmr_granules(args, response_jsons) -def _request_search_cmr_granules(args, request_url, params): - response_jsons = cmr_requests_get(args, request_url, params) - return response_jsons_to_cmr_granules(args, response_jsons) - - def response_jsons_to_cmr_granules(args, response_jsons): items = [item for response_json in response_jsons diff --git a/tests/data_subscriber/test_daac_data_subscriber.py b/tests/data_subscriber/test_daac_data_subscriber.py index 7e0f202ac..b6b9e764a 100644 --- a/tests/data_subscriber/test_daac_data_subscriber.py +++ b/tests/data_subscriber/test_daac_data_subscriber.py @@ -594,65 +594,6 @@ def patch_subscriber(monkeypatch): daac_data_subscriber.supply_token.__name__, mock_token ) - monkeypatch.setattr( - cmr, - cmr._request_search_cmr_granules.__name__, - MagicMock(return_value=( - [ - { - "granule_id": "dummy_granule_id", - "filtered_urls": [ - "https://example.com/T00000.B02.tif", - ], - "related_urls": [ - "https://example.com/T00000.B02.tif", - ], - "identifier": "S2A_dummy", - "temporal_extent_beginning_datetime": datetime.now().isoformat(), - "revision_date": datetime.now().isoformat(), - }, - { - "granule_id": "dummy_granule_id_2", - "filtered_urls": [ - "https://example.com/T00001.B02.tif", - "https://example.com/T00001.B03.tif", - ], - "related_urls": [ - "https://example.com/T00001.B02.tif", - "https://example.com/T00001.B03.tif", - ], - "identifier": "S2A_dummy", - "temporal_extent_beginning_datetime": datetime.now().isoformat(), - "revision_date": datetime.now().isoformat(), - }, - { - "granule_id": "dummy_granule_id_3", - "filtered_urls": [ - "https://example.com/T00002.B02.tif", - ], - "related_urls": [ - "https://example.com/T00002.B02.tif", - ], - "identifier": "S2A_dummy", - "temporal_extent_beginning_datetime": datetime.now().isoformat(), - "revision_date": datetime.now().isoformat(), - }, - { - "granule_id": "dummy_granule_id_4", - "filtered_urls": [ - "https://example.com/T00003.B01.tif", - ], - "related_urls": [ - "https://example.com/T00003.B01.tif", - ], - "identifier": "S2A_dummy", - "temporal_extent_beginning_datetime": datetime.now().isoformat(), - "revision_date": datetime.now().isoformat(), - } - ], - False # search_after - )) - ) monkeypatch.setattr( daac_data_subscriber, daac_data_subscriber.update_url_index.__name__, diff --git a/tools/ops/cmr_audit/cmr_client.py b/tools/ops/cmr_audit/cmr_client.py index 1369ea148..e2c091f65 100644 --- a/tools/ops/cmr_audit/cmr_client.py +++ b/tools/ops/cmr_audit/cmr_client.py @@ -108,55 +108,6 @@ async def fetch_post_url(session: aiohttp.ClientSession, url, data: str, headers return await session.post(url, data=data, headers=headers, raise_for_status=True) -def cmr_requests_get(args, request_url, params): - """ - DEPRECATED. Issues a CMR request using GET. - Newer code should use cmr_client.async_cmr_post. - """ - page_size = 2000 # default is 10, max is 2000 - params["page_size"] = page_size - - logger.debug(f"_request_search_concurrent({request_url=}, {params=}") - - response_jsons = [] - max_pages = 1 # cap the number of pages (requests) to scroll through results. - # update after first response - - current_page = 1 - headers = { - 'Client-Id': f'nasa.jpl.opera.sds.pcm.data_subscriber.{os.environ["USER"]}' - } - while current_page <= max_pages: - response = try_request_get(request_url, params, headers, raise_for_status=True) - response_json = response.json() - response_jsons.append(response_json) - - if current_page == 1: - logger.info(f'CMR number of granules (cmr-query): {response_json["hits"]=:,}') - max_pages = math.ceil(response_json["hits"] / page_size) - logger.info(f"Updating max pages to {max_pages=}") - logger.info(f'CMR number of granules (cmr-query-page {current_page} of {max_pages}): {len(response_json["items"])=:,}') - - cmr_search_after = response.headers.get("CMR-Search-After") - logger.debug(f"{cmr_search_after=}") - if cmr_search_after: - headers.update({"CMR-Search-After": response.headers["CMR-Search-After"]}) - - if len(response_json["items"]) < page_size: - logger.info("Reached end of CMR search results. Ending query.") - break - - current_page += 1 - if not current_page <= max_pages: - logger.warning( - f"Reached max pages limit. {max_pages=}" - "Not all search results exhausted. " - "Adjust limit or time ranges to process all hits, then re-run this script." - ) - - return response_jsons - - def giveup_cmr_requests(e): """giveup function for use with @backoff decorator when issuing CMR requests using blocking `requests` functions.""" if isinstance(e, HTTPError): From 5a96425ffa96241ddfbb8111e5fed9846ace32da Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Tue, 5 Nov 2024 12:25:04 -0600 Subject: [PATCH 11/17] Logging improvements for data_subscriber/query.py --- data_subscriber/query.py | 50 ++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/data_subscriber/query.py b/data_subscriber/query.py index 80c96137d..75f28a3de 100644 --- a/data_subscriber/query.py +++ b/data_subscriber/query.py @@ -45,9 +45,11 @@ def run_query(self): now = datetime.utcnow() query_timerange: DateTimeRange = get_query_timerange(self.args, now) - self.logger.info("CMR query STARTED") + self.logger.info("CMR Query STARTED") + granules = self.query_cmr(query_timerange, now) - self.logger.info("CMR query FINISHED") + + self.logger.info("CMR Query FINISHED") # Get rid of duplicate granules. This happens often for CSLC and TODO: probably RTC granules = self.eliminate_duplicate_granules(granules) @@ -72,17 +74,17 @@ def run_query(self): '''TODO: Optional. For CSLC query jobs, make sure that we got all the bursts here according to database json. Otherwise, fail this job''' - self.logger.info("catalogue-ing STARTED") + self.logger.info("Granule Cataloguing STARTED") + self.catalog_granules(granules, query_dt) - self.logger.info("catalogue-ing FINISHED") - #TODO: This function only applies to RTC, merge w CSLC at some point + self.logger.info("Granule Cataloguing FINISHED") + + # TODO: This function only applies to RTC, merge w CSLC at some point batch_id_to_products_map = self.refresh_index() if self.args.subparser_name == "full": - self.logger.info( - f"{self.args.subparser_name=}. Skipping download job submission. Download will be performed directly." - ) + self.logger.info("Skipping download job submission. Download will be performed directly.") if COLLECTION_TO_PRODUCT_TYPE_MAP[self.args.collection] == ProductType.RTC: self.args.provider = COLLECTION_TO_PROVIDER_TYPE_MAP[self.args.collection] @@ -95,12 +97,12 @@ def run_query(self): return {"download_granules": download_granules} if self.args.no_schedule_download: - self.logger.info(f"{self.args.no_schedule_download=}. Forcefully skipping download job submission.") + self.logger.info("Forcefully skipping download job submission.") return {"download_granules": download_granules} if not self.args.chunk_size: - self.logger.info(f"{self.args.chunk_size=}. Insufficient chunk size. Skipping download job submission.") - return + self.logger.info("Insufficient chunk size (%s). Skipping download job submission.", str(self.args.chunk_size)) + return {"download_granules": download_granules} if COLLECTION_TO_PRODUCT_TYPE_MAP[self.args.collection] == ProductType.RTC: job_submission_tasks = submit_rtc_download_job_submissions_tasks(batch_id_to_products_map.keys(), self.args, self.settings) @@ -109,14 +111,13 @@ def run_query(self): job_submission_tasks = self.download_job_submission_handler(download_granules, query_timerange) results = job_submission_tasks - self.logger.info(f"{len(results)=}") - self.logger.debug(f"{results=}") - succeeded = [job_id for job_id in results if isinstance(job_id, str)] failed = [e for e in results if isinstance(e, Exception)] - self.logger.info(f"{succeeded=}") - self.logger.info(f"{failed=}") + self.logger.debug(f"{results=}") + self.logger.debug(f"{succeeded=}") + self.logger.debug(f"{failed=}") + self.logger.debug(f"{download_granules=}") return { "success": succeeded, @@ -233,12 +234,12 @@ def get_download_chunks(self, batch_id_to_urls_map): def submit_download_job_submissions_tasks(self, batch_id_to_urls_map, query_timerange): job_submission_tasks = [] - self.logger.info(f"{self.args.chunk_size=}") - if COLLECTION_TO_PRODUCT_TYPE_MAP[self.args.collection] == ProductType.CSLC: # Note that self.disp_burst_map_hist and self.blackout_dates_obj are created in the child class - cslc_dependency = CSLCDependency(self.args.k, self.args.m, self.disp_burst_map_hist, self.args, self.token, - self.cmr, self.settings, self.blackout_dates_obj) + cslc_dependency = CSLCDependency( + self.args.k, self.args.m, self.disp_burst_map_hist, self.args, + self.token, self.cmr, self.settings, self.blackout_dates_obj + ) for batch_chunk in self.get_download_chunks(batch_id_to_urls_map): chunk_batch_ids = [] @@ -258,8 +259,8 @@ def submit_download_job_submissions_tasks(self, batch_id_to_urls_map, query_time payload_hash = hashlib.md5(granule_to_hash.encode()).hexdigest() - self.logger.info(f"{chunk_batch_ids=}") - self.logger.info(f"{payload_hash=}") + self.logger.debug(f"{chunk_batch_ids=}") + self.logger.debug(f"{payload_hash=}") self.logger.debug(f"{chunk_urls=}") params = self.create_download_job_params(query_timerange, chunk_batch_ids) @@ -274,7 +275,7 @@ def submit_download_job_submissions_tasks(self, batch_id_to_urls_map, query_time # and wait for the next query to come in. Any acquisition index will work because all batches # require the same compressed cslcs if not cslc_dependency.compressed_cslc_satisfied(frame_id, acq_indices[0], self.es_conn.es_util): - self.logger.info(f"Not all compressed CSLCs are satisfied so this download job is blocked until they are satisfied") + self.logger.info(f"Not all compressed CSLCs are satisfied so this download job is blocked until they are satisfied.") save_blocked_download_job(self.es_conn.es_util, self.settings["RELEASE_VERSION"], product_type, params, self.args.job_queue, job_name, frame_id, acq_indices[0], self.args.k, self.args.m, chunk_batch_ids) @@ -359,7 +360,7 @@ def create_download_job_params(self, query_timerange, chunk_batch_ids): "from": "value" } ] - self.logger.info(f"{download_job_params=}") + self.logger.debug(f"{download_job_params=}") return download_job_params @@ -381,7 +382,6 @@ def submit_download_job(*, release_version=None, product_type: str, params: list def _submit_mozart_job_minimal(*, hysdsio: dict, job_queue: str, provider_str: str, job_name = None, payload_hash = None) -> str: - if not job_name: job_name = f"job-WF-{provider_str}_download" From 3dee3ca925ea04694847bf458dfda8abf016832b Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Tue, 5 Nov 2024 12:43:14 -0600 Subject: [PATCH 12/17] Removed usage and references to the silent argument used to control logging --- data_subscriber/cmr.py | 64 +++++++++++++++---------- data_subscriber/cslc/cslc_blackout.py | 4 +- data_subscriber/cslc/cslc_dependency.py | 11 +++-- data_subscriber/cslc/cslc_query.py | 18 +++---- data_subscriber/survey.py | 6 +-- tools/disp_s1_burst_db_tool.py | 6 +-- tools/update_disp_s1_burst_db.py | 19 ++++---- 7 files changed, 72 insertions(+), 56 deletions(-) diff --git a/data_subscriber/cmr.py b/data_subscriber/cmr.py index 59003e656..21dc07b77 100644 --- a/data_subscriber/cmr.py +++ b/data_subscriber/cmr.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -import logging import netrc import re from collections import namedtuple @@ -109,7 +108,7 @@ def get_cmr_token(endpoint, settings): return cmr, token, username, password, edl -async def async_query_cmr(args, token, cmr, settings, timerange, now: datetime, silent=False) -> list: +async def async_query_cmr(args, token, cmr, settings, timerange, now: datetime) -> list: logger = get_logger() request_url = f"https://{cmr}/search/granules.umm_json" bounding_box = args.bbox @@ -168,8 +167,7 @@ async def async_query_cmr(args, token, cmr, settings, timerange, now: datetime, temporal_range = _get_temporal_range(timerange_start_date, timerange_end_date, now_date) force_temporal = True - if not silent: - logger.info(f"Time Range: {temporal_range} use_temporal: {args.use_temporal}") + logger.debug("Time Range: %s, use_temporal: %s", temporal_range, args.use_temporal) if args.use_temporal or force_temporal is True: params["temporal"] = temporal_range @@ -178,29 +176,39 @@ async def async_query_cmr(args, token, cmr, settings, timerange, now: datetime, # if a temporal start-date is provided, set temporal if args.temporal_start_date: - if not silent: - logger.info(f"{args.temporal_start_date=}") + logger.debug("Using args.temporal_start_date=%s", args.temporal_start_date) params["temporal"] = dateutil.parser.isoparse(args.temporal_start_date).strftime(CMR_TIME_FORMAT) - if not silent: - logger.info(f"Querying CMR. {request_url=} {params=}") + logger.info(f"Querying CMR.") + logger.debug("request_url=%s", request_url) + logger.debug("params=%s", params) product_granules = await _async_request_search_cmr_granules(args, request_url, [params]) search_results_count = len(product_granules) - if not silent: - logger.info(f"QUERY RESULTS: Found {search_results_count} granules") - products_per_line = 1000 # Default but this would never be used because we calculate dynamically below. Just here incase code moves around and we want a reasonable default - if search_results_count > 0: - # Print out all the query results but limit the number of characters per line - one_logout = f'{(product_granules[0]["granule_id"], "revision " + str(product_granules[0]["revision_id"]))}' - chars_per_line = len(one_logout) + 6 # 6 is a fudge factor - products_per_line = MAX_CHARS_PER_LINE // chars_per_line - for i in range(0, search_results_count, products_per_line): - end_range = i + products_per_line - if end_range > search_results_count: - end_range = search_results_count - logger.info(f'QUERY RESULTS {i+1} to {end_range} of {search_results_count}: {[(granule["granule_id"], "revision " + str(granule["revision_id"])) for granule in product_granules[i:end_range]]}') + logger.info(f"CMR Query Complete. Found %d granule(s)", search_results_count) + + # Default but this would never be used because we calculate dynamically below. + # Just here incase code moves around and we want a reasonable default + products_per_line = 1000 + + if search_results_count > 0: + # Print out all the query results but limit the number of characters per line + one_logout = f'{(product_granules[0]["granule_id"], "revision " + str(product_granules[0]["revision_id"]))}' + chars_per_line = len(one_logout) + 6 # 6 is a fudge factor + products_per_line = MAX_CHARS_PER_LINE // chars_per_line + + for i in range(0, search_results_count, products_per_line): + end_range = i + products_per_line + if end_range > search_results_count: + end_range = search_results_count + + logger.info('QUERY RESULTS %d to %d of %d: ', i + 1, end_range, search_results_count) + + for granule in product_granules[i:end_range]: + logger.info( + f'{(granule["granule_id"], "revision " + str(granule["revision_id"]))}' + ) # Filter out granules with revision-id greater than max allowed least_revised_granules = [] @@ -218,19 +226,25 @@ async def async_query_cmr(args, token, cmr, settings, timerange, now: datetime, product_granules = least_revised_granules if len(product_granules) != search_results_count: - logger.info(f"Filtered to {len(product_granules)} granules after least revision check") + logger.info("Filtered to %d granules after least revision check", len(product_granules)) if args.collection in settings["SHORTNAME_FILTERS"]: product_granules = [granule for granule in product_granules if _match_identifier(settings, args, granule)] if len(product_granules) != search_results_count: - logger.info(f"Filtered to {len(product_granules)} total granules after shortname filter check") + logger.info(f"Filtered to %d total granules after shortname filter check", len(product_granules)) for i in range(0, len(product_granules), products_per_line): end_range = i + products_per_line if end_range > len(product_granules): end_range = len(product_granules) - logger.info(f'FILTERED RESULTS {i+1} to {end_range} of {len(product_granules)}: {[(granule["granule_id"], "revision " + str(granule["revision_id"])) for granule in product_granules[i:end_range]]}') + + logger.info(f'FILTERED RESULTS %d to %d of %d: ', i + 1, end_range, len(product_granules)) + + for granule in product_granules[i:end_range]: + logger.info( + f'{(granule["granule_id"], "revision " + str(granule["revision_id"]))}' + ) for granule in product_granules: granule["filtered_urls"] = _filter_granules(granule, args) @@ -242,7 +256,7 @@ async def async_query_cmr(args, token, cmr, settings, timerange, now: datetime, return product_granules -def _get_temporal_range(start: str, end: str, now: str): +def _get_temporal_range(start: str, end: str, now: str) -> str: start = start if start is not False else "1900-01-01T00:00:00Z" end = end if end is not False else now diff --git a/data_subscriber/cslc/cslc_blackout.py b/data_subscriber/cslc/cslc_blackout.py index 3d503ffd0..335b2f6b4 100644 --- a/data_subscriber/cslc/cslc_blackout.py +++ b/data_subscriber/cslc/cslc_blackout.py @@ -148,9 +148,9 @@ def _filter_cslc_blackout_polarization(granules, proc_mode, blackout_dates_obj, return filtered_granules -def query_cmr_cslc_blackout_polarization(args, token, cmr, settings, query_timerange, now, silent, blackout_dates_obj, +def query_cmr_cslc_blackout_polarization(args, token, cmr, settings, query_timerange, now, blackout_dates_obj, no_duplicate, force_frame_id, vv_only = True): '''Query CMR for CSLC granules and filter for blackout dates and polarization''' - granules = asyncio.run(async_query_cmr(args, token, cmr, settings, query_timerange, now, silent)) + granules = asyncio.run(async_query_cmr(args, token, cmr, settings, query_timerange, now)) return _filter_cslc_blackout_polarization(granules, args.proc_mode, blackout_dates_obj, no_duplicate, force_frame_id, vv_only) \ No newline at end of file diff --git a/data_subscriber/cslc/cslc_dependency.py b/data_subscriber/cslc/cslc_dependency.py index de245fa47..5b3ce2373 100644 --- a/data_subscriber/cslc/cslc_dependency.py +++ b/data_subscriber/cslc/cslc_dependency.py @@ -44,11 +44,12 @@ def get_prev_day_indices(self, day_index: int, frame_number: int): days_delta = day_index - frame.sensing_datetime_days_index[-1] end_date = start_date + timedelta(days=days_delta - 1) # We don't want the current day index in this query_timerange = DateTimeRange(start_date.strftime(CMR_TIME_FORMAT), end_date.strftime(CMR_TIME_FORMAT)) - acq_index_to_bursts, _ = self.get_k_granules_from_cmr(query_timerange, frame_number, silent = True) + acq_index_to_bursts, _ = self.get_k_granules_from_cmr(query_timerange, frame_number) all_prev_indices = frame.sensing_datetime_days_index + sorted(list(acq_index_to_bursts.keys())) self.logger.debug(f"All previous day indices: {all_prev_indices}") return all_prev_indices - def get_k_granules_from_cmr(self, query_timerange, frame_number: int, silent = False): + + def get_k_granules_from_cmr(self, query_timerange, frame_number: int): '''Return two dictionaries that satisfy the burst pattern for the frame_number within the time range: 1. acq_index_to_bursts: day index to set of burst ids 2. acq_index_to_granules: day index to list of granules that match the burst @@ -61,7 +62,7 @@ def get_k_granules_from_cmr(self, query_timerange, frame_number: int, silent = F args.use_temporal = True granules = query_cmr_cslc_blackout_polarization( - args, self.token, self.cmr, self.settings, query_timerange, datetime.utcnow(), silent, self.blackout_dates_obj, True, frame_number, self.VV_only) + args, self.token, self.cmr, self.settings, query_timerange, datetime.utcnow(), self.blackout_dates_obj, True, frame_number, self.VV_only) return self.k_granules_grouping(frame_number, granules) @@ -101,7 +102,7 @@ def k_granules_grouping(self, frame_number, granules: list): return acq_index_to_bursts, acq_index_to_granules - def determine_k_cycle(self, acquisition_dts: datetime, day_index: int, frame_number: int, silent = False): + def determine_k_cycle(self, acquisition_dts: datetime, day_index: int, frame_number: int): '''Return where in the k-cycle this acquisition falls for the frame_number Must specify either acquisition_dts or day_index. Returns integer between 0 and k-1 where 0 means that it's at the start of the cycle @@ -131,7 +132,7 @@ def determine_k_cycle(self, acquisition_dts: datetime, day_index: int, frame_num end_date = acquisition_dts query_timerange = DateTimeRange(start_date.strftime(CMR_TIME_FORMAT), end_date.strftime(CMR_TIME_FORMAT)) - acq_index_to_bursts, _ = self.get_k_granules_from_cmr(query_timerange, frame_number, silent) + acq_index_to_bursts, _ = self.get_k_granules_from_cmr(query_timerange, frame_number) # The k-index is then the complete index number (historical + post historical) mod k self.logger.info(f"{len(acq_index_to_bursts.keys())} day indices since historical that match the burst pattern: {acq_index_to_bursts.keys()}") diff --git a/data_subscriber/cslc/cslc_query.py b/data_subscriber/cslc/cslc_query.py index 37d93bd75..a42b64bed 100644 --- a/data_subscriber/cslc/cslc_query.py +++ b/data_subscriber/cslc/cslc_query.py @@ -121,7 +121,7 @@ def determine_download_granules(self, granules): reproc_granules.extend(ready_granules) if self.args.k > 1: - k_granules = self.retrieve_k_granules(ready_granules, self.args, self.args.k - 1, True, silent=True) + k_granules = self.retrieve_k_granules(ready_granules, self.args, self.args.k - 1, True) self.catalog_granules(k_granules, datetime.now(), self.k_es_conn) self.logger.info(f"Length of K-granules: {len(k_granules)=}") for k_g in k_granules: @@ -249,7 +249,7 @@ def determine_download_granules(self, granules): return download_granules - def retrieve_k_granules(self, downloads, args, k_minus_one, VV_only = True, silent=False): + def retrieve_k_granules(self, downloads, args, k_minus_one, VV_only = True): '''# Go back as many 12-day windows as needed to find k- granules that have at least the same bursts as the current frame Return all the granules that satisfy that''' k_granules = [] @@ -290,7 +290,7 @@ def retrieve_k_granules(self, downloads, args, k_minus_one, VV_only = True, sile # Step 1 of 2: This will return dict of acquisition_cycle -> set of granules for only onse that match the burst pattern cslc_dependency = CSLCDependency( args.k, args.m, self.disp_burst_map_hist, args, self.token, self.cmr, self.settings, self.blackout_dates_obj, VV_only) - _, granules_map = cslc_dependency.get_k_granules_from_cmr(query_timerange, frame_id, silent=silent) + _, granules_map = cslc_dependency.get_k_granules_from_cmr(query_timerange, frame_id) # Step 2 of 2 ...Sort that by acquisition_cycle in decreasing order and then pick the first k-1 frames acq_day_indices = sorted(granules_map.keys(), reverse=True) @@ -333,9 +333,9 @@ def query_cmr_by_native_id (self, args, token, cmr, settings, now: datetime, nat timerange = DateTimeRange(start_date, end_date) self.logger.info(f"Querying CMR for all CSLC files that belong to the frame {frame_id}, derived from the native_id {native_id}") - return self.query_cmr_by_frame_and_dates(frame_id, local_args, token, cmr, settings, now, timerange, False) + return self.query_cmr_by_frame_and_dates(frame_id, local_args, token, cmr, settings, now, timerange) - def query_cmr_by_frame_and_acq_cycle(self, frame_id: int, acq_cycle: int, args, token, cmr, settings, now: datetime, silent=False): + def query_cmr_by_frame_and_acq_cycle(self, frame_id: int, acq_cycle: int, args, token, cmr, settings, now: datetime): '''Query CMR for specific date range for a specific frame_id and acquisition cycle. Need to always use temporal queries''' self.logger.info(f"Querying CMR for all CSLC files that belong to the frame {frame_id} and acquisition cycle {acq_cycle}") @@ -349,9 +349,9 @@ def query_cmr_by_frame_and_acq_cycle(self, frame_id: int, acq_cycle: int, args, end_date = (sensing_datetime + timedelta(minutes=15)).strftime(CMR_TIME_FORMAT) timerange = DateTimeRange(start_date, end_date) - return self.query_cmr_by_frame_and_dates(frame_id, new_args, token, cmr, settings, now, timerange, silent) + return self.query_cmr_by_frame_and_dates(frame_id, new_args, token, cmr, settings, now, timerange) - def query_cmr_by_frame_and_dates(self, frame_id: int, args, token, cmr, settings, now: datetime, timerange: DateTimeRange, silent=False): + def query_cmr_by_frame_and_dates(self, frame_id: int, args, token, cmr, settings, now: datetime, timerange: DateTimeRange): '''Query CMR for specific date range for a specific frame_id''' if frame_id not in self.disp_burst_map_hist: @@ -363,7 +363,7 @@ def query_cmr_by_frame_and_dates(self, frame_id: int, args, token, cmr, setting if count == 0: return [] new_args.native_id = native_id - new_granules = query_cmr_cslc_blackout_polarization(new_args, token, cmr, settings, timerange, now, silent, self.blackout_dates_obj, no_duplicate=True, force_frame_id=frame_id) + new_granules = query_cmr_cslc_blackout_polarization(new_args, token, cmr, settings, timerange, now, self.blackout_dates_obj, no_duplicate=True, force_frame_id=frame_id) return new_granules @@ -427,7 +427,7 @@ def query_cmr(self, timerange: DateTimeRange, now: datetime): frame_id = int(self.args.frame_id) all_granules = self.query_cmr_by_frame_and_dates(frame_id, self.args, self.token, self.cmr, self.settings, now, timerange) else: - all_granules = query_cmr_cslc_blackout_polarization(self.args, self.token, self.cmr, self.settings, timerange, now, False, self.blackout_dates_obj, False, None) + all_granules = query_cmr_cslc_blackout_polarization(self.args, self.token, self.cmr, self.settings, timerange, now, self.blackout_dates_obj, False, None) return all_granules diff --git a/data_subscriber/survey.py b/data_subscriber/survey.py index c8f144a8a..d4c87ab03 100644 --- a/data_subscriber/survey.py +++ b/data_subscriber/survey.py @@ -13,7 +13,7 @@ @backoff.on_exception(backoff.expo, Exception, max_value=13) -def _query_cmr_backoff(args, token, cmr, settings, query_timerange, now, disp_burst_map = None, silent=True): +def _query_cmr_backoff(args, token, cmr, settings, query_timerange, now, disp_burst_map = None): # If disp_burst_map is not None, that means we are only querying for a specific frame_id # Restrict CMR query by the burst pattern that make up the DISP-S1 frame @@ -22,7 +22,7 @@ def _query_cmr_backoff(args, token, cmr, settings, query_timerange, now, disp_bu args.native_id = native_id print(args.native_id) - result = asyncio.run(async_query_cmr(args, token, cmr, settings, query_timerange, now, silent)) + result = asyncio.run(async_query_cmr(args, token, cmr, settings, query_timerange, now)) return result @@ -61,7 +61,7 @@ def run_survey(args, token, cmr, settings): query_timerange: DateTimeRange = get_query_timerange(args, now) - granules = _query_cmr_backoff(args, token, cmr, settings, query_timerange, now, disp_burst_map, silent=True) + granules = _query_cmr_backoff(args, token, cmr, settings, query_timerange, now, disp_burst_map) count = 0 for granule in granules: diff --git a/tools/disp_s1_burst_db_tool.py b/tools/disp_s1_burst_db_tool.py index 05ce3b17d..a6078634e 100755 --- a/tools/disp_s1_burst_db_tool.py +++ b/tools/disp_s1_burst_db_tool.py @@ -80,7 +80,7 @@ blackout_dates_obj = DispS1BlackoutDates(localize_disp_blackout_dates(), disp_burst_map, burst_to_frames) blackout_dates_file = None -def get_k_cycle(acquisition_dts, frame_id, disp_burst_map, k, verbose): +def get_k_cycle(acquisition_dts, frame_id, disp_burst_map, k): subs_args = create_parser().parse_args(["query", "-c", "OPERA_L2_CSLC-S1_V1", "--processing-mode=forward"]) @@ -88,7 +88,7 @@ def get_k_cycle(acquisition_dts, frame_id, disp_burst_map, k, verbose): cmr, token, username, password, edl = get_cmr_token(subs_args.endpoint, settings) cslc_dependency = CSLCDependency(k, None, disp_burst_map, subs_args, token, cmr, settings, blackout_dates_obj) # we don't care about m here - k_cycle: int = cslc_dependency.determine_k_cycle(acquisition_dts, None, frame_id, silent = not verbose) + k_cycle: int = cslc_dependency.determine_k_cycle(acquisition_dts, None, frame_id) return k_cycle @@ -207,7 +207,7 @@ def validate_frame(frame_id, all_granules = None, detect_unexpected_cycles = Fal if args.k: k = int(args.k) - k_cycle = get_k_cycle(acquisition_dts, frame_ids[0], disp_burst_map, k, args.verbose) + k_cycle = get_k_cycle(acquisition_dts, frame_ids[0], disp_burst_map, k) if (k_cycle >= 0): print(f"K-cycle: {k_cycle} out of {k}") else: diff --git a/tools/update_disp_s1_burst_db.py b/tools/update_disp_s1_burst_db.py index 8991c6822..e418ec996 100644 --- a/tools/update_disp_s1_burst_db.py +++ b/tools/update_disp_s1_burst_db.py @@ -1,26 +1,27 @@ #!/usr/bin/env python3 -import logging +import argparse import json +import logging from collections import defaultdict -from datetime import datetime, timedelta -import argparse +from datetime import datetime + import backoff -from util.conf_util import SettingsConf +from data_subscriber import cslc_utils from data_subscriber.cmr import get_cmr_token +from data_subscriber.cslc.cslc_query import CslcCmrQuery from data_subscriber.parser import create_parser from data_subscriber.query import DateTimeRange -from data_subscriber.cslc.cslc_query import CslcCmrQuery -from data_subscriber import cslc_utils +from util.conf_util import SettingsConf ''' Tool to update the DISP S1 burst database sensing_time_list with latest data from CMR. Writes out the new file with .mod added to the end of the file name''' @backoff.on_exception(backoff.expo, Exception, max_tries=15) -def query_cmr_by_frame_and_dates_backoff(cslc_query, subs_args, token, cmr, settings, now, timerange, silent): +def query_cmr_by_frame_and_dates_backoff(cslc_query, subs_args, token, cmr, settings, now, timerange): frame_id = int(subs_args.frame_id) - return cslc_query.query_cmr_by_frame_and_dates(frame_id, subs_args, token, cmr, settings, now, timerange, silent) + return cslc_query.query_cmr_by_frame_and_dates(frame_id, subs_args, token, cmr, settings, now, timerange) logging.basicConfig(level="INFO") logger = logging.getLogger(__name__) @@ -63,7 +64,7 @@ def query_cmr_by_frame_and_dates_backoff(cslc_query, subs_args, token, cmr, sett new_sensing_time_list = [] logger.info(f"Updating {frame=}") subs_args.frame_id = frame - all_granules = query_cmr_by_frame_and_dates_backoff(cslc_cmr_query, subs_args, token, cmr, settings, now, timerange, silent=True) + all_granules = query_cmr_by_frame_and_dates_backoff(cslc_cmr_query, subs_args, token, cmr, settings, now, timerange) # Group them by acquisition cycle acq_cycles = defaultdict(set) From cff3a04624bfc5d52244f107c542ad15970c7920 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 7 Nov 2024 13:42:29 -0600 Subject: [PATCH 13/17] Logging improvements for the data_subscriber.cslc package --- data_subscriber/cslc/cslc_blackout.py | 20 ++++--- data_subscriber/cslc/cslc_dependency.py | 6 +- data_subscriber/cslc/cslc_query.py | 74 ++++++++++++++----------- 3 files changed, 57 insertions(+), 43 deletions(-) diff --git a/data_subscriber/cslc/cslc_blackout.py b/data_subscriber/cslc/cslc_blackout.py index 335b2f6b4..d9e7b2eaf 100644 --- a/data_subscriber/cslc/cslc_blackout.py +++ b/data_subscriber/cslc/cslc_blackout.py @@ -9,8 +9,11 @@ from commons.logger import get_logger from data_subscriber.cmr import async_query_cmr, CMR_TIME_FORMAT -from data_subscriber.cslc_utils import localize_anc_json, sensing_time_day_index, parse_cslc_native_id, \ - parse_cslc_file_name, download_batch_id_forward_reproc +from data_subscriber.cslc_utils import (localize_anc_json, + sensing_time_day_index, + parse_cslc_native_id, + parse_cslc_file_name, + download_batch_id_forward_reproc) from data_subscriber.url import cslc_unique_id DEFAULT_DISP_BLACKOUT_DATE_NAME = 'opera-disp-s1-blackout-dates.json' @@ -23,8 +26,9 @@ def localize_disp_blackout_dates(): try: file = localize_anc_json("DISP_S1_BLACKOUT_DATES_S3PATH") except: - logger.warning(f"Could not download DISP-S1 blackout dates file from settings.yaml field DISP_S1_BLACKOUT_DATES_S3PATH from S3. " - f"Attempting to use local copy named {DEFAULT_DISP_BLACKOUT_DATE_NAME}.") + logger.warning("Could not download DISP-S1 blackout dates file from settings.yaml " + "field DISP_S1_BLACKOUT_DATES_S3PATH from S3. " + "Attempting to use local copy named %s.", DEFAULT_DISP_BLACKOUT_DATE_NAME) file = DEFAULT_DISP_BLACKOUT_DATE_NAME return process_disp_blackout_dates(file) @@ -123,7 +127,8 @@ def _filter_cslc_blackout_polarization(granules, proc_mode, blackout_dates_obj, for granule in granules: burst_id, acquisition_dts = parse_cslc_file_name(granule['granule_id']) if burst_id not in blackout_dates_obj.burst_to_frames.keys() or len(blackout_dates_obj.burst_to_frames[burst_id]) == 0: - logger.info(f"Skipping granule {granule['granule_id']} because {burst_id=} not in the historical database") + logger.info("Skipping granule %s because burst_id=%s not in the historical database", + granule['granule_id'], burst_id) else: relevant_granules.append(granule) @@ -132,7 +137,7 @@ def _filter_cslc_blackout_polarization(granules, proc_mode, blackout_dates_obj, for granule in relevant_granules: if vv_only and "_VV_" not in granule["granule_id"]: - logger.info(f"Skipping granule {granule['granule_id']} because it doesn't have VV polarization") + logger.info(f"Skipping granule %s because it doesn't have VV polarization", granule['granule_id']) continue frame_id = granule["frame_id"] @@ -141,7 +146,8 @@ def _filter_cslc_blackout_polarization(granules, proc_mode, blackout_dates_obj, if is_black_out: blackout_start = dates[0].strftime(CMR_TIME_FORMAT) blackout_end = dates[1].strftime(CMR_TIME_FORMAT) - logger.info(f"Skipping granule {granule['granule_id']} because {frame_id=} falls on a blackout date {blackout_start=} {blackout_end=}") + logger.info(f"Skipping granule %s because frame_id=%s falls on a blackout date blackout_start=%s blackout_end=%s", + granule['granule_id'], frame_id, blackout_start, blackout_end) continue filtered_granules.append(granule) diff --git a/data_subscriber/cslc/cslc_dependency.py b/data_subscriber/cslc/cslc_dependency.py index 5b3ce2373..8926a5f73 100644 --- a/data_subscriber/cslc/cslc_dependency.py +++ b/data_subscriber/cslc/cslc_dependency.py @@ -141,10 +141,8 @@ def determine_k_cycle(self, acquisition_dts: datetime, day_index: int, frame_num return index_number % self.k def compressed_cslc_satisfied(self, frame_id, day_index, eu): + return self.get_dependent_compressed_cslcs(frame_id, day_index, eu) - if self.get_dependent_compressed_cslcs(frame_id, day_index, eu) == False: - return False - return True def get_dependent_compressed_cslcs(self, frame_id, day_index, eu): ''' Search for all previous M compressed CSLCs @@ -156,7 +154,7 @@ def get_dependent_compressed_cslcs(self, frame_id, day_index, eu): ccslcs = [] - #special case for early sensing time series + # special case for early sensing time series m = self.m if len(prev_day_indices) < self.k * (self.m-1): m = (len(prev_day_indices) // self.k ) + 1 diff --git a/data_subscriber/cslc/cslc_query.py b/data_subscriber/cslc/cslc_query.py index a42b64bed..b36b4ee68 100644 --- a/data_subscriber/cslc/cslc_query.py +++ b/data_subscriber/cslc/cslc_query.py @@ -5,15 +5,22 @@ from collections import defaultdict from datetime import datetime, timedelta +from commons.logger import get_logger from data_subscriber.cmr import CMR_TIME_FORMAT -from data_subscriber.cslc_utils import (localize_disp_frame_burst_hist, build_cslc_native_ids, parse_cslc_native_id, - process_disp_frame_burst_hist, download_batch_id_forward_reproc, split_download_batch_id, - parse_cslc_file_name) +from data_subscriber.cslc.cslc_blackout import (DispS1BlackoutDates, + process_disp_blackout_dates, + localize_disp_blackout_dates, + query_cmr_cslc_blackout_polarization) +from data_subscriber.cslc.cslc_catalog import KCSLCProductCatalog from data_subscriber.cslc.cslc_dependency import CSLCDependency -from data_subscriber.cslc.cslc_blackout import DispS1BlackoutDates, process_disp_blackout_dates, localize_disp_blackout_dates, query_cmr_cslc_blackout_polarization +from data_subscriber.cslc_utils import (localize_disp_frame_burst_hist, + build_cslc_native_ids, + parse_cslc_native_id, + process_disp_frame_burst_hist, + download_batch_id_forward_reproc, + split_download_batch_id) from data_subscriber.query import CmrQuery, DateTimeRange from data_subscriber.url import cslc_unique_id -from data_subscriber.cslc.cslc_catalog import KCSLCProductCatalog K_MULT_FACTOR = 2 # TODO: This should be a setting in probably settings.yaml. EARLIEST_POSSIBLE_CSLC_DATE = "2016-01-01T00:00:00Z" @@ -47,7 +54,7 @@ def __init__(self, args, token, es_conn, cmr, job_id, settings, disp_frame_burs self.k_batch_ids = defaultdict(set) # We store this within this class object and use it when we catalog all granules self.k_retrieved_granules = [] - self.k_es_conn = KCSLCProductCatalog(logging.getLogger(__name__)) + self.k_es_conn = KCSLCProductCatalog(get_logger()) def validate_args(self): @@ -123,18 +130,18 @@ def determine_download_granules(self, granules): if self.args.k > 1: k_granules = self.retrieve_k_granules(ready_granules, self.args, self.args.k - 1, True) self.catalog_granules(k_granules, datetime.now(), self.k_es_conn) - self.logger.info(f"Length of K-granules: {len(k_granules)=}") + self.logger.info("Length of K-granules: %d", len(k_granules)) for k_g in k_granules: self.download_batch_ids[k_g["download_batch_id"]].add(batch_id) self.k_batch_ids[batch_id].add(k_g["download_batch_id"]) self.k_retrieved_granules.extend(k_granules) # This is used for scenario testing else: - self.logger.info(f"Skipping download for {batch_id} because only {len(download_batch)} of {max_bursts} granules are present") + self.logger.info(f"Skipping download for %s because only %d of %d granules are present", + batch_id, len(download_batch), max_bursts) return reproc_granules # From this point on is forward processing which is the most complex - current_time = datetime.now() # This list is what is ultimately returned by this function @@ -144,8 +151,8 @@ def determine_download_granules(self, granules): self.refresh_index() unsubmitted = self.es_conn.get_unsubmitted_granules() - self.logger.info(f"{len(granules)=}") - self.logger.info(f"{len(unsubmitted)=}") + self.logger.debug("len(granules)=%d", len(granules)) + self.logger.debug("len(unsubmitted)=%d", len(unsubmitted)) # Group all granules by download_batch_id # If the same download_batch_id is in both granules and unsubmitted, we will use the one in granules because it's newer @@ -154,9 +161,9 @@ def determine_download_granules(self, granules): for granule in granules: by_download_batch_id[granule["download_batch_id"]][granule["unique_id"]] = granule - self.logger.info("Received the following cslc granules from CMR") + self.logger.info("Received the following CSLC granules from CMR: ") for batch_id, download_batch in by_download_batch_id.items(): - self.logger.info(f"{batch_id=} {len(download_batch)=}") + self.logger.info(f"batch_id=%s len(download_batch)=%d", batch_id, len(download_batch)) # THIS RULE ALSO NO LONGER APPLIES. Without Rule 2 there is no Rule 3 # Rule 3: If granules have been downloaded already but with less than 100% and we have new granules for that batch, download all granules for that batch @@ -175,7 +182,7 @@ def determine_download_granules(self, granules): self.download_batch_ids[batch_id].add(batch_id)''' for granule in unsubmitted: - self.logger.info(f"Merging in unsubmitted granule {granule['unique_id']}: {granule['granule_id']} for triggering consideration") + self.logger.info(f"Merging in unsubmitted granule %s for triggering consideration", granule['unique_id']) download_batch = by_download_batch_id[granule["download_batch_id"]] if granule["unique_id"] not in download_batch: download_batch[granule["unique_id"]] = granule @@ -183,7 +190,7 @@ def determine_download_granules(self, granules): # Print them all here so that it's nicely all in one place self.logger.info("After merging unsubmitted granules with the new ones returned from CMR") for batch_id, download_batch in by_download_batch_id.items(): - self.logger.info(f"{batch_id=} {len(download_batch)=}") + self.logger.info(f"batch_id=%s len(download_batch)=%d", batch_id, len(download_batch)) # Combine unsubmitted and new granules and determine which granules meet the criteria for download # Rule 1: If all granules for a given download_batch_id are present, download all granules for that batch @@ -194,11 +201,12 @@ def determine_download_granules(self, granules): new_downloads = False if len(download_batch) == max_bursts: # Rule 1 - self.logger.info(f"Download all granules for {batch_id} because all {max_bursts} granules are present") + self.logger.info(f"Download all granules for %s because all %d granules are present", batch_id, max_bursts) self.download_batch_ids[batch_id].add(batch_id) # This batch needs to be submitted as part of the download job for sure new_downloads = True else: - self.logger.info(f"Skipping download for {batch_id} because only {len(download_batch)} of {max_bursts} granules are present") + self.logger.info(f"Skipping download for %s because only %d of %d granules are present", + batch_id, len(download_batch), max_bursts) '''As per email from Heresh at ADT on 7-25-2024, we will not use rule 2. We will always only process full-frames Keeping this code around in case we change our mind on that. else: @@ -219,7 +227,6 @@ def determine_download_granules(self, granules): #print(batch_id, download_batch)''' if new_downloads: - # And also add these downloads into the return download list for download in download_batch.values(): download_granules.append(download) @@ -231,7 +238,7 @@ def determine_download_granules(self, granules): k_granules = self.retrieve_k_granules(list(download_batch.values()), self.args, self.args.k-1) self.catalog_granules(k_granules, current_time, self.k_es_conn) self.k_retrieved_granules.extend(k_granules) # This is used for scenario testing - self.logger.info(f"Length of K-granules: {len(k_granules)=}") + self.logger.info(f"Length of K-granules: %d", len(k_granules)) #print(f"{granules=}") # All the k batches need to be submitted as part of the download job for this batch @@ -240,12 +247,12 @@ def determine_download_granules(self, granules): self.download_batch_ids[k_g["download_batch_id"]].add(batch_id) self.k_batch_ids[batch_id].add(k_g["download_batch_id"]) - if (len(download_batch) > max_bursts): + if len(download_batch) > max_bursts: self.logger.error(f"{len(download_batch)=} {max_bursts=}") self.logger.error(f"{download_batch=}") raise AssertionError("Something seriously went wrong matching up CSLC input granules!") - self.logger.info(f"{len(download_granules)=}") + self.logger.debug("len(download_granules)=%d", len(download_granules)) return download_granules @@ -285,7 +292,8 @@ def retrieve_k_granules(self, downloads, args, k_minus_one, VV_only = True): if end_date_object < datetime.strptime(EARLIEST_POSSIBLE_CSLC_DATE, CMR_TIME_FORMAT): raise AssertionError(f"We are searching earlier than {EARLIEST_POSSIBLE_CSLC_DATE}. There is no more data here. {end_date_object=}") - self.logger.info(f"Retrieving K-1 granules {start_date=} {end_date=} for {frame_id=}") + self.logger.info("Retrieving K-1 granules start_date=%s end_date=%s for frame_id=%d", + start_date, end_date, frame_id) # Step 1 of 2: This will return dict of acquisition_cycle -> set of granules for only onse that match the burst pattern cslc_dependency = CSLCDependency( @@ -303,7 +311,7 @@ def retrieve_k_granules(self, downloads, args, k_minus_one, VV_only = True): granules = granules_map[acq_day_index] k_granules.extend(granules) k_satified += 1 - self.logger.info(f" {frame_id=} {acq_day_index=} satsifies. {k_satified=} {k_minus_one=}") + self.logger.info(f"{frame_id=} {acq_day_index=} satsifies. {k_satified=} {k_minus_one=}") if k_satified == k_minus_one: break @@ -321,7 +329,7 @@ def query_cmr_by_native_id (self, args, token, cmr, settings, now: datetime, nat burst_id, acquisition_time, _, frame_ids = parse_cslc_native_id(native_id, self.burst_to_frames, self.disp_burst_map_hist) if len(frame_ids) == 0: - self.logger.warning(f"{native_id=} is not found in the DISP-S1 Burst ID Database JSON. Nothing to process") + self.logger.warning("native_id=%s is not found in the DISP-S1 Burst ID Database JSON. Nothing to process", native_id) return [] # TODO: Also check if we should be processing this native_id for this acquisition cycle. @@ -331,14 +339,16 @@ def query_cmr_by_native_id (self, args, token, cmr, settings, now: datetime, nat start_date = (acquisition_time - timedelta(minutes=15)).strftime(CMR_TIME_FORMAT) end_date = (acquisition_time + timedelta(minutes=15)).strftime(CMR_TIME_FORMAT) timerange = DateTimeRange(start_date, end_date) - self.logger.info(f"Querying CMR for all CSLC files that belong to the frame {frame_id}, derived from the native_id {native_id}") + self.logger.info("Querying CMR for all CSLC files that belong to the frame %d, derived from the native_id %s", + frame_id, native_id) return self.query_cmr_by_frame_and_dates(frame_id, local_args, token, cmr, settings, now, timerange) def query_cmr_by_frame_and_acq_cycle(self, frame_id: int, acq_cycle: int, args, token, cmr, settings, now: datetime): '''Query CMR for specific date range for a specific frame_id and acquisition cycle. Need to always use temporal queries''' - self.logger.info(f"Querying CMR for all CSLC files that belong to the frame {frame_id} and acquisition cycle {acq_cycle}") + self.logger.info(f"Querying CMR for all CSLC files that belong to the frame %d and acquisition cycle %d", + frame_id, acq_cycle) new_args = copy.deepcopy(args) new_args.use_temporal = True @@ -355,8 +365,8 @@ def query_cmr_by_frame_and_dates(self, frame_id: int, args, token, cmr, setting '''Query CMR for specific date range for a specific frame_id''' if frame_id not in self.disp_burst_map_hist: - raise Exception(f"Frame number {frame_id} not found in the historical database. \ - OPERA does not process this frame for DISP-S1.") + raise Exception(f"Frame number {frame_id} not found in the historical database. " + f"OPERA does not process this frame for DISP-S1.") new_args = copy.deepcopy(args) count, native_id = build_cslc_native_ids(frame_id, self.disp_burst_map_hist) @@ -410,7 +420,7 @@ def query_cmr(self, timerange: DateTimeRange, now: datetime): _, _, acquisition_cycles, _ = parse_cslc_native_id(granule["granule_id"], self.burst_to_frames, self.disp_burst_map_hist) for frame_id, acq_cycle in acquisition_cycles.items(): unique_frames_dates.add(f"{frame_id}-{acq_cycle}") - self.logger.info(f"Added the follwing frame_id-acq_cycle pairs reprocessing mode: {list(unique_frames_dates)}") + self.logger.info(f"Added the follwing frame_id-acq_cycle pairs reprocessing mode: %s", str(list(unique_frames_dates))) all_granules = [] # We could perform two queries so create a unique set of granules. @@ -478,7 +488,7 @@ def get_download_chunks(self, batch_id_to_urls_map): batch_chunk) # We don't actually care about the URLs, we only care about the batch_id if self.proc_mode == "historical": - if (len(chunk_map[frame_id]) != self.args.k): + if len(chunk_map[frame_id]) != self.args.k: self.logger.error([chunk for chunk, data in chunk_map[frame_id]]) err_str = f"Number of download batches {len(chunk_map[frame_id])} for frame {frame_id} does not equal K {self.args.k}." raise AssertionError(err_str) @@ -486,6 +496,6 @@ def get_download_chunks(self, batch_id_to_urls_map): return chunk_map.values() def refresh_index(self): - self.logger.info("performing index refresh") + self.logger.info("Performing index refresh") self.es_conn.refresh() - self.logger.info("performed index refresh") + self.logger.info("Performed index refresh") From ae992de0590b9b9da6d45a2c59aa1d0a247ad172 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 7 Nov 2024 13:42:45 -0600 Subject: [PATCH 14/17] Logging improvements for the data_subscriber.rtc package --- .../rtc/mgrs_bursts_collection_db_client.py | 11 +-- data_subscriber/rtc/rtc_catalog.py | 6 +- data_subscriber/rtc/rtc_query.py | 72 +++++++++++-------- geo/geo_util.py | 23 +++--- 4 files changed, 63 insertions(+), 49 deletions(-) diff --git a/data_subscriber/rtc/mgrs_bursts_collection_db_client.py b/data_subscriber/rtc/mgrs_bursts_collection_db_client.py index 270cc4690..7b16e02dd 100644 --- a/data_subscriber/rtc/mgrs_bursts_collection_db_client.py +++ b/data_subscriber/rtc/mgrs_bursts_collection_db_client.py @@ -34,14 +34,15 @@ def dicts(t): def cached_load_mgrs_burst_db(filter_land=True): """see :func:`~data_subscriber.rtc.mgrs_bursts_collection_db_client.load_mgrs_burst_db`""" logger = get_logger() - logger.info(f"Cache loading MGRS burst database. {filter_land=}") + logger.info(f"Cache loading MGRS burst database.") + logger.debug(f"{filter_land=}") return load_mgrs_burst_db(filter_land) def load_mgrs_burst_db(filter_land=True): """see :func:`~data_subscriber.rtc.mgrs_bursts_collection_db_client.load_mgrs_burst_db_raw`""" logger = get_logger() - logger.info(f"Loading MGRS burst database. {filter_land=}") + logger.info(f"Initial load of MGRS burst database from disk.") vector_gdf = load_mgrs_burst_db_raw(filter_land) @@ -70,12 +71,14 @@ def load_mgrs_burst_db_raw(filter_land=True) -> GeoDataFrame: mtc_download_filepath = Path(Path(mgrs_tile_collection_db_s3path).name) s3_client.download_file(Bucket=match_s3path.group("bucket_name"), Key=match_s3path.group("object_key"), Filename=str(mtc_download_filepath)) vector_gdf = gpd.read_file(mtc_download_filepath, crs="EPSG:4326") # , bbox=(-230, 0, -10, 90)) # bbox=(-180, -90, 180, 90) # global + # na_gdf = gpd.read_file(Path("geo/north_america_opera.geojson"), crs="EPSG:4326") # vector_gdf = vector_gdf.overlay(na_gdf, how="intersection") - logger.info(f"{len(vector_gdf)=}") + logger.debug(f"pre water/land filter: {len(vector_gdf)=}") + if filter_land: vector_gdf = vector_gdf[vector_gdf["land_ocean_flag"].isin(["water/land", "land"])] # filter out water (water == no relevant data) - logger.info(f"{len(vector_gdf)=}") + logger.debug(f"post water/land filter: {len(vector_gdf)=}") return vector_gdf diff --git a/data_subscriber/rtc/rtc_catalog.py b/data_subscriber/rtc/rtc_catalog.py index fc24ac3e4..4df8a6443 100644 --- a/data_subscriber/rtc/rtc_catalog.py +++ b/data_subscriber/rtc/rtc_catalog.py @@ -55,7 +55,7 @@ def filter_catalog_by_sets(self, mgrs_set_id_acquisition_ts_cycle_indexes): body["query"]["bool"]["must"].append({"match": {"mgrs_set_id": mgrs_set_id_acquisition_ts_cycle_idx.split("$")[0]}}) es_docs = self.es_util.query(body=body, index=self.ES_INDEX_PATTERNS) - self.logger.info(f"Found {len(es_docs)=}") + self.logger.debug("Found %d", len(es_docs)) return self.process_query_result(es_docs) def mark_products_as_download_job_submitted(self, batch_id_to_products_map: dict): @@ -95,7 +95,7 @@ def mark_products_as_download_job_submitted(self, batch_id_to_products_map: dict self.logger.info(f"Marking {set(batch_id_to_products_map.keys())} products as download job-submitted, in bulk") elasticsearch.helpers.bulk(self.es_util.es, operations) - self.logger.info("Performing index refresh") + self.logger.debug("Performing index refresh") self.refresh() def raw_create_doc_id_to_index_cache(self, docs): @@ -152,7 +152,7 @@ def mark_products_as_job_submitted(self, batch_id_to_products_map: dict): self.logger.info(f"Marking {set(batch_id_to_products_map.keys())} products as job-submitted, in bulk") elasticsearch.helpers.bulk(self.es_util.es, operations) - self.logger.info("Performing index refresh") + self.logger.debug("Performing index refresh") self.refresh() def create_doc_id_to_index_cache(self, docs): diff --git a/data_subscriber/rtc/rtc_query.py b/data_subscriber/rtc/rtc_query.py index a50eb6251..a73fccdd7 100644 --- a/data_subscriber/rtc/rtc_query.py +++ b/data_subscriber/rtc/rtc_query.py @@ -36,28 +36,30 @@ def __init__(self, args, token, es_conn, cmr, job_id, settings): super().__init__(args, token, es_conn, cmr, job_id, settings) async def run_query(self): - query_dt = datetime.now() now = datetime.utcnow() query_timerange: DateTimeRange = get_query_timerange(self.args, now) - self.logger.info("CMR query STARTED") + self.logger.info("CMR Query STARTED") + granules = await async_query_cmr(self.args, self.token, self.cmr, self.settings, query_timerange, now) - self.logger.info("CMR query FINISHED") + + self.logger.info("CMR Query FINISHED") # If processing mode is historical, apply the include/exclude-region filtering if self.args.proc_mode == "historical": - self.logger.info(f"Processing mode is historical so applying include and exclude regions...") + self.logger.info("Processing mode is historical, so applying include and exclude regions...") # Fetch all necessary geojson files from S3 localize_include_exclude(self.args) granules[:] = filter_granules_by_regions(granules, self.args.include_regions, self.args.exclude_regions) - self.logger.info("catalogue-ing STARTED") + self.logger.info("Granule Cataloguing STARTED") affected_mgrs_set_id_acquisition_ts_cycle_indexes = set() granules[:] = self.filter_granules_rtc(granules) - self.logger.info(f"Filtered to {len(granules)} granules") + + self.logger.info("Filtered to %d granules", len(granules)) mgrs = mbc_client.cached_load_mgrs_burst_db(filter_land=True) if self.args.native_id: @@ -68,7 +70,7 @@ async def run_query(self): num_granules = len(granules) for i, granule in enumerate(granules): - self.logger.debug(f"Processing granule {i+1} of {num_granules}") + self.logger.debug("Processing granule %d of %d", i + 1, num_granules) granule_id = granule.get("granule_id") revision_id = granule.get("revision_id") @@ -80,8 +82,8 @@ async def run_query(self): additional_fields["instrument"] = "S1A" if "S1A" in granule_id else "S1B" match_product_id = re.match(rtc_granule_regex, granule_id) - acquisition_dts = match_product_id.group("acquisition_ts") #e.g. 20210705T183117Z - burst_id = match_product_id.group("burst_id") # e.g. T074-157286-IW3 + acquisition_dts = match_product_id.group("acquisition_ts") # e.g. 20210705T183117Z + burst_id = match_product_id.group("burst_id") # e.g. T074-157286-IW3 # Returns up to two mgrs_set_ids. e.g. MS_74_76 mgrs_burst_set_ids = mbc_client.burst_id_to_mgrs_set_ids(mgrs, mbc_client.product_burst_id_to_mapping_burst_id(burst_id)) @@ -106,18 +108,23 @@ async def run_query(self): **additional_fields ) - self.logger.info("catalogue-ing FINISHED") + self.logger.info("Granule Cataloguing FINISHED") succeeded = [] failed = [] - self.logger.info("performing index refresh") + + self.logger.info("Performing index refresh") + self.es_conn.refresh() - self.logger.info("performed index refresh") - self.logger.info("evaluating available burst sets") - self.logger.info(f"{affected_mgrs_set_id_acquisition_ts_cycle_indexes=}") + self.logger.info("Performed index refresh") + + self.logger.info("Evaluating available burst sets") + self.logger.debug("affected_mgrs_set_id_acquisition_ts_cycle_indexes=%s", str(affected_mgrs_set_id_acquisition_ts_cycle_indexes)) + if self.args.native_id: # limit query to the 1 or 2 affected sets in backlog - self.logger.info("Supplied native-id. Limiting evaluation") + self.logger.info("Supplied native-id, limiting evaluation") + min_num_bursts = self.args.coverage_target_num if not min_num_bursts: min_num_bursts = self.settings["DSWX_S1_MINIMUM_NUMBER_OF_BURSTS_REQUIRED"] @@ -135,6 +142,7 @@ async def run_query(self): ) else: # evaluate ALL sets in backlog self.logger.info("Performing full evaluation") + min_num_bursts = self.args.coverage_target_num if not min_num_bursts: min_num_bursts = self.settings["DSWX_S1_MINIMUM_NUMBER_OF_BURSTS_REQUIRED"] @@ -169,25 +177,30 @@ async def run_query(self): # doc needs to be associated with the batch. so filter the other doc that isn't part of this batch if product_doc["mgrs_set_id_acquisition_ts_cycle_index"] == batch_id: batch_id_to_products_map[batch_id][product_doc["id"]].append(product_doc) + if self.args.smoke_run: self.logger.info(f"{self.args.smoke_run=}. Filtering to single batch") batch_id_to_products_map = dict(sorted(batch_id_to_products_map.items())[:1]) - self.logger.info(f"num_batches={len(batch_id_to_products_map)}") + + self.logger.debug(f"num_batches=%d", len(batch_id_to_products_map)) if self.args.subparser_name == "full": - self.logger.info(f"{self.args.subparser_name=}. Skipping download job submission. Download will be performed directly.") + self.logger.info("Skipping download job submission. Download will be performed directly.") self.args.provider = COLLECTION_TO_PROVIDER_TYPE_MAP[self.args.collection] self.args.batch_ids = affected_mgrs_set_id_acquisition_ts_cycle_indexes - return + return {"success": succeeded, "fail": failed} + if self.args.no_schedule_download: - self.logger.info(f"{self.args.no_schedule_download=}. Forcefully skipping download job submission.") - return + self.logger.info("Forcefully skipping download job submission.") + return {"success": succeeded, "fail": failed} + if not self.args.chunk_size: - self.logger.info(f"{self.args.chunk_size=}. Insufficient chunk size. Skipping download job submission.") - return + self.logger.info(f"Insufficient chunk size (%s). Skipping download job submission.", str(self.args.chunk_size)) + return {"success": succeeded, "fail": failed} results = [] self.logger.info(f"Submitting batches for RTC download job: {list(batch_id_to_products_map)}") + for batch_id, products_map in batch_id_to_products_map.items(): job_submission_tasks = submit_rtc_download_job_submissions_tasks({batch_id: products_map}, self.args, self.settings) results_batch = await asyncio.gather(*job_submission_tasks, return_exceptions=True) @@ -195,6 +208,7 @@ async def run_query(self): suceeded_batch = [job_id for _, job_id in results_batch if isinstance(job_id, str)] failed_batch = [e for _, e in results_batch if isinstance(e, Exception)] + if suceeded_batch: for product_id, products in batch_id_to_products_map[batch_id].items(): for product in products: @@ -202,7 +216,7 @@ async def run_query(self): product["download_job_ids"] = first(suceeded_batch) if self.args.dry_run: - self.logger.info(f"{self.args.dry_run=}. Skipping marking jobs as downloaded. Producing mock job ID") + self.logger.info("dry_run=%s, Skipping marking jobs as downloaded. Producing mock job ID.", str(self.args.dry_run)) pass else: self.es_conn.mark_products_as_download_job_submitted({batch_id: batch_id_to_products_map[batch_id]}) @@ -210,13 +224,9 @@ async def run_query(self): succeeded.extend(suceeded_batch) failed.extend(failed_batch) - self.logger.info(f"{len(results)=}") - self.logger.info(f"{results=}") - - self.logger.info(f"{len(succeeded)=}") - self.logger.info(f"{succeeded=}") - self.logger.info(f"{len(failed)=}") - self.logger.info(f"{failed=}") + self.logger.debug(f"{results=}") + self.logger.debug(f"{succeeded=}") + self.logger.debug(f"{failed=}") return { "success": succeeded, @@ -237,7 +247,7 @@ def filter_granules_rtc(self, granules): mgrs_sets = mbc_client.burst_id_to_mgrs_set_ids(mgrs, mbc_client.product_burst_id_to_mapping_burst_id(burst_id)) if not mgrs_sets: - self.logger.debug(f"{burst_id=} not associated with land or land/water data. skipping.") + self.logger.debug("burst_id=%s not associated with land or land/water data. Skipping.", burst_id) continue filtered_granules.append(granule) diff --git a/geo/geo_util.py b/geo/geo_util.py index 22c3058e4..0d251ea8e 100644 --- a/geo/geo_util.py +++ b/geo/geo_util.py @@ -1,12 +1,11 @@ + import json -import logging from functools import cache -from pathlib import Path from typing import TypedDict from osgeo import ogr -logger = logging.getLogger(__name__) +from commons.logger import get_logger _NORTH_AMERICA = "north_america_opera" @@ -48,10 +47,11 @@ def does_bbox_intersect_region(bbox: list[Coordinate], region) -> bool: :param bbox: a list of coordinate dicts. `bbox["lat"]` refers to the latitude component of the coordinate. `bbox["lon"]` refers to the longitudinal component of the coordinate. - region: string name of the geojson file without the extension - :return: True if the given coordinates intersect with North America (OPERA). Otherwise False. + :param region: string name of the geojson file without the extension + :return: True if the given coordinates intersect with North America (OPERA). Otherwise, False. """ - logger.info(f"{bbox=}") + logger = get_logger() + logger.debug(f"{bbox=}") bbox_ring = ogr.Geometry(ogr.wkbLinearRing) for coordinate in bbox: @@ -62,19 +62,20 @@ def does_bbox_intersect_region(bbox: list[Coordinate], region) -> bool: na_geom = _load_region_opera_geometry_collection(region) is_bbox_in_region = na_geom.Intersects(bbox_poly) - logger.info(f"{is_bbox_in_region=}") + logger.debug(f"{is_bbox_in_region=}") return is_bbox_in_region @cache def _load_region_opera_geometry_collection(region) -> ogr.Geometry: + logger = get_logger() region_opera_geojson = _cached_load_region_opera_geojson(region) na_geoms = ogr.Geometry(ogr.wkbGeometryCollection) for feature in region_opera_geojson["features"]: na_geoms.AddGeometry(ogr.CreateGeometryFromJson(json.dumps(feature["geometry"]))) - logger.info("Loaded geojson as osgeo GeometryCollection") + logger.info("Loaded region %s.geojson as osgeo GeometryCollection", region) return na_geoms @@ -82,8 +83,8 @@ def _load_region_opera_geometry_collection(region) -> ogr.Geometry: def _cached_load_region_opera_geojson(region) -> dict: """Loads a RFC7946 GeoJSON file.""" geojson = region + '.geojson' - fp = open(geojson) - geojson_obj: dict = json.load(fp) - logger.info("Loaded " + geojson) + with open(geojson, 'r') as fp: + geojson_obj: dict = json.load(fp) + return geojson_obj From eead8462875accf6c964f4f461a8c4ba2cfcdc23 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Wed, 13 Nov 2024 09:16:03 -0600 Subject: [PATCH 15/17] Restored original implementation of cslc_dependency.compressed_cslc_satisfied() --- data_subscriber/cslc/cslc_dependency.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/data_subscriber/cslc/cslc_dependency.py b/data_subscriber/cslc/cslc_dependency.py index 8926a5f73..898edb2ea 100644 --- a/data_subscriber/cslc/cslc_dependency.py +++ b/data_subscriber/cslc/cslc_dependency.py @@ -141,7 +141,9 @@ def determine_k_cycle(self, acquisition_dts: datetime, day_index: int, frame_num return index_number % self.k def compressed_cslc_satisfied(self, frame_id, day_index, eu): - return self.get_dependent_compressed_cslcs(frame_id, day_index, eu) + if self.get_dependent_compressed_cslcs(frame_id, day_index, eu) == False: + return False + return True def get_dependent_compressed_cslcs(self, frame_id, day_index, eu): From 4e448e10850b96c4a8ab13d29f1a99f1362194ce Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Wed, 13 Nov 2024 09:16:42 -0600 Subject: [PATCH 16/17] Relocated process_frame_burst_db() to data_subscriber/geojson_utils.py --- data_subscriber/geojson_utils.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/data_subscriber/geojson_utils.py b/data_subscriber/geojson_utils.py index 41c329b06..b6291c417 100644 --- a/data_subscriber/geojson_utils.py +++ b/data_subscriber/geojson_utils.py @@ -85,3 +85,16 @@ def download_from_s3(bucket, file, path): s3.Object(bucket, file).download_file(path) except Exception as e: raise Exception("Exception while fetching file: %s from S3 bucket %s." % (file, bucket) + str(e)) + + +def process_frame_burst_db(geojsons : list[str]): + settings = SettingsConf().cfg + bucket = settings["GEOJSON_BUCKET"] + + try: + for geojson in geojsons: + key = geojson.strip() + ".geojson" + # output_filepath = os.path.join(working_dir, key) + download_from_s3(bucket, key, key) + except Exception as e: + raise Exception("Exception while fetching geojson file: %s. " % key + str(e)) From 555b2774b91099894e15890d0cd47ac7dd2d03ff Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Wed, 13 Nov 2024 09:18:37 -0600 Subject: [PATCH 17/17] Added a verbose flag to cmr.async_query_cmr() to control output of granule information from a query --- data_subscriber/cmr.py | 31 +++++++++++++------------ data_subscriber/cslc/cslc_blackout.py | 4 ++-- data_subscriber/cslc/cslc_dependency.py | 10 ++++---- data_subscriber/cslc/cslc_query.py | 19 +++++++-------- data_subscriber/survey.py | 4 ++-- tools/disp_s1_burst_db_tool.py | 6 ++--- tools/update_disp_s1_burst_db.py | 6 ++--- 7 files changed, 40 insertions(+), 40 deletions(-) diff --git a/data_subscriber/cmr.py b/data_subscriber/cmr.py index 21dc07b77..852cd75a6 100644 --- a/data_subscriber/cmr.py +++ b/data_subscriber/cmr.py @@ -108,7 +108,7 @@ def get_cmr_token(endpoint, settings): return cmr, token, username, password, edl -async def async_query_cmr(args, token, cmr, settings, timerange, now: datetime) -> list: +async def async_query_cmr(args, token, cmr, settings, timerange, now: datetime, verbose=True) -> list: logger = get_logger() request_url = f"https://{cmr}/search/granules.umm_json" bounding_box = args.bbox @@ -192,23 +192,24 @@ async def async_query_cmr(args, token, cmr, settings, timerange, now: datetime) # Just here incase code moves around and we want a reasonable default products_per_line = 1000 - if search_results_count > 0: - # Print out all the query results but limit the number of characters per line - one_logout = f'{(product_granules[0]["granule_id"], "revision " + str(product_granules[0]["revision_id"]))}' - chars_per_line = len(one_logout) + 6 # 6 is a fudge factor - products_per_line = MAX_CHARS_PER_LINE // chars_per_line + if verbose: + if search_results_count > 0: + # Print out all the query results but limit the number of characters per line + one_logout = f'{(product_granules[0]["granule_id"], "revision " + str(product_granules[0]["revision_id"]))}' + chars_per_line = len(one_logout) + 6 # 6 is a fudge factor + products_per_line = MAX_CHARS_PER_LINE // chars_per_line - for i in range(0, search_results_count, products_per_line): - end_range = i + products_per_line - if end_range > search_results_count: - end_range = search_results_count + for i in range(0, search_results_count, products_per_line): + end_range = i + products_per_line + if end_range > search_results_count: + end_range = search_results_count - logger.info('QUERY RESULTS %d to %d of %d: ', i + 1, end_range, search_results_count) + logger.info('QUERY RESULTS %d to %d of %d: ', i + 1, end_range, search_results_count) - for granule in product_granules[i:end_range]: - logger.info( - f'{(granule["granule_id"], "revision " + str(granule["revision_id"]))}' - ) + for granule in product_granules[i:end_range]: + logger.info( + f'{(granule["granule_id"], "revision " + str(granule["revision_id"]))}' + ) # Filter out granules with revision-id greater than max allowed least_revised_granules = [] diff --git a/data_subscriber/cslc/cslc_blackout.py b/data_subscriber/cslc/cslc_blackout.py index d9e7b2eaf..ad1398501 100644 --- a/data_subscriber/cslc/cslc_blackout.py +++ b/data_subscriber/cslc/cslc_blackout.py @@ -154,9 +154,9 @@ def _filter_cslc_blackout_polarization(granules, proc_mode, blackout_dates_obj, return filtered_granules -def query_cmr_cslc_blackout_polarization(args, token, cmr, settings, query_timerange, now, blackout_dates_obj, +def query_cmr_cslc_blackout_polarization(args, token, cmr, settings, query_timerange, now, verbose, blackout_dates_obj, no_duplicate, force_frame_id, vv_only = True): '''Query CMR for CSLC granules and filter for blackout dates and polarization''' - granules = asyncio.run(async_query_cmr(args, token, cmr, settings, query_timerange, now)) + granules = asyncio.run(async_query_cmr(args, token, cmr, settings, query_timerange, now, verbose)) return _filter_cslc_blackout_polarization(granules, args.proc_mode, blackout_dates_obj, no_duplicate, force_frame_id, vv_only) \ No newline at end of file diff --git a/data_subscriber/cslc/cslc_dependency.py b/data_subscriber/cslc/cslc_dependency.py index 898edb2ea..45bcb5021 100644 --- a/data_subscriber/cslc/cslc_dependency.py +++ b/data_subscriber/cslc/cslc_dependency.py @@ -44,12 +44,12 @@ def get_prev_day_indices(self, day_index: int, frame_number: int): days_delta = day_index - frame.sensing_datetime_days_index[-1] end_date = start_date + timedelta(days=days_delta - 1) # We don't want the current day index in this query_timerange = DateTimeRange(start_date.strftime(CMR_TIME_FORMAT), end_date.strftime(CMR_TIME_FORMAT)) - acq_index_to_bursts, _ = self.get_k_granules_from_cmr(query_timerange, frame_number) + acq_index_to_bursts, _ = self.get_k_granules_from_cmr(query_timerange, frame_number, verbose = False) all_prev_indices = frame.sensing_datetime_days_index + sorted(list(acq_index_to_bursts.keys())) self.logger.debug(f"All previous day indices: {all_prev_indices}") return all_prev_indices - def get_k_granules_from_cmr(self, query_timerange, frame_number: int): + def get_k_granules_from_cmr(self, query_timerange, frame_number: int, verbose = True): '''Return two dictionaries that satisfy the burst pattern for the frame_number within the time range: 1. acq_index_to_bursts: day index to set of burst ids 2. acq_index_to_granules: day index to list of granules that match the burst @@ -62,7 +62,7 @@ def get_k_granules_from_cmr(self, query_timerange, frame_number: int): args.use_temporal = True granules = query_cmr_cslc_blackout_polarization( - args, self.token, self.cmr, self.settings, query_timerange, datetime.utcnow(), self.blackout_dates_obj, True, frame_number, self.VV_only) + args, self.token, self.cmr, self.settings, query_timerange, datetime.utcnow(), verbose, self.blackout_dates_obj, True, frame_number, self.VV_only) return self.k_granules_grouping(frame_number, granules) @@ -102,7 +102,7 @@ def k_granules_grouping(self, frame_number, granules: list): return acq_index_to_bursts, acq_index_to_granules - def determine_k_cycle(self, acquisition_dts: datetime, day_index: int, frame_number: int): + def determine_k_cycle(self, acquisition_dts: datetime, day_index: int, frame_number: int, verbose = True): '''Return where in the k-cycle this acquisition falls for the frame_number Must specify either acquisition_dts or day_index. Returns integer between 0 and k-1 where 0 means that it's at the start of the cycle @@ -132,7 +132,7 @@ def determine_k_cycle(self, acquisition_dts: datetime, day_index: int, frame_num end_date = acquisition_dts query_timerange = DateTimeRange(start_date.strftime(CMR_TIME_FORMAT), end_date.strftime(CMR_TIME_FORMAT)) - acq_index_to_bursts, _ = self.get_k_granules_from_cmr(query_timerange, frame_number) + acq_index_to_bursts, _ = self.get_k_granules_from_cmr(query_timerange, frame_number, verbose) # The k-index is then the complete index number (historical + post historical) mod k self.logger.info(f"{len(acq_index_to_bursts.keys())} day indices since historical that match the burst pattern: {acq_index_to_bursts.keys()}") diff --git a/data_subscriber/cslc/cslc_query.py b/data_subscriber/cslc/cslc_query.py index b36b4ee68..824997b0d 100644 --- a/data_subscriber/cslc/cslc_query.py +++ b/data_subscriber/cslc/cslc_query.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import copy -import logging from collections import defaultdict from datetime import datetime, timedelta @@ -128,7 +127,7 @@ def determine_download_granules(self, granules): reproc_granules.extend(ready_granules) if self.args.k > 1: - k_granules = self.retrieve_k_granules(ready_granules, self.args, self.args.k - 1, True) + k_granules = self.retrieve_k_granules(ready_granules, self.args, self.args.k - 1, True, verbose = False) self.catalog_granules(k_granules, datetime.now(), self.k_es_conn) self.logger.info("Length of K-granules: %d", len(k_granules)) for k_g in k_granules: @@ -256,7 +255,7 @@ def determine_download_granules(self, granules): return download_granules - def retrieve_k_granules(self, downloads, args, k_minus_one, VV_only = True): + def retrieve_k_granules(self, downloads, args, k_minus_one, VV_only = True, verbose = True): '''# Go back as many 12-day windows as needed to find k- granules that have at least the same bursts as the current frame Return all the granules that satisfy that''' k_granules = [] @@ -298,7 +297,7 @@ def retrieve_k_granules(self, downloads, args, k_minus_one, VV_only = True): # Step 1 of 2: This will return dict of acquisition_cycle -> set of granules for only onse that match the burst pattern cslc_dependency = CSLCDependency( args.k, args.m, self.disp_burst_map_hist, args, self.token, self.cmr, self.settings, self.blackout_dates_obj, VV_only) - _, granules_map = cslc_dependency.get_k_granules_from_cmr(query_timerange, frame_id) + _, granules_map = cslc_dependency.get_k_granules_from_cmr(query_timerange, frame_id, verbose=verbose) # Step 2 of 2 ...Sort that by acquisition_cycle in decreasing order and then pick the first k-1 frames acq_day_indices = sorted(granules_map.keys(), reverse=True) @@ -344,7 +343,7 @@ def query_cmr_by_native_id (self, args, token, cmr, settings, now: datetime, nat return self.query_cmr_by_frame_and_dates(frame_id, local_args, token, cmr, settings, now, timerange) - def query_cmr_by_frame_and_acq_cycle(self, frame_id: int, acq_cycle: int, args, token, cmr, settings, now: datetime): + def query_cmr_by_frame_and_acq_cycle(self, frame_id: int, acq_cycle: int, args, token, cmr, settings, now: datetime, verbose = True): '''Query CMR for specific date range for a specific frame_id and acquisition cycle. Need to always use temporal queries''' self.logger.info(f"Querying CMR for all CSLC files that belong to the frame %d and acquisition cycle %d", @@ -359,9 +358,9 @@ def query_cmr_by_frame_and_acq_cycle(self, frame_id: int, acq_cycle: int, args, end_date = (sensing_datetime + timedelta(minutes=15)).strftime(CMR_TIME_FORMAT) timerange = DateTimeRange(start_date, end_date) - return self.query_cmr_by_frame_and_dates(frame_id, new_args, token, cmr, settings, now, timerange) + return self.query_cmr_by_frame_and_dates(frame_id, new_args, token, cmr, settings, now, timerange, verbose) - def query_cmr_by_frame_and_dates(self, frame_id: int, args, token, cmr, settings, now: datetime, timerange: DateTimeRange): + def query_cmr_by_frame_and_dates(self, frame_id: int, args, token, cmr, settings, now: datetime, timerange: DateTimeRange, verbose = True): '''Query CMR for specific date range for a specific frame_id''' if frame_id not in self.disp_burst_map_hist: @@ -373,7 +372,7 @@ def query_cmr_by_frame_and_dates(self, frame_id: int, args, token, cmr, setting if count == 0: return [] new_args.native_id = native_id - new_granules = query_cmr_cslc_blackout_polarization(new_args, token, cmr, settings, timerange, now, self.blackout_dates_obj, no_duplicate=True, force_frame_id=frame_id) + new_granules = query_cmr_cslc_blackout_polarization(new_args, token, cmr, settings, timerange, now, verbose, self.blackout_dates_obj, no_duplicate=True, force_frame_id=frame_id) return new_granules @@ -415,7 +414,7 @@ def query_cmr(self, timerange: DateTimeRange, now: datetime): unique_frames_dates.add(f"{frame_id}-{acq_cycle}") else: - granules = query_cmr_cslc_blackout_polarization(self.args, self.token, self.cmr, self.settings, timerange, now, False, self.blackout_dates_obj, False, None) + granules = query_cmr_cslc_blackout_polarization(self.args, self.token, self.cmr, self.settings, timerange, now, True, self.blackout_dates_obj, False, None) for granule in granules: _, _, acquisition_cycles, _ = parse_cslc_native_id(granule["granule_id"], self.burst_to_frames, self.disp_burst_map_hist) for frame_id, acq_cycle in acquisition_cycles.items(): @@ -437,7 +436,7 @@ def query_cmr(self, timerange: DateTimeRange, now: datetime): frame_id = int(self.args.frame_id) all_granules = self.query_cmr_by_frame_and_dates(frame_id, self.args, self.token, self.cmr, self.settings, now, timerange) else: - all_granules = query_cmr_cslc_blackout_polarization(self.args, self.token, self.cmr, self.settings, timerange, now, self.blackout_dates_obj, False, None) + all_granules = query_cmr_cslc_blackout_polarization(self.args, self.token, self.cmr, self.settings, timerange, now, True, self.blackout_dates_obj, False, None) return all_granules diff --git a/data_subscriber/survey.py b/data_subscriber/survey.py index d4c87ab03..56d6047f6 100644 --- a/data_subscriber/survey.py +++ b/data_subscriber/survey.py @@ -13,7 +13,7 @@ @backoff.on_exception(backoff.expo, Exception, max_value=13) -def _query_cmr_backoff(args, token, cmr, settings, query_timerange, now, disp_burst_map = None): +def _query_cmr_backoff(args, token, cmr, settings, query_timerange, now, disp_burst_map = None, verbose=False): # If disp_burst_map is not None, that means we are only querying for a specific frame_id # Restrict CMR query by the burst pattern that make up the DISP-S1 frame @@ -22,7 +22,7 @@ def _query_cmr_backoff(args, token, cmr, settings, query_timerange, now, disp_bu args.native_id = native_id print(args.native_id) - result = asyncio.run(async_query_cmr(args, token, cmr, settings, query_timerange, now)) + result = asyncio.run(async_query_cmr(args, token, cmr, settings, query_timerange, now, verbose)) return result diff --git a/tools/disp_s1_burst_db_tool.py b/tools/disp_s1_burst_db_tool.py index a6078634e..141ae5863 100755 --- a/tools/disp_s1_burst_db_tool.py +++ b/tools/disp_s1_burst_db_tool.py @@ -80,7 +80,7 @@ blackout_dates_obj = DispS1BlackoutDates(localize_disp_blackout_dates(), disp_burst_map, burst_to_frames) blackout_dates_file = None -def get_k_cycle(acquisition_dts, frame_id, disp_burst_map, k): +def get_k_cycle(acquisition_dts, frame_id, disp_burst_map, k, verbose): subs_args = create_parser().parse_args(["query", "-c", "OPERA_L2_CSLC-S1_V1", "--processing-mode=forward"]) @@ -88,7 +88,7 @@ def get_k_cycle(acquisition_dts, frame_id, disp_burst_map, k): cmr, token, username, password, edl = get_cmr_token(subs_args.endpoint, settings) cslc_dependency = CSLCDependency(k, None, disp_burst_map, subs_args, token, cmr, settings, blackout_dates_obj) # we don't care about m here - k_cycle: int = cslc_dependency.determine_k_cycle(acquisition_dts, None, frame_id) + k_cycle: int = cslc_dependency.determine_k_cycle(acquisition_dts, None, frame_id, verbose) return k_cycle @@ -207,7 +207,7 @@ def validate_frame(frame_id, all_granules = None, detect_unexpected_cycles = Fal if args.k: k = int(args.k) - k_cycle = get_k_cycle(acquisition_dts, frame_ids[0], disp_burst_map, k) + k_cycle = get_k_cycle(acquisition_dts, frame_ids[0], disp_burst_map, k, args.verbose) if (k_cycle >= 0): print(f"K-cycle: {k_cycle} out of {k}") else: diff --git a/tools/update_disp_s1_burst_db.py b/tools/update_disp_s1_burst_db.py index e418ec996..f1597571b 100644 --- a/tools/update_disp_s1_burst_db.py +++ b/tools/update_disp_s1_burst_db.py @@ -19,9 +19,9 @@ Writes out the new file with .mod added to the end of the file name''' @backoff.on_exception(backoff.expo, Exception, max_tries=15) -def query_cmr_by_frame_and_dates_backoff(cslc_query, subs_args, token, cmr, settings, now, timerange): +def query_cmr_by_frame_and_dates_backoff(cslc_query, subs_args, token, cmr, settings, now, timerange, verbose=True): frame_id = int(subs_args.frame_id) - return cslc_query.query_cmr_by_frame_and_dates(frame_id, subs_args, token, cmr, settings, now, timerange) + return cslc_query.query_cmr_by_frame_and_dates(frame_id, subs_args, token, cmr, settings, now, timerange, verbose) logging.basicConfig(level="INFO") logger = logging.getLogger(__name__) @@ -64,7 +64,7 @@ def query_cmr_by_frame_and_dates_backoff(cslc_query, subs_args, token, cmr, sett new_sensing_time_list = [] logger.info(f"Updating {frame=}") subs_args.frame_id = frame - all_granules = query_cmr_by_frame_and_dates_backoff(cslc_cmr_query, subs_args, token, cmr, settings, now, timerange) + all_granules = query_cmr_by_frame_and_dates_backoff(cslc_cmr_query, subs_args, token, cmr, settings, now, timerange, verbose=False) # Group them by acquisition cycle acq_cycles = defaultdict(set)