From 56fe59e8a2ea5a9e9326ef785ca0f4dd06a88963 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 31 Aug 2024 18:45:26 +0000 Subject: [PATCH 1/3] fixed intervals --- cdx_toolkit/myrequests.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/cdx_toolkit/myrequests.py b/cdx_toolkit/myrequests.py index 9127fbc..ad70cc0 100644 --- a/cdx_toolkit/myrequests.py +++ b/cdx_toolkit/myrequests.py @@ -7,13 +7,15 @@ LOGGER = logging.getLogger(__name__) - previously_seen_hostnames = { 'commoncrawl.s3.amazonaws.com', 'data.commoncrawl.org', 'web.archive.org', } +next_fetch = time.time() +minimum_interval = 3.0 # seconds + def dns_fatal(url): '''We have a dns error, should we fail immediately or not?''' @@ -23,6 +25,13 @@ def dns_fatal(url): def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): + t = time.time() + global next_fetch + if t < next_fetch: + time.sleep(next_fetch - t) + # next_fetch is also updated at the bottom + next_fetch = next_fetch + minimum_interval + if params: if 'from_ts' in params: params['from'] = params['from_ts'] @@ -38,8 +47,8 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): headers['User-Agent'] = 'pypi_cdx_toolkit/'+__version__ retry = True - retry_sec = 1 - retry_max_sec = 30 + retry_sec = 2 * minimum_interval + retry_max_sec = 60 retries = 0 connect_errors = 0 while retry: @@ -62,14 +71,10 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): # I have never seen IA or CC send 429 or 509, but just in case... # 429 is also a slow down, IA started sending them mid-2023 retries += 1 - if retries > 5: - LOGGER.warning('retrying after 1s for %d', resp.status_code) - if resp.text: - LOGGER.warning('response body is %s', resp.text) - else: - LOGGER.info('retrying after 1s for %d', resp.status_code) - if resp.text: - LOGGER.info('response body is %s', resp.text) + level = 30 if retries > 5 else 20 # 30=warning 20=info + LOGGER.log(level, 'retrying after %.2fs for %d', retry_sec, resp.status_code) + if resp.text: + LOGGER.log(level, 'response body is %s', resp.text) time.sleep(retry_sec) retry_sec = min(retry_sec*2, retry_max_sec) continue @@ -93,8 +98,8 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): raise ValueError(string) if connect_errors > 10: LOGGER.warning(string) - LOGGER.info('retrying after 1s for '+str(e)) - time.sleep(retry_sec) + LOGGER.info('retrying after {:.2f}s for '.format(retry_max_sec)+str(e)) + time.sleep(retry_max_sec) # notice the extra-long sleep retry_sec = min(retry_sec*2, retry_max_sec) except requests.exceptions.RequestException as e: # pragma: no cover LOGGER.warning('something unexpected happened, giving up after %s', str(e)) @@ -104,4 +109,7 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): if hostname not in previously_seen_hostnames: previously_seen_hostnames.add(hostname) + # in case we had a lot of retries, etc + next_fetch = time.time() + minimum_interval + return resp From 938faaec1ad4e3ddfaf2a493c7c5cea3ddbb0109 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 1 Sep 2024 18:14:59 +0000 Subject: [PATCH 2/3] tweak retries --- cdx_toolkit/myrequests.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cdx_toolkit/myrequests.py b/cdx_toolkit/myrequests.py index ad70cc0..b0b4561 100644 --- a/cdx_toolkit/myrequests.py +++ b/cdx_toolkit/myrequests.py @@ -13,10 +13,6 @@ 'web.archive.org', } -next_fetch = time.time() -minimum_interval = 3.0 # seconds - - def dns_fatal(url): '''We have a dns error, should we fail immediately or not?''' hostname = urlparse(url).hostname @@ -24,6 +20,10 @@ def dns_fatal(url): return True +next_fetch = time.time() +minimum_interval = 3.0 # seconds + + def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): t = time.time() global next_fetch From bd7ccc188ff3c037602c1fcab3300c743f51dcca Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 1 Sep 2024 20:25:06 +0000 Subject: [PATCH 3/3] new retry system based on rate limits --- cdx_toolkit/myrequests.py | 52 +++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/cdx_toolkit/myrequests.py b/cdx_toolkit/myrequests.py index b0b4561..94fa726 100644 --- a/cdx_toolkit/myrequests.py +++ b/cdx_toolkit/myrequests.py @@ -13,24 +13,60 @@ 'web.archive.org', } -def dns_fatal(url): + +def dns_fatal(hostname): '''We have a dns error, should we fail immediately or not?''' - hostname = urlparse(url).hostname if hostname not in previously_seen_hostnames: return True -next_fetch = time.time() -minimum_interval = 3.0 # seconds +retry_info = { + 'default': { + 'next_fetch': 0, + 'minimum_interval': 3.0, + }, + 'index.commoncrawl.org': { + 'next_fetch': 0, + 'minimum_interval': 3.0, + }, + 'data.commoncrawl.org': { + 'next_fetch': 0, + 'minimum_interval': 3.0, + }, + 'web.archive.org': { + 'next_fetch': 0, + 'minimum_interval': 6.0, + }, +} + + +def get_retries(hostname): + if hostname not in retry_info: + retry_info[hostname] = retry_info['default'].copy() + LOGGER.debug('initializing retry info for new host '+hostname) + entry = retry_info[hostname] + if not entry['next_fetch']: + entry['next_fetch'] = time.time() + return entry['next_fetch'], entry['minimum_interval'] + + +def update_next_fetch(hostname, next_fetch): + retry_info[hostname]['next_fetch'] = next_fetch def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): t = time.time() - global next_fetch + + hostname = urlparse(url).hostname + next_fetch, minimum_interval = get_retries(hostname) + if t < next_fetch: - time.sleep(next_fetch - t) + dt = next_fetch - t + if dt > 3.1: + LOGGER.debug('sleeping for {:.3f}s before next fetch'.format(dt)) + time.sleep(dt) # next_fetch is also updated at the bottom - next_fetch = next_fetch + minimum_interval + update_next_fetch(hostname, next_fetch + minimum_interval) if params: if 'from_ts' in params: @@ -110,6 +146,6 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): previously_seen_hostnames.add(hostname) # in case we had a lot of retries, etc - next_fetch = time.time() + minimum_interval + update_next_fetch(hostname, time.time() + minimum_interval) return resp