From 3d8da6923e5992e429bd9021635046cea56028e0 Mon Sep 17 00:00:00 2001 From: bra-fsn Date: Wed, 4 Sep 2024 23:07:26 +0200 Subject: [PATCH] Retry locking errors --- inspector/lib.py | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/inspector/lib.py b/inspector/lib.py index 9d89fef..b011fc7 100644 --- a/inspector/lib.py +++ b/inspector/lib.py @@ -494,6 +494,20 @@ def remove_matches(regexes, input_string): return input_string +def retry_locked(func, *args, **kwargs): + """Retry a pulumi function with random backoff for locking issues""" + import pulumi + + for i in range(3): + try: + return func(*args, **kwargs) + except pulumi.automation.errors.ConcurrentUpdateError: + logging.exception(f"ConcurrentUpdateError, retry #{i}") + time.sleep(random.randint(1, 5)) + except Exception: + raise + + def start_inspect(executor, lock, data_dir, vendor, server, tasks, srv_data, regions, zones): from sc_runner.resources import default from sc_runner import runner @@ -547,12 +561,9 @@ def start_inspect(executor, lock, data_dir, vendor, server, tasks, srv_data, reg error_msgs = [] stack_opts = dict(on_output=logging.info, on_event=lambda event: pulumi_event_filter(event, error_msgs)) try: - runner.create( - vendor, - {}, - resource_opts | dict(instance_opts=instance_opts, user_data=b64_user_data, disk_size=16), - stack_opts=stack_opts, - ) + retry_locked(runner.create,vendor, {}, + resource_opts | dict(instance_opts=instance_opts, user_data=b64_user_data, disk_size=16), + stack_opts=stack_opts) # empty it if create succeeded, just in case error_msgs = [] break @@ -581,12 +592,9 @@ def start_inspect(executor, lock, data_dir, vendor, server, tasks, srv_data, reg for _ in range(2): # try normal images first, then gen1 if we get Hypervisor Generation '2' error try: - runner.create( - vendor, - {}, - resource_opts | dict(user_data=b64_user_data, image_sku=image_sku), - stack_opts=stack_opts, - ) + retry_locked(runner.create, vendor, {}, + resource_opts | dict(user_data=b64_user_data, image_sku=image_sku), + stack_opts=stack_opts) # empty it if create succeeded, just in case error_msgs = [] done = True @@ -644,12 +652,9 @@ def start_inspect(executor, lock, data_dir, vendor, server, tasks, srv_data, reg error_msgs = [] stack_opts = dict(on_output=logging.info, on_event=lambda event: pulumi_event_filter(event, error_msgs)) try: - runner.create( - vendor, - {}, - resource_opts | dict(instance_opts=instance_opts), - stack_opts=stack_opts, - ) + retry_locked(runner.create, vendor, {}, + resource_opts | dict(instance_opts=instance_opts), + stack_opts=stack_opts) # empty it if create succeeded, just in case error_msgs = [] break