Skip to content

Commit

Permalink
Make AnsibleTower checkout/execute actions more resilient
Browse files Browse the repository at this point in the history
We've been seeing some issues with service interruptions in AAP under
high load. While the jobs do complete successfully, awxkit bails when
encountering the connection issue.
With this change, we simple enter a retry loop when monitoring job
status.
  • Loading branch information
JacobCallahan committed Jan 13, 2025
1 parent d0b1192 commit c0a1920
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
16 changes: 15 additions & 1 deletion broker/providers/ansible_tower.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import click
from dynaconf import Validator
from logzero import logger
from requests.exceptions import ConnectionError

from broker import exceptions
from broker.helpers import eval_filter, find_origin, yaml
Expand Down Expand Up @@ -41,6 +42,19 @@ def convert_pseudonamespaces(attr_dict):
return out_dict


def resilient_job_wait(job, timeout=None):
"""Wait for a job to complete. Retry on errors."""
timeout = timeout or settings.ANSIBLETOWER.workflow_timeout
completed = False
while not completed:
try:
job.wait_until_completed(timeout=timeout)
completed = True
except ConnectionError as err:
logger.error(f"Error occurred while waiting for job: {err}")
logger.info("Retrying job wait...")


class JobExecutionError(exceptions.ProviderError):
"""Raised when a job execution fails."""

Expand Down Expand Up @@ -605,7 +619,7 @@ def execute(self, **kwargs): # noqa: PLR0912,PLR0915 - Possible TODO refactor
job_ui_url = url_parser.urljoin(self.url, f"/#/{subject}s/{job_number}")
helpers.emit(api_url=job_api_url, ui_url=job_ui_url)
logger.info(f"Waiting for job: \nAPI: {job_api_url}\nUI: {job_ui_url}")
job.wait_until_completed(timeout=settings.ANSIBLETOWER.workflow_timeout)
resilient_job_wait(job)
if job.status != "successful":
message_data = {
f"{subject.capitalize()} Status": job.status,
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ select = [
"T100", # Trace found: {name} used
"T20", # flake8-print
"TRY004", # Prefer TypeError exception for invalid type
"TRY302", # Remove exception handler; error is immediately re-raised
"PLR0911", # Too many return statements ({returns} > {max_returns})
"PLR0912", # Too many branches ({branches} > {max_branches})
"PLR0915", # Too many statements ({statements} > {max_statements})
Expand Down

0 comments on commit c0a1920

Please sign in to comment.