Skip to content

Commit

Permalink
add more error reasons to reset instead of fail (#399)
Browse files Browse the repository at this point in the history
* add more error reasons to reset, with comments

* fix flake8
  • Loading branch information
dsschult authored Oct 27, 2024
1 parent 1ce5330 commit c166b2e
Showing 1 changed file with 14 additions and 2 deletions.
16 changes: 14 additions & 2 deletions iceprod/server/plugins/condor.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,25 +79,37 @@ def from_condor_status(num):


RESET_CONDOR_REASONS = [
# condor file transfer plugin failed
'_condor_stdout: (errno 2) No such file',
'transfer input files failure',
'transfer output files failure',
# resource limits
'cpu consumption limit exceeded',
'memory limit exceeded',
'cgroup memory limit',
'local storage limit on worker node exceeded',
'execution time limit exceeded',
# general retries
'exceeded max iceprod queue time',
'job has failed',
'python-initiated action (by user ice3simusr)',
]


RESET_STDERR_REASONS = [
# glidein died
'sigterm',
'killed',
# hopefully transient errors
'bus error (core dumped)',
'segmentation fault (core dumped)',
'operation timed out',
'connection timed out',
# GPU errors
'opencl error: could not set up context',
# CVMFS errors
'python: command not found',
'cannot read file data: Stale file handle',
]


Expand Down Expand Up @@ -704,7 +716,7 @@ async def wait(self, timeout):
if new_status is not None and job.status != new_status:
job.status = new_status
if new_status == JobStatus.FAILED:
self.submitter.remove(job_id, reason=event.get('HoldReason', None))
self.submitter.remove(job_id, reason=event.get('HoldReason', 'Job has failed'))
else:
await self.job_update(job)
except Exception:
Expand Down Expand Up @@ -806,7 +818,7 @@ async def check(self):
if job_id not in old_jobs or job.status != old_jobs[job_id].status:
if job.status == JobStatus.FAILED:
extra = job.extra if job.extra else {}
reason = extra.get('HoldReason', None)
reason = extra.get('HoldReason', 'Job has failed')
logger.info("job %s %s.%s removed from cross-check: %r", job_id, job.dataset_id, job.task_id, reason)
self.submitter.remove(job_id, reason=reason)

Expand Down

0 comments on commit c166b2e

Please sign in to comment.