Skip to content

Commit

Permalink
separate out condor and stderr reset reasons
Browse files Browse the repository at this point in the history
  • Loading branch information
dsschult committed Oct 19, 2024
1 parent 3023887 commit 9c3a4f3
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 10 deletions.
21 changes: 13 additions & 8 deletions iceprod/server/plugins/condor.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,18 +77,23 @@ def from_condor_status(num):
}


RESET_REASONS = [
RESET_CONDOR_REASONS = [
'_condor_stdout: (errno 2) No such file',
'sigterm',
'killed',
'transfer input files failure',
'transfer output files failure',
'cpu consumption limit exceeded',
'memory limit exceeded',
'cgroup memory limit',
'local storage limit on worker node exceeded',
'execution time limit exceeded',
]


RESET_STDERR_REASONS = [
'sigterm',
'killed',
'operation timed out',
]
# 'memory limit exceeded',
# 'local storage limit on worker node exceeded',
# 'execution time limit exceeded',


def parse_usage(usage: str) -> int:
Expand Down Expand Up @@ -741,14 +746,14 @@ async def finish(self, job_id: CondorJobId, success: bool = True, resources: dic
if reason:
stats['error_summary'] = reason
# check condor error for reset reason
for text in RESET_REASONS:
for text in RESET_CONDOR_REASONS:
if text.lower() in reason.lower():
future = self.task_reset(job, stats=stats, reason=reason)
break
if future is None and stderr and stderr.is_file():
# check stderr for reset reason
reason = stderr.open().read()
for text in RESET_REASONS:
for text in RESET_STDERR_REASONS:
if text.lower() in reason.lower():
future = self.task_reset(job, stats=stats, reason=reason)
break
Expand Down
4 changes: 2 additions & 2 deletions tests/server/plugins/condor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -825,7 +825,7 @@ async def test_reset_task(schedd, i3prod_path, set_time):
g.task_reset = AsyncMock()
g.task_failure = AsyncMock()

await g.finish(jobid, success=False, reason=iceprod.server.plugins.condor.RESET_REASONS[0])
await g.finish(jobid, success=False, reason=iceprod.server.plugins.condor.RESET_CONDOR_REASONS[0])

assert g.task_success.call_count == 0
assert g.task_reset.call_count == 1
Expand All @@ -838,7 +838,7 @@ async def test_reset_task(schedd, i3prod_path, set_time):
g.task_reset = AsyncMock()
g.task_failure = AsyncMock()

(p / 'condor.err').open('w').write(iceprod.server.plugins.condor.RESET_REASONS[-1])
(p / 'condor.err').open('w').write(iceprod.server.plugins.condor.RESET_STDERR_REASONS[0])

await g.finish(jobid, success=False)

Expand Down

0 comments on commit 9c3a4f3

Please sign in to comment.