Skip to content

Commit

Permalink
Logging
Browse files Browse the repository at this point in the history
  • Loading branch information
bra-fsn committed May 30, 2024
1 parent 036eb56 commit 4eafa6e
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 1 deletion.
6 changes: 5 additions & 1 deletion inspector/inspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,17 @@ def start(ctx, exclude, start_only):
server = srv.api_reference
gpu_count = srv.gpu_count
if (vendor, server) in exclude:
logging.info(f"Excluding {vendor}/{server}")
continue
if start_only and (vendor, server) not in start_only:
logging.info(f"Excluding {vendor}/{server} as --start-only {start_only} is given")
continue
data_dir = os.path.join(ctx.parent.params["repo_path"], "data", vendor, server)
tasks = list(filter(lambda task: lib.should_start(task, data_dir, gpu_count), lib.get_tasks(vendor)))
if not tasks:
logging.info(f"No tasks for {vendor}/{server}")
continue
print("start", vendor, server)
logging.info(f"Starting {vendor}/{server}")
for task in tasks:
meta = lib.Meta(start=datetime.now(), task_hash=lib.task_hash(task))
lib.write_meta(meta, os.path.join(data_dir, task.name, lib.META_NAME))
Expand Down Expand Up @@ -173,6 +176,7 @@ def parse(ctx):
def inspect(ctx, vendor, instance, gpu_count, threads):
"""Run inspection on this machine."""
if os.environ.get("GITHUB_TOKEN"):
logging.info("Updating the git repo")
# we must clone the repo before writing anything to it
repo.get_repo()
data_dir = os.path.join(ctx.parent.params["repo_path"], "data", vendor, instance)
Expand Down
13 changes: 13 additions & 0 deletions inspector/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import hashlib
import inspect
import json
import logging
import math
import os
import repo
Expand Down Expand Up @@ -118,19 +119,24 @@ def should_start(task: Task, data_dir: str | os.PathLike, gpu_count: int) -> boo
thash = task_hash(task)
if task.gpu and not gpu_count:
# skip tasks which require GPUs on a server which doesn't have one
logging.info(f"Skipping task {task.name} because it requires GPU, but gpu_count is {gpu_count}")
return False
if meta.start:
if (datetime.now() - meta.start) >= FAIL_IF_NO_OUTPUT and (meta.end is None or meta.exit_code is None):
raise RuntimeError(f"{task.name} was started at {meta.start}, but didn't produce output!")
if (datetime.now() - meta.start) >= FAIL_ON_ERROR and meta.exit_code != 0:
raise RuntimeError(f"{task.name} was last started at {meta.start} and failed!")
if (datetime.now() - meta.start) < WAIT_BETWEEN_TASKS:
logging.info(f"Skipping task {task.name}: {WAIT_BETWEEN_TASKS} has not yet passed since last run")
return False
if meta.end and task.rerun and (datetime.now() - meta.end) >= task.rerun and meta.exit_code == 0:
# if rerun is set and there's a successful run, run the task again if rerun time interval has passed
logging.info(f"Task {task.name} should be started: {task.rerun} has passed since last run")
return True
if meta.task_hash != thash:
logging.info(f"Task {task.name} should run as its task hash has changed: {meta.task_hash} -> {thash}")
return True
logging.info(f"Skipping task {task.name}")
return False


Expand All @@ -139,12 +145,18 @@ def should_run(task: Task, data_dir: str | os.PathLike, gpu_count: int) -> bool:
meta = load_task_meta(task, data_dir)
thash = task_hash(task)
if task.gpu and not gpu_count:
logging.info(f"Skipping task {task.name} because it requires GPU, but gpu_count is {gpu_count}")
# skip tasks which require GPUs on a server which doesn't have one
return False
if meta.end and task.rerun and (datetime.now() - meta.end) >= task.rerun and meta.exit_code == 0:
return True
if meta.exit_code != 0 or meta.task_hash != thash:
if meta.exit_code != 0:
logging.info(f"Task {task.name} should run as last run has exit code: {meta.exit_code}")
if meta.task_hash != thash:
logging.info(f"Task {task.name} should run as its task hash has changed: {meta.task_hash} -> {thash}")
return True
logging.info(f"Skipping task {task.name}, {meta.end}, {meta.exit_code}")
return False


Expand Down Expand Up @@ -275,6 +287,7 @@ def run_tasks(vendor, data_dir: str | os.PathLike, gpu_count: int = 0, nthreads:
for task in taskgroups[taskgroup]:
if not should_run(task, data_dir, gpu_count):
continue
logging.info(f"Starting {task.name}")
q.put(task)
if not task.parallel:
q.join()
Expand Down

0 comments on commit 4eafa6e

Please sign in to comment.