Skip to content

Commit

Permalink
allow replay of failed benches, addressing comment in foundation-mode…
Browse files Browse the repository at this point in the history
  • Loading branch information
fabianlim committed Jun 1, 2024
1 parent f666d5e commit 77bc92b
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 9 deletions.
26 changes: 23 additions & 3 deletions scripts/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Standard
from itertools import product
from time import sleep
from typing import Any, Callable, Dict, List, Tuple, Union
import argparse
import json
Expand Down Expand Up @@ -88,6 +89,7 @@
HF_ARG_SKIP_MEMORY_METRIC = "--skip_memory_metrics"
RESULT_FIELD_ALLOCATED_GPU_MEM = "mem_torch_mem_alloc_in_bytes"
RESULT_FIELD_PEAK_ALLOCATED_GPU_MEM = "mem_peak_torch_mem_alloc_in_bytes"
ERROR_MESSAGES = "error_messages"


def extract_gpu_memory_metrics(output_metrics) -> Tuple[float]:
Expand Down Expand Up @@ -357,6 +359,17 @@ def __init__(
self.results_filename = os.path.join(self.save_dir, FILE_RESULTS)
self.gpu_log_filename = os.path.join(self.save_dir, FILE_MEM)

@property
def is_completed(self):
if not os.path.exists(self.results_filename):
return False
# otherwise open it and check for errors
with open(self.results_filename) as f:
results = json.load(f)

# return complete only if no errors
return not ERROR_MESSAGES in results

def run(
self,
run_cmd: str,
Expand Down Expand Up @@ -552,7 +565,7 @@ def write_result(self):
**self.get_experiment_final_metrics(),
}
else:
other_results = {"error_messages": maybe_error_messages}
other_results = {ERROR_MESSAGES: maybe_error_messages}

# combine the final thing
save_result = {**save_result, **other_results}
Expand Down Expand Up @@ -781,6 +794,14 @@ def main(args):
log_memory_in_trainer=args.log_memory_hf,
)
):
# store pointer to file for future result retrival
experiment_stats[experiment.tag] = experiment.results_filename

if experiment.is_completed:
# if completed, dont proceed
sleep(0.1) # sleep a bit to allow the tqdm to update
continue

if experiment.num_gpus > 1:
prefix = COMMAND_ACCELERATE.format(
accelerate_config_path=args.accelerate_config,
Expand All @@ -806,10 +827,9 @@ def main(args):
log_nvidia_smi=args.log_nvidia_smi,
)

# write results and store pointers to files
# write results
experiment.write_result()
experiment.write_shell_command()
experiment_stats[experiment.tag] = experiment.results_filename

# 4. Consolidates the experiment results into a summary
for tag, path in experiment_stats.items():
Expand Down
14 changes: 8 additions & 6 deletions scripts/run_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ if [ -n "$RESULT_DIR" ]; then
echo "Results dir $RESULT_DIR is not empty, but NO_OVERWRITE=true"
echo "If intending to overwrite please delete the folder manually"
echo "or do not set NO_OVERWRITE"
exit 1
else
echo "Deleting $RESULT_DIR"
rm -rf $RESULT_DIR
fi
echo "Deleting $RESULT_DIR"
rm -rf $RESULT_DIR
fi

# tag on the directories
Expand Down Expand Up @@ -98,9 +98,11 @@ elif [ "$MEMORY_LOGGING" = "all" ]; then
fi

# dump out the environment
echo "Creating $RESULT_DIR"
mkdir -p $RESULT_DIR
pip freeze > $PIP_REQUIREMENTS_FILE
if [ ! "$NO_OVERWRITE" = "true" ]; then
echo "Creating $RESULT_DIR"
mkdir -p $RESULT_DIR
pip freeze > $PIP_REQUIREMENTS_FILE
fi

# run the bench
python $WORKING_DIR/benchmark.py \
Expand Down

0 comments on commit 77bc92b

Please sign in to comment.