From 11ea68f166ef68105bc427a71d026870da249b47 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 19 Sep 2024 16:42:27 +0000 Subject: [PATCH] add safety flag to enforce explicitly enabling step budgets --- scoring/run_workloads.py | 82 +++++++++++----------------------------- 1 file changed, 23 insertions(+), 59 deletions(-) diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index da702388f..0d708990f 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -17,8 +17,6 @@ from absl import app from absl import flags from absl import logging -import datetime -import subprocess from algorithmic_efficiency import random_utils as prng from algorithmic_efficiency.workloads.workloads import get_base_workload_name @@ -30,7 +28,7 @@ 'URL to docker image') flags.DEFINE_integer('run_percentage', 100, - 'Percentage of max num steps to run for.') + 'Percentage of max num steps to run for. Must set enable_step_percentage to true for this to take effect.') flags.DEFINE_string('experiment_name', 'my_experiment', 'Name of top sub directory in experiment dir.') @@ -85,21 +83,14 @@ 'If your algorithm has a smaller per step time than our baselines ' 'you may want to increase the number of steps per workload.') flags.DEFINE_string( - 'workloads', + 'workload', None, - 'String representing a comma separated list of workload names.' 'If not None, only run this workload, else run all workloads in workload_metadata_path.' ) flags.DEFINE_string( - 'additional_requirements_path', - None, - 'Path to requirements.txt if any.' -) -flags.DEFINE_integer( - 'max_steps', - None, - 'Maximum number of steps to run. If run_fraction results in greater number of steps ' - 'than the max_steps, the run will be cut to max_steps.' + 'enable_step_percentage', + False, + 'By default ignore step_fraction such that scoring is bounded by time budget.' ) FLAGS = flags.FLAGS @@ -119,34 +110,15 @@ def container_running(): else: return True -def kill_containers(): - docker_client = docker.from_env() - containers = docker_client.containers.list() - for container in containers: - container.kill() - -def gpu_is_active(): - output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits']) - return any(int(x) > 0 for x in output.decode().splitlines()) - def wait_until_container_not_running(sleep_interval=5 * 60): - # check gpu util - # if the gpu has not been utilized for 30 minutes kill the - gpu_last_active = datetime.datetime.now().timestamp() - while container_running(): - # check if gpus have been inactive > 45 min and if so terminate container - if gpu_is_active(): - gpu_last_active = datetime.datetime.now().timestamp() - if (datetime.datetime.now().timestamp() - gpu_last_active) > 45 * 60: - kill_containers("Killing container: GPUs have been inactive > 45 minutes...") time.sleep(sleep_interval) return + def main(_): framework = FLAGS.framework - run_fraction = FLAGS.run_percentage / 100. experiment_name = FLAGS.experiment_name docker_image_url = FLAGS.docker_image_url submission_path = FLAGS.submission_path @@ -164,13 +136,7 @@ def main(_): study_end_index = FLAGS.study_end_index else: study_end_index = num_studies - 1 - - additional_requirements_path_flag = '' - if FLAGS.additional_requirements_path: - additional_requirements_path_flag = f'--additional_requirements_path {FLAGS.additional_requirements_path} ' - submission_id = FLAGS.submission_id - rng_seed = FLAGS.seed if not rng_seed: @@ -182,21 +148,17 @@ def main(_): with open(FLAGS.workload_metadata_path) as f: workload_metadata = json.load(f) - # Get list of all possible workloads workloads = [w for w in workload_metadata.keys()] - # Read heldout workloads + # Read held-out workloads if FLAGS.held_out_workloads_config_path: held_out_workloads = read_held_out_workloads( FLAGS.held_out_workloads_config_path) workloads = workloads + held_out_workloads - # Filter workloads if explicit workloads specified - if FLAGS.workloads is not None: - workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads)) - if len(workloads) != len(FLAGS.workloads.split(',')): - unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads) - raise ValueError(f'Invalid workload name {unmatched_workloads}') + # Filter for single workload + if FLAGS.workload and (FLAGS.workload in workloads): + workloads = [FLAGS.workload] rng_subkeys = prng.split(rng_key, num_studies) @@ -216,17 +178,20 @@ def main(_): "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") # clear caches print('=' * 100) dataset = workload_metadata[base_workload_name]['dataset'] - if FLAGS.max_steps is None: - max_steps = int(workload_metadata[base_workload_name]['max_steps'] * - run_fraction) - else: - max_steps = FLAGS.max_steps + + max_steps_flag = '' + if FLAGS.enable_step_percentage: + run_fraction = FLAGS.run_percentage / 100. + max_steps = int(workload_metadata[base_workload_name]['max_steps'] * + run_fraction) + max_steps_flag = f'-m {max_steps}' + mount_repo_flag = '' if FLAGS.local: - mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency ' - command = ('docker run -t -d -v /home/kasimbeg/data/:/data/ ' - '-v /home/kasimbeg/experiment_runs/:/experiment_runs ' - '-v /home/kasimbeg/experiment_runs/logs:/logs ' + mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency ' + command = ('docker run -t -d -v $HOME/data/:/data/ ' + '-v $HOME/experiment_runs/:/experiment_runs ' + '-v $HOME/experiment_runs/logs:/logs ' f'{mount_repo_flag}' '--gpus all --ipc=host ' f'{docker_image_url} ' @@ -235,10 +200,9 @@ def main(_): f'-s {submission_path} ' f'-w {workload} ' f'-e {study_dir} ' - f'-m {max_steps} ' + f'{max_steps_flag} ' f'--num_tuning_trials {num_tuning_trials} ' f'--rng_seed {run_seed} ' - f'{additional_requirements_path_flag}' '-c false ' '-o true ' '-i true ')