From ce4fc77460c6a2d7641b5f5a3b0c6ef7600cb7e5 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 16 Oct 2024 00:49:07 +0000 Subject: [PATCH] remove duplicate run_workloads script --- utils/run_workloads.py | 210 ---------------------- utils/target_setting_workload_config.json | 195 -------------------- 2 files changed, 405 deletions(-) delete mode 100644 utils/run_workloads.py delete mode 100644 utils/target_setting_workload_config.json diff --git a/utils/run_workloads.py b/utils/run_workloads.py deleted file mode 100644 index 92eb4d9b0..000000000 --- a/utils/run_workloads.py +++ /dev/null @@ -1,210 +0,0 @@ -""" -Example Usage: -python run_workloads.py \ ---workload_config_path workload_config.json \ ---framework jax \ ---experiment_name my_first_experiment \ ---docker_image_url us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev \ ---run_percentage 10 \ ---workload_config_path workload_config.json \ ---dry_run -""" - -import json -import os -import struct -import time - -from absl import app -from absl import flags -from absl import logging - -import docker - -flags.DEFINE_string( - 'docker_image_url', - 'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev', - 'URL to docker image') -flags.DEFINE_integer('run_percentage', - 100, - 'Percentage of max num steps to run for.') -flags.DEFINE_string('experiment_name', - 'my_experiment', - 'Name of top sub directory in experiment dir.') -flags.DEFINE_boolean('rsync_data', - True, - 'Whether or not to transfer the data from GCP w rsync.') -flags.DEFINE_boolean('local', False, 'Mount local algorithmic-efficiency repo.') -flags.DEFINE_string('framework', 'jax', 'Can be either PyTorch or JAX.') -flags.DEFINE_boolean( - 'dry_run', - False, - 'Whether or not to actually run the docker containers. ' - 'If False, simply print the docker run commands. ') -flags.DEFINE_integer('num_studies', 1, 'Number of studies to run') -flags.DEFINE_integer('study_start_index', None, 'Start index for studies.') -flags.DEFINE_integer('study_end_index', None, 'End index for studies.') -flags.DEFINE_integer('num_tuning_trials', 1, 'Number of tuning trials.') -flags.DEFINE_integer('hparam_start_index', - None, - 'Start index for tuning trials.') -flags.DEFINE_integer('hparam_end_index', None, 'End index for tuning trials.') -flags.DEFINE_integer('seed', None, 'Random seed for evaluating a submission.') -flags.DEFINE_integer('submission_id', - 0, - 'Submission ID to generate study and hparam seeds.') -flags.DEFINE_string( - 'workload_config_path', - 'workload_confing.json', - 'Path to config containing dataset and maximum number of steps per workload.' - 'The default values of these are set to the full budgets as determined ' - 'via the target-setting procedure. ' - 'Note that training will be interrupted at either the set maximum number ' - 'of steps or the fixed workload maximum run time, whichever comes first. ' - 'If your algorithm has a smaller per step time than our baselines ' - 'you may want to increase the number of steps per workload.') - -flags.DEFINE_integer( - 'max_steps' - None, - 'Maximum number of steps to run. If the run_percentage results into a larger' - 'number of steps, the maximum number of steps will be run.' -) - -FLAGS = flags.FLAGS - - -def read_workloads(filename): - with open(filename, "r") as f: - held_out_workloads = json.load(f) - return held_out_workloads - - -def container_running(): - docker_client = docker.from_env() - containers = docker_client.containers.list() - if len(containers) == 0: - return False - else: - return True - - -def wait_until_container_not_running(sleep_interval=5 * 60): - while container_running(): - time.sleep(sleep_interval) - return - - -def main(_): - # What Docker image to run the container with - docker_image_url = FLAGS.docker_image_url - - # Framework - framework = FLAGS.framework - - # - run_fraction = FLAGS.run_percentage / 100. - experiment_name = FLAGS.experiment_name - - # Get study and trial interval arguments - num_studies = FLAGS.num_studies - study_start_index = FLAGS.study_start_index if FLAGS.study_start_index else 0 - study_end_index = FLAGS.study_end_index if FLAGS.study_end_index else num_studies - 1 - - # Get trial arguments - num_tuning_trials = FLAGS.num_tuning_trials - hparam_start_index_flag = '' - hparam_end_index_flag = '' - if FLAGS.hparam_start_index: - hparam_start_index_flag = f'--hparam_start_index {FLAGS.hparam_start_index} ' - if FLAGS.hparam_end_index: - hparam_end_index_flag = f'--hparam_end_index {FLAGS.hparam_end_index} ' - - # Generate rng keys from submission_id and seed - submission_id = FLAGS.submission_id - rng_seed = FLAGS.seed - - if not rng_seed: - rng_seed = struct.unpack('I', os.urandom(4))[0] - - logging.info('Using RNG seed %d', rng_seed) - - # Read workload specifications to run - with open(FLAGS.workload_config_path) as f: - workload_config = json.load(f) - workloads = [w for w in workload_config.keys()] - - for study_index in range(study_start_index, study_end_index + 1): - print('-' * 100) - print('*' * 40, f'Starting study {study_index + 1}/{num_studies}', '*' * 40) - print('-' * 100) - study_dir = os.path.join(experiment_name, f'study_{study_index}') - - for workload in workloads: - # For each runnable workload check if there are any containers running - wait_until_container_not_running() - - # Clear caches - os.system("sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") - print('=' * 100) - - # Get workload dataset, max step, algorithm path and tuning search space - dataset = workload_config[workload]['dataset'] - if FLAGS.max_steps is None: - max_steps = int(workload_config[workload]['max_steps'] * run_fraction) - else: - max_steps = FLAGS.max_steps - submission_path = workload_config[workload]['submission_path'] - tuning_search_space = workload_config[workload]['tuning_search_space'] - - # Optionally, define flag to mount local algorithmic-efficiency repo - mount_repo_flag = '' - if FLAGS.local: - mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency ' - - command = ('docker run -t -d -v $HOME/data/:/data/ ' - '-v $HOME/experiment_runs/:/experiment_runs ' - '-v $HOME/experiment_runs/logs:/logs ' - f'{mount_repo_flag}' - '--gpus all --ipc=host ' - f'{docker_image_url} ' - f'-d {dataset} ' - f'-f {framework} ' - f'-s {submission_path} ' - f'-w {workload} ' - f'-t {tuning_search_space} ' - f'-e {study_dir} ' - f'-m {max_steps} ' - f'--num_tuning_trials {num_tuning_trials} ' - f'{hparam_start_index_flag} ' - f'{hparam_end_index_flag} ' - f'--rng_seed {rng_seed} ' - '-c false ' - '-o true ' - '-i true ') - if not FLAGS.dry_run: - print('Running docker container command') - print('Container ID: ') - return_code = os.system(command) - else: - return_code = 0 - if return_code == 0: - print( - f'SUCCESS: container for {framework} {workload} launched successfully' - ) - print(f'Command: {command}') - print(f'Results will be logged to {experiment_name}') - else: - print( - f'Failed: container for {framework} {workload} failed with exit code {return_code}.' - ) - print(f'Command: {command}') - wait_until_container_not_running() - os.system( - "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") # clear caches - - print('=' * 100) - - -if __name__ == '__main__': - app.run(main) diff --git a/utils/target_setting_workload_config.json b/utils/target_setting_workload_config.json deleted file mode 100644 index a8c050422..000000000 --- a/utils/target_setting_workload_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "imagenet_resnet": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json" - }, - "imagenet_resnet_gelu": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_momentum.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_resnet_gelu/tuning_search_space.json" - }, - "imagenet_resnet_large_bn_init": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_momentum.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_resnet_large_bn_init/tuning_search_space.json" - }, - "imagenet_resnet_silu": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_resnet_silu/tuning_search_space.json" - }, - "imagenet_vit": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_vit/tuning_search_space.json" - }, - "imagenet_vit_glu": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_vit_glu/tuning_search_space.json" - }, - "imagenet_vit_map": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_vit_map/tuning_search_space.json" - }, - "imagenet_vit_post_ln": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_vit_post_ln/tuning_search_space.json" - }, - "fastmri": { - "max_steps": 36189, - "dataset": "fastmri", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nesterov.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/fastmri/tuning_search_space.json" - }, - "fastmri_layernorm": { - "max_steps": 36189, - "dataset": "fastmri", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/fastmri_layernorm/tuning_search_space.json" - }, - "fastmri_model_size": { - "max_steps": 36189, - "dataset": "fastmri", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/fastmri_model_size/tuning_search_space.json" - }, - "fastmri_tanh": { - "max_steps": 36189, - "dataset": "fastmri", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/fastmri_tanh/tuning_search_space.json" - }, - "ogbg": { - "max_steps": 80000, - "dataset": "ogbg", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nesterov.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/ogbg/tuning_search_space.json" - }, - "ogbg_gelu": { - "max_steps": 80000, - "dataset": "ogbg", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/ogbg_gelu/tuning_search_space.json" - }, - "ogbg_model_size": { - "max_steps": 80000, - "dataset": "ogbg", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/ogbg_model_size/tuning_search_space.json" - }, - "ogbg_silu": { - "max_steps": 80000, - "dataset": "ogbg", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/ogbg_silu/tuning_search_space.json" - }, - "wmt": { - "max_steps": 133333, - "dataset": "wmt", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/wmt/tuning_search_space.json" - }, - "wmt_attention_temp": { - "max_steps": 133333, - "dataset": "wmt", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/wmt_attention_temp/tuning_search_space.json" - }, - "wmt_glu_tanh": { - "max_steps": 133333, - "dataset": "wmt", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/wmt_glu_tanh/tuning_search_space.json" - }, - "wmt_post_ln": { - "max_steps": 133333, - "dataset": "wmt", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/wmt_post_ln/tuning_search_space.json" - }, - "librispeech_deepspeech": { - "max_steps": 48000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_deepspeech/tuning_search_space.json" - }, - "librispeech_deepspeech_no_resnet": { - "max_steps": 48000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_deepspeech_no_resnet/tuning_search_space.json" - }, - "librispeech_deepspeech_norm_and_spec_aug": { - "max_steps": 48000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_deepspeech_norm_and_spec_aug/tuning_search_space.json" - }, - "librispeech_deepspeech_tanh": { - "max_steps": 48000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_deepspeech_tanh/tuning_search_space.json" - }, - "criteo1tb": { - "max_steps": 10666, - "dataset": "criteo1tb", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/criteo1tb/tuning_search_space.json" - }, - "criteo1tb_embed_init": { - "max_steps": 10666, - "dataset": "criteo1tb", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/criteo1tb_embed_init/tuning_search_space.json" - }, - "criteo1tb_layernorm": { - "max_steps": 10666, - "dataset": "criteo1tb", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/criteo1tb_layernorm/tuning_search_space.json" - }, - "criteo1tb_resnet": { - "max_steps": 10666, - "dataset": "criteo1tb", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/criteo1tb_resnet/tuning_search_space.json" - }, - "librispeech_conformer": { - "max_steps": 80000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_conformer/tuning_search_space.json" - }, - "librispeech_conformer_attention_temperature": { - "max_steps": 80000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_conformer_attention_temperature/tuning_search_space.json" - }, - "librispeech_conformer_gelu": { - "max_steps": 80000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_conformer_gelu/tuning_search_space.json" - }, - "librispeech_conformer_layernorm": { - "max_steps": 80000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_conformer_layernorm/tuning_search_space.json" - } - -} \ No newline at end of file