Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prize qualificaton logs #657

Merged
merged 5 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
9 changes: 9 additions & 0 deletions GETTING_STARTED.md
Original file line number Diff line number Diff line change
Expand Up @@ -381,4 +381,13 @@ python score_submissions.py --submission_directory <directory_with_submissions>

We provide the scores and performance profiles for the [paper baseline algorithms](/reference_algorithms/paper_baselines/) in the "Baseline Results" section in [Benchmarking Neural Network Training Algorithms](https://arxiv.org/abs/2306.07179).

## Package Submission for Self-Reporting
To prepare your submission for self reporting run:

```
python3 package_logs.py --experiment_dir <experiment_dir> --destination_dir <destination_dir>
```

The destination directiory will contain the logs packed in studies and trials required for self-reporting.

**Good Luck!**
2 changes: 2 additions & 0 deletions prize_qualification_baselines/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

This directory contains the baseline(s) that submissions must beat to qualify for prizes, see the [Scoring Section](/COMPETITION_RULES.md#scoring) of the competition rules. For each ruleset there are 2 baselines (`*_target_setting.py` and `*_full_budget.py`). A submission must beat both baselines to be eligible for prizes.

The experiment logs with training metrics are in `prize_qualification_baselines/logs`

## Externally Tuned Ruleset

### JAX
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
{
"logtostderr": false,
"alsologtostderr": false,
"log_dir": "",
"v": 0,
"verbosity": 0,
"logger_levels": {},
"stderrthreshold": "fatal",
"showprefixforinfo": true,
"run_with_pdb": false,
"pdb_post_mortem": false,
"pdb": false,
"run_with_profiling": false,
"profile_file": null,
"use_cprofile_for_profiling": true,
"only_check_args": false,
"op_conversion_fallback_to_while_loop": true,
"runtime_oom_exit": true,
"hbm_oom_exit": true,
"delta_threshold": 0.5,
"tt_check_filter": false,
"tt_single_core_summaries": false,
"test_srcdir": "",
"test_tmpdir": "/tmp/absl_testing",
"test_random_seed": 301,
"test_randomize_ordering_seed": "",
"xml_output_file": "",
"submission_path": "prize_qualification_baselines/external_tuning/jax_nadamw_full_budget.py",
"workload": "criteo1tb",
"tuning_ruleset": "external",
"tuning_search_space": "prize_qualification_baselines/external_tuning/tuning_search_space.json",
"num_tuning_trials": 5,
"data_dir": "/data/criteo1tb",
"imagenet_v2_data_dir": "~/data",
"librispeech_tokenizer_vocab_path": "",
"framework": "jax",
"torch_compile": true,
"experiment_dir": "/experiment_runs",
"experiment_name": "prize_qualification/study_0",
"save_intermediate_checkpoints": true,
"resume_last_run": null,
"append_timestamp": false,
"use_wandb": false,
"profile": false,
"max_global_steps": 10666,
"overwrite": true,
"save_checkpoints": false,
"hparam_start_index": null,
"hparam_end_index": null,
"rng_seed": 2735018057,
"set_pytorch_max_split_size": false,
"?": false,
"help": false,
"helpshort": false,
"helpfull": false,
"helpxml": false,
"chex_n_cpu_devices": 1,
"chex_assert_multiple_cpu_devices": false,
"chex_skip_pmap_variant_if_single_device": true
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"dropout_rate": 0.0,
"label_smoothing": 0.1,
"learning_rate": 0.001308209823469072,
"one_minus_beta1": 0.02686663061,
"beta2": 0.9981232922116359,
"weight_decay": 0.16375311233774334,
"warmup_factor": 0.1
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"workload.embed_dim": 128,
"workload.eval_batch_size": 524288,
"workload.eval_period_time_sec": 1200,
"workload.max_allowed_runtime_sec": 7703,
"workload.num_dense_features": 13,
"workload.num_eval_train_examples": 83361792,
"workload.num_test_examples": 95000000,
"workload.num_train_examples": 4195197692,
"workload.num_validation_examples": 83274637,
"workload.step_hint": 10666,
"workload.target_metric_name": "loss",
"workload.test_target_value": 0.126041,
"workload.train_mean": 0.126041,
"workload.train_stddev": 0.126041,
"workload.use_layer_norm": false,
"workload.use_resnet": false,
"workload.validation_target_value": 0.123735,
"workload.vocab_size": 4194304,
"cpu.util.avg_percent_since_last": 2.2,
"cpu.freq.current": 2000.144,
"mem.total": 253568368640,
"mem.available": 241405480960,
"mem.used": 7452110848,
"mem.percent_used": 4.8,
"mem.read_bytes_since_boot": 289844948763648,
"mem.write_bytes_since_boot": 310064559104,
"net.bytes_sent_since_boot": 24537,
"net.bytes_recv_since_boot": 484862,
"gpu.count": 8,
"gpu.0.compute.util": 0.68,
"gpu.0.mem.util": 0.75921630859375,
"gpu.0.mem.total": 16384.0,
"gpu.0.mem.used": 12439.0,
"gpu.0.mem.free": 3711.0,
"gpu.0.temp.current": 37.0,
"gpu.1.compute.util": 0.0,
"gpu.1.mem.util": 0.75836181640625,
"gpu.1.mem.total": 16384.0,
"gpu.1.mem.used": 12425.0,
"gpu.1.mem.free": 3725.0,
"gpu.1.temp.current": 40.0,
"gpu.2.compute.util": 0.0,
"gpu.2.mem.util": 0.75836181640625,
"gpu.2.mem.total": 16384.0,
"gpu.2.mem.used": 12425.0,
"gpu.2.mem.free": 3725.0,
"gpu.2.temp.current": 39.0,
"gpu.3.compute.util": 0.0,
"gpu.3.mem.util": 0.75836181640625,
"gpu.3.mem.total": 16384.0,
"gpu.3.mem.used": 12425.0,
"gpu.3.mem.free": 3725.0,
"gpu.3.temp.current": 40.0,
"gpu.4.compute.util": 0.0,
"gpu.4.mem.util": 0.75836181640625,
"gpu.4.mem.total": 16384.0,
"gpu.4.mem.used": 12425.0,
"gpu.4.mem.free": 3725.0,
"gpu.4.temp.current": 39.0,
"gpu.5.compute.util": 0.0,
"gpu.5.mem.util": 0.75836181640625,
"gpu.5.mem.total": 16384.0,
"gpu.5.mem.used": 12425.0,
"gpu.5.mem.free": 3725.0,
"gpu.5.temp.current": 41.0,
"gpu.6.compute.util": 0.0,
"gpu.6.mem.util": 0.75836181640625,
"gpu.6.mem.total": 16384.0,
"gpu.6.mem.used": 12425.0,
"gpu.6.mem.free": 3725.0,
"gpu.6.temp.current": 40.0,
"gpu.7.compute.util": 0.31,
"gpu.7.mem.util": 0.75836181640625,
"gpu.7.mem.total": 16384.0,
"gpu.7.mem.used": 12425.0,
"gpu.7.mem.free": 3725.0,
"gpu.7.temp.current": 41.0,
"gpu.avg.compute.util": 0.12375,
"gpu.avg.mem.util": 0.7584686279296875,
"gpu.avg.mem.total": 16384.0,
"gpu.avg.mem.used": 12426.75,
"gpu.avg.mem.free": 3723.25,
"gpu.avg.temp.current": 39.625,
"os_platform": "Linux-4.19.0-26-cloud-amd64-x86_64-with-glibc2.29",
"python_version": "3.8.10",
"python_compiler": "GCC 9.4.0",
"cpu_model_name": "Intel(R) Xeon(R) CPU @ 2.00GHz",
"cpu_count": 64,
"gpu_model_name": "Tesla V100-SXM2-16GB",
"gpu_count": 8,
"gpu_driver": "535.104.05",
"rng_seed": 2735018057
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
{
"logtostderr": false,
"alsologtostderr": false,
"log_dir": "",
"v": 0,
"verbosity": 0,
"logger_levels": {},
"stderrthreshold": "fatal",
"showprefixforinfo": true,
"run_with_pdb": false,
"pdb_post_mortem": false,
"pdb": false,
"run_with_profiling": false,
"profile_file": null,
"use_cprofile_for_profiling": true,
"only_check_args": false,
"op_conversion_fallback_to_while_loop": true,
"runtime_oom_exit": true,
"hbm_oom_exit": true,
"delta_threshold": 0.5,
"tt_check_filter": false,
"tt_single_core_summaries": false,
"test_srcdir": "",
"test_tmpdir": "/tmp/absl_testing",
"test_random_seed": 301,
"test_randomize_ordering_seed": "",
"xml_output_file": "",
"submission_path": "prize_qualification_baselines/external_tuning/jax_nadamw_full_budget.py",
"workload": "criteo1tb",
"tuning_ruleset": "external",
"tuning_search_space": "prize_qualification_baselines/external_tuning/tuning_search_space.json",
"num_tuning_trials": 5,
"data_dir": "/data/criteo1tb",
"imagenet_v2_data_dir": "~/data",
"librispeech_tokenizer_vocab_path": "",
"framework": "jax",
"torch_compile": true,
"experiment_dir": "/experiment_runs",
"experiment_name": "prize_qualification/study_0",
"save_intermediate_checkpoints": true,
"resume_last_run": null,
"append_timestamp": false,
"use_wandb": false,
"profile": false,
"max_global_steps": 10666,
"overwrite": true,
"save_checkpoints": false,
"hparam_start_index": null,
"hparam_end_index": null,
"rng_seed": 2735018057,
"set_pytorch_max_split_size": false,
"?": false,
"help": false,
"helpshort": false,
"helpfull": false,
"helpxml": false,
"chex_n_cpu_devices": 1,
"chex_assert_multiple_cpu_devices": false,
"chex_skip_pmap_variant_if_single_device": true
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"dropout_rate": 0.0,
"label_smoothing": 0.2,
"learning_rate": 0.0008445074561975979,
"one_minus_beta1": 0.11042418465,
"beta2": 0.9978504782314613,
"weight_decay": 0.08135402759553023,
"warmup_factor": 0.05
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"workload.embed_dim": 128,
"workload.eval_batch_size": 524288,
"workload.eval_period_time_sec": 1200,
"workload.max_allowed_runtime_sec": 7703,
"workload.num_dense_features": 13,
"workload.num_eval_train_examples": 83361792,
"workload.num_test_examples": 95000000,
"workload.num_train_examples": 4195197692,
"workload.num_validation_examples": 83274637,
"workload.step_hint": 10666,
"workload.target_metric_name": "loss",
"workload.test_target_value": 0.126041,
"workload.train_mean": 0.126041,
"workload.train_stddev": 0.126041,
"workload.use_layer_norm": false,
"workload.use_resnet": false,
"workload.validation_target_value": 0.123735,
"workload.vocab_size": 4194304,
"cpu.util.avg_percent_since_last": 7.4,
"cpu.freq.current": 2000.144,
"mem.total": 253568368640,
"mem.available": 127282569216,
"mem.used": 120970350592,
"mem.percent_used": 49.8,
"mem.read_bytes_since_boot": 290720142218240,
"mem.write_bytes_since_boot": 312102560768,
"net.bytes_sent_since_boot": 698385,
"net.bytes_recv_since_boot": 1159270,
"gpu.count": 8,
"gpu.0.compute.util": 0.0,
"gpu.0.mem.util": 0.79022216796875,
"gpu.0.mem.total": 16384.0,
"gpu.0.mem.used": 12947.0,
"gpu.0.mem.free": 3203.0,
"gpu.0.temp.current": 38.0,
"gpu.1.compute.util": 0.0,
"gpu.1.mem.util": 0.78741455078125,
"gpu.1.mem.total": 16384.0,
"gpu.1.mem.used": 12901.0,
"gpu.1.mem.free": 3249.0,
"gpu.1.temp.current": 40.0,
"gpu.2.compute.util": 0.0,
"gpu.2.mem.util": 0.79107666015625,
"gpu.2.mem.total": 16384.0,
"gpu.2.mem.used": 12961.0,
"gpu.2.mem.free": 3189.0,
"gpu.2.temp.current": 40.0,
"gpu.3.compute.util": 0.0,
"gpu.3.mem.util": 0.79107666015625,
"gpu.3.mem.total": 16384.0,
"gpu.3.mem.used": 12961.0,
"gpu.3.mem.free": 3189.0,
"gpu.3.temp.current": 41.0,
"gpu.4.compute.util": 0.0,
"gpu.4.mem.util": 0.79119873046875,
"gpu.4.mem.total": 16384.0,
"gpu.4.mem.used": 12963.0,
"gpu.4.mem.free": 3187.0,
"gpu.4.temp.current": 39.0,
"gpu.5.compute.util": 0.0,
"gpu.5.mem.util": 0.78741455078125,
"gpu.5.mem.total": 16384.0,
"gpu.5.mem.used": 12901.0,
"gpu.5.mem.free": 3249.0,
"gpu.5.temp.current": 42.0,
"gpu.6.compute.util": 0.0,
"gpu.6.mem.util": 0.79107666015625,
"gpu.6.mem.total": 16384.0,
"gpu.6.mem.used": 12961.0,
"gpu.6.mem.free": 3189.0,
"gpu.6.temp.current": 40.0,
"gpu.7.compute.util": 0.0,
"gpu.7.mem.util": 0.79107666015625,
"gpu.7.mem.total": 16384.0,
"gpu.7.mem.used": 12961.0,
"gpu.7.mem.free": 3189.0,
"gpu.7.temp.current": 41.0,
"gpu.avg.compute.util": 0.0,
"gpu.avg.mem.util": 0.790069580078125,
"gpu.avg.mem.total": 16384.0,
"gpu.avg.mem.used": 12944.5,
"gpu.avg.mem.free": 3205.5,
"gpu.avg.temp.current": 40.125,
"os_platform": "Linux-4.19.0-26-cloud-amd64-x86_64-with-glibc2.29",
"python_version": "3.8.10",
"python_compiler": "GCC 9.4.0",
"cpu_model_name": "Intel(R) Xeon(R) CPU @ 2.00GHz",
"cpu_count": 64,
"gpu_model_name": "Tesla V100-SXM2-16GB",
"gpu_count": 8,
"gpu_driver": "535.104.05",
"rng_seed": 2735018057
}
Loading
Loading