diff --git a/config/examples/system.yaml b/config/examples/system.yaml index 7b84c48d1..0f3f13ea6 100644 --- a/config/examples/system.yaml +++ b/config/examples/system.yaml @@ -26,3 +26,33 @@ system: ip: 192.168.11.13 main: false user: username + + + + +multirun: + runs: + # Force batch size to populate the sizing model + - name: "bs{sizer.batch_size}" + matrix: + sizer.auto: 1 + sizer.batch_size: [1, 2, 4, 8, 16, 32, 64, 128] + sizer.save: "scaling.yaml" + + # Matrix run + - name: "c{sizer.capacity}_m{sizer.multiple}_w{cpu.n_workers}" + matrix: + cpu.auto: 1 + cpu.n_workers: [2, 4, 8, 16, 32] + sizer.auto: 1 + sizer.capacity: [4Go, 8Go, 16Go, 32Go, 64Go, All] + sizer.multiple: 8 + sizer.save: "scaling.yaml" + + # Auto run + - name: "auto" + matrix: + cpu.auto: 1 + sizer.auto: 1 + sizer.multiple: 8 + sizer.save: "scaling.yaml" diff --git a/milabench/_version.py b/milabench/_version.py index b07d8b5e8..3973a23a5 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v1.0.0_RC1-13-gde92a7e" -__commit__ = "de92a7ea9dea1da24e8105e4566d5e6daef8464c" -__date__ = "2024-10-03 15:48:10 +0000" +__tag__ = "v1.0.0_RC1-17-gbaf5304" +__commit__ = "baf53044e78d0989600359e9496e9aae682bf640" +__date__ = "2024-10-10 16:12:31 +0000" diff --git a/milabench/cli/run.py b/milabench/cli/run.py index f5e75b702..ae67abed0 100644 --- a/milabench/cli/run.py +++ b/milabench/cli/run.py @@ -23,6 +23,7 @@ from ..report import make_report from ..sizer import MemoryUsageExtractor from ..summary import make_summary +from ..system import multirun, apply_system # fmt: off @@ -72,12 +73,7 @@ def _fetch_arch(mp): return None -@tooled -def cli_run(args=None): - """Run the benchmarks.""" - if args is None: - args = arguments() - +def run(mp, args, name): layers = validation_names(args.validations) dash_class = { @@ -85,13 +81,7 @@ def cli_run(args=None): "long": LongDashFormatter, "no": None, }.get(args.dash, None) - - mp = get_multipack(run_name=args.run_name) - arch = _fetch_arch(mp) - - # Initialize the backend here so we can retrieve GPU stats - init_arch(arch) - + success = run_with_loggers( mp.do_run(repeat=args.repeat), loggers=[ @@ -136,3 +126,27 @@ def cli_run(args=None): ) return success + + +@tooled +def cli_run(args=None): + """Run the benchmarks.""" + if args is None: + args = arguments() + + # Load the configuration and system + mp = get_multipack(run_name=args.run_name) + arch = _fetch_arch(mp) + + # Initialize the backend here so we can retrieve GPU stats + init_arch(arch) + + success = 0 + for name, conf in multirun(): + with apply_system(conf): + # mark the run later so we can resume multirun more easily + run_name = name or args.run_name + mp = get_multipack(run_name=run_name) + success += run(mp, args, run_name) + + return success diff --git a/milabench/config.py b/milabench/config.py index 039a85cc4..9a2d519c9 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -100,11 +100,15 @@ def combine_args(args, kwargs): yield kwargs else: key, values = args.popitem() - for value in values: - kwargs[key] = value + + try: + for value in values: + kwargs[key] = value + yield from combine_args(deepcopy(args), kwargs) + except: + kwargs[key] = values yield from combine_args(deepcopy(args), kwargs) - def expand_matrix(name, bench_config): if "matrix" not in bench_config: return [(name, bench_config)] diff --git a/milabench/system.py b/milabench/system.py index c237baf2c..421fd3f0a 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -1,4 +1,5 @@ import contextvars +from copy import deepcopy import ipaddress import os import socket @@ -15,7 +16,7 @@ from .merge import merge system_global = contextvars.ContextVar("system", default=None) - +multirun_global = contextvars.ContextVar("multirun", default=None) def get_gpu_capacity(strict=False): try: @@ -79,6 +80,60 @@ def as_environment_variable(name): return "MILABENCH_" + "_".join(map(str.upper, frags)) +def multirun(): + multirun = multirun_global.get() + + if multirun is None or len(multirun) == 0: + yield None, dict() + + runs = multirun.get("runs", dict()) + + from .config import combine_args + import time + from types import SimpleNamespace + + def unflatten(dct): + result = {} + for k, v in dct.items(): + l = result + frags = k.split(".") + for frag in frags[:-1]: + l = l.setdefault(frag, SimpleNamespace()) + setattr(l, frags[-1], v) + + return result + + for run_matrix in runs: + arguments = run_matrix["matrix"] + + for run in combine_args(arguments, dict()): + template_name = run_matrix["name"] + + ctx = unflatten(run) + ctx['time'] = int(time.time()) + run_name = template_name.format(**ctx) + + yield run_name, run + + +@contextmanager +def apply_system(config: dict): + system = system_global.get() + old = deepcopy(system) + + for k, v in config.items(): + frags = k.split(".") + + lookup = system.setdefault("options", {}) + for f in frags[:-1]: + lookup = lookup.setdefault(f, {}) + lookup[frags[-1]] = v + + + yield + system_global.set(old) + + def option(name, etype, default=None): options = dict() system = system_global.get() @@ -464,6 +519,9 @@ def build_system_config(config_file, defaults=None, gpu=True): config = merge(defaults, config) system = config.get("system", {}) + multirun = config.get("multirun", {}) + + multirun_global.set(multirun) system_global.set(system) # capacity is only required if batch resizer is enabled diff --git a/scripts/article/run_rocm.sh b/scripts/article/run_rocm.sh index 90a26baaa..7d9034316 100644 --- a/scripts/article/run_rocm.sh +++ b/scripts/article/run_rocm.sh @@ -112,11 +112,16 @@ else fi -milabench prepare $ARGS +( + . $BENCHMARK_VENV/bin/activate + pip install xformers --index-url https://download.pytorch.org/whl/rocm6.1 +) + +# milabench prepare $ARGS # # Run the benchmakrs -milabench run $ARGS +milabench run $ARGS --system $MILABENCH_WORDIR/system.yaml # # Display report diff --git a/tests/test_system_matrix.py b/tests/test_system_matrix.py new file mode 100644 index 000000000..0f0a7a68e --- /dev/null +++ b/tests/test_system_matrix.py @@ -0,0 +1,37 @@ + + + + + +from milabench.system import multirun, build_system_config, enable_offline, option, apply_system, SizerOptions + +from milabench.testing import official_config + + +def test_system_matrix(): + with enable_offline(True): + sys = build_system_config(official_config("examples/system")) + + n = 0 + for name, conf in multirun(): + print(name, conf) + n += 1 + + assert n == 39 + + +def test_apply_system_matrix(): + with enable_offline(True): + sys = build_system_config(official_config("examples/system")) + + for name, conf in multirun(): + with apply_system(conf): + + # Apply system worked and changed the config + for k, v in conf.items(): + assert option(k, lambda x: x) == v + + + +if __name__ == "__main__": + test_apply_system_matrix()