From 491505f0c72248824a26926df05e22f4c764ecbe Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 21 Nov 2024 13:15:47 -0500 Subject: [PATCH] Multirun system (#308) * ROCm changes * Update ping * - * Cleanup the rocm script * use rocm branch * - * Ne wmulti run system * multinode tweaks * make sure system config is applied before running * Update matrix run * Tweaks --------- Co-authored-by: Your Name --- .../llm/recipes/full_finetune_distributed.py | 1 - config/base.yaml | 2 +- config/examples/system.yaml | 30 +++++++++ config/scaling.yaml | 2 + milabench/_version.py | 7 ++- milabench/alt_async.py | 2 + milabench/cli/run.py | 42 +++++++++---- milabench/commands/__init__.py | 8 ++- milabench/compare.py | 2 + milabench/config.py | 10 ++- milabench/remote.py | 4 +- milabench/sizer.py | 25 +++++--- milabench/system.py | 63 ++++++++++++++++++- scripts/article/run_rocm.sh | 25 +++++++- tests/test_system_matrix.py | 40 ++++++++++++ 15 files changed, 226 insertions(+), 37 deletions(-) create mode 100644 tests/test_system_matrix.py diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py index 19556ec71..f8d58e2f4 100755 --- a/benchmarks/llm/recipes/full_finetune_distributed.py +++ b/benchmarks/llm/recipes/full_finetune_distributed.py @@ -100,7 +100,6 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: - import os self._device = acc.fetch_device(int(os.getenv("LOCAL_RANK", "0"))) self._dtype = utils.get_dtype(cfg.dtype, device=self._device) diff --git a/config/base.yaml b/config/base.yaml index b9b104d75..38dfc4d38 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -208,7 +208,7 @@ resnet50-noio: inherits: _torchvision voir: options: - stop: 1000 + stop: 500 interval: "1s" tags: diff --git a/config/examples/system.yaml b/config/examples/system.yaml index 7b84c48d1..78cf39571 100644 --- a/config/examples/system.yaml +++ b/config/examples/system.yaml @@ -26,3 +26,33 @@ system: ip: 192.168.11.13 main: false user: username + + + + +multirun: + runs: + # Force batch size to populate the sizing model + - name: "bs{sizer.batch_size}" + matrix: + sizer.auto: 1 + sizer.batch_size: [1, 2, 4, 8, 16, 32, 64, 128] + sizer.save: ["scaling.yaml"] + + # Matrix run + - name: "c{sizer.capacity}_m{sizer.multiple}_w{cpu.n_workers}" + matrix: + cpu.auto: 1 + cpu.n_workers: [2, 4, 8, 16, 32] + sizer.auto: 1 + sizer.capacity: [4Go, 8Go, 16Go, 32Go, 64Go, All] + sizer.multiple: 8 + sizer.save: ["scaling.yaml"] + + # Auto run + - name: "auto" + matrix: + cpu.auto: 1 + sizer.auto: 1 + sizer.multiple: 8 + sizer.save: ["scaling.yaml"] diff --git a/config/scaling.yaml b/config/scaling.yaml index d9d3dbf9e..00a37bd8e 100644 --- a/config/scaling.yaml +++ b/config/scaling.yaml @@ -286,7 +286,9 @@ lightning-gpus: 112: 16776.25 MiB 128: 15858 MiB 240: 28942.25 MiB + 256: 77822 MiB 504: 54100.25 MiB + 616: 93571 MiB 624: 65386.25 MiB optimized: 16 llama: {} diff --git a/milabench/_version.py b/milabench/_version.py index b07d8b5e8..6f09fef11 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,6 @@ """This file is generated, do not modify""" -__tag__ = "v1.0.0_RC1-13-gde92a7e" -__commit__ = "de92a7ea9dea1da24e8105e4566d5e6daef8464c" -__date__ = "2024-10-03 15:48:10 +0000" +__tag__ = "v1.0.0_RC1-18-g784b38e" +__commit__ = "784b38e77b90116047e3de893c22c2f7d3225179" +__date__ = "2024-10-18 15:58:46 +0000" + diff --git a/milabench/alt_async.py b/milabench/alt_async.py index 8608196d3..6fc9f64c8 100644 --- a/milabench/alt_async.py +++ b/milabench/alt_async.py @@ -190,6 +190,8 @@ def run(argv, setsid=None, process_accumulator=None, info={}, **kwargs): destroy(*mx.processes) yield entry + # mx.close() + def proceed(coro): loop = FeedbackEventLoop() diff --git a/milabench/cli/run.py b/milabench/cli/run.py index f5e75b702..f04427af1 100644 --- a/milabench/cli/run.py +++ b/milabench/cli/run.py @@ -23,6 +23,7 @@ from ..report import make_report from ..sizer import MemoryUsageExtractor from ..summary import make_summary +from ..system import multirun, apply_system, SizerOptions, option # fmt: off @@ -72,12 +73,7 @@ def _fetch_arch(mp): return None -@tooled -def cli_run(args=None): - """Run the benchmarks.""" - if args is None: - args = arguments() - +def run(mp, args, name): layers = validation_names(args.validations) dash_class = { @@ -85,13 +81,7 @@ def cli_run(args=None): "long": LongDashFormatter, "no": None, }.get(args.dash, None) - - mp = get_multipack(run_name=args.run_name) - arch = _fetch_arch(mp) - - # Initialize the backend here so we can retrieve GPU stats - init_arch(arch) - + success = run_with_loggers( mp.do_run(repeat=args.repeat), loggers=[ @@ -136,3 +126,29 @@ def cli_run(args=None): ) return success + + +@tooled +def cli_run(args=None): + """Run the benchmarks.""" + if args is None: + args = arguments() + + # Load the configuration and system + mp = get_multipack(run_name=args.run_name) + arch = _fetch_arch(mp) + + # Initialize the backend here so we can retrieve GPU stats + init_arch(arch) + + success = 0 + for name, conf in multirun(): + run_name = name or args.run_name + + # Note that this function overrides the system config + mp = get_multipack(run_name=run_name) + + with apply_system(conf): + success += run(mp, args, run_name) + + return success diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py index e97ac4e58..4a8f1e90a 100644 --- a/milabench/commands/__init__.py +++ b/milabench/commands/__init__.py @@ -451,6 +451,11 @@ def _find_node_config(self) -> Dict: return {} def is_local(self): + local = self._is_local() + print("is_local", self.host, local) + return local + + def _is_local(self): localnode = self.pack.config["system"]["self"] if localnode is not None: @@ -581,7 +586,7 @@ def node_address(node): """Favour Hostname as it is the most consistent name across machines""" host = node.get("hostname") ip = node.get("ip") - return host or ip + return ip or hostname class ForeachNode(ListCommand): @@ -637,6 +642,7 @@ def executors(self): **self.options ) + print(rank, node, node_address(node)) worker = SSHCommand( host=node_address(node), user=node["user"], diff --git a/milabench/compare.py b/milabench/compare.py index d4d6299ee..32f95c64c 100644 --- a/milabench/compare.py +++ b/milabench/compare.py @@ -26,6 +26,7 @@ def fetch_runs(folder, filter): runs = [] ignored = 0 + for run in os.listdir(folder): if run.startswith("install") or run.startswith("prepare"): continue @@ -43,6 +44,7 @@ def fetch_runs(folder, filter): date = retrieve_datetime_from_name(date) else: name = run + date = None if date is None: date = datetime.fromtimestamp(os.path.getmtime(pth)) diff --git a/milabench/config.py b/milabench/config.py index 039a85cc4..9a2d519c9 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -100,11 +100,15 @@ def combine_args(args, kwargs): yield kwargs else: key, values = args.popitem() - for value in values: - kwargs[key] = value + + try: + for value in values: + kwargs[key] = value + yield from combine_args(deepcopy(args), kwargs) + except: + kwargs[key] = values yield from combine_args(deepcopy(args), kwargs) - def expand_matrix(name, bench_config): if "matrix" not in bench_config: return [(name, bench_config)] diff --git a/milabench/remote.py b/milabench/remote.py index cbe9696b2..c92166fdd 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -124,7 +124,6 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: nodes = pack.config["system"]["nodes"] copy = [] - node_packs = [] copy_source = copy_folder(pack, INSTALL_FOLDER, setup_for) @@ -132,7 +131,8 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: for i, node in enumerate(nodes): if should_run_for(node, setup_for): - install.append(pip_install_milabench(node_packs[i], node, INSTALL_FOLDER)) + node_pack = worker_pack(pack, node) + install.append(pip_install_milabench(node_pack, node, INSTALL_FOLDER)) return SequenceCommand( copy_source, diff --git a/milabench/sizer.py b/milabench/sizer.py index 75002edb3..00d6d2b67 100644 --- a/milabench/sizer.py +++ b/milabench/sizer.py @@ -53,8 +53,7 @@ def to_octet(value: str) -> float: class Sizer: """Automatically scale the batch size to match GPU spec""" - def __init__(self, options=SizerOptions(), scaling_config=None): - self.options = options + def __init__(self, scaling_config=None): self.path = scaling_config if scaling_config is None: @@ -62,6 +61,10 @@ def __init__(self, options=SizerOptions(), scaling_config=None): with open(scaling_config, "r") as sconf: self.scaling_config = yaml.safe_load(sconf) + + @property + def options(self): + return SizerOptions() def benchscaling(self, benchmark): # key @@ -165,6 +168,10 @@ def find_batch_size(self, benchmark, event): return -1 def argv(self, benchmark, capacity, argv): + newargv = self._argv(benchmark, capacity, argv) + return newargv + + def _argv(self, benchmark, capacity, argv): """Find the batch size and override it with a new value""" config = self.benchscaling(benchmark) @@ -214,11 +221,12 @@ def argv(self, benchmark, capacity, argv): def batch_sizer() -> Sizer: - sizer = sizer_global.get() - if sizer is None: - sizer_global.set(Sizer()) - return batch_sizer() - return sizer + return Sizer() + # sizer = sizer_global.get() + # if sizer is None: + # sizer_global.set(Sizer()) + # return batch_sizer() + # return sizer def get_batch_size(config, start_event): @@ -242,8 +250,9 @@ class MemoryUsageExtractor(ValidationLayer): """Extract max memory usage per benchmark to populate the memory model""" def __init__(self): + + self.filepath = option("sizer.save", str, None) sizer = batch_sizer() - self.filepath = sizer.options.save self.memory = deepcopy(sizer.scaling_config) self.scaling = None self.benchname = None diff --git a/milabench/system.py b/milabench/system.py index 2d5a6ca8e..9aa499750 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -1,4 +1,5 @@ import contextvars +from copy import deepcopy import ipaddress import os import socket @@ -15,7 +16,7 @@ from .merge import merge system_global = contextvars.ContextVar("system", default=None) - +multirun_global = contextvars.ContextVar("multirun", default=None) def get_gpu_capacity(strict=False): try: @@ -79,6 +80,60 @@ def as_environment_variable(name): return "MILABENCH_" + "_".join(map(str.upper, frags)) +def multirun(): + multirun = multirun_global.get() + + if multirun is None or len(multirun) == 0: + yield None, dict() + + runs = multirun.get("runs", dict()) + + from .config import combine_args + import time + from types import SimpleNamespace + + def unflatten(dct): + result = {} + for k, v in dct.items(): + l = result + frags = k.split(".") + for frag in frags[:-1]: + l = l.setdefault(frag, SimpleNamespace()) + setattr(l, frags[-1], v) + + return result + + for run_matrix in runs: + arguments = run_matrix["matrix"] + + for run in combine_args(arguments, dict()): + template_name = run_matrix["name"] + + ctx = unflatten(run) + ctx['time'] = int(time.time()) + run_name = template_name.format(**ctx) + + yield run_name, run + + +@contextmanager +def apply_system(config: dict): + system = system_global.get() + old = deepcopy(system) + + for k, v in config.items(): + frags = k.split(".") + + lookup = system.setdefault("options", {}) + for f in frags[:-1]: + lookup = lookup.setdefault(f, {}) + lookup[frags[-1]] = v + + + yield + system_global.set(old) + + def option(name, etype, default=None): options = dict() system = system_global.get() @@ -401,11 +456,12 @@ def gethostname(host): def resolve_hostname(ip): try: hostname, _, iplist = socket.gethostbyaddr(ip) - + for ip in iplist: if is_loopback(ip): return hostname, True + # FIXME return socket.gethostname(), hostname.startswith(socket.gethostname()) return hostname, hostname == socket.gethostname() @@ -465,6 +521,9 @@ def build_system_config(config_file, defaults=None, gpu=True): config = merge(defaults, config) system = config.get("system", {}) + multirun = config.get("multirun", {}) + + multirun_global.set(multirun) system_global.set(system) # capacity is only required if batch resizer is enabled diff --git a/scripts/article/run_rocm.sh b/scripts/article/run_rocm.sh index fbb9da830..0fc2bf16d 100644 --- a/scripts/article/run_rocm.sh +++ b/scripts/article/run_rocm.sh @@ -11,6 +11,7 @@ export ROCM_PATH="/opt/rocm" export MILABENCH_BASE="$MILABENCH_WORDIR/results" export MILABENCH_VENV="$MILABENCH_WORDIR/env" export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch" +export MILABENCH_SIZER_SAVE="$MILABENCH_WORDIR/scaling.yaml" if [ -z "${MILABENCH_SOURCE}" ]; then export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml" @@ -24,6 +25,17 @@ export TORCH_ROCM_ARCH_LIST="$GPU" export ROCM_TARGETS="$GPU" export PYTORCH_ROCM_ARCH="$GPU" +if [ -z "${MILABENCH_SOURCE}" ]; then + export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml" +else + export MILABENCH_CONFIG="$MILABENCH_SOURCE/config/standard.yaml" +fi + + +export GPU="$(/opt/rocm/lib/llvm/bin/amdgpu-arch | head -n 1)" +export TORCH_ROCM_ARCH_LIST="$GPU" +export ROCM_TARGETS="$GPU" +export PYTORCH_ROCM_ARCH="$GPU" ARGS="$@" @@ -75,7 +87,7 @@ install_prepare() { # https://github.com/ROCm/jax/releases/tag/rocm-jaxlib-v0.4.30 pip install https://github.com/ROCm/jax/releases/download/rocm-jaxlib-v0.4.30/jaxlib-0.4.30+rocm611-cp310-cp310-manylinux2014_x86_64.whl - pip install https://github.com/ROCm/jax/archive/refs/tags/rocm-jaxlib-v0.4.30.tar.gz + pip install https://github.com/ROCm/jax/archive/refs/tags/rocm-jaxlib-v0.4.30.tar.g pip uninstall torch_cluster torch_scatter torch_sparse -y FORCE_ONLY_CUDA=1 pip install -U -v --use-pep517 --no-build-isolation git+https://github.com/rusty1s/pytorch_cluster.git @@ -111,12 +123,19 @@ else . $MILABENCH_WORDIR/env/bin/activate fi +( + . $BENCHMARK_VENV/bin/activate + pip install xformers --index-url https://download.pytorch.org/whl/rocm6.1 +) + +# milabench install $ARGS --system $MILABENCH_WORDIR/system.yaml -milabench prepare $ARGS +# milabench prepare $ARGS --system $MILABENCH_WORDIR/system.yaml # # Run the benchmakrs -milabench run $ARGS +milabench run $ARGS --system $MILABENCH_WORDIR/system.yaml + # # Display report diff --git a/tests/test_system_matrix.py b/tests/test_system_matrix.py new file mode 100644 index 000000000..ed5378815 --- /dev/null +++ b/tests/test_system_matrix.py @@ -0,0 +1,40 @@ + + + + + +from milabench.system import multirun, build_system_config, enable_offline, option, apply_system, SizerOptions + +from milabench.testing import official_config + + +def test_system_matrix(): + with enable_offline(True): + sys = build_system_config(official_config("examples/system")) + + n = 0 + for name, conf in multirun(): + print(name, conf) + n += 1 + + assert n == 39 + + +def test_apply_system_matrix(): + with enable_offline(True): + sys = build_system_config(official_config("examples/system")) + + for name, conf in multirun(): + with apply_system(conf): + + # Apply system worked and changed the config + for k, v in conf.items(): + assert option(k, lambda x: x) == v + + + assert SizerOptions().save == option("sizer.save", lambda x: x) + + + +if __name__ == "__main__": + test_apply_system_matrix()