From 491505f0c72248824a26926df05e22f4c764ecbe Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Thu, 21 Nov 2024 13:15:47 -0500
Subject: [PATCH] Multirun system (#308)

* ROCm changes

* Update ping

* -

* Cleanup the rocm script

* use rocm branch

* -

* Ne wmulti run system

* multinode tweaks

* make sure system config is applied before running

* Update matrix run

* Tweaks

---------

Co-authored-by: Your Name <you@example.com>
---
 .../llm/recipes/full_finetune_distributed.py  |  1 -
 config/base.yaml                              |  2 +-
 config/examples/system.yaml                   | 30 +++++++++
 config/scaling.yaml                           |  2 +
 milabench/_version.py                         |  7 ++-
 milabench/alt_async.py                        |  2 +
 milabench/cli/run.py                          | 42 +++++++++----
 milabench/commands/__init__.py                |  8 ++-
 milabench/compare.py                          |  2 +
 milabench/config.py                           | 10 ++-
 milabench/remote.py                           |  4 +-
 milabench/sizer.py                            | 25 +++++---
 milabench/system.py                           | 63 ++++++++++++++++++-
 scripts/article/run_rocm.sh                   | 25 +++++++-
 tests/test_system_matrix.py                   | 40 ++++++++++++
 15 files changed, 226 insertions(+), 37 deletions(-)
 create mode 100644 tests/test_system_matrix.py

diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py
index 19556ec71..f8d58e2f4 100755
--- a/benchmarks/llm/recipes/full_finetune_distributed.py
+++ b/benchmarks/llm/recipes/full_finetune_distributed.py
@@ -100,7 +100,6 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-
         import os
         self._device = acc.fetch_device(int(os.getenv("LOCAL_RANK", "0")))
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
diff --git a/config/base.yaml b/config/base.yaml
index b9b104d75..38dfc4d38 100644
--- a/config/base.yaml
+++ b/config/base.yaml
@@ -208,7 +208,7 @@ resnet50-noio:
   inherits: _torchvision
   voir:
     options:
-      stop: 1000
+      stop: 500
       interval: "1s"
 
   tags:
diff --git a/config/examples/system.yaml b/config/examples/system.yaml
index 7b84c48d1..78cf39571 100644
--- a/config/examples/system.yaml
+++ b/config/examples/system.yaml
@@ -26,3 +26,33 @@ system:
       ip: 192.168.11.13
       main: false 
       user: username
+
+
+
+
+multirun:
+  runs:
+    # Force batch size to populate the sizing model
+    - name: "bs{sizer.batch_size}"
+      matrix:
+        sizer.auto: 1
+        sizer.batch_size: [1, 2, 4, 8, 16, 32, 64, 128]
+        sizer.save: ["scaling.yaml"]
+    
+    # Matrix run
+    - name: "c{sizer.capacity}_m{sizer.multiple}_w{cpu.n_workers}"
+      matrix:
+        cpu.auto: 1
+        cpu.n_workers: [2, 4, 8, 16, 32]
+        sizer.auto: 1
+        sizer.capacity: [4Go, 8Go, 16Go, 32Go, 64Go, All]
+        sizer.multiple: 8
+        sizer.save: ["scaling.yaml"]
+
+    # Auto run
+    - name: "auto"
+      matrix:
+        cpu.auto: 1
+        sizer.auto: 1
+        sizer.multiple: 8
+        sizer.save: ["scaling.yaml"]
diff --git a/config/scaling.yaml b/config/scaling.yaml
index d9d3dbf9e..00a37bd8e 100644
--- a/config/scaling.yaml
+++ b/config/scaling.yaml
@@ -286,7 +286,9 @@ lightning-gpus:
     112: 16776.25 MiB
     128: 15858 MiB
     240: 28942.25 MiB
+    256: 77822 MiB
     504: 54100.25 MiB
+    616: 93571 MiB
     624: 65386.25 MiB
   optimized: 16
 llama: {}
diff --git a/milabench/_version.py b/milabench/_version.py
index b07d8b5e8..6f09fef11 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,6 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v1.0.0_RC1-13-gde92a7e"
-__commit__ = "de92a7ea9dea1da24e8105e4566d5e6daef8464c"
-__date__ = "2024-10-03 15:48:10 +0000"
+__tag__ = "v1.0.0_RC1-18-g784b38e"
+__commit__ = "784b38e77b90116047e3de893c22c2f7d3225179"
+__date__ = "2024-10-18 15:58:46 +0000"
+
diff --git a/milabench/alt_async.py b/milabench/alt_async.py
index 8608196d3..6fc9f64c8 100644
--- a/milabench/alt_async.py
+++ b/milabench/alt_async.py
@@ -190,6 +190,8 @@ def run(argv, setsid=None, process_accumulator=None, info={}, **kwargs):
             destroy(*mx.processes)
         yield entry
         
+    # mx.close()
+        
 
 def proceed(coro):
     loop = FeedbackEventLoop()
diff --git a/milabench/cli/run.py b/milabench/cli/run.py
index f5e75b702..f04427af1 100644
--- a/milabench/cli/run.py
+++ b/milabench/cli/run.py
@@ -23,6 +23,7 @@
 from ..report import make_report
 from ..sizer import MemoryUsageExtractor
 from ..summary import make_summary
+from ..system import multirun, apply_system, SizerOptions, option
 
 
 # fmt: off
@@ -72,12 +73,7 @@ def _fetch_arch(mp):
         return None
     
 
-@tooled
-def cli_run(args=None):
-    """Run the benchmarks."""
-    if args is None:
-        args = arguments()
-
+def run(mp, args, name):
     layers = validation_names(args.validations)
 
     dash_class = {
@@ -85,13 +81,7 @@ def cli_run(args=None):
         "long": LongDashFormatter,
         "no": None,
     }.get(args.dash, None)
-
-    mp = get_multipack(run_name=args.run_name)
-    arch = _fetch_arch(mp)
-
-    # Initialize the backend here so we can retrieve GPU stats
-    init_arch(arch)
-
+        
     success = run_with_loggers(
         mp.do_run(repeat=args.repeat),
         loggers=[
@@ -136,3 +126,29 @@ def cli_run(args=None):
             )
 
     return success
+
+
+@tooled
+def cli_run(args=None):
+    """Run the benchmarks."""
+    if args is None:
+        args = arguments()
+
+    # Load the configuration and system
+    mp = get_multipack(run_name=args.run_name)
+    arch = _fetch_arch(mp)
+
+    # Initialize the backend here so we can retrieve GPU stats
+    init_arch(arch)
+    
+    success = 0
+    for name, conf in multirun():
+        run_name = name or args.run_name
+        
+        # Note that this function overrides the system config
+        mp = get_multipack(run_name=run_name)
+        
+        with apply_system(conf):
+            success += run(mp, args, run_name)
+    
+    return success
diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py
index e97ac4e58..4a8f1e90a 100644
--- a/milabench/commands/__init__.py
+++ b/milabench/commands/__init__.py
@@ -451,6 +451,11 @@ def _find_node_config(self) -> Dict:
         return {}
 
     def is_local(self):
+        local = self._is_local()
+        print("is_local", self.host, local)
+        return local
+
+    def _is_local(self):
         localnode = self.pack.config["system"]["self"]
 
         if localnode is not None:
@@ -581,7 +586,7 @@ def node_address(node):
     """Favour Hostname as it is the most consistent name across machines"""
     host = node.get("hostname")
     ip = node.get("ip")
-    return host or ip
+    return ip or hostname
 
 
 class ForeachNode(ListCommand):
@@ -637,6 +642,7 @@ def executors(self):
                     **self.options
                 )
 
+            print(rank, node, node_address(node))
             worker = SSHCommand(
                 host=node_address(node),
                 user=node["user"],
diff --git a/milabench/compare.py b/milabench/compare.py
index d4d6299ee..32f95c64c 100644
--- a/milabench/compare.py
+++ b/milabench/compare.py
@@ -26,6 +26,7 @@ def fetch_runs(folder, filter):
 
     runs = []
     ignored = 0
+    
     for run in os.listdir(folder):
         if run.startswith("install") or run.startswith("prepare"):
             continue
@@ -43,6 +44,7 @@ def fetch_runs(folder, filter):
             date = retrieve_datetime_from_name(date)
         else:
             name = run
+            date = None
 
         if date is None:
             date = datetime.fromtimestamp(os.path.getmtime(pth))
diff --git a/milabench/config.py b/milabench/config.py
index 039a85cc4..9a2d519c9 100644
--- a/milabench/config.py
+++ b/milabench/config.py
@@ -100,11 +100,15 @@ def combine_args(args, kwargs):
         yield kwargs
     else:
         key, values = args.popitem()
-        for value in values:
-            kwargs[key] = value
+        
+        try:
+            for value in values:
+                kwargs[key] = value
+                yield from combine_args(deepcopy(args), kwargs)
+        except:
+            kwargs[key] = values
             yield from combine_args(deepcopy(args), kwargs)
 
-
 def expand_matrix(name, bench_config):
     if "matrix" not in bench_config:
         return [(name, bench_config)]
diff --git a/milabench/remote.py b/milabench/remote.py
index cbe9696b2..c92166fdd 100644
--- a/milabench/remote.py
+++ b/milabench/remote.py
@@ -124,7 +124,6 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand:
 
     nodes = pack.config["system"]["nodes"]
     copy = []
-    node_packs = []
 
     copy_source = copy_folder(pack, INSTALL_FOLDER, setup_for)
 
@@ -132,7 +131,8 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand:
 
     for i, node in enumerate(nodes):
         if should_run_for(node, setup_for):
-            install.append(pip_install_milabench(node_packs[i], node, INSTALL_FOLDER))
+            node_pack = worker_pack(pack, node)
+            install.append(pip_install_milabench(node_pack, node, INSTALL_FOLDER))
 
     return SequenceCommand(
         copy_source,
diff --git a/milabench/sizer.py b/milabench/sizer.py
index 75002edb3..00d6d2b67 100644
--- a/milabench/sizer.py
+++ b/milabench/sizer.py
@@ -53,8 +53,7 @@ def to_octet(value: str) -> float:
 class Sizer:
     """Automatically scale the batch size to match GPU spec"""
 
-    def __init__(self, options=SizerOptions(), scaling_config=None):
-        self.options = options
+    def __init__(self, scaling_config=None):
         self.path = scaling_config
 
         if scaling_config is None:
@@ -62,6 +61,10 @@ def __init__(self, options=SizerOptions(), scaling_config=None):
 
         with open(scaling_config, "r") as sconf:
             self.scaling_config = yaml.safe_load(sconf)
+            
+    @property
+    def options(self):
+        return SizerOptions()
 
     def benchscaling(self, benchmark):
         # key
@@ -165,6 +168,10 @@ def find_batch_size(self, benchmark, event):
         return -1
 
     def argv(self, benchmark, capacity, argv):
+        newargv = self._argv(benchmark, capacity, argv)
+        return newargv
+        
+    def _argv(self, benchmark, capacity, argv):
         """Find the batch size and override it with a new value"""
 
         config = self.benchscaling(benchmark)
@@ -214,11 +221,12 @@ def argv(self, benchmark, capacity, argv):
 
 
 def batch_sizer() -> Sizer:
-    sizer = sizer_global.get()
-    if sizer is None:
-        sizer_global.set(Sizer())
-        return batch_sizer()
-    return sizer
+    return Sizer()
+    # sizer = sizer_global.get()
+    # if sizer is None:
+    #     sizer_global.set(Sizer())
+    #     return batch_sizer()
+    # return sizer
 
 
 def get_batch_size(config, start_event):
@@ -242,8 +250,9 @@ class MemoryUsageExtractor(ValidationLayer):
     """Extract max memory usage per benchmark to populate the memory model"""
 
     def __init__(self):
+        
+        self.filepath = option("sizer.save", str, None)
         sizer = batch_sizer()
-        self.filepath = sizer.options.save
         self.memory = deepcopy(sizer.scaling_config)
         self.scaling = None
         self.benchname = None
diff --git a/milabench/system.py b/milabench/system.py
index 2d5a6ca8e..9aa499750 100644
--- a/milabench/system.py
+++ b/milabench/system.py
@@ -1,4 +1,5 @@
 import contextvars
+from copy import deepcopy
 import ipaddress
 import os
 import socket
@@ -15,7 +16,7 @@
 from .merge import merge
 
 system_global = contextvars.ContextVar("system", default=None)
-
+multirun_global = contextvars.ContextVar("multirun", default=None)
 
 def get_gpu_capacity(strict=False):
     try:
@@ -79,6 +80,60 @@ def as_environment_variable(name):
     return "MILABENCH_" + "_".join(map(str.upper, frags))
 
 
+def multirun():
+    multirun = multirun_global.get()
+    
+    if multirun is None or len(multirun) == 0:
+        yield None, dict()
+        
+    runs = multirun.get("runs", dict())
+    
+    from .config import combine_args
+    import time
+    from types import SimpleNamespace
+    
+    def unflatten(dct):
+        result = {}
+        for k, v in dct.items():
+            l = result
+            frags = k.split(".")
+            for frag in frags[:-1]:
+                l = l.setdefault(frag, SimpleNamespace())
+            setattr(l, frags[-1], v)
+            
+        return result
+                
+    for run_matrix in runs:
+        arguments = run_matrix["matrix"]
+
+        for run in combine_args(arguments, dict()):
+            template_name = run_matrix["name"]
+            
+            ctx = unflatten(run)
+            ctx['time'] = int(time.time())
+            run_name = template_name.format(**ctx)
+            
+            yield run_name, run
+
+
+@contextmanager
+def apply_system(config: dict):
+    system = system_global.get()
+    old = deepcopy(system)
+    
+    for k, v in config.items():
+        frags = k.split(".")
+        
+        lookup = system.setdefault("options", {})
+        for f in frags[:-1]:
+            lookup = lookup.setdefault(f, {})
+        lookup[frags[-1]] = v
+        
+
+    yield    
+    system_global.set(old)
+
+
 def option(name, etype, default=None):
     options = dict()
     system = system_global.get()
@@ -401,11 +456,12 @@ def gethostname(host):
 def resolve_hostname(ip):
     try:
         hostname, _, iplist = socket.gethostbyaddr(ip)
-
+        
         for ip in iplist:
             if is_loopback(ip):
                 return hostname, True
 
+        # FIXME
         return socket.gethostname(), hostname.startswith(socket.gethostname())
         return hostname, hostname == socket.gethostname()
 
@@ -465,6 +521,9 @@ def build_system_config(config_file, defaults=None, gpu=True):
         config = merge(defaults, config)
 
     system = config.get("system", {})
+    multirun = config.get("multirun", {})
+    
+    multirun_global.set(multirun)
     system_global.set(system)
 
     # capacity is only required if batch resizer is enabled
diff --git a/scripts/article/run_rocm.sh b/scripts/article/run_rocm.sh
index fbb9da830..0fc2bf16d 100644
--- a/scripts/article/run_rocm.sh
+++ b/scripts/article/run_rocm.sh
@@ -11,6 +11,7 @@ export ROCM_PATH="/opt/rocm"
 export MILABENCH_BASE="$MILABENCH_WORDIR/results"
 export MILABENCH_VENV="$MILABENCH_WORDIR/env"
 export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch"
+export MILABENCH_SIZER_SAVE="$MILABENCH_WORDIR/scaling.yaml"
 
 if [ -z "${MILABENCH_SOURCE}" ]; then
     export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml"
@@ -24,6 +25,17 @@ export TORCH_ROCM_ARCH_LIST="$GPU"
 export ROCM_TARGETS="$GPU"
 export PYTORCH_ROCM_ARCH="$GPU"
 
+if [ -z "${MILABENCH_SOURCE}" ]; then
+    export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml"
+else
+    export MILABENCH_CONFIG="$MILABENCH_SOURCE/config/standard.yaml"
+fi
+
+
+export GPU="$(/opt/rocm/lib/llvm/bin/amdgpu-arch | head -n 1)"
+export TORCH_ROCM_ARCH_LIST="$GPU"
+export ROCM_TARGETS="$GPU"
+export PYTORCH_ROCM_ARCH="$GPU"
 
 ARGS="$@"
 
@@ -75,7 +87,7 @@ install_prepare() {
 
         # https://github.com/ROCm/jax/releases/tag/rocm-jaxlib-v0.4.30
         pip install https://github.com/ROCm/jax/releases/download/rocm-jaxlib-v0.4.30/jaxlib-0.4.30+rocm611-cp310-cp310-manylinux2014_x86_64.whl
-        pip install https://github.com/ROCm/jax/archive/refs/tags/rocm-jaxlib-v0.4.30.tar.gz
+        pip install https://github.com/ROCm/jax/archive/refs/tags/rocm-jaxlib-v0.4.30.tar.g
 
         pip uninstall torch_cluster torch_scatter torch_sparse -y
         FORCE_ONLY_CUDA=1 pip install -U -v --use-pep517 --no-build-isolation git+https://github.com/rusty1s/pytorch_cluster.git
@@ -111,12 +123,19 @@ else
     . $MILABENCH_WORDIR/env/bin/activate
 fi
 
+(
+    . $BENCHMARK_VENV/bin/activate
+    pip install xformers --index-url https://download.pytorch.org/whl/rocm6.1
+)
+
+# milabench install $ARGS --system $MILABENCH_WORDIR/system.yaml
 
-milabench prepare $ARGS 
+# milabench prepare $ARGS --system $MILABENCH_WORDIR/system.yaml
 
 #
 #   Run the benchmakrs
-milabench run $ARGS 
+milabench run $ARGS --system $MILABENCH_WORDIR/system.yaml
+
 
 #
 #   Display report
diff --git a/tests/test_system_matrix.py b/tests/test_system_matrix.py
new file mode 100644
index 000000000..ed5378815
--- /dev/null
+++ b/tests/test_system_matrix.py
@@ -0,0 +1,40 @@
+
+
+
+
+
+from milabench.system import multirun, build_system_config, enable_offline, option, apply_system, SizerOptions
+
+from milabench.testing import official_config
+
+
+def test_system_matrix():
+    with enable_offline(True):
+        sys = build_system_config(official_config("examples/system"))
+        
+        n = 0
+        for name, conf in multirun():
+            print(name, conf)
+            n += 1
+
+        assert n == 39
+
+
+def test_apply_system_matrix():
+    with enable_offline(True):
+        sys = build_system_config(official_config("examples/system"))
+
+        for name, conf in multirun():
+            with apply_system(conf):
+                
+                # Apply system worked and changed the config
+                for k, v in conf.items():
+                    assert option(k, lambda x: x) == v
+
+
+                assert SizerOptions().save == option("sizer.save", lambda x: x)
+
+    
+    
+if __name__ == "__main__":
+    test_apply_system_matrix()