mila-iqia · Delaunay · Nov 21, 2023 · Nov 13, 2023 · Nov 13, 2023 · Nov 21, 2023
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -53,10 +53,12 @@ jobs:
         uses: actions/checkout@v3
 
       - name: Get Image Tag Name
+        env:
+          GITHUB_REF_NAME_ENV: ${{ github.ref_name }}
         run: |
           REGEX="(.*)v(.*)\.(.*)\.(.*)"
           IMAGE_TAG="nightly"
-          if [[ "${{ github.ref_name }}" =~ $REGEX ]]; then
+          if [[ "${GITHUB_REF_NAME_ENV}" =~ $REGEX ]]; then
               IMAGE_TAG="${GITHUB_REF_NAME##*/}"
           fi
           echo "IMAGE_TAG=$IMAGE_TAG" >> $GITHUB_ENV

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -47,6 +47,7 @@ jobs:
       MILABENCH_ARGS: ""
       MILABENCH_GPU_ARCH: "${{ matrix.arch }}"
       MILABENCH_DASH: "no"
+      MILABENCH_EXCLUDE: "${{ matrix.exclude }}"
 
     steps:
       - uses: actions/checkout@v3
@@ -60,7 +61,7 @@ jobs:
 
       - name: Pytorch Sanity
         run: |
-          if [[ "${{ matrix.arch }}" == "rocm" ]]; then
+          if [[ "${MILABENCH_GPU_ARCH}" == "rocm" ]]; then
             groups
             /opt/rocm/bin/rocminfo
           fi
@@ -96,16 +97,16 @@ jobs:
 
       - name: install benchmarks
         run: |
-          milabench install --exclude "${{ matrix.exclude }}"
+          milabench install --exclude "${MILABENCH_EXCLUDE}"
 
       - name: prepare benchmarks
         run: |
-          milabench prepare --exclude "${{ matrix.exclude }}"
+          milabench prepare --exclude "${MILABENCH_EXCLUDE}"
 
       - name: run benchmarks
         run: |
           export PATH="/opt/rocm/bin:$PATH"
-          milabench run --validations all --exclude "${{ matrix.exclude }}"
+          milabench run --validations all --exclude "${MILABENCH_EXCLUDE}"
 
       - name: Summary
         run: |

diff --git a/benchmarks/dlrm/voirfile.py b/benchmarks/dlrm/voirfile.py
@@ -47,12 +47,7 @@ def instrument_main(ov, options: Config):
     yield ov.phases.load_script
 
     # Loss
-    (
-        ov.probe("//run > L")
-        .throttle(1)["L"]
-        .map(float)
-        .give("loss")
-    )
+    (ov.probe("//run > L").throttle(1)["L"].map(float).give("loss"))
 
     # Compute Start & End + Batch
     ov.probe(

diff --git a/benchmarks/flops/benchfile.py b/benchmarks/flops/benchfile.py
@@ -5,15 +5,15 @@ class FlopsBenchmarch(Package):
     base_requirements = "requirements.in"
     prepare_script = "prepare.py"
     main_script = "main.py"
-    
+
     def build_run_plan(self) -> "execs.Executor":
         import milabench.executors as execs
-        
+
         main = self.dirs.code / self.main_script
         pack = execs.PackExecutor(self, *self.argv, lazy=True)
         # pack = execs.VoirExecutor(pack, cwd=main.parent)
         pack = execs.ActivatorExecutor(pack, use_stdout=True)
         return pack
-    
+
 
 __pack__ = FlopsBenchmarch
diff --git a/benchmarks/flops/main.py b/benchmarks/flops/main.py
@@ -22,34 +22,37 @@
 def _worker(state, queue, func, delay):
     import time
 
-    while state['running']:
+    while state["running"]:
         queue.put(func())
         time.sleep(delay)
-
+
+
 class Monitor:
     def __init__(self, delay, func):
         self.manager = multiprocessing.Manager()
         self.state = self.manager.dict()
-        self.state['running'] = True
+        self.state["running"] = True
         self.results = multiprocessing.Queue()
         self.process = multiprocessing.Process(
-            target=_worker, 
+            target=_worker,
             args=(self.state, self.results, func, delay),
         )
-        
+
     def start(self):
         self.process.start()
-        
+
     def stop(self):
-        self.state['running'] = False
+        self.state["running"] = False
         self.process.join()
 
 
-def modelflops(model: torch.nn.Module, shape, repeat=10, dtype=torch.float32, unit=TERA):
+def modelflops(
+    model: torch.nn.Module, shape, repeat=10, dtype=torch.float32, unit=TERA
+):
     # Not sure how much thop is correct in its computation
     # it says it return MAC but I feel its methods is wrong
     from thop import profile
-    
+
     # MAC: Multiply–accumulate operation
     batch = torch.randn(*shape, dtype=dtype, device="cuda:0")
 
@@ -77,108 +80,92 @@ def modelflops(model: torch.nn.Module, shape, repeat=10, dtype=torch.float32, un
     return (flops * repeat) / (end - start) / unit
 
 
-
 def f(N, R=30, m=5000000, n=256, unit=TERA, dtype=torch.float32, log=None):
     torch.cuda.empty_cache()
     a = torch.eye(n, dtype=dtype, device="cuda:0")
     x = torch.randn((m, n), dtype=dtype, device="cuda:0")
     y = torch.zeros_like(x)
 
     F = N * (2 * m * n * n + 2 * m * n * n)
- 
-    for i in range(R): 
+
+    for i in range(R):
         torch.cuda.synchronize()
         ts = -time.time()
-        
+
         for _ in range(N):
             # No allocation in main loop using dual-out strategy
             y = torch.mm(x, a, out=y)
             x = torch.mm(y, a, out=x)
-        
+
         torch.cuda.synchronize()
         ts += time.time()
-        
+
         if log is not None:
-            log({
-                "task": "train",
-                "rate": F / ts / unit,
-                "units": "Tflops"
-            })
-
+            log({"task": "train", "rate": F / ts / unit, "units": "Tflops"})
+
     torch.cuda.empty_cache()
 
 
 def setupvoir():
     # wtf this do
     data_file = SmuggleWriter(sys.stdout)
     # data_file = sys.stdout
-    
+
     def log(data):
         if data_file is not None:
             data["t"] = time.time()
             print(json.dumps(data), file=data_file)
-            
+
             while not monitor.results.empty():
                 print(json.dumps(monitor.results.get()), file=data_file)
-        
+
     def monitor_fn():
         data = {
             gpu["device"]: {
                 "memory": [
-                    gpu["memory"]["used"], 
+                    gpu["memory"]["used"],
                     gpu["memory"]["total"],
                 ],
                 "load": gpu["utilization"]["compute"],
                 "temperature": gpu["temperature"],
-                "power": gpu["power"]
+                "power": gpu["power"],
             }
             for gpu in get_gpu_info()["gpus"].values()
         }
         return {"task": "main", "gpudata": data, "t": time.time()}
-        
+
     monitor = Monitor(0.5, monitor_fn)
     monitor.start()
     return log, monitor
 
-
 
 def main():
     dtypes = {
-        'bf16': torch.bfloat16,
-        'fp16': torch.float16,
-        'fp32': torch.float32,
+        "bf16": torch.bfloat16,
+        "fp16": torch.float16,
+        "fp32": torch.float32,
     }
-        
+
     parser = ArgumentParser()
-    parser.add_argument('--repeat', type=int, default=100)
-    parser.add_argument('--number', type=int, default=100)
-    parser.add_argument('--m', type=int, default=256)
-    parser.add_argument('--n', type=int, default=256)
-    parser.add_argument('--dtype', type=str, default='fp32', choices=dtypes.keys())
-    parser.add_argument('--tf32', action='store_true', default=False)
-    
+    parser.add_argument("--repeat", type=int, default=100)
+    parser.add_argument("--number", type=int, default=100)
+    parser.add_argument("--m", type=int, default=256)
+    parser.add_argument("--n", type=int, default=256)
+    parser.add_argument("--dtype", type=str, default="fp32", choices=dtypes.keys())
+    parser.add_argument("--tf32", action="store_true", default=False)
+
     args = parser.parse_args()
 
     torch.backends.cuda.matmul.allow_tf32 = False
     if args.tf32:
         torch.backends.cuda.matmul.allow_tf32 = True
-    
+
     log, monitor = setupvoir()
 
-    f(
-        args.number,
-        args.repeat,
-        args.m,
-        args.n,
-        TERA,
-        dtypes[args.dtype],
-        log
-    )
+    f(args.number, args.repeat, args.m, args.n, TERA, dtypes[args.dtype], log)
 
     monitor.stop()
-
-if __name__ == "__main__":
-    main()
-
 
 
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/llama/benchfile.py b/benchmarks/llama/benchfile.py
@@ -11,12 +11,12 @@ class LLAMA(Package):
     def make_env(self):
         return {
             **super().make_env(),
-            "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8))
+            "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)),
         }
-    
+
     async def install(self):
         await super().install()
-        
+
     def build_prepare_plan(self):
         return CmdExecutor(
             self,
@@ -36,7 +36,8 @@ def build_run_plan(self):
             *self.argv,
             "--cache",
             str(self.dirs.cache),
-            use_stdout=True
+            use_stdout=True,
         )
 
+
 __pack__ = LLAMA