mila-iqia · Delaunay · Nov 22, 2024 · Sep 19, 2024 · Sep 20, 2024 · Sep 23, 2024
diff --git a/.pin/constraints-cuda-torch.txt b/.pin/constraints-cuda-torch.txt
diff --git a/.pin/constraints-hpu-torch.txt b/.pin/constraints-hpu-torch.txt
diff --git a/.pin/constraints-rocm-torch.txt b/.pin/constraints-rocm-torch.txt
diff --git a/README.md b/README.md
@@ -20,62 +20,23 @@ evaluating current and future hardware in a research environment.
 * Focussed on training
 * Ease of use
 * Pytorch focused
-* ROCm & NVIDIA
+* ROCm, NVIDIA, Intel OneAPI, Habana Gaudi (Synapse)
 * Independent 
 
 ## Getting Started
 
-The easiest way to run milabbench is to run it with one of its docker image.
-It will include all of the necessary data
-
-
-    # Choose the image you want to use
-    export MILABENCH_IMAGE=ghcr.io/mila-iqia/milabench:cuda-nightly
-
-    # Pull the image we are going to run
-    docker pull $MILABENCH_IMAGE
-
-    # Run milabench
-    docker run -it --rm --ipc=host --gpus=all      \
-          -v $(pwd)/results:/milabench/envs/runs   \
-          $MILABENCH_IMAGE                         \
-          bash -c "milabench prepare && milabench run"
-
-    =================
-    Benchmark results
-    =================
-                             fail n       perf   sem%   std% peak_memory          score weight
-    bert-fp16                   0 8     155.08   0.3%   4.3%       24552    1241.260310   0.00
-    bert-fp32                   0 8      29.52   0.0%   0.5%       31524     236.337218   0.00
-    bert-tf32                   0 8     120.46   0.4%   6.1%       31524     964.713297   0.00
-    bert-tf32-fp16              0 8     154.76   0.3%   4.1%       24552    1238.477257   3.00
-    convnext_large-fp16         0 8     337.48   0.9%  14.0%       27658    2741.604444   0.00
-    convnext_large-fp32         0 8      44.61   0.8%  12.6%       49786     354.207225   0.00
-    convnext_large-tf32         0 8     135.99   0.7%  11.2%       49786    1089.394916   0.00
-    convnext_large-tf32-fp16    0 8     338.58   0.8%  13.0%       27658    2744.325170   3.00
-    davit_large                 0 8     312.79   0.3%   6.7%       35058    2515.326450   1.00
-    davit_large-multi           0 1    2401.65   1.0%   7.7%       42232    2401.651720   5.00
-    dlrm                        0 1  188777.20   1.8%  14.0%        3194  188777.203190   1.00
-    focalnet                    0 8     400.47   0.2%   5.4%       26604    3215.431924   2.00
-    opt-1_3b                    0 1      26.71   0.1%   0.4%       44116      26.714365   5.00
-    opt-1_3b-multinode          0 2      34.62   0.2%   1.0%       43552      34.618292  10.00
-    opt-6_7b                    0 1      14.32   0.0%   0.1%       55750      14.319587   5.00
-    opt-6_7b-multinode          0 2      10.79   0.1%   0.7%       49380      10.792595  10.00
-    reformer                    0 8      61.70   0.0%   0.9%       25376     494.110834   1.00
-    regnet_y_128gf              0 8      99.96   0.2%   5.0%       31840     803.012507   2.00
-    resnet152                   0 8     710.18   0.3%   6.2%       36732    5710.828608   1.00
-    resnet152-multi             0 1    5367.34   1.0%   8.1%       38638    5367.338469   5.00
-    resnet50                    0 8     984.43   0.9%  19.1%        5026    7927.257351   1.00
-    rwkv                        0 8     428.65   0.2%   3.8%        5546    3435.097716   1.00
-    stargan                     0 8      51.32   1.8%  40.8%       37848     413.238870   1.00
-    super-slomo                 0 8      41.63   0.1%   2.3%       34082     332.395065   1.00
-    t5                          0 8      48.05   0.2%   3.9%       35466     384.317023   2.00
-    whisper                     0 8     248.16   0.0%   0.6%       37006    1985.861017   1.00
+
+    git clone https://github.com/mila-iqia/milabench.git
 
-    Scores
-    ------
-    Failure rate:       0.00% (PASS)
-    Score:             219.06
+    pip install -e milabench
+
+    export MILABENCH_GPU_ARCH=cuda
+
+    milabench install --base workspace --config milabench/config/standard.yaml --select fp32
+
+    milabench prepare --base workspace --config milabench/config/standard.yaml --select fp32
+
+    milabench run --base workspace --config milabench/config/standard.yaml --select fp32
 
 
 ## Details
@@ -84,13 +45,77 @@ The benchmark suite has been validated on the following configurations:
 
 | Python version |          GPU                   |   Configuration file |
 |       -        |        -                       |           -          |
-| 3.10   (conda) | 2 node x 8xNVIDIA A100 80GB    | config/standard.yaml |
-| 3.9.12 (conda) | 8x NVIDIA RTX8000 48GB         | config/standard.yaml |
-| 3.9.16 (conda) | 2x NVIDIA K80                  | config/ci.yaml       |
-| 3.9.16 (conda) | 2x AMD MI100                   | config/ci.yaml       |
-| 3.9.16 (conda) | 4x AMD MI250                   | config/standard.yaml |
+| 3.10           | 2 node x 8xNVIDIA A100 80GB    | config/standard.yaml |
+| 3.10           | 2 node x 8xMI300X              | config/standard.yaml |
+| 3.10           | 1 node x 8xGaudi2              | config/standard.yaml |
 
 We are working on validating it on more configurations and will update the above table as we do.
 
-
-
+## Report
+
+    =================
+    Benchmark results
+    =================
+
+    System
+    ------
+    cpu:      AMD EPYC 7742 64-Core Processor
+    n_cpu:    128
+    product:  NVIDIA A100-SXM4-80GB
+    n_gpu:    8
+    memory:   81920.0
+
+    Breakdown
+    ---------
+    bench                    | fail |   n | ngpu |           perf |   sem% |   std% | peak_memory |           score | weight
+    brax                     |    0 |   1 |    8 |      730035.71 |   0.1% |   0.4% |        2670 |       730035.71 |   1.00
+    diffusion-gpus           |    0 |   1 |    8 |         117.67 |   1.5% |  11.7% |       59944 |          117.67 |   1.00
+    diffusion-single         |    0 |   8 |    1 |          25.02 |   0.8% |  17.9% |       53994 |          202.10 |   1.00
+    dimenet                  |    0 |   8 |    1 |         366.85 |   0.7% |  16.2% |        2302 |         2973.32 |   1.00
+    dinov2-giant-gpus        |    0 |   1 |    8 |         445.68 |   0.4% |   3.0% |       69614 |          445.68 |   1.00
+    dinov2-giant-single      |    0 |   8 |    1 |          53.54 |   0.4% |   9.5% |       74646 |          432.65 |   1.00
+    dqn                      |    0 |   8 |    1 | 23089954554.91 |   1.1% |  89.9% |       62106 | 184480810548.20 |   1.00
+    bf16                     |    0 |   8 |    1 |         293.43 |   0.2% |   6.3% |        1788 |         2361.16 |   0.00
+    fp16                     |    0 |   8 |    1 |         289.26 |   0.1% |   3.6% |        1788 |         2321.65 |   0.00
+    fp32                     |    0 |   8 |    1 |          19.14 |   0.0% |   0.7% |        2166 |          153.21 |   0.00
+    tf32                     |    0 |   8 |    1 |         146.63 |   0.1% |   3.6% |        2166 |         1177.04 |   0.00
+    bert-fp16                |    0 |   8 |    1 |         263.73 |   1.1% |  16.7% |         nan |         2165.37 |   0.00
+    bert-fp32                |    0 |   8 |    1 |          44.84 |   0.6% |   9.6% |       21170 |          364.52 |   0.00
+    bert-tf32                |    0 |   8 |    1 |         141.95 |   0.9% |  14.1% |        1764 |         1162.94 |   0.00
+    bert-tf32-fp16           |    0 |   8 |    1 |         265.04 |   1.0% |  15.6% |         nan |         2175.59 |   3.00
+    reformer                 |    0 |   8 |    1 |          62.29 |   0.3% |   6.0% |       25404 |          501.89 |   1.00
+    t5                       |    0 |   8 |    1 |          51.40 |   0.5% |   9.9% |       34390 |          416.14 |   2.00
+    whisper                  |    0 |   8 |    1 |         481.95 |   1.0% |  21.4% |        8520 |         3897.53 |   1.00
+    lightning                |    0 |   8 |    1 |         680.22 |   1.0% |  22.7% |       27360 |         5506.90 |   1.00
+    lightning-gpus           |    0 |   1 |    8 |        3504.74 |   7.9% |  62.9% |       28184 |         3504.74 |   1.00
+    llava-single             |    1 |   8 |    1 |           2.28 |   0.4% |   9.6% |       72556 |           14.12 |   1.00
+    llama                    |    0 |   8 |    1 |         484.86 |   4.4% |  80.0% |       27820 |         3680.86 |   1.00
+    llm-full-mp-gpus         |    0 |   1 |    8 |         193.92 |   3.1% |  16.2% |       48470 |          193.92 |   1.00
+    llm-lora-ddp-gpus        |    0 |   1 |    8 |       16738.58 |   0.4% |   2.0% |       36988 |        16738.58 |   1.00
+    llm-lora-mp-gpus         |    0 |   1 |    8 |        1980.63 |   2.2% |  11.8% |       55972 |         1980.63 |   1.00
+    llm-lora-single          |    0 |   8 |    1 |        2724.95 |   0.2% |   3.0% |       49926 |        21861.99 |   1.00
+    ppo                      |    0 |   8 |    1 |     3114264.32 |   1.6% |  57.2% |       62206 |     24915954.98 |   1.00
+    recursiongfn             |    0 |   8 |    1 |        7080.67 |   1.2% |  27.1% |       10292 |        57038.34 |   1.00
+    rlhf-gpus                |    0 |   1 |    8 |        6314.94 |   2.1% |  11.2% |       21730 |         6314.94 |   1.00
+    rlhf-single              |    0 |   8 |    1 |        1143.72 |   0.4% |   8.4% |       19566 |         9174.52 |   1.00
+    focalnet                 |    0 |   8 |    1 |         375.07 |   0.7% |  14.9% |       23536 |         3038.83 |   2.00
+    torchatari               |    0 |   8 |    1 |        5848.88 |   0.6% |  12.7% |        3834 |        46613.34 |   1.00
+    convnext_large-fp16      |    0 |   8 |    1 |         330.93 |   1.5% |  22.9% |       27376 |         2711.46 |   0.00
+    convnext_large-fp32      |    0 |   8 |    1 |          59.49 |   0.6% |   9.8% |       55950 |          483.84 |   0.00
+    convnext_large-tf32      |    0 |   8 |    1 |         155.41 |   0.9% |  14.3% |       49650 |         1273.31 |   0.00
+    convnext_large-tf32-fp16 |    0 |   8 |    1 |         322.28 |   1.6% |  24.5% |       27376 |         2637.88 |   3.00
+    regnet_y_128gf           |    0 |   8 |    1 |         119.46 |   0.5% |  10.0% |       29762 |          966.96 |   2.00
+    resnet152-ddp-gpus       |    0 |   1 |    8 |        3843.06 |   5.2% |  39.3% |       27980 |         3843.06 |   0.00
+    resnet50                 |    0 |   8 |    1 |         932.95 |   2.4% |  52.2% |       14848 |         7524.25 |   1.00
+    resnet50-noio            |    0 |   8 |    1 |        1163.88 |   0.3% |   6.7% |       27480 |         9385.35 |   0.00
+    vjepa-gpus               |    0 |   1 |    8 |         130.13 |   5.9% |  46.8% |       64244 |          130.13 |   1.00
+    vjepa-single             |    0 |   8 |    1 |          21.29 |   1.0% |  22.4% |       58552 |          172.11 |   1.00
+
+    Scores
+    ------
+    Failure rate:       0.38% (PASS)
+    Score:            4175.57
+
+   Errors
+   ------
+   1 errors, details in HTML report.
diff --git a/benchmarks/_templates/simple/dev.yaml b/benchmarks/_templates/simple/dev.yaml
@@ -6,3 +6,5 @@ template:
   install_group: torch
   plan:
     method: per_gpu
+  tags:
+    - monogpu
diff --git a/benchmarks/_templates/stdout/dev.yaml b/benchmarks/_templates/stdout/dev.yaml
@@ -3,7 +3,8 @@ _template:
   definition: .
   install-variant: unpinned
   install_group: torch
-
+  tags:
+    - monogpu
   #argv:
   #  --train_batch_size: 32
   #  --num_epochs: 5

diff --git a/benchmarks/_templates/voir/dev.yaml b/benchmarks/_templates/voir/dev.yaml
@@ -6,3 +6,5 @@ template:
   install_group: torch
   plan:
     method: per_gpu
+  tags:
+    - monogpu
diff --git a/benchmarks/brax/benchfile.py b/benchmarks/brax/benchfile.py
@@ -5,5 +5,9 @@ class BraxBenchmark(Package):
     base_requirements = "requirements.in"
     main_script = "main.py"
 
-
+    def make_env(self):
+        env = super().make_env()
+        env["XLA_PYTHON_CLIENT_PREALLOCATE"] = "False"
+        return env
+
 __pack__ = BraxBenchmark
diff --git a/benchmarks/brax/main.py b/benchmarks/brax/main.py
@@ -85,6 +85,9 @@ def run():
 
     args = parser.parse_args()
 
+    # args.num_envs = (args.batch_size * args.num_minibatches)  
+
+
     train(
         environment=envs.get_environment(env_name=args.env),
         num_timesteps=args.num_timesteps,

diff --git a/benchmarks/brax/requirements.cuda.txt b/benchmarks/brax/requirements.cuda.txt
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,3 +6,5 @@ template: @@
       install_group: torch
       plan:
         method: per_gpu
+      tags:
+        - monogpu