Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add worker resolution #225

Merged
merged 10 commits into from
Jun 19, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add metric gathering utility for batch x worker run
  • Loading branch information
pierre.delaunay committed Jun 14, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 1fad24ca7e02b65e9591c21ee3a0d28ff33e65b9
124 changes: 124 additions & 0 deletions milabench/cli/gather.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import argparse
import os
import re
from dataclasses import dataclass, field

import pandas as pd

from ..common import _read_reports
from ..report import make_dataframe, pandas_to_string
from ..summary import make_summary


def default_tags():
return [
"worker=w([a-z0-9]*)",
"multiple=m([0-9]*)",
"power=p([0-9]*)",
"capacity=c([A-Za-z0-9]*(Go)?)",
]


# fmt: off
@dataclass
class Arguments:
runs: str
tags: list = field(default_factory=default_tags)
# fmt: on


def arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"--runs",
type=str,
help="Run folder",
default="/home/mila/d/delaunap/batch_x_worker/",
)
parser.add_argument(
"--tags",
type=str,
help="Tags defined in run names",
default=default_tags(),
)
return parser.parse_args() # Arguments()


def get_config(reports):
k = list(reports.keys())[0]
config = None
for line in reports[k]:
if line["event"] == "config":
config = line["data"]
break
return config


def extract_tags(name, tags):
for tag, pat in tags.items():
if m := pat.search(name):
value = m.group(1)
yield tag, value
else:
print(f"{tag} not found in {name}")
yield tag, "NA"


def gather_cli(args=None):
"""Gather metrics from runs inside a folder in a neat format.
It can extract tags/flags from the runname and create new columns to uniquely identify runs.

Examples
--------

>>> python -m milabench.cli.gather --runs /home/mila/d/delaunap/batch_x_worker/
bench | fail | n | perf | sem% | std% | peak_memory | score | weight | elapsed | name | worker | multiple | power | capacity
brax | 0 | 1 | 722480.33 | 0.7% | 5.2% | 6448 | 722480.33 | 1.00 | 94 | w16-m8-c4Go | 16 | 8 | NA | 4Go
dlrm | 0 | 1 | 350641.30 | 0.6% | 4.6% | 7624 | 350641.30 | 1.00 | 124 | w16-m8-c4Go | 16 | 8 | NA | 4Go
....
brax | 0 | 1 | 723867.42 | 0.6% | 4.5% | 6448 | 723867.42 | 1.00 | 94 | w2-m8-c8Go | 2 | 8 | NA | 8Go
dlrm | 0 | 1 | 403113.36 | 0.7% | 5.1% | 7420 | 403113.36 | 1.00 | 258 | w2-m8-c8Go | 2 | 8 | NA | 8Go
bf16 | 0 | 8 | 293.08 | 0.3% | 7.5% | 5688 | 2361.09 | 0.00 | 18 | w2-m8-c8Go | 2 | 8 | NA | 8Go
fp16 | 0 | 8 | 290.58 | 0.2% | 4.9% | 5688 | 2335.63 | 0.00 | 29 | w2-m8-c8Go | 2 | 8 | NA | 8Go

"""
if args is None:
args = arguments()

runs = []
for folder in os.listdir(args.runs):
if folder.startswith("prepare"):
continue

if folder.startswith("install"):
continue

path = f"{args.runs}/{folder}"
if os.path.isdir(path):
runs.append(path)

tags = dict()
for tag in args.tags:
name, regex = tag.split("=")
tags[name] = re.compile(regex)

query = ("batch_size", "elapsed")
data = []
for run in runs:
reports = _read_reports(run)
summary = make_summary(reports.values(), query=query)
df = make_dataframe(summary, None, None, query=query)

name = run.split("/")[-1]
df["name"] = name.split(".", maxsplit=1)[0]
for tag, value in extract_tags(name, tags):
df[tag] = value

data.append(df)

gathered = pd.concat(data)
print(pandas_to_string(gathered))


if __name__ == "__main__":
gather_cli()
17 changes: 12 additions & 5 deletions milabench/cli/matrix.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from dataclasses import dataclass
import sys
from dataclasses import dataclass

import yaml
from coleo import Option, tooled

from ..system import build_system_config
from ..common import deduce_arch, build_config, get_base_defaults, merge, is_selected
from ..common import (
build_config,
deduce_arch,
get_base_defaults,
is_selected,
merge,
)
from ..sizer import resolve_argv, scale_argv
from ..system import build_system_config


# fmt: off
@@ -78,13 +84,14 @@ def cli_matrix_run(args=None):

def resolve_args(conf, argv):
from ..pack import Package

pack = Package(conf)

args = []
for k, v in argv.items():
args.append(k)
args.append(v)

sized_args = scale_argv(pack, args)
final_args = resolve_argv(pack, sized_args)

@@ -94,7 +101,7 @@ def resolve_args(conf, argv):
argv[k] = final_args[i + 1]
i += 2
continue

print(f"Missing resolved argument {k}")

return argv
2 changes: 1 addition & 1 deletion milabench/commands/__init__.py
Original file line number Diff line number Diff line change
@@ -666,7 +666,7 @@ def _argv(self, **_) -> List:

resolver = new_argument_resolver(self.pack)

cpu_per_process = resolver(str(self.pack.config['argv']['--cpus_per_gpu']))
cpu_per_process = resolver(str(self.pack.config["argv"]["--cpus_per_gpu"]))
return [
# -- Run the command in the right venv
# This could be inside the SSH Command
2 changes: 1 addition & 1 deletion milabench/common.py
Original file line number Diff line number Diff line change
@@ -15,14 +15,14 @@
from milabench.alt_async import proceed
from milabench.utils import available_layers, blabla, multilogger

from .system import build_system_config
from .config import build_config
from .fs import XPath
from .log import TerminalFormatter
from .merge import merge
from .multi import MultiPackage
from .report import make_report
from .summary import aggregate, make_summary
from .system import build_system_config


def get_pack(defn):
3 changes: 0 additions & 3 deletions milabench/config.py
Original file line number Diff line number Diff line change
@@ -4,14 +4,12 @@
from copy import deepcopy

import psutil
from copy import deepcopy
import yaml
from omegaconf import OmegaConf

from .fs import XPath
from .merge import merge


config_global = contextvars.ContextVar("config", default=None)


@@ -112,7 +110,6 @@ def build_matrix_bench(all_configs):

for name, bench_config in all_configs.items():
for k, v in expand_matrix(name, bench_config):

if k in expanded_config:
raise ValueError("Bench name is not unique")

15 changes: 8 additions & 7 deletions milabench/log.py
Original file line number Diff line number Diff line change
@@ -145,10 +145,11 @@ def __call__(self, entry):
pass

elif event == "config":

def _show(k, entry):
if k in ("meta", "system"):
return

if isinstance(entry, dict):
for k2, v in entry.items():
_show(f"{k}.{k2}", v)
@@ -302,9 +303,9 @@ def on_data(self, entry, data, row):
load = int(data.get("load", 0) * 100)
currm, totalm = data.get("memory", [0, 0])
temp = int(data.get("temperature", 0))
row[f"gpu:{gpuid}"] = (
f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
)
row[
f"gpu:{gpuid}"
] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
row["gpu_load"] = f"{load}%"
row["gpu_mem"] = f"{currm:.0f}/{totalm:.0f} MB"
row["gpu_temp"] = f"{temp}C"
@@ -378,9 +379,9 @@ def on_data(self, entry, data, row):
load = int(data.get("load", 0) * 100)
currm, totalm = data.get("memory", [0, 0])
temp = int(data.get("temperature", 0))
row[f"gpu:{gpuid}"] = (
f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
)
row[
f"gpu:{gpuid}"
] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
else:
task = data.pop("task", "")
units = data.pop("units", "")
Loading
Loading