From 7ef63682dff8c50245ccc8d5fb9610d0dcb063e5 Mon Sep 17 00:00:00 2001 From: Ashley Wright Date: Wed, 11 Dec 2024 15:07:10 -0800 Subject: [PATCH] Add system info --- base/base/contest.py | 1 - base/base/system_info.py | 45 ++++++++++++++++++++ validator/pyproject.toml | 2 +- validator/submission_tester/api.py | 4 ++ validator/weight_setting/benchmarking_api.py | 6 +++ validator/weight_setting/validator.py | 9 +++- validator/weight_setting/wandb_manager.py | 3 ++ validator/weight_setting/weight_setter.py | 6 ++- 8 files changed, 71 insertions(+), 5 deletions(-) create mode 100644 base/base/system_info.py diff --git a/base/base/contest.py b/base/base/contest.py index 080053b7..44d648a3 100644 --- a/base/base/contest.py +++ b/base/base/contest.py @@ -1,4 +1,3 @@ -import math from dataclasses import dataclass from enum import IntEnum from functools import partial diff --git a/base/base/system_info.py b/base/base/system_info.py new file mode 100644 index 00000000..63bbe856 --- /dev/null +++ b/base/base/system_info.py @@ -0,0 +1,45 @@ +import psutil +from pydantic import BaseModel + + +class SystemInfo(BaseModel): + cpu: str + min_frequency_mhz: float + max_frequency_mhz: float + current_frequency_mhz: float + physical_cores: int + total_cores: int + ram: int + gpu: str + + +def get_system_info() -> SystemInfo: + with open("/proc/cpuinfo", "r") as f: + for line in f: + if "model name" in line: + cpu = line.strip().split(":")[1].strip() + break + + cpu_frequency = psutil.cpu_freq() + min_frequency = cpu_frequency.min + max_frequency = cpu_frequency.max + current_frequency = cpu_frequency.current + + physical_cores = psutil.cpu_count(logical=False) + total_cores = psutil.cpu_count(logical=True) + + ram = psutil.virtual_memory().total + + import torch + gpu = torch.cuda.get_device_name(0) + + return SystemInfo( + cpu=cpu, + min_frequency_mhz=min_frequency, + max_frequency_mhz=max_frequency, + current_frequency_mhz=current_frequency, + physical_cores=physical_cores, + total_cores=total_cores, + ram=ram, + gpu=gpu, + ) diff --git a/validator/pyproject.toml b/validator/pyproject.toml index f9f1fe76..a3e4b36d 100644 --- a/validator/pyproject.toml +++ b/validator/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "edge-maxxing-validator" description = "The validator which checks models and checkpoints provided by miners" requires-python = ">=3.10,<3.13" -version = "5.6.7" +version = "5.6.8" dependencies = [ "edge-maxxing-base==1.0.0", "opentelemetry-api>=1.28.2", diff --git a/validator/submission_tester/api.py b/validator/submission_tester/api.py index d628e49d..ffdf9cd5 100644 --- a/validator/submission_tester/api.py +++ b/validator/submission_tester/api.py @@ -16,6 +16,7 @@ from base_validator.api_data import BenchmarkingStartRequest, BenchmarkingResults, ApiMetadata, BenchmarkingInitializeRequest from base_validator.auto_updater import AutoUpdater from base_validator.telemetry import init_open_telemetry_logging +from base.system_info import get_system_info, SystemInfo from testing.benchmarker import Benchmarker hotkey = os.getenv("VALIDATOR_HOTKEY_SS58_ADDRESS") @@ -128,6 +129,9 @@ def metadata(request: Request) -> ApiMetadata: compatible_contests=request.state.compatible_contests, ) +@app.get("/hardware") +def hardware() -> SystemInfo: + return get_system_info() @app.post("/initialize") def initialize( diff --git a/validator/weight_setting/benchmarking_api.py b/validator/weight_setting/benchmarking_api.py index 638d3789..e089ac12 100644 --- a/validator/weight_setting/benchmarking_api.py +++ b/validator/weight_setting/benchmarking_api.py @@ -12,6 +12,7 @@ from base.contest import ContestId, RepositoryInfo from base.inputs_api import get_inputs_state from base_validator.api_data import BenchmarkingStartRequest, ApiMetadata, BenchmarkingResults, BenchmarkingInitializeRequest +from base.system_info import SystemInfo logger = get_logger(__name__) @@ -60,6 +61,11 @@ def metadata(self) -> ApiMetadata: response.raise_for_status() return ApiMetadata.model_validate(response.json()) + def hardware(self) -> SystemInfo: + response = requests.get(f"{self._api}/hardware") + response.raise_for_status() + return SystemInfo.model_validate(response.json()) + def results(self) -> BenchmarkingResults: response = requests.get(f"{self._api}/state") response.raise_for_status() diff --git a/validator/weight_setting/validator.py b/validator/weight_setting/validator.py index 44b81ac8..6e24d8de 100644 --- a/validator/weight_setting/validator.py +++ b/validator/weight_setting/validator.py @@ -17,6 +17,7 @@ from base.contest import BenchmarkState from base.inputs_api import get_inputs_state from base.submissions import get_submissions +from base.system_info import SystemInfo from base_validator.api_data import BenchmarkingResults from base_validator.auto_updater import AutoUpdater from base_validator.telemetry import init_open_telemetry_logging @@ -70,6 +71,7 @@ class Validator: weight_setter: WeightSetter benchmarking_apis: list[BenchmarkingApi] + api_hardware: list[SystemInfo] = [] def __init__(self): self.metagraph.sync_nodes() @@ -107,6 +109,7 @@ def __init__(self): keypair=self.keypair, uid=self.uid, contest_state=lambda: self.contest_state, + api_hardware=self.api_hardware, wandb_manager=self.wandb_manager, ) @@ -115,6 +118,7 @@ def __init__(self): self.run() def initialize_apis(self, untested_submissions: Submissions): + self.api_hardware.clear() for api in self.benchmarking_apis: api.initialize( uid=self.uid, @@ -122,6 +126,7 @@ def initialize_apis(self, untested_submissions: Submissions): netuid=self.metagraph.netuid, substrate_url=self.substrate.url, ) + self.api_hardware.append(api.hardware()) send_submissions_to_api( version=self.validator_version, all_apis=self.benchmarking_apis, @@ -167,7 +172,7 @@ def do_step(self): if not untested_submissions: self.contest_state.benchmarking_state = BenchmarkState.FINISHED self.state_manager.save_state(self.contest_state) - self.wandb_manager.send_metrics(self.contest_state) + self.wandb_manager.send_metrics(self.contest_state, self.api_hardware) self.contest_state.sleep_to_next_contest(self._stop_flag) return @@ -228,7 +233,7 @@ def run(self): if self.contest_state: self.contest_state.step += 1 self.state_manager.save_state(self.contest_state) - self.wandb_manager.send_metrics(self.contest_state) + self.wandb_manager.send_metrics(self.contest_state, self.api_hardware) except (ConnectionError, HTTPError) as e: logger.error(f"Error connecting to API, retrying in 10 blocks: {e}") self._stop_flag.wait(BENCHMARK_UPDATE_RATE_BLOCKS * 12) diff --git a/validator/weight_setting/wandb_manager.py b/validator/weight_setting/wandb_manager.py index 45cf09a9..a698708d 100644 --- a/validator/weight_setting/wandb_manager.py +++ b/validator/weight_setting/wandb_manager.py @@ -5,6 +5,7 @@ from wandb.apis.public import Run from base.checkpoint import Uid, Key +from base.system_info import SystemInfo from .contest_state import ContestState @@ -69,6 +70,7 @@ def init_wandb(self, contest_state: ContestState): def send_metrics( self, contest_state: ContestState, + api_hardware: list[SystemInfo], scores: dict[Key, float] | None = None, ranks: dict[Key, int] | None = None ): @@ -77,6 +79,7 @@ def send_metrics( data = { "scores": scores or contest_state.get_scores(contest_state.benchmarks), + "api_hardware": [api.model_dump() for api in api_hardware], "ranks": ranks or contest_state.get_ranks(scores), "num_gpus": len(self.config["benchmarker_api"]), } | contest_state.model_dump() diff --git a/validator/weight_setting/weight_setter.py b/validator/weight_setting/weight_setter.py index 8c4d2f09..173e8aa4 100644 --- a/validator/weight_setting/weight_setter.py +++ b/validator/weight_setting/weight_setter.py @@ -9,6 +9,7 @@ from substrateinterface import SubstrateInterface, Keypair from base.inputs_api import get_blacklist, get_inputs_state +from base.system_info import SystemInfo from weight_setting.contest_state import ContestState from weight_setting.wandb_manager import WandbManager @@ -26,6 +27,7 @@ class WeightSetter: _keypair: Keypair _uid: int _contest_state: Callable[[], ContestState] + _api_hardware: list[SystemInfo] _wandb_manager: WandbManager _weights_version: int @@ -38,6 +40,7 @@ def __init__( keypair: Keypair, uid: int, contest_state: Callable[[], ContestState], + api_hardware: list[SystemInfo], wandb_manager: WandbManager, ): self._epoch_length = epoch_length @@ -46,6 +49,7 @@ def __init__( self._keypair = keypair self._uid = uid self._contest_state = contest_state + self._api_hardware = api_hardware self._wandb_manager = wandb_manager parts: list[str] = version.split(".") @@ -114,7 +118,7 @@ def set_weights(self) -> bool: weights_by_key = contest_state.calculate_weights(ranks=ranks) - self._wandb_manager.send_metrics(contest_state, scores, ranks) + self._wandb_manager.send_metrics(contest_state, self._api_hardware, scores, ranks) return self._set_weights([ weights_by_key.get(key, 0) for key in self._metagraph.nodes.keys()