Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validator & general code refactor/rewrite #101

Merged
merged 33 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
828c3cf
Rewrite
MsRandom Nov 27, 2024
881932c
Implement weight calculation functions
MsRandom Nov 27, 2024
7d9e82f
Readd submission deduplication, clean up output comparator
MsRandom Nov 27, 2024
e54baf9
Use double empty line for top level classes and functions
MsRandom Nov 27, 2024
dc9a7ec
Fix start script
MsRandom Nov 27, 2024
7ae036a
Update start.sh
MsRandom Nov 27, 2024
9ff390b
Fix dependencies
MsRandom Nov 27, 2024
97573b7
Fix API
MsRandom Nov 27, 2024
5f281f1
Fix autoupdater blocking termination
MsRandom Nov 27, 2024
0485c56
Fix submission script
MsRandom Nov 27, 2024
7eb9407
Fix errors from pytype
MsRandom Nov 27, 2024
0438a14
Update submit.py
MsRandom Nov 27, 2024
cc7deda
Fix benchmarking issues
MsRandom Nov 27, 2024
b4739c7
Fix pytype workflow
MsRandom Nov 28, 2024
6e71c02
add proper benchmarking shutdown, add more logging
MsRandom Nov 28, 2024
9dffbf7
Check popen process, log output
MsRandom Nov 28, 2024
ad0bbee
Fix dockerfile
MsRandom Nov 28, 2024
11c06b4
Fix docker
MsRandom Nov 28, 2024
5d11c70
Optimize update.sh permission setting
MsRandom Nov 28, 2024
d86d906
Add more logging, fix benchmarking
MsRandom Nov 28, 2024
89e75c7
Fix score calculation
MsRandom Nov 28, 2024
698d54e
Fix wandb, fix trying to send to inactive contest
MsRandom Nov 28, 2024
e45a1e7
Fix divide be 0 error
MsRandom Nov 28, 2024
7c7c2ad
Fix ETA
MsRandom Nov 28, 2024
710c1d3
Round ETA
MsRandom Nov 29, 2024
c3af1a7
Fix unsuccessful weight setting attempts marked as successful
MsRandom Nov 29, 2024
8c73e35
Fix calculating scores before baseline is set
MsRandom Nov 29, 2024
0526de1
Fix wrong path for telemetry_attributes
MsRandom Nov 29, 2024
59a0362
Fix ETA
MsRandom Nov 29, 2024
40c10f9
Set contest state before setting weights
MsRandom Nov 29, 2024
5428f08
Add benchmarking state to contest state
MsRandom Nov 29, 2024
8d3a2e9
Increase sleep blocks after failed weight setting attempt
MsRandom Nov 29, 2024
a8fbe34
Fix inference sandbox logging
MsRandom Nov 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .dockerignore

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@ jobs:
uses: astral-sh/setup-uv@v3

- name: Test Types
working-directory: neuron
working-directory: base
run: |
uv run pytype
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,4 @@ cython_debug/
.idea/

huggingface/
pip/
miner/model/
miner/baseline_cache.json
validator/compose.yaml
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,7 @@ uv run submit_model \
--netuid {netuid} \
--subtensor.network finney \
--wallet.name {wallet} \
--wallet.hotkey {hotkey} \
--logging.trace \
--logging.debug
--wallet.hotkey {hotkey}
```
5. Follow the interactive prompts to submit the repository link, revision, and contest to participate in
6. Optionally, benchmark your submission locally before submitting (make sure you have the right hardware e.g. NVIDIA GeForce RTX 4090). uv and huggingface are required for benchmarking:
Expand All @@ -150,7 +148,7 @@ If your hardware is not accessed within a container(as in, can use Docker), then

To get started, go to the `validator`, and create a `.env` file with the following contents:
```
VALIDATOR_ARGS=--netuid {netuid} --subtensor.network {network} --wallet.name {wallet} --wallet.hotkey {hotkey} --logging.trace --logging.debug
VALIDATOR_ARGS=--netuid {netuid} --subtensor.network {network} --wallet.name {wallet} --wallet.hotkey {hotkey}
VALIDATOR_HOTKEY_SS58_ADDRESS={ss58-address}
```

Expand Down Expand Up @@ -201,8 +199,6 @@ In the another pod/container without a GPU, to run the scoring validator, clone
--subtensor.network {network} \
--wallet.name {wallet} \
--wallet.hotkey {hotkey} \
--logging.trace \
--logging.debug \
--benchmarker_api {API component routes, space separated if multiple}
```

Expand Down
File renamed without changes.
18 changes: 18 additions & 0 deletions base/base/checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from datetime import datetime
from typing import TypeAlias
from zoneinfo import ZoneInfo

from base.contest import Submission, Benchmark

TIMEZONE = ZoneInfo("America/Los_Angeles")
SPEC_VERSION = 7

Uid: TypeAlias = int
Key: TypeAlias = str

Submissions: TypeAlias = dict[Key, Submission]
Benchmarks: TypeAlias = dict[Key, Benchmark]


def current_time() -> datetime:
return datetime.now(tz=TIMEZONE)
8 changes: 2 additions & 6 deletions neuron/neuron/config.py → base/base/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from argparse import ArgumentParser
from typing import Callable, Any
from typing import Callable

from fiber.constants import FINNEY_NETWORK

Expand Down Expand Up @@ -28,11 +28,7 @@ def get_config(add_args: Callable[[ArgumentParser], None] | None = None):

argument_parser.add_argument("--netuid", type=int, required=True, help="Network UID")

# Deprecated arguments that won't be used
argument_parser.add_argument("--logging.debug", action="store_true", help="Enable debug logging", default=False)
argument_parser.add_argument("--logging.trace", action="store_true", help="Enable trace logging", default=False)

if add_args:
add_args(argument_parser)

return vars(argument_parser.parse_args())
return vars(argument_parser.parse_known_args()[0])
86 changes: 52 additions & 34 deletions neuron/neuron/contest.py → base/base/contest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
SIMILARITY_SCORE_THRESHOLD = 0.7


class BenchmarkState(IntEnum):
NOT_STARTED = 0
IN_PROGRESS = 1
FINISHED = 2


class MetricType(IntEnum):
SIMILARITY_SCORE = 0
GENERATION_TIME = 1
Expand All @@ -21,53 +27,62 @@ class MetricType(IntEnum):
LOAD_TIME = 5


class MetricData(BaseModel):
class ContestId(IntEnum):
FLUX_NVIDIA_4090 = 0
SDXL_NEWDREAM_NVIDIA_4090 = 1


class RepositoryInfo(BaseModel):
url: str
revision: str


class Submission(BaseModel):
repository_info: RepositoryInfo
contest_id: ContestId
block: int

def contest(self) -> "Contest":
return find_contest(self.contest_id)


class Metrics(BaseModel):
generation_time: float
size: int
vram_used: float
watts_used: float
load_time: float


class CheckpointBenchmark(BaseModel):
model: MetricData
class Benchmark(BaseModel):
metrics: Metrics
average_similarity: float
min_similarity: float


class ModelRepositoryInfo(BaseModel):
url: str
revision: str


class ContestId(IntEnum):
FLUX_NVIDIA_4090 = 0
SDXL_NEWDREAM_NVIDIA_4090 = 1


@dataclass
class Contest:
id: ContestId
device: Device
output_comparator: Callable[[], OutputComparator]
baseline_repository: ModelRepositoryInfo
baseline_repository: RepositoryInfo
metric_weights: dict[MetricType, int]

def __init__(
self,
contest_id: ContestId,
device: Device,
output_comparator: Callable[[], OutputComparator],
baseline_repository: ModelRepositoryInfo,
metric_weights: dict[MetricType, int]
self,
contest_id: ContestId,
device: Device,
output_comparator: Callable[[], OutputComparator],
baseline_repository: RepositoryInfo,
metric_weights: dict[MetricType, int]
):
self.id = contest_id
self.device = device
self.output_comparator = output_comparator
self.baseline_repository = baseline_repository
self.metric_weights = metric_weights

def calculate_score(self, baseline: MetricData, benchmark: CheckpointBenchmark) -> float:
def calculate_score(self, baseline: Metrics, benchmark: Benchmark) -> float:
if benchmark.min_similarity < SIMILARITY_SCORE_THRESHOLD:
return 0.0

Expand All @@ -83,22 +98,24 @@ def normalize(baseline_value: float, benchmark_value: float, metric_type: Metric
return (relative_improvement * self.metric_weights.get(metric_type, 0)) / total_weight

score = sum([
normalize(baseline.generation_time, benchmark.model.generation_time, MetricType.GENERATION_TIME),
normalize(baseline.size, benchmark.model.size, MetricType.SIZE),
normalize(baseline.vram_used, benchmark.model.vram_used, MetricType.VRAM_USED),
normalize(baseline.watts_used, benchmark.model.watts_used, MetricType.WATTS_USED),
normalize(baseline.load_time, benchmark.model.load_time, MetricType.LOAD_TIME)
normalize(baseline.generation_time, benchmark.metrics.generation_time, MetricType.GENERATION_TIME),
normalize(baseline.size, benchmark.metrics.size, MetricType.SIZE),
normalize(baseline.vram_used, benchmark.metrics.vram_used, MetricType.VRAM_USED),
normalize(baseline.watts_used, benchmark.metrics.watts_used, MetricType.WATTS_USED),
normalize(baseline.load_time, benchmark.metrics.load_time, MetricType.LOAD_TIME)
])

return score * similarity * self.metric_weights.get(MetricType.SIMILARITY_SCORE, 0) / total_weight


CUDA_4090_DEVICE = CudaDevice(gpu=Gpu.NVIDIA_RTX_4090)

CONTESTS = [
Contest(
contest_id=ContestId.FLUX_NVIDIA_4090,
device=CudaDevice(gpu=Gpu.NVIDIA_RTX_4090),
output_comparator=partial(ImageOutputComparator, "cuda"),
baseline_repository=ModelRepositoryInfo(url="https://github.com/womboai/flux-schnell-edge-inference", revision="fbfb8f0"),
device=CUDA_4090_DEVICE,
output_comparator=partial(ImageOutputComparator, CUDA_4090_DEVICE),
baseline_repository=RepositoryInfo(url="https://github.com/womboai/flux-schnell-edge-inference", revision="fbfb8f0"),
metric_weights={
MetricType.SIMILARITY_SCORE: 3,
MetricType.VRAM_USED: 3,
Expand All @@ -107,9 +124,9 @@ def normalize(baseline_value: float, benchmark_value: float, metric_type: Metric
),
Contest(
contest_id=ContestId.SDXL_NEWDREAM_NVIDIA_4090,
device=CudaDevice(gpu=Gpu.NVIDIA_RTX_4090),
output_comparator=partial(ImageOutputComparator, "cuda"),
baseline_repository=ModelRepositoryInfo(url="https://github.com/womboai/sdxl-newdream-20-inference", revision="1b3f9ea"),
device=CUDA_4090_DEVICE,
output_comparator=partial(ImageOutputComparator, CUDA_4090_DEVICE),
baseline_repository=RepositoryInfo(url="https://github.com/womboai/sdxl-newdream-20-inference", revision="1b3f9ea"),
metric_weights={
MetricType.SIMILARITY_SCORE: 1,
MetricType.GENERATION_TIME: 1,
Expand All @@ -132,5 +149,6 @@ def find_compatible_contests() -> list[ContestId]:
return [contest.id for contest in CONTESTS if contest.device.is_compatible()]


CURRENT_CONTEST: Contest = find_contest(ContestId.FLUX_NVIDIA_4090)
ACTIVE_CONTESTS = { ContestId.FLUX_NVIDIA_4090 }
ACTIVE_CONTESTS = [
ContestId.FLUX_NVIDIA_4090
]
36 changes: 23 additions & 13 deletions neuron/neuron/device.py → base/base/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ class Gpu(Enum):


class Device(ABC):
@abstractmethod
def get_name(self):
...

@abstractmethod
def get_vram_used(self):
...
Expand All @@ -16,11 +20,11 @@ def get_joules(self):
...

@abstractmethod
def is_compatible(self):
def empty_cache(self):
...

@abstractmethod
def validate(self):
def is_compatible(self):
...


Expand All @@ -30,6 +34,9 @@ class CudaDevice(Device):
def __init__(self, gpu: Gpu):
self._gpu = gpu

def get_name(self):
return "cuda"

def get_vram_used(self):
import pynvml
import torch
Expand All @@ -50,23 +57,24 @@ def get_joules(self):
pynvml.nvmlShutdown()
return mj / 1000.0 # convert mJ to J

def is_compatible(self):
def empty_cache(self):
import torch

device_name = torch.cuda.get_device_name()
torch.cuda.synchronize()
torch.cuda.empty_cache()

return device_name == self._gpu.value

def validate(self):
def is_compatible(self):
import torch

device_name = torch.cuda.get_device_name()

if not self.is_compatible():
raise ContestDeviceValidationError(f"Incompatible device {device_name} when {self._gpu.name} is required.")
return device_name == self._gpu.value


class MpsDevice(Device):
def get_name(self):
return "mps"

def get_vram_used(self):
import torch

Expand All @@ -75,15 +83,17 @@ def get_vram_used(self):
def get_joules(self):
return 0 # TODO

def empty_cache(self):
import torch

torch.mps.synchronize()
torch.mps.empty_cache()

def is_compatible(self):
import torch

return torch.backends.mps.is_available()

def validate(self):
if not self.is_compatible():
raise ContestDeviceValidationError("MPS is not available but is required.")


class ContestDeviceValidationError(Exception):
def __init__(self, message: str):
Expand Down
17 changes: 15 additions & 2 deletions neuron/neuron/random_inputs.py → base/base/inputs_api.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import os
from zoneinfo import ZoneInfo

import requests
from pydantic import RootModel

from pipelines import TextToImageRequest

TIMEZONE = ZoneInfo("America/Los_Angeles")
INPUTS_ENDPOINT = os.getenv("INPUTS_ENDPOINT", "https://edge-inputs.api.wombo.ai")


Expand All @@ -20,3 +18,18 @@ def random_inputs() -> list[TextToImageRequest]:
response.raise_for_status()

return RootModel[list[TextToImageRequest]].model_validate_json(response.text).root


def blacklisted_keys() -> dict:
response = requests.get(
f"{INPUTS_ENDPOINT}/blacklist", headers={
"Content-Type": "application/json"
},
)

response.raise_for_status()
return response.json()


def is_blacklisted(blacklist: dict, hotkey: str, coldkey: str):
return hotkey in blacklist["hotkeys"] or coldkey in blacklist["coldkeys"]
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from struct import pack, unpack


_UINT_16_SIZE = 2
_UINT_32_SIZE = 4

Expand Down
Loading