Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example sklearn #5

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/test-type-lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ jobs:
run: |
source activate ./ci_env
pip install -e .[dev]
pip install scikit-learn lightning # for docs

- name: Print installed packages
run: |
Expand Down Expand Up @@ -76,7 +77,7 @@ jobs:
sed -i 's/\"auto\"/None/g' README.md
# on Mac: sed -i '' 's/cluster: slurm/cluster: null/g' infra/*.md
# check readmes
pytest --markdown-docs -m markdown-docs `**/*.md`
pytest --markdown-docs -m markdown-docs .

- name: Run basic pylint
run: |
Expand Down
91 changes: 91 additions & 0 deletions docs/infra/example_sklearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""
A minimalist example with sklearn to show how to develop and explore a model with exca.
"""
import typing as tp
import numpy as np
import pydantic
import sys
import exca
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


class Dataset(pydantic.BaseModel):
n_samples: int = 100
noise: float = 0.1
random_state: int = 42
test_size: float = 0.2
model_config = pydantic.ConfigDict(extra="forbid")

def get(self) -> tp.Tuple[np.ndarray]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def get(self) -> tp.Tuple[np.ndarray]:
def get(self) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:

# Generate synthetic data
X, y = make_regression(
n_samples=self.n_samples,
noise=self.noise,
random_state=self.random_state
)
# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=self.test_size,
random_state=self.random_state
)
return X_train, X_test, y_train, y_test


class Model(pydantic.BaseModel):
data: Dataset = Dataset()
alpha: float = 1.0
max_iter: int = 1000
infra: exca.TaskInfra = exca.TaskInfra(folder='.cache/')

@infra.apply
def score(self):
# Get data
X_train, X_test, y_train, y_test = self.data.get()

# Train a Ridge regression model
print('Fit...')
model = Ridge(alpha=self.alpha, max_iter=self.max_iter)
model.fit(X_train, y_train)

# Evaluate
print('Score...')
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
return mse


def args_to_nested_dict(args: list[str]) -> tp.Dict[str, tp.Any]:
"""
Parses a list of Bash-style arguments (e.g., --key=value) into a nested dict.
"""
nested_dict = {}
for arg in args:
# Split argument into key and value
key, value = arg.lstrip("--").split("=", 1)
# Convert flat key into a nested dictionary
keys = key.split(".")
current_level = nested_dict
for k in keys[:-1]:
current_level = current_level.setdefault(k, {})
current_level[keys[-1]] = value
return nested_dict
Comment on lines +64 to +80
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def args_to_nested_dict(args: list[str]) -> tp.Dict[str, tp.Any]:
"""
Parses a list of Bash-style arguments (e.g., --key=value) into a nested dict.
"""
nested_dict = {}
for arg in args:
# Split argument into key and value
key, value = arg.lstrip("--").split("=", 1)
# Convert flat key into a nested dictionary
keys = key.split(".")
current_level = nested_dict
for k in keys[:-1]:
current_level = current_level.setdefault(k, {})
current_level[keys[-1]] = value
return nested_dict



if __name__ == "__main__":
# Validate config
config = args_to_nested_dict(sys.argv[1:])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
config = args_to_nested_dict(sys.argv[1:])
config = exca.ConfDict.from_args(sys.argv[1:])

model = Model(**config)
print(model.infra.config)

# Score
mse = model.score()
print(mse)
Loading