Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AutoML v2 (WIP) #81

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,5 @@ pytest_report.html
pytest_report_not_long.html
.DS_Store
/docs/*.bat

/*.xlsx
4 changes: 2 additions & 2 deletions aikit/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,10 @@ def create_cv(cv=3, y=None, classifier=False, shuffle=False, random_state=None):
and (sklearn.model_selection._split.type_of_target(y) in ("binary", "multiclass"))
):

return sklearn.model_selection.StratifiedKFold(cv, shuffle=shuffle, random_state=random_state)
return sklearn.model_selection.StratifiedKFold(cv, shuffle=shuffle or random_state is not None, random_state=random_state)

else:
return sklearn.model_selection.KFold(cv, shuffle=shuffle, random_state=random_state)
return sklearn.model_selection.KFold(cv, shuffle=shuffle or random_state is not None, random_state=random_state)

if not hasattr(cv, "split") or isinstance(cv, str):
if not isinstance(cv, sklearn.model_selection._split.Iterable) or isinstance(cv, str):
Expand Down
2 changes: 2 additions & 0 deletions aikit/future/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from . import _class_registration

98 changes: 98 additions & 0 deletions aikit/future/_class_registration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from sklearn.pipeline import Pipeline
from aikit.pipeline import GraphPipeline

from aikit.transformers import ColumnsSelector
from aikit.models import OutSamplerTransformer, StackerClassifier, StackerRegressor, KMeansWrapper, DBSCANWrapper, \
AgglomerativeClusteringWrapper

from aikit.transformers import FeaturesSelectorClassifier, FeaturesSelectorRegressor, TruncatedSVDWrapper, PassThrough
from aikit.transformers import PCAWrapper
from aikit.transformers import BoxCoxTargetTransformer, NumImputer, KMeansTransformer, CdfScaler
from aikit.transformers import Word2VecVectorizer, CountVectorizerWrapper, Char2VecVectorizer
from aikit.transformers import TextNltkProcessing, TextDefaultProcessing, TextDigitAnonymizer
from aikit.transformers import TargetEncoderClassifier, TargetEncoderEntropyClassifier, TargetEncoderRegressor
from aikit.transformers import NumericalEncoder

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import LogisticRegression, Ridge, Lasso

from .util import CLASS_REGISTRY

try:
import nltk
except ImportError:
nltk = None
print("NLTK not available, AutoML won't run with NLTK transformers")

try:
import gensim
except ImportError:
gensim = None
print("Gensim not available, AutoML won't run with Gensim models")

try:
import lightgbm
except ImportError:
lightgbm = None
print("LightGBM not available, AutoML won't run with LightGBM models")

# Pipelines
CLASS_REGISTRY.add_klass(PassThrough)
CLASS_REGISTRY.add_klass(Pipeline)
CLASS_REGISTRY.add_klass(GraphPipeline)
CLASS_REGISTRY.add_klass(ColumnsSelector)

# Stacking tools
CLASS_REGISTRY.add_klass(OutSamplerTransformer)
CLASS_REGISTRY.add_klass(StackerRegressor)
CLASS_REGISTRY.add_klass(StackerClassifier)

# Feature selection
CLASS_REGISTRY.add_klass(FeaturesSelectorClassifier)
CLASS_REGISTRY.add_klass(FeaturesSelectorRegressor)

# Text vectorizers
CLASS_REGISTRY.add_klass(CountVectorizerWrapper)
if gensim is not None:
CLASS_REGISTRY.add_klass(Word2VecVectorizer)
CLASS_REGISTRY.add_klass(Char2VecVectorizer)

# Text preprocessors
if nltk is not None:
CLASS_REGISTRY.add_klass(TextNltkProcessing)
CLASS_REGISTRY.add_klass(TextDefaultProcessing)
CLASS_REGISTRY.add_klass(TextDigitAnonymizer)

# Transformers
CLASS_REGISTRY.add_klass(TruncatedSVDWrapper)
CLASS_REGISTRY.add_klass(PCAWrapper)
CLASS_REGISTRY.add_klass(BoxCoxTargetTransformer)
CLASS_REGISTRY.add_klass(NumImputer)
CLASS_REGISTRY.add_klass(CdfScaler)
CLASS_REGISTRY.add_klass(KMeansTransformer)

# Category encoders
CLASS_REGISTRY.add_klass(NumericalEncoder)
CLASS_REGISTRY.add_klass(TargetEncoderClassifier)
CLASS_REGISTRY.add_klass(TargetEncoderEntropyClassifier)
CLASS_REGISTRY.add_klass(TargetEncoderRegressor)

# Classifiers
CLASS_REGISTRY.add_klass(RandomForestClassifier)
CLASS_REGISTRY.add_klass(ExtraTreesClassifier)
CLASS_REGISTRY.add_klass(LogisticRegression)
CLASS_REGISTRY.add_klass(Lasso)
if lightgbm is not None:
CLASS_REGISTRY.add_klass(lightgbm.LGBMClassifier)

# Regressors
CLASS_REGISTRY.add_klass(RandomForestRegressor)
CLASS_REGISTRY.add_klass(ExtraTreesRegressor)
CLASS_REGISTRY.add_klass(Ridge)
if lightgbm is not None:
CLASS_REGISTRY.add_klass(lightgbm.LGBMRegressor)

# Clustering
CLASS_REGISTRY.add_klass(KMeansWrapper)
CLASS_REGISTRY.add_klass(DBSCANWrapper)
CLASS_REGISTRY.add_klass(AgglomerativeClusteringWrapper)
15 changes: 15 additions & 0 deletions aikit/future/automl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from ._config import AutoMlConfig
from ._job import JobConfig, load_job_config_from_json
from ._automl import AutoMl, TimeBudget, AutoMlBudget
from . import registry
from ._registry import MODEL_REGISTRY

__all__ = [
"AutoMlConfig",
"AutoMl",
"AutoMlBudget",
"TimeBudget",
"JobConfig",
"load_job_config_from_json",
"MODEL_REGISTRY"
]
165 changes: 165 additions & 0 deletions aikit/future/automl/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import logging
import os
import uuid
# Remove some warning categories for debugging purpose
from warnings import simplefilter

import pandas as pd
import typer
from sklearn.exceptions import ConvergenceWarning

# import scorers to add custom aikit scorers in scikit-learn SCORERS list
import aikit.scorer # noqa
from aikit.datasets import load_dataset, DatasetEnum
from aikit.future.automl import AutoMl, TimeBudget, AutoMlConfig, load_job_config_from_json, JobConfig
from aikit.future.automl._automl import ModelCountBudget
from aikit.future.automl.backends import get_backend, filter_backend_kwargs
from aikit.future.automl.guider import AutoMlModelGuider
from aikit.future.automl.result import AutoMlResultReader
from aikit.future.automl.serialization import Format

simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=ConvergenceWarning)
simplefilter(action='ignore', category=UserWarning)


app = typer.Typer()

# Configure logging
logging.basicConfig(level=logging.INFO)
# Configure some custom level for third-parties for debugging purpose
logging.getLogger("gensim").setLevel(logging.WARNING)

_logger = logging.getLogger(__name__)


@app.command()
def run(data: str,
config_path: str = None,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

26% of developers fix this issue

Incompatible variable type: config_path is declared to have type str but is used as type None.

❗❗ 7 similar findings have been found in this PR

🔎 Expand here to view all instances of this finding
File Path Line Number
aikit/future/automl/main.py 41
aikit/future/automl/main.py 42
aikit/future/automl/main.py 43
aikit/future/automl/main.py 44
aikit/future/automl/main.py 45
aikit/future/automl/backends/_dask.py 17
aikit/future/automl/backends/_dask.py 17

Visit the Lift Web Console to find more details in your report.


ℹ️ Expand to see all @sonatype-lift commands

You can reply with the following commands. For example, reply with @sonatype-lift ignoreall to leave out all findings.

Command Usage
@sonatype-lift ignore Leave out the above finding from this PR
@sonatype-lift ignoreall Leave out all the existing findings from this PR
@sonatype-lift exclude <file|issue|path|tool> Exclude specified file|issue|path|tool from Lift findings by updating your config.toml file

Note: When talking to LiftBot, you need to refresh the page to see its response.
Click here to add LiftBot to another repo.


Help us improve LIFT! (Sonatype LiftBot external survey)

Was this a good recommendation for you? Answering this survey will not impact your Lift settings.

[ 🙁 Not relevant ] - [ 😕 Won't fix ] - [ 😑 Not critical, will fix ] - [ 🙂 Critical, will fix ] - [ 😊 Critical, fixing now ]

target: str = "target",
backend: str = "sequential",
session: str = None,
cv: int = None,
baseline: float = None,
budget_model_count: int = None,
budget_time: int = None,
dask_storage_path: str = os.path.join(os.path.expanduser("~"), ".aikit", "working_dir"),
dask_cluster: str = "local",
dask_num_workers: int = 1):

if session is None:
session = str(uuid.uuid4())
_logger.info(f"Start AutoML, session: {session}")

# Register in this dictionary all arguments that must be passed to the backend
backend_kwargs = {
"dask_storage_path": dask_storage_path,
"dask_cluster": dask_cluster,
"dask_num_workers": dask_num_workers,
}
backend_kwargs = filter_backend_kwargs(backend, **backend_kwargs)

if data in DatasetEnum.alls:
df_train, y_train, _, _, _ = load_dataset(data)
else:
# TODO: load data from filesystem
raise NotImplementedError(f"Unknown dataset: {data}")
automl_config = AutoMlConfig(X=df_train, y=y_train)
automl_config.guess_everything()

if config_path is not None:
job_config = load_job_config_from_json(config_path)
else:
job_config = JobConfig()
if cv is not None:
job_config.cv = cv
if baseline is not None:
job_config.baseline_score = baseline
if job_config.cv is None:
job_config.guess_cv(automl_config)
if job_config.scoring is None:
job_config.guess_scoring(automl_config)

if budget_time is not None:
budget = TimeBudget(budget_time)
elif budget_model_count is not None:
budget = ModelCountBudget(budget_model_count)
else:
raise ValueError("'budget_time' or 'budget_model_count' must be set")

# TODO: force seed of workers in the backend
with get_backend(backend, session=session, **backend_kwargs) as backend:
# TODO: add dedicated methods in backend to write common data
backend.get_data_loader().write(key="X", path="data", data=df_train, serialization_format=Format.PICKLE)
backend.get_data_loader().write(key="y", path="data", data=y_train, serialization_format=Format.PICKLE)
backend.get_data_loader().write(key="groups", path="data", data=None, serialization_format=Format.PICKLE)
backend.get_data_loader().write(key="automl_config", path="data", data=automl_config,
serialization_format=Format.PICKLE)
backend.get_data_loader().write(key="job_config", path="data", data=job_config,
serialization_format=Format.PICKLE)

result_reader = AutoMlResultReader(backend.get_data_loader())

automl_guider = AutoMlModelGuider(result_reader=result_reader,
job_config=job_config)

automl = AutoMl(automl_config=automl_config,
job_config=job_config,
backend=backend,
automl_guider=automl_guider,
budget=budget,
random_state=123)

automl.search_models()

df_result = result_reader.load_all_results(aggregate=True)
print(df_result)

_logger.info(f"Finished searching models, session: {session}")


@app.command()
def result(session: str,
output_path: str = ".",
backend: str = "sequential",
dask_storage_path: str = os.path.join(os.path.expanduser("~"), ".aikit", "working_dir")):

# Register in this dictionary all arguments that must be passed to the backend
backend_kwargs = {
"dask_storage_path": dask_storage_path,
}
backend_kwargs = filter_backend_kwargs(backend, **backend_kwargs)

with get_backend(backend, session=session, **backend_kwargs) as backend:
result_reader = AutoMlResultReader(backend.get_data_loader())

df_results = result_reader.load_all_results()
df_additional_results = result_reader.load_additional_results()
df_params = result_reader.load_all_params()
df_errors = result_reader.load_all_errors()
df_params_other = result_reader.load_other_params()

df_merged_result = pd.merge(df_params, df_results, how="inner", on="job_id")
df_merged_result = pd.merge(df_merged_result, df_params_other, how="inner", on="job_id")
if df_additional_results.shape[0] > 0:
df_merged_result = pd.merge(df_merged_result, df_additional_results, how="inner", on="job_id")

df_merged_error = pd.merge(df_params, df_errors, how="inner", on="job_id")

result_filename = os.path.join(output_path, "result.xlsx")
try:
df_merged_result.to_excel(result_filename, index=False)
_logger.info(f"Result file saved: {result_filename}")
except: # noqa
_logger.warning(f"Error saving result file ({result_filename})", exc_info=True)

error_filename = os.path.join(output_path, "error.xlsx")
try:
df_merged_error.to_excel(error_filename, index=False)
_logger.info(f"Error file saved: {error_filename}")
except: # noqa
_logger.warning(f"Error saving error file ({error_filename})", exc_info=True)


if __name__ == '__main__':
app()
Loading