Skip to content

Commit

Permalink
Merge pull request #354 from automl/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
mfeurer authored Sep 19, 2017
2 parents 7d33420 + 42430d1 commit f4b72be
Show file tree
Hide file tree
Showing 51 changed files with 911 additions and 1,069 deletions.
3 changes: 0 additions & 3 deletions CHANGES.md

This file was deleted.

19 changes: 19 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
FROM ubuntu

# System requirements
RUN apt-get update && apt-get install -y \
build-essential \
curl \
python3-pip \
swig \
&& rm -rf /var/lib/apt/lists/*

# Upgrade pip then install dependencies
RUN pip3 install --upgrade pip
RUN curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt \
| xargs -n 1 -L 1 pip3 install

# Install
RUN pip3 install \
auto-sklearn \
jupyter
2 changes: 1 addition & 1 deletion autosklearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

__MANDATORY_PACKAGES__ = '''
numpy>=1.9
scikit-learn==0.18.1
scikit-learn>=0.18.1,<0.19
smac==0.5.0
lockfile>=0.10
ConfigSpace>=0.3.3,<0.4
Expand Down
2 changes: 1 addition & 1 deletion autosklearn/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Version information."""

# The following line *must* be the last in the module, exactly as formatted:
__version__ = "0.2.0"
__version__ = "0.2.1"
21 changes: 16 additions & 5 deletions autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def fit(self, X, y,
raise ValueError('No metric given.')
if not isinstance(metric, Scorer):
raise ValueError('Metric must be instance of '
'autosklearn.metric.Scorer.')
'autosklearn.metrics.Scorer.')

if feat_type is not None and len(feat_type) != X.shape[1]:
raise ValueError('Array feat_type does not have same number of '
Expand Down Expand Up @@ -531,8 +531,8 @@ def predict(self, X, batch_size=None, n_jobs=1):
self._resampling_strategy not in \
['holdout', 'holdout-iterative-fit']:
raise NotImplementedError(
'Predict is currently only implemented for resampling '
'strategy %s.' % self._resampling_strategy)
'Predict is currently not implemented for resampling '
'strategy %s, please call refit().' % self._resampling_strategy)

if self.models_ is None or len(self.models_) == 0 or \
self.ensemble_ is None:
Expand Down Expand Up @@ -764,12 +764,23 @@ def sprint_statistics(self):
'limit: %d\n' % num_memout)
return sio.getvalue()

def show_models(self):
def get_models_with_weights(self):
if self.models_ is None or len(self.models_) == 0 or \
self.ensemble_ is None:
self._load_models()

return self.ensemble_.pprint_ensemble_string(self.models_)
return self.ensemble_.get_models_with_weights(self.models_)

def show_models(self):
models_with_weights = self.get_models_with_weights()

with io.StringIO() as sio:
sio.write("[")
for weight, model in models_with_weights:
sio.write("(%f, %s),\n" % (weight, model))
sio.write("]")

return sio.getvalue()

def _create_search_space(self, tmp_dir, backend, datamanager,
include_estimators=None,
Expand Down
7 changes: 4 additions & 3 deletions autosklearn/ensembles/abstract_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def predict(self, base_models_predictions):
self

@abstractmethod
def pprint_ensemble_string(self, models):
"""Return a nicely-readable representation of the ensmble.
def get_models_with_weights(self, models):
"""Return a list of (weight, model) pairs
Parameters
----------
Expand All @@ -53,9 +53,10 @@ def pprint_ensemble_string(self, models):
Returns
-------
str
array : [(weight_1, model_1), ..., (weight_n, model_n)]
"""


@abstractmethod
def get_model_identifiers(self):
"""Return identifiers of models in the ensemble.
Expand Down
12 changes: 3 additions & 9 deletions autosklearn/ensembles/ensemble_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import random

import numpy as np
import six

from autosklearn.constants import *
from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
Expand Down Expand Up @@ -204,9 +203,9 @@ def __str__(self):
enumerate(self.identifiers_)
if self.weights_[idx] > 0]))

def pprint_ensemble_string(self, models):
def get_models_with_weights(self, models):
output = []
sio = six.StringIO()

for i, weight in enumerate(self.weights_):
identifier = self.identifiers_[i]
model = models[identifier]
Expand All @@ -215,12 +214,7 @@ def pprint_ensemble_string(self, models):

output.sort(reverse=True, key=lambda t: t[0])

sio.write("[")
for weight, model in output:
sio.write("(%f, %s),\n" % (weight, model))
sio.write("]")

return sio.getvalue()
return output

def get_model_identifiers(self):
return self.identifiers_
24 changes: 18 additions & 6 deletions autosklearn/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,16 @@ def show_models(self):
"""
return self._automl.show_models()

def get_models_with_weights(self):
"""Return a list of the final ensemble found by auto-sklearn.
Returns
-------
[(weight_1, model_1), ..., (weight_n, model_n)]
"""
return self._automl.get_models_with_weights()

@property
def cv_results_(self):
return self._automl.cv_results_
Expand Down Expand Up @@ -171,15 +181,17 @@ def __init__(self,
resampling_strategy : string, optional ('holdout')
how to to handle overfitting, might need 'resampling_strategy_arguments'
* 'holdout': 66:33 (train:test) split
* 'holdout-iterative-fit': 66:33 (train:test) split, calls iterative
* 'holdout': 67:33 (train:test) split
* 'holdout-iterative-fit': 67:33 (train:test) split, calls iterative
fit where possible
* 'cv': crossvalidation, requires 'folds'
resampling_strategy_arguments : dict, optional if 'holdout' (None)
resampling_strategy_arguments : dict, optional if 'holdout' (train_size default=0.67)
Additional arguments for resampling_strategy
* 'holdout': None
* 'holdout-iterative-fit': None
``train_size`` should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split.
* 'holdout': {'train_size': float}
* 'holdout-iterative-fit': {'train_size': float}
* 'cv': {'folds': int}
tmp_folder : string, optional (None)
Expand Down Expand Up @@ -339,7 +351,7 @@ def fit_ensemble(self, y, task=None, metric=None, precision='32',
introduced in `Getting Most out of Ensemble Selection`.
ensemble_size : int
Size of the ensemble built by `Ensomble Selection`.
Size of the ensemble built by `Ensemble Selection`.
Returns
-------
Expand Down
17 changes: 10 additions & 7 deletions autosklearn/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,21 +234,24 @@ def run(self, config, instance=None,

def get_splitter(self, D):
y = D.data['Y_train'].ravel()

train_size = 0.67
if self.resampling_strategy_args:
train_size = self.resampling_strategy_args.get('train_size', train_size)
test_size = 1 - train_size
if D.info['task'] in CLASSIFICATION_TASKS and \
D.info['task'] != MULTILABEL_CLASSIFICATION:

if self.resampling_strategy in ['holdout',
'holdout-iterative-fit']:
try:
cv = StratifiedShuffleSplit(n_splits=1, train_size=0.67,
test_size=0.33, random_state=1)
cv = StratifiedShuffleSplit(n_splits=1, train_size=train_size,
test_size=test_size, random_state=1)
test_cv = copy.deepcopy(cv)
next(test_cv.split(y, y))
except ValueError as e:
if 'The least populated class in y has only' in e.args[0]:
cv = ShuffleSplit(n_splits=1, train_size=0.67,
test_size=0.33, random_state=1)
cv = ShuffleSplit(n_splits=1, train_size=train_size,
test_size=test_size, random_state=1)
else:
raise

Expand All @@ -261,8 +264,8 @@ def get_splitter(self, D):
else:
if self.resampling_strategy in ['holdout',
'holdout-iterative-fit']:
cv = ShuffleSplit(n_splits=1, train_size=0.67,
test_size=0.33, random_state=1)
cv = ShuffleSplit(n_splits=1, train_size=train_size,
test_size=test_size, random_state=1)
elif self.resampling_strategy in ['cv', 'partial-cv',
'partial-cv-iterative-fit']:
cv = KFold(n_splits=self.resampling_strategy_args['folds'],
Expand Down
8 changes: 6 additions & 2 deletions autosklearn/pipeline/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,12 @@ def predict(self, X, batch_size=None):
if batch_size is None:
return super(BasePipeline, self).predict(X).astype(self._output_dtype)
else:
if type(batch_size) is not int or batch_size <= 0:
raise Exception("batch_size must be a positive integer")
if not isinstance(batch_size, int):
raise ValueError("Argument 'batch_size' must be of type int, "
"but is '%s'" % type(batch_size))
if batch_size <= 0:
raise ValueError("Argument 'batch_size' must be positive, "
"but is %d" % batch_size)

else:
if self.num_targets == 1:
Expand Down
8 changes: 6 additions & 2 deletions autosklearn/pipeline/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,12 @@ def predict_proba(self, X, batch_size=None):
return self.steps[-1][-1].predict_proba(Xt)

else:
if type(batch_size) is not int or batch_size <= 0:
raise Exception("batch_size must be a positive integer")
if not isinstance(batch_size, int):
raise ValueError("Argument 'batch_size' must be of type int, "
"but is '%s'" % type(batch_size))
if batch_size <= 0:
raise ValueError("Argument 'batch_size' must be positive, "
"but is %d" % batch_size)

else:
# Probe for the target array dimensions
Expand Down
7 changes: 4 additions & 3 deletions autosklearn/pipeline/components/classification/qda.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def __init__(self, reg_param, random_state=None):
def fit(self, X, Y):
import sklearn.discriminant_analysis

estimator = sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis(self.reg_param)
estimator = sklearn.discriminant_analysis.\
QuadraticDiscriminantAnalysis(reg_param=self.reg_param)

if len(Y.shape) == 2 and Y.shape[1] > 1:
self.estimator = sklearn.multiclass.OneVsRestClassifier(estimator, n_jobs=1)
Expand Down Expand Up @@ -68,8 +69,8 @@ def get_properties(dataset_properties=None):

@staticmethod
def get_hyperparameter_search_space(dataset_properties=None):
reg_param = UniformFloatHyperparameter('reg_param', 0.0, 10.0,
default=0.5)
reg_param = UniformFloatHyperparameter('reg_param', 0.0, 1.0,
default=0.0)
cs = ConfigurationSpace()
cs.add_hyperparameter(reg_param)
return cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def fit(self, X, Y):
fit_intercept=self.fit_intercept,
tol=self.tol,
copy_X=False,
normalize=False)
normalize=False,
random_state=self.random_state)
self.estimator.fit(X, Y)
return self

Expand Down
4 changes: 2 additions & 2 deletions autosklearn/smbo.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,6 @@ def run_smbo(self):

self.scenario = Scenario(scenario_dict)


# TODO rebuild target algorithm to be it's own target algorithm
# evaluator, which takes into account that a run can be killed prior
# to the model being fully fitted; thus putting intermediate results
Expand All @@ -545,8 +544,9 @@ def run_smbo(self):
include['preprocessor'] = self.include_preprocessors
elif self.exclude_preprocessors is not None:
exclude['preprocessor'] = self.exclude_preprocessors

if self.include_estimators is not None and \
self.exclude_preprocessors is not None:
self.exclude_estimators is not None:
raise ValueError('Cannot specify include_estimators and '
'exclude_estimators.')
elif self.include_estimators is not None:
Expand Down
3 changes: 2 additions & 1 deletion autosklearn/util/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ def get_configuration_space(info,
include['preprocessor'] = include_preprocessors
elif exclude_preprocessors is not None:
exclude['preprocessor'] = exclude_preprocessors

if include_estimators is not None and \
exclude_preprocessors is not None:
exclude_estimators is not None:
raise ValueError('Cannot specify include_estimators and '
'exclude_estimators.')
elif include_estimators is not None:
Expand Down
12 changes: 2 additions & 10 deletions autosklearn/util/stopwatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,36 +81,28 @@ def insert_task(self, name, cpu_dur, wall_dur):
self._tasks[name].stop()
self._tasks[name]._wall_dur = wall_dur
self._tasks[name]._cpu_dur = cpu_dur
else:
sys.stderr.write('You are already timing task: %s\n' % name)

def start_task(self, name):
if name not in self._tasks:
self._tasks[name] = TimingTask(name)
else:
sys.stderr.write('You are already timing task: %s\n' % name)

def wall_elapsed(self, name):
tmp = time.time()
try:
if name in self._tasks:
if not self._tasks[name].wall_dur:
tsk_start = self._tasks[name].wall_tic
return tmp - tsk_start
else:
return self._tasks[name].wall_dur
except KeyError:
sys.stderr.write('You are already timing task: %s\n' % name)

def cpu_elapsed(self, name):
tmp = time.clock()
try:
if name in self._tasks:
if not self._tasks[name].cpu_dur:
tsk_start = self._tasks[name].cpu_tic
return tmp - tsk_start
else:
return self._tasks[name].cpu_dur
except KeyError:
sys.stderr.write('You are already timing task: %s\n' % name)

def stop_task(self, name):
try:
Expand Down
6 changes: 3 additions & 3 deletions doc/manual.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ random forests.
>>> automl = autosklearn.classification.AutoSklearnClassifier(
>>> include_estimators=["random_forest", ], exclude_estimators=None,
>>> include_preprocessors=["no_preprocessing", ], exclude_preprocessors=None)
>>> cls.fit(X_train, y_train)
>>> predictions = cls.predict(X_test, y_test)
>>> automl.fit(X_train, y_train)
>>> predictions = automl.predict(X_test)

**Note:** The strings used to identify estimators and preprocessors are the filenames without *.py*.

Expand Down Expand Up @@ -129,4 +129,4 @@ set ``ensemble_size=1`` and ``initial_configurations_via_metalearning=0``:
An ensemble of size one will result in always choosing the current best model
according to its performance on the validation set. Setting the initial
configurations found by meta-learning to zero makes *auto-sklearn* use the
regular SMAC algorithm for suggesting new hyperparameter configurations.
regular SMAC algorithm for suggesting new hyperparameter configurations.
Loading

0 comments on commit f4b72be

Please sign in to comment.