Merge pull request #354 from automl/development

Development
automl · Sep 19, 2017 · f4b72be · f4b72be
2 parents 7d33420 + 42430d1
commit f4b72be
Show file tree

Hide file tree

Showing 51 changed files with 911 additions and 1,069 deletions.
diff --git a/CHANGES.md b/CHANGES.md
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,19 @@
+FROM ubuntu
+
+# System requirements
+RUN apt-get update && apt-get install -y \
+  build-essential \
+  curl \
+  python3-pip \
+  swig \
+  && rm -rf /var/lib/apt/lists/*
+
+# Upgrade pip then install dependencies
+RUN pip3 install --upgrade pip
+RUN curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt \
+  | xargs -n 1 -L 1 pip3 install
+
+# Install
+RUN pip3 install \
+  auto-sklearn \
+  jupyter
diff --git a/autosklearn/__init__.py b/autosklearn/__init__.py
@@ -5,7 +5,7 @@
 
 __MANDATORY_PACKAGES__ = '''
 numpy>=1.9
-scikit-learn==0.18.1
+scikit-learn>=0.18.1,<0.19
 smac==0.5.0
 lockfile>=0.10
 ConfigSpace>=0.3.3,<0.4

diff --git a/autosklearn/__version__.py b/autosklearn/__version__.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.2.0"
+__version__ = "0.2.1"
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -171,7 +171,7 @@ def fit(self, X, y,
             raise ValueError('No metric given.')
         if not isinstance(metric, Scorer):
             raise ValueError('Metric must be instance of '
-                             'autosklearn.metric.Scorer.')
+                             'autosklearn.metrics.Scorer.')
 
         if feat_type is not None and len(feat_type) != X.shape[1]:
             raise ValueError('Array feat_type does not have same number of '
@@ -531,8 +531,8 @@ def predict(self, X, batch_size=None, n_jobs=1):
                 self._resampling_strategy not in  \
                         ['holdout', 'holdout-iterative-fit']:
             raise NotImplementedError(
-                'Predict is currently only implemented for resampling '
-                'strategy %s.' % self._resampling_strategy)
+                'Predict is currently not implemented for resampling '
+                'strategy %s, please call refit().' % self._resampling_strategy)
 
         if self.models_ is None or len(self.models_) == 0 or \
                 self.ensemble_ is None:
@@ -764,12 +764,23 @@ def sprint_statistics(self):
                   'limit: %d\n' % num_memout)
         return sio.getvalue()
 
-    def show_models(self):
+    def get_models_with_weights(self):
         if self.models_ is None or len(self.models_) == 0 or \
                 self.ensemble_ is None:
             self._load_models()
 
-        return self.ensemble_.pprint_ensemble_string(self.models_)
+        return self.ensemble_.get_models_with_weights(self.models_)
+
+    def show_models(self):
+        models_with_weights = self.get_models_with_weights()
+
+        with io.StringIO() as sio:
+            sio.write("[")
+            for weight, model in models_with_weights:
+                sio.write("(%f, %s),\n" % (weight, model))
+            sio.write("]")
+
+            return sio.getvalue()
 
     def _create_search_space(self, tmp_dir, backend, datamanager,
                              include_estimators=None,

diff --git a/autosklearn/ensembles/abstract_ensemble.py b/autosklearn/ensembles/abstract_ensemble.py
@@ -42,8 +42,8 @@ def predict(self, base_models_predictions):
         self
 
     @abstractmethod
-    def pprint_ensemble_string(self, models):
-        """Return a nicely-readable representation of the ensmble.
+    def get_models_with_weights(self, models):
+        """Return a list of (weight, model) pairs
 
         Parameters
         ----------
@@ -53,9 +53,10 @@ def pprint_ensemble_string(self, models):
 
         Returns
         -------
-        str
+        array : [(weight_1, model_1), ..., (weight_n, model_n)]
         """
 
+
     @abstractmethod
     def get_model_identifiers(self):
         """Return identifiers of models in the ensemble.

diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py
@@ -2,7 +2,6 @@
 import random
 
 import numpy as np
-import six
 
 from autosklearn.constants import *
 from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
@@ -204,9 +203,9 @@ def __str__(self):
                           enumerate(self.identifiers_)
                           if self.weights_[idx] > 0]))
 
-    def pprint_ensemble_string(self, models):
+    def get_models_with_weights(self, models):
         output = []
-        sio = six.StringIO()
+
         for i, weight in enumerate(self.weights_):
             identifier = self.identifiers_[i]
             model = models[identifier]
@@ -215,12 +214,7 @@ def pprint_ensemble_string(self, models):
 
         output.sort(reverse=True, key=lambda t: t[0])
 
-        sio.write("[")
-        for weight, model in output:
-            sio.write("(%f, %s),\n" % (weight, model))
-        sio.write("]")
-
-        return sio.getvalue()
+        return output
 
     def get_model_identifiers(self):
         return self.identifiers_
diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
@@ -73,6 +73,16 @@ def show_models(self):
         """
         return self._automl.show_models()
 
+    def get_models_with_weights(self):
+        """Return a list of the final ensemble found by auto-sklearn.
+
+        Returns
+        -------
+        [(weight_1, model_1), ..., (weight_n, model_n)]
+
+        """
+        return self._automl.get_models_with_weights()
+
     @property
     def cv_results_(self):
         return self._automl.cv_results_
@@ -171,15 +181,17 @@ def __init__(self,
         resampling_strategy : string, optional ('holdout')
             how to to handle overfitting, might need 'resampling_strategy_arguments'
 
-            * 'holdout': 66:33 (train:test) split
-            * 'holdout-iterative-fit':  66:33 (train:test) split, calls iterative
+            * 'holdout': 67:33 (train:test) split
+            * 'holdout-iterative-fit':  67:33 (train:test) split, calls iterative
               fit where possible
             * 'cv': crossvalidation, requires 'folds'
 
-        resampling_strategy_arguments : dict, optional if 'holdout' (None)
+        resampling_strategy_arguments : dict, optional if 'holdout' (train_size default=0.67)
             Additional arguments for resampling_strategy
-            * 'holdout': None
-            * 'holdout-iterative-fit':  None
+            ``train_size`` should be between 0.0 and 1.0 and represent the
+            proportion of the dataset to include in the train split.
+            * 'holdout': {'train_size': float}
+            * 'holdout-iterative-fit':  {'train_size': float}
             * 'cv': {'folds': int}
 
         tmp_folder : string, optional (None)
@@ -339,7 +351,7 @@ def fit_ensemble(self, y, task=None, metric=None, precision='32',
             introduced in `Getting Most out of Ensemble Selection`.
 
         ensemble_size : int
-            Size of the ensemble built by `Ensomble Selection`.
+            Size of the ensemble built by `Ensemble Selection`.
 
         Returns
         -------

diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py
@@ -234,21 +234,24 @@ def run(self, config, instance=None,
 
     def get_splitter(self, D):
         y = D.data['Y_train'].ravel()
-
+        train_size = 0.67
+        if self.resampling_strategy_args:
+            train_size = self.resampling_strategy_args.get('train_size', train_size)
+        test_size = 1 - train_size
         if D.info['task'] in CLASSIFICATION_TASKS and \
                         D.info['task'] != MULTILABEL_CLASSIFICATION:
 
             if self.resampling_strategy in ['holdout',
                                             'holdout-iterative-fit']:
                 try:
-                    cv = StratifiedShuffleSplit(n_splits=1, train_size=0.67,
-                                                test_size=0.33, random_state=1)
+                    cv = StratifiedShuffleSplit(n_splits=1, train_size=train_size,
+                                                test_size=test_size, random_state=1)
                     test_cv = copy.deepcopy(cv)
                     next(test_cv.split(y, y))
                 except ValueError as e:
                     if 'The least populated class in y has only' in e.args[0]:
-                        cv = ShuffleSplit(n_splits=1, train_size=0.67,
-                                          test_size=0.33, random_state=1)
+                        cv = ShuffleSplit(n_splits=1, train_size=train_size,
+                                          test_size=test_size, random_state=1)
                     else:
                         raise
 
@@ -261,8 +264,8 @@ def get_splitter(self, D):
         else:
             if self.resampling_strategy in ['holdout',
                                             'holdout-iterative-fit']:
-                cv = ShuffleSplit(n_splits=1, train_size=0.67,
-                                  test_size=0.33, random_state=1)
+                cv = ShuffleSplit(n_splits=1, train_size=train_size,
+                                  test_size=test_size, random_state=1)
             elif self.resampling_strategy in ['cv', 'partial-cv',
                                               'partial-cv-iterative-fit']:
                 cv = KFold(n_splits=self.resampling_strategy_args['folds'],

diff --git a/autosklearn/pipeline/base.py b/autosklearn/pipeline/base.py
@@ -136,8 +136,12 @@ def predict(self, X, batch_size=None):
         if batch_size is None:
             return super(BasePipeline, self).predict(X).astype(self._output_dtype)
         else:
-            if type(batch_size) is not int or batch_size <= 0:
-                raise Exception("batch_size must be a positive integer")
+            if not isinstance(batch_size, int):
+                raise ValueError("Argument 'batch_size' must be of type int, "
+                                 "but is '%s'" % type(batch_size))
+            if batch_size <= 0:
+                raise ValueError("Argument 'batch_size' must be positive, "
+                                 "but is %d" % batch_size)
 
             else:
                 if self.num_targets == 1:

diff --git a/autosklearn/pipeline/classification.py b/autosklearn/pipeline/classification.py
@@ -127,8 +127,12 @@ def predict_proba(self, X, batch_size=None):
             return self.steps[-1][-1].predict_proba(Xt)
 
         else:
-            if type(batch_size) is not int or batch_size <= 0:
-                raise Exception("batch_size must be a positive integer")
+            if not isinstance(batch_size, int):
+                raise ValueError("Argument 'batch_size' must be of type int, "
+                                 "but is '%s'" % type(batch_size))
+            if batch_size <= 0:
+                raise ValueError("Argument 'batch_size' must be positive, "
+                                 "but is %d" % batch_size)
 
             else:
                 # Probe for the target array dimensions

diff --git a/autosklearn/pipeline/components/classification/qda.py b/autosklearn/pipeline/components/classification/qda.py
@@ -18,7 +18,8 @@ def __init__(self, reg_param, random_state=None):
     def fit(self, X, Y):
         import sklearn.discriminant_analysis
 
-        estimator = sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis(self.reg_param)
+        estimator = sklearn.discriminant_analysis.\
+            QuadraticDiscriminantAnalysis(reg_param=self.reg_param)
 
         if len(Y.shape) == 2 and Y.shape[1] > 1:
             self.estimator = sklearn.multiclass.OneVsRestClassifier(estimator, n_jobs=1)
@@ -68,8 +69,8 @@ def get_properties(dataset_properties=None):
 
     @staticmethod
     def get_hyperparameter_search_space(dataset_properties=None):
-        reg_param = UniformFloatHyperparameter('reg_param', 0.0, 10.0,
-                                               default=0.5)
+        reg_param = UniformFloatHyperparameter('reg_param', 0.0, 1.0,
+                                               default=0.0)
         cs = ConfigurationSpace()
         cs.add_hyperparameter(reg_param)
         return cs
diff --git a/autosklearn/pipeline/components/regression/ridge_regression.py b/autosklearn/pipeline/components/regression/ridge_regression.py
@@ -22,7 +22,8 @@ def fit(self, X, Y):
                                                     fit_intercept=self.fit_intercept,
                                                     tol=self.tol,
                                                     copy_X=False,
-                                                    normalize=False)
+                                                    normalize=False,
+                                                    random_state=self.random_state)
         self.estimator.fit(X, Y)
         return self
 

diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py
@@ -530,7 +530,6 @@ def run_smbo(self):
 
         self.scenario = Scenario(scenario_dict)
 
-
         # TODO rebuild target algorithm to be it's own target algorithm
         # evaluator, which takes into account that a run can be killed prior
         # to the model being fully fitted; thus putting intermediate results
@@ -545,8 +544,9 @@ def run_smbo(self):
             include['preprocessor'] = self.include_preprocessors
         elif self.exclude_preprocessors is not None:
             exclude['preprocessor'] = self.exclude_preprocessors
+
         if self.include_estimators is not None and \
-                self.exclude_preprocessors is not None:
+                self.exclude_estimators is not None:
             raise ValueError('Cannot specify include_estimators and '
                              'exclude_estimators.')
         elif self.include_estimators is not None:

diff --git a/autosklearn/util/pipeline.py b/autosklearn/util/pipeline.py
@@ -25,8 +25,9 @@ def get_configuration_space(info,
         include['preprocessor'] = include_preprocessors
     elif exclude_preprocessors is not None:
         exclude['preprocessor'] = exclude_preprocessors
+
     if include_estimators is not None and \
-            exclude_preprocessors is not None:
+            exclude_estimators is not None:
         raise ValueError('Cannot specify include_estimators and '
                          'exclude_estimators.')
     elif include_estimators is not None:

diff --git a/autosklearn/util/stopwatch.py b/autosklearn/util/stopwatch.py
@@ -81,36 +81,28 @@ def insert_task(self, name, cpu_dur, wall_dur):
             self._tasks[name].stop()
             self._tasks[name]._wall_dur = wall_dur
             self._tasks[name]._cpu_dur = cpu_dur
-        else:
-            sys.stderr.write('You are already timing task: %s\n' % name)
 
     def start_task(self, name):
         if name not in self._tasks:
             self._tasks[name] = TimingTask(name)
-        else:
-            sys.stderr.write('You are already timing task: %s\n' % name)
 
     def wall_elapsed(self, name):
         tmp = time.time()
-        try:
+        if name in self._tasks:
             if not self._tasks[name].wall_dur:
                 tsk_start = self._tasks[name].wall_tic
                 return tmp - tsk_start
             else:
                 return self._tasks[name].wall_dur
-        except KeyError:
-            sys.stderr.write('You are already timing task: %s\n' % name)
 
     def cpu_elapsed(self, name):
         tmp = time.clock()
-        try:
+        if name in self._tasks:
             if not self._tasks[name].cpu_dur:
                 tsk_start = self._tasks[name].cpu_tic
                 return tmp - tsk_start
             else:
                 return self._tasks[name].cpu_dur
-        except KeyError:
-            sys.stderr.write('You are already timing task: %s\n' % name)
 
     def stop_task(self, name):
         try:

diff --git a/doc/manual.rst b/doc/manual.rst
@@ -56,8 +56,8 @@ random forests.
 >>> automl = autosklearn.classification.AutoSklearnClassifier(
 >>>     include_estimators=["random_forest", ], exclude_estimators=None,
 >>>     include_preprocessors=["no_preprocessing", ], exclude_preprocessors=None)
->>> cls.fit(X_train, y_train)
->>> predictions = cls.predict(X_test, y_test)
+>>> automl.fit(X_train, y_train)
+>>> predictions = automl.predict(X_test)
 
 **Note:** The strings used to identify estimators and preprocessors are the filenames without *.py*.
 
@@ -129,4 +129,4 @@ set ``ensemble_size=1`` and ``initial_configurations_via_metalearning=0``:
 An ensemble of size one will result in always choosing the current best model
 according to its performance on the validation set. Setting the initial
 configurations found by meta-learning to zero makes *auto-sklearn* use the
-regular SMAC algorithm for suggesting new hyperparameter configurations.
+regular SMAC algorithm for suggesting new hyperparameter configurations.