-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalysis_baseline.py
109 lines (94 loc) · 3.94 KB
/
analysis_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import sys
import logging
from sklearn.model_selection import LeaveOneGroupOut, ParameterGrid, cross_val_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from dataset import Dataset
from helpers import make_chunks_per_run, make_chunks_per_subjects
def _select_model(modelstr, params):
'''
Internal function to instanciate the model with its parameters for the baseline tests.
Will throw an error if the parameters do not match the model.
Returns the model instance with right parameters
Inputs:
modelstr: string
params: dict
'''
if modelstr == 'logistic':
return LogisticRegression(**params)
elif modelstr == 'ridge':
return RidgeClassifier(**params)
elif modelstr == 'linearsvc':
return LinearSVC(**params)
else:
logging.info('Model {} not implemented. Aborting.'.format(modelstr))
sys.exit(-1)
def _analysis_baseline_per_model(X, y, model, cv, chunks):
'''
Internal function to perform the cross-validation specified by cv and chunks.
Applied the cross validation to the model with X, y as samples, labels
Returns the score of the sklearn function cross_vals_score
Inputs:
X: np.array of size (n_samples, n_features)
y: np.array of size (n_samples,)
model: sklearn classifier supported by cross_val_score
cv: cross-validation generator supported by cross_val_score
chunks: np.array of size (n_samples,) that contains groups for cross-validation
'''
scores = cross_val_score(
estimator=model,
X=X,
y=y,
cv=cv,
groups=chunks,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
return scores
def analysis_baseline(datadir, cv_strategy, models, params_models, debug=False):
'''
Function to perform the baseline analysis and log the results in the logging file specified in run.py
The dataset is first loaded in debug specified mode.
Then the CV strategy is specified.
To finally loop throught the models and the parameters,
perform CV on the data using internal functions and log the scores.
Inputs:
datadir: string being the relative path of where the data is stored
cv_strategy: string to specify the cv_strategy
models: list of strings of the models to test
params_models: list of dict of the associated parameters for the models
debug: bool to specify debug mode
'''
## Loading data
dataset = Dataset(datadir, debug)
X, y = dataset.get_samples(), dataset.get_labels()
## Setting up CV strategy
if cv_strategy == 'per_run':
cv = LeaveOneGroupOut()
chunks = make_chunks_per_run(dataset.nb_subs_, dataset.nb_runs_per_sub_)
elif cv_strategy == 'per_subs':
cv = LeaveOneGroupOut()
chunks = make_chunks_per_subjects(dataset.nb_subs_)
elif cv_strategy == 'random':
cv = 5
chunks = None
else:
logging.info('ERROR, {} cv not implemented for this method'.format(cv_strategy))
sys.exit(-1)
for modelstr, params in zip(models, params_models):
param_grid = ParameterGrid(param_grid=params)
for p in param_grid:
logging.info('\n------------------------')
logging.info('Scores for the model {}'.format(str(modelstr)))
try:
model = _select_model(modelstr, p)
pipe = Pipeline([('scaler', StandardScaler()), ('model', model)])
accuracies = _analysis_baseline_per_model(X, y, pipe, cv, chunks)
logging.info('Accuracy: {} +/- {}'.format(accuracies.mean(), accuracies.std()))
except:
logging.info('ERROR, could not perform CV')
logging.info('With parameters: {}'.format(p))
logging.info('------------------------')