-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGBDT.py
131 lines (110 loc) · 5.34 KB
/
GBDT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 9 15:39:35 2019
@author: sun_y
"""
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
import matplotlib.pylab as plt
## load data
train = pd.read_csv('train_modified.csv')
target='Disbursed'
IDcol = 'ID'
train['Disbursed'].value_counts()
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Disbursed']
## use all default value to train model and check the performance
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(X,y)
y_pred = gbm0.predict(X)
y_predprob = gbm0.predict_proba(X)[:,1]
###############################
#####Accuracy : 0.9852
###AUC Score (Train): 0.900531
###############################
print ("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))
### find the best parameters
## A : estimAtors
param_test1 = {'n_estimators':list(range(20,81,10))}
gsearch1 = GridSearchCV(estimator =
GradientBoostingClassifier(learning_rate=0.1,
min_samples_split=300,
min_samples_leaf=20,
max_depth=8,
max_features='sqrt',
subsample=0.8,random_state=10),
param_grid = param_test1,
scoring='roc_auc',
iid=False,
cv=5)
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
## B: max depth
param_test2 = {'max_depth':list(range(3,14,2)),
'min_samples_split':list(range(100,801,200))}
gsearch2 = GridSearchCV(estimator =
GradientBoostingClassifier(learning_rate=0.1,
n_estimators=60,
min_samples_leaf=20,
max_features='sqrt',
subsample=0.8,
random_state=10),
param_grid = param_test2,
scoring='roc_auc',
iid=False,
cv=5)
gsearch2.fit(X,y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
### C: leaf and sample
param_test3 = {'min_samples_split':list(range(800,1900,200)),
'min_samples_leaf':list(range(60,101,10))}
gsearch3 = GridSearchCV(estimator =
GradientBoostingClassifier(learning_rate=0.1,
n_estimators=60,
max_depth=7,
max_features='sqrt',
subsample=0.8,
random_state=10),
param_grid = param_test3,
scoring='roc_auc',
iid=False,
cv=5)
gsearch3.fit(X,y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
### D: max features
param_test4 = {'max_features':list(range(7,20,2))}
gsearch4 = GridSearchCV(estimator =
GradientBoostingClassifier(learning_rate=0.1,
n_estimators=60,
max_depth=7,
min_samples_leaf =60,
min_samples_split =1200,
subsample=0.8,
random_state=10),
param_grid = param_test4,
scoring='roc_auc',
iid=False,
cv=5)
gsearch4.fit(X,y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
#### decrease learning rate and increase number of estimators
### train a new model
gbm_new = GradientBoostingClassifier(
learning_rate=0.01,
n_estimators=600,
max_depth=7,
min_samples_leaf =60,
min_samples_split =1200,
max_features=9,
subsample=0.7,
random_state=10)
gbm_new.fit(X,y)
y_pred = gbm_new.predict(X)
y_predprob = gbm_new.predict_proba(X)[:,1]
print ("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))