-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathMLmodel.py
399 lines (278 loc) · 13.5 KB
/
MLmodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
# -*- coding: utf-8 -*-
"""Deliverable3_MLModel.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1ijOhsfSW1xanxiNoCrAv5nDk7gHzE_LS
"""
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import os
from rds import db_password
from psycopg2 import sql, connect
try:
# declare a new PostgreSQL connection object
conn = connect(
dbname = "car_db",
user = "postgres",
host = "final-project.cn1djdbx7lfi.us-east-1.rds.amazonaws.com",
port = "5432",
password = db_password
)
# print the connection if successful
print ("psycopg2 connection:", conn)
except Exception as err:
print ("psycopg2 connect() ERROR:", err)
conn = None
cr = conn.cursor()
cr.execute('SELECT * FROM total_crash_data;')
tmp = cr.fetchall()
# Extract the column names
col_names = []
for elt in cr.description:
col_names.append(elt[0])
# Create the dataframe, passing in the list of col_names extracted from the description
df = pd.DataFrame(tmp, columns=col_names)
df.head()
# iterating the columns
for col in df.columns:
print(col)
"""# Preprocessing Data"""
# I selected these features because I think they will have a high correlation with crash severity
mock_df = df[['Crash Severity', 'Person Age', 'Person Gender', 'Person Type', 'Light Condition', 'Weather Condition', 'Vehicle Body Style', 'Vehicle Make', 'Day of Week','Rating']]
mock_df.head()
print(mock_df.shape)
# Drop rows with Unknown
mock_df = mock_df[mock_df['Crash Severity'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Person Age'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Person Gender'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Person Type'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Light Condition'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Weather Condition'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Vehicle Body Style'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Vehicle Make'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Rating'] != "99 - UNKNOWN"]
print(mock_df.shape)
# Clean Person Type Column
mock_df = mock_df[(mock_df['Person Type'] != "2 - PASSENGER/OCCUPANT") & (mock_df['Person Type'] != "4 - PEDESTRIAN") & (mock_df['Person Type'] != "98 - OTHER (EXPLAIN IN NARRATIVE)") & (mock_df['Person Type'] != "6 - PASSENGER/OCCUPANT ON MOTORCYCLE TYPE VEHICLE")]
print(mock_df.shape)
# Clean Vehicle Body Style column
mock_df = mock_df[(mock_df['Vehicle Body Style'] != "No Data") & (mock_df['Vehicle Body Style'] != "98 - OTHER (EXPLAIN IN NARRATIVE)") & (mock_df['Vehicle Body Style'] != "EV - NEV-NEIGHBORHOOD ELECTRIC VEHICLE") & (mock_df['Vehicle Body Style'] != "FE - FARM EQUIPMENT")]
print(mock_df.shape)
# Clean Vehicle Body Style column
mock_df.loc[(mock_df['Vehicle Body Style'] == "P4 - PASSENGER CAR, 4-DOOR") , 'Vehicle Body Style'] = "SEDAN(4-DOOR)"
mock_df.loc[(mock_df['Vehicle Body Style'] == "SV - SPORT UTILITY VEHICLE"), 'Vehicle Body Style'] = "SUV"
mock_df.loc[(mock_df['Vehicle Body Style'] == "PK - PICKUP"), 'Vehicle Body Style'] = "PICKUP TRUCK"
mock_df.loc[(mock_df['Vehicle Body Style'] == "P2 - PASSENGER CAR, 2-DOOR"), 'Vehicle Body Style'] = "SEDAN(2-DOOR)"
mock_df.loc[(mock_df['Vehicle Body Style'] == "VN - VAN"), "Vehicle Body Style"] = "VAN"
mock_df.loc[(mock_df['Vehicle Body Style'] == "TR - TRUCK"), "Vehicle Body Style"] = "TRUCK(OTHER)"
mock_df.loc[(mock_df['Vehicle Body Style'] == "MC - MOTORCYCLE"), "Vehicle Body Style"] = "MOTORCYCLE"
mock_df.loc[(mock_df['Vehicle Body Style'] == "TT - TRUCK TRACTOR") | (mock_df['Vehicle Body Style'] == "PC - POLICE CAR/TRUCK") | (mock_df['Vehicle Body Style'] == "BU - BUS") | (mock_df['Vehicle Body Style'] == "AM - AMBULANCE") | (mock_df['Vehicle Body Style'] == "FT - FIRE TRUCK") | (mock_df['Vehicle Body Style'] == "SB - YELLOW SCHOOL BUS") | (mock_df['Vehicle Body Style'] == "PM - POLICE MOTORCYCLE"), "Vehicle Body Style"] = "OTHER"
mock_df['Vehicle Body Style'].value_counts()
# Clean Weather Data Column
mock_df['Weather Condition'].value_counts()
# Group Weather Condition into three categories, Clear, Cloudy, and Other.
mock_df.loc[(mock_df['Weather Condition'] == "1 - CLEAR") , 'Weather Condition'] = "CLEAR"
mock_df.loc[(mock_df['Weather Condition'] == "2 - CLOUDY"), 'Weather Condition'] = "CLOUDY"
mock_df.loc[(mock_df['Weather Condition'] == "3 - RAIN") | (mock_df['Weather Condition'] == "6 - FOG") | (mock_df['Weather Condition'] == "4 - SLEET/HAIL") | (mock_df['Weather Condition'] == "98 - OTHER (EXPLAIN IN NARRATIVE)") | (mock_df['Weather Condition'] == "5 - SNOW") | (mock_df['Weather Condition'] == "7 - BLOWING SAND/SNOW") | (mock_df['Weather Condition'] == "8 - SEVERE CROSSWINDS"), "Weather Condition"] = "OTHER"
mock_df['Weather Condition'].value_counts()
# Clean Person Gender column
mock_df['Person Gender'].value_counts()
# Group Gender into two classes: 0 for Female and 1 for Male
mock_df.loc[mock_df['Person Gender'] == "2 - FEMALE", 'Person Gender'] = "FEMALE"
mock_df.loc[mock_df['Person Gender'] == "1 - MALE", 'Person Gender'] = "MALE"
# Clean Person Type column
mock_df['Person Type'].value_counts()
# Clean values for Person Type
mock_df.loc[mock_df['Person Type'] == "1 - DRIVER", 'Person Type'] = "DRIVER"
mock_df.loc[mock_df['Person Type'] == "5 - DRIVER OF MOTORCYCLE TYPE VEHICLE", 'Person Type'] = "MOTORCYCLE DRIVER"
# Clean Crash Severity column
mock_df['Crash Severity'].value_counts()
# Create a function that groups crash severity into two groups, 1 for No injury, 2 for all other injuries.
def group_crash(X):
if X == "N - NOT INJURED":
return 0
else:
return 1
# Group Crash Severity into two classes in order for the ML to predict the level of severity.
mock_df['Crash Severity'] = mock_df['Crash Severity'].apply(group_crash)
mock_df['Crash Severity'].value_counts()
mock_df.dtypes
# Clean Person Age
mock_df['Person Age'].value_counts()
mock_df = mock_df[mock_df['Person Age'] != "No Data"]
mock_df["Person Age"] = mock_df["Person Age"].astype(int)
# Clean Vehicle Make Column
mock_df['Vehicle Make'].value_counts()
# Map over the Vehicle Make column and replace the values with "OTHER" where the value count is less than 9000.
mock_df[['Vehicle Make']] = mock_df[['Vehicle Make']].where(mock_df.apply(lambda x: x.map(x.value_counts()))>=900, "OTHER")
mock_df['Vehicle Make'].value_counts()
# Clean Ratings
mock_df['Rating'].value_counts()
mock_df = mock_df[mock_df['Rating'] != "Not Rated"]
mock_df = mock_df[mock_df['Rating'] != "Vehicle Not Found."]
mock_df.shape
mock_df['Rating'].value_counts()
mock_df.head()
# Check for Null values
mock_df.isna().sum()
#Drop Null values
mock_df = mock_df.dropna()
mock_df.head()
# Clean Light Condition Column
print(mock_df['Light Condition'].value_counts())
# Create a cleaning function to bin the light condition column into Daylight and Other
def clean_light(cond):
if cond == "1 - DAYLIGHT":
return "DAYLIGHT"
else:
return "OTHER"
# Apply the function to the Light Condition column
mock_df['Light Condition'] = mock_df['Light Condition'].apply(clean_light)
mock_df.head()
mock_df.dtypes
mock_df["Rating"] = mock_df["Rating"].astype(int)
mock_df['Crash Severity'] = mock_df['Crash Severity'].astype(int)
"""# Prepare Data for ML Model"""
# Drop Person Type
mock_df = mock_df.drop("Person Type", axis=1)
mock_df.head()
# Generate the categorical variable list
feat_cat = ['Person Gender','Light Condition', 'Weather Condition', 'Vehicle Body Style', 'Vehicle Make', 'Day of Week']
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(mock_df[feat_cat]))
# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(feat_cat)
encode_df.head()
# Merge one-hot encoded features and drop the originals
mock_df2 = mock_df.merge(encode_df, left_index=True, right_index=True).drop(columns=feat_cat, axis=1)
mock_df2.head()
mock_df2.corr()
# Split our preprocessed data into our features and target arrays
y = mock_df2["Crash Severity"]
X = mock_df2.drop("Crash Severity", axis=1)
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
# Create a StandardScaler instances
scaler = StandardScaler()
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
"""# Oversampling"""
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))
"""# SMOTEEN"""
from imblearn.over_sampling import SMOTE
X_resampled1, y_resampled1 = SMOTE(random_state=42,
sampling_strategy='auto').fit_resample(
X_train_scaled, y_train)
Counter(y_resampled1)
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled1, y_resampled1)
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)
print(classification_report_imbalanced(y_test, y_pred))
"""# Undersampling"""
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled2, y_resampled2 = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=42, max_iter=250,n_jobs=250)
model.fit(X_resampled2, y_resampled2)
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))
"""# Sequential Neural Network"""
#Import Dependencies
import tensorflow as tf
# Create the Keras Sequential model
nn_model = tf.keras.models.Sequential()
# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=900, activation="relu", input_dim=41))
nn_model.add(tf.keras.layers.Dense(units=800, activation="relu", input_dim=41))
nn_model.add(tf.keras.layers.Dense(units=600, activation="relu", input_dim=41))
nn_model.add(tf.keras.layers.Dense(units=400, activation="swish", input_dim=41))
nn_model.add(tf.keras.layers.Dense(units=200, activation="swish", input_dim=41))
nn_model.add(tf.keras.layers.Dense(units=100, activation="swish", input_dim=41))
nn_model.add(tf.keras.layers.Dense(units=10, activation="sigmoid", input_dim=41))
# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
# Check the structure of the Sequential model
nn_model.summary()
from tensorflow.keras.callbacks import ModelCheckpoint
os.makedirs("model_checkpoints/",exist_ok=True)
checkpoint_path = "model_checkpoints/weights.{epoch:02d}.hdf5"
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
#checkpoints to epoch -- to compile all info from the diff amount of epochs
cp_callback= ModelCheckpoint(
filepath= checkpoint_path,
verbose=1,
save_weights_only=True,
save_freq=1000)
# Fit the model to the training data
fit_model = nn_model.fit(X_resampled2, y_resampled2, epochs=100)
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f'"loss3": {model_loss}')
print(f'"accuracy3": {model_accuracy}')
"""# Decision Tree
## We have decided to use a decision tree model for our Machine Learning Model because we are predicting a binary outcome.
"""
# Import Dependencies
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_df
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))
"""# Logistic Regression"""
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
max_iter=200,
random_state=1)
classifier.fit(X_resampled2, y_resampled2)
y_pred = classifier.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))