MLmodel.py

# -*- coding: utf-8 -*-
"""Deliverable3_MLModel.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ijOhsfSW1xanxiNoCrAv5nDk7gHzE_LS
"""

# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import os
from rds import db_password
from psycopg2 import sql, connect

try:
    # declare a new PostgreSQL connection object
    conn = connect(
        dbname = "car_db",
        user = "postgres",
        host = "final-project.cn1djdbx7lfi.us-east-1.rds.amazonaws.com",
        port = "5432",
        password = db_password
    )

    # print the connection if successful
    print ("psycopg2 connection:", conn)

except Exception as err:
    print ("psycopg2 connect() ERROR:", err)
    conn = None

cr = conn.cursor()
cr.execute('SELECT * FROM total_crash_data;')
tmp = cr.fetchall()

# Extract the column names
col_names = []
for elt in cr.description:
    col_names.append(elt[0])

# Create the dataframe, passing in the list of col_names extracted from the description
df = pd.DataFrame(tmp, columns=col_names)

df.head()

# iterating the columns 
for col in df.columns: 
    print(col)


"""# Preprocessing Data"""

# I selected these features because I think they will have a high correlation with crash severity
mock_df = df[['Crash Severity', 'Person Age', 'Person Gender', 'Person Type', 'Light Condition', 'Weather Condition', 'Vehicle Body Style', 'Vehicle Make', 'Day of Week','Rating']]
mock_df.head()

print(mock_df.shape)

# Drop rows with Unknown
mock_df = mock_df[mock_df['Crash Severity'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Person Age'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Person Gender'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Person Type'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Light Condition'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Weather Condition'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Vehicle Body Style'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Vehicle Make'] != "99 - UNKNOWN"]
mock_df = mock_df[mock_df['Rating'] != "99 - UNKNOWN"]
print(mock_df.shape)

# Clean Person Type Column
mock_df = mock_df[(mock_df['Person Type'] != "2 - PASSENGER/OCCUPANT") & (mock_df['Person Type'] != "4 - PEDESTRIAN") & (mock_df['Person Type'] != "98 - OTHER (EXPLAIN IN NARRATIVE)") & (mock_df['Person Type'] != "6 - PASSENGER/OCCUPANT ON MOTORCYCLE TYPE VEHICLE")]
print(mock_df.shape)

# Clean Vehicle Body Style column
mock_df = mock_df[(mock_df['Vehicle Body Style'] != "No Data") & (mock_df['Vehicle Body Style'] != "98 - OTHER  (EXPLAIN IN NARRATIVE)") & (mock_df['Vehicle Body Style'] != "EV - NEV-NEIGHBORHOOD ELECTRIC VEHICLE") & (mock_df['Vehicle Body Style'] != "FE - FARM EQUIPMENT")]
print(mock_df.shape)

# Clean Vehicle Body Style column
mock_df.loc[(mock_df['Vehicle Body Style'] == "P4 - PASSENGER CAR, 4-DOOR") , 'Vehicle Body Style'] = "SEDAN(4-DOOR)"
mock_df.loc[(mock_df['Vehicle Body Style'] == "SV - SPORT UTILITY VEHICLE"), 'Vehicle Body Style'] = "SUV"
mock_df.loc[(mock_df['Vehicle Body Style'] == "PK - PICKUP"), 'Vehicle Body Style'] = "PICKUP TRUCK"
mock_df.loc[(mock_df['Vehicle Body Style'] == "P2 - PASSENGER CAR, 2-DOOR"), 'Vehicle Body Style'] = "SEDAN(2-DOOR)"
mock_df.loc[(mock_df['Vehicle Body Style'] == "VN - VAN"), "Vehicle Body Style"] = "VAN"
mock_df.loc[(mock_df['Vehicle Body Style'] == "TR - TRUCK"), "Vehicle Body Style"] = "TRUCK(OTHER)"
mock_df.loc[(mock_df['Vehicle Body Style'] == "MC - MOTORCYCLE"), "Vehicle Body Style"] = "MOTORCYCLE"
mock_df.loc[(mock_df['Vehicle Body Style'] == "TT - TRUCK TRACTOR") | (mock_df['Vehicle Body Style'] == "PC - POLICE CAR/TRUCK") | (mock_df['Vehicle Body Style'] == "BU - BUS") | (mock_df['Vehicle Body Style'] == "AM - AMBULANCE") | (mock_df['Vehicle Body Style'] == "FT - FIRE TRUCK") | (mock_df['Vehicle Body Style'] == "SB - YELLOW SCHOOL BUS") | (mock_df['Vehicle Body Style'] == "PM - POLICE MOTORCYCLE"), "Vehicle Body Style"] = "OTHER"

mock_df['Vehicle Body Style'].value_counts()

# Clean Weather Data Column
mock_df['Weather Condition'].value_counts()

# Group Weather Condition into three categories, Clear, Cloudy, and Other.
mock_df.loc[(mock_df['Weather Condition'] == "1 - CLEAR") , 'Weather Condition'] = "CLEAR"
mock_df.loc[(mock_df['Weather Condition'] == "2 - CLOUDY"), 'Weather Condition'] = "CLOUDY"
mock_df.loc[(mock_df['Weather Condition'] == "3 - RAIN") | (mock_df['Weather Condition'] == "6 - FOG") | (mock_df['Weather Condition'] == "4 - SLEET/HAIL") | (mock_df['Weather Condition'] == "98 - OTHER (EXPLAIN IN NARRATIVE)") | (mock_df['Weather Condition'] == "5 - SNOW") | (mock_df['Weather Condition'] == "7 - BLOWING SAND/SNOW") | (mock_df['Weather Condition'] == "8 - SEVERE CROSSWINDS"), "Weather Condition"] = "OTHER"
mock_df['Weather Condition'].value_counts()

# Clean Person Gender column
mock_df['Person Gender'].value_counts()

# Group Gender into two classes: 0 for Female and 1 for Male
mock_df.loc[mock_df['Person Gender'] == "2 - FEMALE", 'Person Gender'] = "FEMALE"
mock_df.loc[mock_df['Person Gender'] == "1 - MALE", 'Person Gender'] = "MALE"

# Clean Person Type column
mock_df['Person Type'].value_counts()

# Clean values for Person Type
mock_df.loc[mock_df['Person Type'] == "1 - DRIVER", 'Person Type'] = "DRIVER"
mock_df.loc[mock_df['Person Type'] == "5 - DRIVER OF MOTORCYCLE TYPE VEHICLE", 'Person Type'] = "MOTORCYCLE DRIVER"

# Clean Crash Severity column
mock_df['Crash Severity'].value_counts()

# Create a function that groups crash severity into two groups, 1 for No injury, 2 for all other injuries.
def group_crash(X):
    if X == "N - NOT INJURED":
        return 0
    else:
        return 1

# Group Crash Severity into two classes in order for the ML to predict the level of severity.
mock_df['Crash Severity'] = mock_df['Crash Severity'].apply(group_crash)

mock_df['Crash Severity'].value_counts()

mock_df.dtypes

# Clean Person Age
mock_df['Person Age'].value_counts()

mock_df = mock_df[mock_df['Person Age'] != "No Data"]

mock_df["Person Age"] = mock_df["Person Age"].astype(int)

# Clean Vehicle Make Column
mock_df['Vehicle Make'].value_counts()

# Map over the Vehicle Make column and replace the values with "OTHER" where the value count is less than 9000.
mock_df[['Vehicle Make']] = mock_df[['Vehicle Make']].where(mock_df.apply(lambda x: x.map(x.value_counts()))>=900, "OTHER")

mock_df['Vehicle Make'].value_counts()


# Clean Ratings
mock_df['Rating'].value_counts()

mock_df = mock_df[mock_df['Rating'] != "Not Rated"]
mock_df = mock_df[mock_df['Rating'] != "Vehicle Not Found."]
mock_df.shape

mock_df['Rating'].value_counts()

mock_df.head()

# Check for Null values
mock_df.isna().sum()

#Drop Null values
mock_df = mock_df.dropna()
mock_df.head()

# Clean Light Condition Column
print(mock_df['Light Condition'].value_counts())

# Create a cleaning function to bin the light condition column into Daylight and Other
def clean_light(cond):
    if cond == "1 - DAYLIGHT":
        return "DAYLIGHT"
    else:
        return "OTHER"

# Apply the function to the Light Condition column
mock_df['Light Condition'] = mock_df['Light Condition'].apply(clean_light)
mock_df.head()

mock_df.dtypes

mock_df["Rating"] = mock_df["Rating"].astype(int)
mock_df['Crash Severity'] = mock_df['Crash Severity'].astype(int)

"""# Prepare Data for ML Model"""

# Drop Person Type 
mock_df = mock_df.drop("Person Type", axis=1)
mock_df.head()

# Generate the categorical variable list
feat_cat = ['Person Gender','Light Condition', 'Weather Condition', 'Vehicle Body Style', 'Vehicle Make', 'Day of Week']

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(mock_df[feat_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(feat_cat)
encode_df.head()

# Merge one-hot encoded features and drop the originals
mock_df2 = mock_df.merge(encode_df, left_index=True, right_index=True).drop(columns=feat_cat, axis=1)
mock_df2.head()

mock_df2.corr()

# Split our preprocessed data into our features and target arrays
y = mock_df2["Crash Severity"]
X = mock_df2.drop("Crash Severity", axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


"""# Oversampling"""

from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_resampled)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))


"""# SMOTEEN"""

from imblearn.over_sampling import SMOTE
X_resampled1, y_resampled1 = SMOTE(random_state=42,
sampling_strategy='auto').fit_resample(
   X_train_scaled, y_train)

Counter(y_resampled1)

model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled1, y_resampled1)

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

confusion_matrix(y_test, y_pred)

print(classification_report_imbalanced(y_test, y_pred))

"""# Undersampling"""

from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled2, y_resampled2 = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=42, max_iter=250,n_jobs=250)
model.fit(X_resampled2, y_resampled2)

from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

"""# Sequential Neural Network"""

#Import Dependencies
import tensorflow as tf

# Create the Keras Sequential model
nn_model = tf.keras.models.Sequential()

# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=900, activation="relu", input_dim=41))

nn_model.add(tf.keras.layers.Dense(units=800, activation="relu", input_dim=41))

nn_model.add(tf.keras.layers.Dense(units=600, activation="relu", input_dim=41))

nn_model.add(tf.keras.layers.Dense(units=400, activation="swish", input_dim=41))

nn_model.add(tf.keras.layers.Dense(units=200, activation="swish", input_dim=41))

nn_model.add(tf.keras.layers.Dense(units=100, activation="swish", input_dim=41))

nn_model.add(tf.keras.layers.Dense(units=10, activation="sigmoid", input_dim=41))

# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

from tensorflow.keras.callbacks import ModelCheckpoint
os.makedirs("model_checkpoints/",exist_ok=True)
checkpoint_path = "model_checkpoints/weights.{epoch:02d}.hdf5"

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

#checkpoints to epoch -- to compile all info from the diff amount of epochs
cp_callback= ModelCheckpoint(
    filepath= checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=1000)

# Fit the model to the training data
fit_model = nn_model.fit(X_resampled2, y_resampled2, epochs=100)

model_loss, model_accuracy = nn_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f'"loss3": {model_loss}')
print(f'"accuracy3": {model_accuracy}')

"""# Decision Tree
## We have decided to use a decision tree model for our Machine Learning Model because we are predicting a binary outcome.
"""

# Import Dependencies
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


"""# Logistic Regression"""

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=1)

classifier.fit(X_resampled2, y_resampled2)

y_pred = classifier.predict(X_test_scaled)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))