-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from machinelearningnanodegree/master
upstream merge
- Loading branch information
Showing
13 changed files
with
595 additions
and
129 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1 @@ | ||
FROM jupyter/scipy-notebook | ||
RUN mkdir results | ||
COPY app.py . | ||
COPY student-data.csv . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,18 @@ | ||
wrangle_data: | ||
docker build -t timing_comparison . | ||
docker run -it -v $(shell pwd):/home/jovyan/work --rm timing_comparison python -m lib.data.wrangler | ||
|
||
single_classfier: | ||
docker build -t timing_comparison . | ||
docker run -it --rm timing_comparison python app.py CLASSIFIER=$(CLASSIFIER) | ||
|
||
all_classifiers: | ||
docker build -t timing_comparison . | ||
docker run -it --rm timing_comparison python app.py | ||
docker run -it --rm timing_comparison python app.py | ||
|
||
notebook_server: | ||
docker build -t timing_comparison . | ||
docker run --rm timing_comparison | ||
|
||
clean: | ||
rm -rf tmp results **/*.pyc **/__pycache__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,107 +1,21 @@ | ||
# Import libraries | ||
import numpy as np | ||
import pandas as pd | ||
import os | ||
import time | ||
# from matplotlib import pyplot as plt | ||
from sklearn import grid_search | ||
from sklearn.metrics import f1_score | ||
from sklearn import tree, svm, naive_bayes, ensemble, neighbors | ||
from sklearn.cross_validation import train_test_split | ||
import datetime | ||
|
||
# Read student data | ||
student_data = pd.read_csv("student-data.csv") | ||
student_data.reindex(np.random.permutation(student_data.index)) | ||
print("Data read successfully!") | ||
from lib.data.wrangler import readData | ||
from lib.helpers import runTests, writeToCsv | ||
|
||
dtc = tree.DecisionTreeClassifier() | ||
svc = svm.SVC() | ||
nbc = naive_bayes.GaussianNB() | ||
knn = neighbors.KNeighborsClassifier() | ||
rfc = ensemble.RandomForestClassifier() | ||
adc = ensemble.AdaBoostClassifier() | ||
X_test, \ | ||
X_train, \ | ||
y_test, \ | ||
y_train = readData('./tmp/testTrainData.npz') | ||
|
||
models = [dtc, svc, nbc, knn, rfc, adc] | ||
X, y = np.arange(1000).reshape((500, 2)), range(500) | ||
classifiers, \ | ||
train_times, \ | ||
pred_times, \ | ||
f1_trains, \ | ||
f1_tests = runTests(X_test, X_train, y_test, y_train) | ||
|
||
class Model: | ||
def __init__(self, classifier, parameters=[]): | ||
self.classifier = classifier | ||
self.parameters = parameters | ||
|
||
def train_classifier(self, clf, X_train, y_train): | ||
print("Training {}...".format(clf.__class__.__name__)) | ||
# start = np.datetime64(datetime.datetime.now(),"us") | ||
start = time.time() | ||
clf.fit(X_train, y_train) | ||
# end = np.datetime64(datetime.datetime.now(),"us") | ||
end = time.time() | ||
self.training_time = end - start | ||
print(self.training_time) | ||
|
||
def predict_labels(self, clf, features, target): | ||
# print("Predicting labels using {}...".format(clf.__class__.__name__)) | ||
# start = np.datetime64(datetime.datetime.now(),"us") | ||
start = time.time() | ||
y_pred = clf.predict(features) | ||
# end = np.datetime64(datetime.datetime.now(),"us") | ||
end = time.time() | ||
self.prediction_time = end - start | ||
f1_score_output = f1_score(target, y_pred, average="macro") | ||
return f1_score_output | ||
|
||
def train_predict(self, clf, X_train, y_train, X_test, y_test): | ||
print("------------------------------------------") | ||
print("Training set size: {}".format(len(X_train))) | ||
self.train_classifier(clf, X_train, y_train) | ||
self.f1_train = self.predict_labels(clf, X_train, y_train) | ||
self.f1_test = self.predict_labels(clf, X_test, y_test) | ||
return [self.training_time, self.prediction_time, self.f1_train, self.f1_test] | ||
|
||
|
||
dataframes = [] | ||
|
||
for x in [100, 200, 300]: | ||
classifiers = [] | ||
train_times = [] | ||
pred_times = [] | ||
f1_trains = [] | ||
f1_tests = [] | ||
|
||
for model in models: | ||
clf = Model(model) | ||
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42) | ||
|
||
output = clf.train_predict(model, X_train, y_train, X_test, y_test) | ||
classifiers.append(model.__class__.__name__) | ||
train_times.append(str(output[0])) | ||
pred_times.append(str(output[1])) | ||
f1_trains.append(output[2]) | ||
f1_tests.append(output[3]) | ||
|
||
df = pd.DataFrame({"Classifier": classifiers, | ||
"Training Time": train_times, | ||
"Prediction Time": pred_times, | ||
"F1 Score on Training Set": f1_trains, | ||
"F1 Score on Test Set": f1_tests}) | ||
dataframes.append(df) | ||
|
||
for i, frame in enumerate(dataframes): | ||
filenumber = i * 100 | ||
filename = "results/{} samples.csv".format(filenumber) | ||
frame.to_csv(filename) | ||
|
||
|
||
# def fit_model(clf,parameters,X,Y): | ||
# clfr = grid_search.GridSearchCV(clf,parameters,scoring="f1",cv=4) | ||
# return clfr.fit(X,Y) | ||
|
||
# clf = fit_model(svc, | ||
# [{"kernel":["poly"], | ||
# "degree":[1,2,3,4,5], | ||
# "C":[1,10,100,1000], | ||
# }],X_train,y_train) | ||
|
||
# print(clf.best_params_) | ||
# print(predict_labels(clf,X_test,y_test)) | ||
writeToCsv(classifiers, train_times, pred_times, f1_trains, f1_tests) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
--- | ||
output: html_document | ||
--- | ||
Adult Data Exploratory Visualization | ||
======================================================= | ||
|
||
# Data Prep | ||
|
||
Loading necessary packages and Adult Data. | ||
|
||
``` {r echo=TRUE, message=FALSE, warning=FALSE, packages} | ||
# Loading useful packages | ||
packs = c("ggplot2","ggthemes","gridExtra") | ||
lapply(packs, function(p) { | ||
if (!(p %in% installed.packages()[,'Package'])) { | ||
install.packages(p) | ||
} | ||
else { | ||
gsub('package', p, 'package already installed') | ||
} | ||
}) | ||
library(ggplot2) | ||
library(ggthemes) | ||
library(gridExtra) | ||
# Loading adult data | ||
cNames = c('Age','Workclass','FnlWgt','Education','EducationNum', | ||
'MaritalStatus','Occupation','Relationship','Race','Sex', | ||
'CapitalGain','CapitalLoss','HoursPerWeek','NativeCountry','Income') | ||
adult = read.csv('../data/adult.data', header=FALSE, col.names = cNames, strip.white = TRUE) | ||
``` | ||
|
||
# Data Overview | ||
```{r echo=TRUE, message=FALSE, warning=FALSE, overview} | ||
# Dimensions of data | ||
m = nrow(adult) | ||
n = ncol(adult) | ||
gsub('ncol',n,gsub('nrow',m,'The dataset has nrow rows and ncol columns')) | ||
# Feature names | ||
colnames(adult) | ||
# Unique classes | ||
unique(adult$Income) | ||
# Summary of features | ||
summary(adult[, names(adult)!='Income']) | ||
# Class balance | ||
table(adult$Income) / m | ||
# Racial balance | ||
table(adult$Race) / m | ||
# Gender balance | ||
table(adult$Sex) / m | ||
``` | ||
|
||
There are 14 features to work with, and the nature of the label "Income" shows that this task will have to be binary classification. The labels are not totally unbalanced, at 76%-24%. Racially, the dataset is quite dominated by caucasians. The dataset is also made up of about 2/3rds males. | ||
|
||
One of the most interesting features, particularly when attempting to model Income, ought to be Education. | ||
|
||
```{r echo=FALSE, message=FALSE, warning=FALSE, Plot_One} | ||
# Set the ggPlot theme to something not gross | ||
theme_set(theme_minimal(12)) | ||
# Factor levels aren't set properly; manually set | ||
adult$Education = sapply(adult$Education, as.character) | ||
levs = c('Preschool','1st-4th','5th-6th','7th-8th','9th','10th','11th','12th', | ||
'HS-grad','Some-college','Assoc-acdm','Assoc-voc','Bachelors','Masters','Prof-school','Doctorate') | ||
adult$Education = factor(adult$Education, levels = levs) | ||
# Education histogram | ||
ggplot(adult, aes(Education)) + | ||
geom_bar(fill="#236B8E") + | ||
geom_vline(aes(xintercept=mean(EducationNum)), | ||
color="black", linetype="dashed",size=0.75) + | ||
ylab("Count") + | ||
ggtitle("Education Histogram w/ Mean") + | ||
theme(axis.text.x = element_text(angle=90, hjust=1), | ||
axis.title.x=element_blank()) | ||
# Education split by gender | ||
# Normalize to discount gender imbalance | ||
edByGender = aggregate(1:nrow(adult), by=list(adult$Sex, adult$Education), length) | ||
names(edByGender) = c('Gender','Education','Num') | ||
edByGender$EdPctOfGender = with(edByGender, ifelse(Gender=='Male', | ||
Num / sum(Num[Gender=='Male']), | ||
Num / sum(Num[Gender=='Female']))) | ||
edByGender$GenderPctOfEd = mapply(function(ed,num) { | ||
return(num / sum(edByGender[edByGender$Education==ed,'EdPctOfGender'])) | ||
}, edByGender$Education, edByGender$EdPctOfGender) | ||
# TODO: make cumulative line plot of Num by Gender | ||
``` |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import datetime | ||
import time | ||
import pandas as pd | ||
|
||
from lib.model import Model | ||
from lib.model.classifiers import CLASSIFIERS | ||
|
||
def runTests(X_test, X_train, y_test, y_train): | ||
classifiers = [] | ||
train_times = [] | ||
pred_times = [] | ||
f1_trains = [] | ||
f1_tests = [] | ||
|
||
for classifier, parameters in CLASSIFIERS: | ||
this_model = Model(classifier, parameters) | ||
|
||
this_model(X_train, y_train, X_test, y_test) | ||
classifiers.append(this_model.classifier.__class__.__name__) | ||
train_times.append(this_model.training_time) | ||
pred_times.append(this_model.train_prediction_time) | ||
f1_trains.append(this_model.f1_train) | ||
f1_tests.append(this_model.f1_test) | ||
|
||
return classifiers, train_times, pred_times, f1_trains, f1_tests | ||
|
||
def writeToCsv(classifiers, train_times, pred_times, f1_trains, f1_tests): | ||
df = pd.DataFrame({"Classifier": classifiers, | ||
"Training Time": train_times, | ||
"Prediction Time": pred_times, | ||
"F1 Score on Training Set": f1_trains, | ||
"F1 Score on Test Set": f1_tests}) | ||
|
||
t = datetime.datetime(2011, 10, 21, 0, 0) | ||
t = time.mktime(t.timetuple()) | ||
if not os.path.exists('results'): | ||
os.makedirs('results') | ||
filename = str(t)+'.csv' | ||
frame.to_csv('results/'+filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.