Skip to content

Commit

Permalink
Merge pull request #5 from machinelearningnanodegree/master
Browse files Browse the repository at this point in the history
upstream merge
  • Loading branch information
parambharat authored Aug 24, 2016
2 parents 1e62392 + d241a47 commit f147947
Show file tree
Hide file tree
Showing 13 changed files with 595 additions and 129 deletions.
3 changes: 0 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
FROM jupyter/scipy-notebook
RUN mkdir results
COPY app.py .
COPY student-data.csv .
13 changes: 10 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
wrangle_data:
docker build -t timing_comparison .
docker run -it -v $(shell pwd):/home/jovyan/work --rm timing_comparison python -m lib.data.wrangler

single_classfier:
docker build -t timing_comparison .
docker run -it --rm timing_comparison python app.py CLASSIFIER=$(CLASSIFIER)

all_classifiers:
docker build -t timing_comparison .
docker run -it --rm timing_comparison python app.py
docker run -it --rm timing_comparison python app.py

notebook_server:
docker build -t timing_comparison .
docker run --rm timing_comparison

clean:
rm -rf tmp results **/*.pyc **/__pycache__
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ name
- ensemble.BaggingClassifier (BHARAT)
- ensemble.ExtraTreesClassifier
- ensemble.GradientBoostingClassifier (NASH)
- ensemble.RandomForestClassifier
- ensemble.RandomForestClassifier (MATT)
- ensemble.RandomTreesEmbedding
- ensemble.RandomTreesEmbedding
- ensemble.VotingClassifier (BHARAT)
Expand All @@ -139,18 +139,18 @@ name
- multiclass.OneVsOneClassifier
- multiclass.OneVsRestClassifier
- multiclass.OutputCodeClassifier
- naive_bayes.BernoulliNB
- naive_bayes.GaussianNB
- naive_bayes.BernoulliNB (ANDREY)
- naive_bayes.GaussianNB (ANDREY)
- naive_bayes.MultinomialNB
- neighbors.KNeighborsClassifier (MATT)
- neighbors.NearestCentroid
- neighbors.RadiusNeighborsClassifier
- neural_network.BernoulliRBM (MAXIME)
- semi_supervised.LabelPropagation
- svm.LinearSVC
- svm.NuSVC
- svm.NuSVC (ANDREY)
- svm.SVC (MATT)
- tree.DecisionTreeClassifier
- tree.DecisionTreeClassifier (MATT)
- tree.ExtraTreeClassifier
```

Expand Down
110 changes: 12 additions & 98 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,107 +1,21 @@
# Import libraries
import numpy as np
import pandas as pd
import os
import time
# from matplotlib import pyplot as plt
from sklearn import grid_search
from sklearn.metrics import f1_score
from sklearn import tree, svm, naive_bayes, ensemble, neighbors
from sklearn.cross_validation import train_test_split
import datetime

# Read student data
student_data = pd.read_csv("student-data.csv")
student_data.reindex(np.random.permutation(student_data.index))
print("Data read successfully!")
from lib.data.wrangler import readData
from lib.helpers import runTests, writeToCsv

dtc = tree.DecisionTreeClassifier()
svc = svm.SVC()
nbc = naive_bayes.GaussianNB()
knn = neighbors.KNeighborsClassifier()
rfc = ensemble.RandomForestClassifier()
adc = ensemble.AdaBoostClassifier()
X_test, \
X_train, \
y_test, \
y_train = readData('./tmp/testTrainData.npz')

models = [dtc, svc, nbc, knn, rfc, adc]
X, y = np.arange(1000).reshape((500, 2)), range(500)
classifiers, \
train_times, \
pred_times, \
f1_trains, \
f1_tests = runTests(X_test, X_train, y_test, y_train)

class Model:
def __init__(self, classifier, parameters=[]):
self.classifier = classifier
self.parameters = parameters

def train_classifier(self, clf, X_train, y_train):
print("Training {}...".format(clf.__class__.__name__))
# start = np.datetime64(datetime.datetime.now(),"us")
start = time.time()
clf.fit(X_train, y_train)
# end = np.datetime64(datetime.datetime.now(),"us")
end = time.time()
self.training_time = end - start
print(self.training_time)

def predict_labels(self, clf, features, target):
# print("Predicting labels using {}...".format(clf.__class__.__name__))
# start = np.datetime64(datetime.datetime.now(),"us")
start = time.time()
y_pred = clf.predict(features)
# end = np.datetime64(datetime.datetime.now(),"us")
end = time.time()
self.prediction_time = end - start
f1_score_output = f1_score(target, y_pred, average="macro")
return f1_score_output

def train_predict(self, clf, X_train, y_train, X_test, y_test):
print("------------------------------------------")
print("Training set size: {}".format(len(X_train)))
self.train_classifier(clf, X_train, y_train)
self.f1_train = self.predict_labels(clf, X_train, y_train)
self.f1_test = self.predict_labels(clf, X_test, y_test)
return [self.training_time, self.prediction_time, self.f1_train, self.f1_test]


dataframes = []

for x in [100, 200, 300]:
classifiers = []
train_times = []
pred_times = []
f1_trains = []
f1_tests = []

for model in models:
clf = Model(model)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

output = clf.train_predict(model, X_train, y_train, X_test, y_test)
classifiers.append(model.__class__.__name__)
train_times.append(str(output[0]))
pred_times.append(str(output[1]))
f1_trains.append(output[2])
f1_tests.append(output[3])

df = pd.DataFrame({"Classifier": classifiers,
"Training Time": train_times,
"Prediction Time": pred_times,
"F1 Score on Training Set": f1_trains,
"F1 Score on Test Set": f1_tests})
dataframes.append(df)

for i, frame in enumerate(dataframes):
filenumber = i * 100
filename = "results/{} samples.csv".format(filenumber)
frame.to_csv(filename)


# def fit_model(clf,parameters,X,Y):
# clfr = grid_search.GridSearchCV(clf,parameters,scoring="f1",cv=4)
# return clfr.fit(X,Y)

# clf = fit_model(svc,
# [{"kernel":["poly"],
# "degree":[1,2,3,4,5],
# "C":[1,10,100,1000],
# }],X_train,y_train)

# print(clf.best_params_)
# print(predict_labels(clf,X_test,y_test))
writeToCsv(classifiers, train_times, pred_times, f1_trains, f1_tests)
99 changes: 99 additions & 0 deletions doc/EDA.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
---
output: html_document
---
Adult Data Exploratory Visualization
=======================================================

# Data Prep

Loading necessary packages and Adult Data.

``` {r echo=TRUE, message=FALSE, warning=FALSE, packages}
# Loading useful packages
packs = c("ggplot2","ggthemes","gridExtra")
lapply(packs, function(p) {
if (!(p %in% installed.packages()[,'Package'])) {
install.packages(p)
}
else {
gsub('package', p, 'package already installed')
}
})
library(ggplot2)
library(ggthemes)
library(gridExtra)
# Loading adult data
cNames = c('Age','Workclass','FnlWgt','Education','EducationNum',
'MaritalStatus','Occupation','Relationship','Race','Sex',
'CapitalGain','CapitalLoss','HoursPerWeek','NativeCountry','Income')
adult = read.csv('../data/adult.data', header=FALSE, col.names = cNames, strip.white = TRUE)
```

# Data Overview
```{r echo=TRUE, message=FALSE, warning=FALSE, overview}
# Dimensions of data
m = nrow(adult)
n = ncol(adult)
gsub('ncol',n,gsub('nrow',m,'The dataset has nrow rows and ncol columns'))
# Feature names
colnames(adult)
# Unique classes
unique(adult$Income)
# Summary of features
summary(adult[, names(adult)!='Income'])
# Class balance
table(adult$Income) / m
# Racial balance
table(adult$Race) / m
# Gender balance
table(adult$Sex) / m
```

There are 14 features to work with, and the nature of the label "Income" shows that this task will have to be binary classification. The labels are not totally unbalanced, at 76%-24%. Racially, the dataset is quite dominated by caucasians. The dataset is also made up of about 2/3rds males.

One of the most interesting features, particularly when attempting to model Income, ought to be Education.

```{r echo=FALSE, message=FALSE, warning=FALSE, Plot_One}
# Set the ggPlot theme to something not gross
theme_set(theme_minimal(12))
# Factor levels aren't set properly; manually set
adult$Education = sapply(adult$Education, as.character)
levs = c('Preschool','1st-4th','5th-6th','7th-8th','9th','10th','11th','12th',
'HS-grad','Some-college','Assoc-acdm','Assoc-voc','Bachelors','Masters','Prof-school','Doctorate')
adult$Education = factor(adult$Education, levels = levs)
# Education histogram
ggplot(adult, aes(Education)) +
geom_bar(fill="#236B8E") +
geom_vline(aes(xintercept=mean(EducationNum)),
color="black", linetype="dashed",size=0.75) +
ylab("Count") +
ggtitle("Education Histogram w/ Mean") +
theme(axis.text.x = element_text(angle=90, hjust=1),
axis.title.x=element_blank())
# Education split by gender
# Normalize to discount gender imbalance
edByGender = aggregate(1:nrow(adult), by=list(adult$Sex, adult$Education), length)
names(edByGender) = c('Gender','Education','Num')
edByGender$EdPctOfGender = with(edByGender, ifelse(Gender=='Male',
Num / sum(Num[Gender=='Male']),
Num / sum(Num[Gender=='Female'])))
edByGender$GenderPctOfEd = mapply(function(ed,num) {
return(num / sum(edByGender[edByGender$Education==ed,'EdPctOfGender']))
}, edByGender$Education, edByGender$EdPctOfGender)
# TODO: make cumulative line plot of Num by Gender
```
261 changes: 261 additions & 0 deletions doc/EDA.html

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions lib/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import datetime
import time
import pandas as pd

from lib.model import Model
from lib.model.classifiers import CLASSIFIERS

def runTests(X_test, X_train, y_test, y_train):
classifiers = []
train_times = []
pred_times = []
f1_trains = []
f1_tests = []

for classifier, parameters in CLASSIFIERS:
this_model = Model(classifier, parameters)

this_model(X_train, y_train, X_test, y_test)
classifiers.append(this_model.classifier.__class__.__name__)
train_times.append(this_model.training_time)
pred_times.append(this_model.train_prediction_time)
f1_trains.append(this_model.f1_train)
f1_tests.append(this_model.f1_test)

return classifiers, train_times, pred_times, f1_trains, f1_tests

def writeToCsv(classifiers, train_times, pred_times, f1_trains, f1_tests):
df = pd.DataFrame({"Classifier": classifiers,
"Training Time": train_times,
"Prediction Time": pred_times,
"F1 Score on Training Set": f1_trains,
"F1 Score on Test Set": f1_tests})

t = datetime.datetime(2011, 10, 21, 0, 0)
t = time.mktime(t.timetuple())
if not os.path.exists('results'):
os.makedirs('results')
filename = str(t)+'.csv'
frame.to_csv('results/'+filename)
24 changes: 8 additions & 16 deletions lib/dataWrangler.py → lib/data/wrangler.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,16 @@
"""
usage: dataWrangler.py [-h] [-i RAWDATALOC] [-s CV] [-r RS] [-o TEMPLOC]
DataWrangling Script - reads rawData from the /data folder and creates
features-labels(Test Train Split) and store it in /tmp folder
optional arguments:
-h, --help     show this help message and exit
 -i RAWDATALOC  rawData file location <use absolute file path>.
                default_value: ./data/adult.data
 -s CV          size of the cross_validation set. default_value: 0.30
 -r RS          random_state to use for splitting the data. default_value: 42
 -o TEMPLOC     file location to store temporary binary data for
                test_train_split <use absolute file path>. default_value:
                ./tmp/
"""


from __future__ import print_function, absolute_import
import os
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import argparse

def readData(fileLoc=None):
data = np.load(fileLoc)
print(data.keys())
return (data[item] for item in data.keys())

def getData(fileLoc=None):
"""
Expand Down Expand Up @@ -102,6 +92,8 @@ def storeData(df, fileLoc='./tmp/', cv=0.30, rs=21):
# and access the train/test using split using dictionary formatting.
# Ex: data['XTrain']
"""
if not os.path.exists('tmp'):
os.makedirs('tmp')
filename = fileLoc+'testTrainData'
XTrain, XTest, yTrain, yTest = trainCvSplit(df, cv, rs)
kwargs = {'XTrain': XTrain,
Expand Down
Loading

0 comments on commit f147947

Please sign in to comment.