Merge pull request #5 from machinelearningnanodegree/master

upstream merge
machinelearningnanodegree · Aug 24, 2016 · f147947 · f147947
2 parents 1e62392 + d241a47
commit f147947
Show file tree

Hide file tree

Showing 13 changed files with 595 additions and 129 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1 @@
 FROM jupyter/scipy-notebook
-RUN mkdir results
-COPY app.py .
-COPY student-data.csv .
diff --git a/Makefile b/Makefile
@@ -1,11 +1,18 @@
+wrangle_data:
+	docker build -t timing_comparison .
+	docker run -it -v $(shell pwd):/home/jovyan/work --rm timing_comparison python -m lib.data.wrangler
+
 single_classfier:
 	docker build -t timing_comparison .
 	docker run -it --rm timing_comparison python app.py CLASSIFIER=$(CLASSIFIER)
-	 
+
 all_classifiers:
 	docker build -t timing_comparison .
-	docker run -it --rm timing_comparison python app.py 
-	
+	docker run -it --rm timing_comparison python app.py
+
 notebook_server:
 	docker build -t timing_comparison .
 	docker run --rm timing_comparison
+
+clean:
+	rm -rf tmp results **/*.pyc	**/__pycache__
diff --git a/README.md b/README.md
@@ -128,7 +128,7 @@ name
 - ensemble.BaggingClassifier                           (BHARAT)
 - ensemble.ExtraTreesClassifier
 - ensemble.GradientBoostingClassifier                  (NASH)
-- ensemble.RandomForestClassifier
+- ensemble.RandomForestClassifier                      (MATT)
 - ensemble.RandomTreesEmbedding
 - ensemble.RandomTreesEmbedding
 - ensemble.VotingClassifier                            (BHARAT)
@@ -139,18 +139,18 @@ name
 - multiclass.OneVsOneClassifier
 - multiclass.OneVsRestClassifier
 - multiclass.OutputCodeClassifier
-- naive_bayes.BernoulliNB
-- naive_bayes.GaussianNB
+- naive_bayes.BernoulliNB                              (ANDREY)
+- naive_bayes.GaussianNB                               (ANDREY)
 - naive_bayes.MultinomialNB
 - neighbors.KNeighborsClassifier                       (MATT)
 - neighbors.NearestCentroid
 - neighbors.RadiusNeighborsClassifier
 - neural_network.BernoulliRBM                          (MAXIME)
 - semi_supervised.LabelPropagation
 - svm.LinearSVC
-- svm.NuSVC
+- svm.NuSVC                                            (ANDREY)
 - svm.SVC                                              (MATT)
-- tree.DecisionTreeClassifier
+- tree.DecisionTreeClassifier                          (MATT)
 - tree.ExtraTreeClassifier
 ```
 

diff --git a/app.py b/app.py
@@ -1,107 +1,21 @@
 # Import libraries
 import numpy as np
 import pandas as pd
-import os
-import time
-# from matplotlib import pyplot as plt
 from sklearn import grid_search
-from sklearn.metrics import f1_score
-from sklearn import tree, svm, naive_bayes, ensemble, neighbors
 from sklearn.cross_validation import train_test_split
-import datetime
 
-# Read student data
-student_data = pd.read_csv("student-data.csv")
-student_data.reindex(np.random.permutation(student_data.index))
-print("Data read successfully!")
+from lib.data.wrangler import readData
+from lib.helpers import runTests, writeToCsv
 
-dtc = tree.DecisionTreeClassifier()
-svc = svm.SVC()
-nbc = naive_bayes.GaussianNB()
-knn = neighbors.KNeighborsClassifier()
-rfc = ensemble.RandomForestClassifier()
-adc = ensemble.AdaBoostClassifier()
+X_test, \
+    X_train, \
+    y_test, \
+    y_train = readData('./tmp/testTrainData.npz')
 
-models = [dtc, svc, nbc, knn, rfc, adc]
-X, y = np.arange(1000).reshape((500, 2)), range(500)
+classifiers, \
+    train_times, \
+    pred_times, \
+    f1_trains, \
+    f1_tests = runTests(X_test, X_train, y_test, y_train)
 
-class Model:
-    def __init__(self, classifier, parameters=[]):
-        self.classifier = classifier
-        self.parameters = parameters
-
-    def train_classifier(self, clf, X_train, y_train):
-        print("Training {}...".format(clf.__class__.__name__))
-         # start = np.datetime64(datetime.datetime.now(),"us")
-        start = time.time()
-        clf.fit(X_train, y_train)
-        # end = np.datetime64(datetime.datetime.now(),"us")
-        end = time.time()
-        self.training_time = end - start
-        print(self.training_time)
-
-    def predict_labels(self, clf, features, target):
-        # print("Predicting labels using {}...".format(clf.__class__.__name__))
-         # start = np.datetime64(datetime.datetime.now(),"us")
-        start = time.time()
-        y_pred = clf.predict(features)
-        # end = np.datetime64(datetime.datetime.now(),"us")
-        end = time.time()
-        self.prediction_time = end - start
-        f1_score_output = f1_score(target, y_pred, average="macro")
-        return f1_score_output
-
-    def train_predict(self, clf, X_train, y_train, X_test, y_test):
-        print("------------------------------------------")
-        print("Training set size: {}".format(len(X_train)))
-        self.train_classifier(clf, X_train, y_train)
-        self.f1_train = self.predict_labels(clf, X_train, y_train)
-        self.f1_test = self.predict_labels(clf, X_test, y_test)
-        return [self.training_time, self.prediction_time, self.f1_train, self.f1_test]
-
-
-dataframes = []
-
-for x in [100, 200, 300]:
-    classifiers = []
-    train_times = []
-    pred_times = []
-    f1_trains = []
-    f1_tests = []
-
-    for model in models:
-        clf = Model(model)
-        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)
-
-        output = clf.train_predict(model, X_train, y_train, X_test, y_test)
-        classifiers.append(model.__class__.__name__)
-        train_times.append(str(output[0]))
-        pred_times.append(str(output[1]))
-        f1_trains.append(output[2])
-        f1_tests.append(output[3])
-
-    df = pd.DataFrame({"Classifier": classifiers,
-                       "Training Time": train_times,
-                       "Prediction Time": pred_times,
-                       "F1 Score on Training Set": f1_trains,
-                       "F1 Score on Test Set": f1_tests})
-    dataframes.append(df)
-
-for i, frame in enumerate(dataframes):
-    filenumber = i * 100
-    filename = "results/{} samples.csv".format(filenumber)
-    frame.to_csv(filename)
-
-
-# def fit_model(clf,parameters,X,Y):
-#    clfr = grid_search.GridSearchCV(clf,parameters,scoring="f1",cv=4)
-#    return clfr.fit(X,Y)
-
-# clf = fit_model(svc,
-#                [{"kernel":["poly"],
-#                  "degree":[1,2,3,4,5],
-#                  "C":[1,10,100,1000],
-#                  }],X_train,y_train)
-
-# print(clf.best_params_)
- # print(predict_labels(clf,X_test,y_test))
+writeToCsv(classifiers, train_times, pred_times, f1_trains, f1_tests)
diff --git a/doc/EDA.Rmd b/doc/EDA.Rmd
@@ -0,0 +1,99 @@
+---
+output: html_document
+---
+Adult Data Exploratory Visualization
+=======================================================
+
+# Data Prep
+
+Loading necessary packages and Adult Data.
+
+``` {r echo=TRUE, message=FALSE, warning=FALSE, packages}
+# Loading useful packages
+packs = c("ggplot2","ggthemes","gridExtra")
+lapply(packs, function(p) {
+  if (!(p %in% installed.packages()[,'Package'])) {
+    install.packages(p)
+  }
+  else {
+    gsub('package', p, 'package already installed')
+  }
+})
+library(ggplot2)
+library(ggthemes)
+library(gridExtra)
+
+# Loading adult data
+cNames = c('Age','Workclass','FnlWgt','Education','EducationNum',
+           'MaritalStatus','Occupation','Relationship','Race','Sex',
+           'CapitalGain','CapitalLoss','HoursPerWeek','NativeCountry','Income')
+adult = read.csv('../data/adult.data', header=FALSE, col.names = cNames, strip.white = TRUE)
+```
+
+# Data Overview
+```{r echo=TRUE, message=FALSE, warning=FALSE, overview}
+# Dimensions of data
+m = nrow(adult)
+n = ncol(adult)
+gsub('ncol',n,gsub('nrow',m,'The dataset has nrow rows and ncol columns'))
+
+# Feature names
+colnames(adult)
+
+# Unique classes
+unique(adult$Income)
+
+# Summary of features
+summary(adult[, names(adult)!='Income'])
+
+# Class balance
+table(adult$Income) / m
+
+# Racial balance
+table(adult$Race) / m
+
+# Gender balance
+table(adult$Sex) / m
+
+```
+
+There are 14 features to work with, and the nature of the label "Income" shows that this task will have to be binary classification. The labels are not totally unbalanced, at 76%-24%. Racially, the dataset is quite dominated by caucasians. The dataset is also made up of about 2/3rds males.
+
+One of the most interesting features, particularly when attempting to model Income, ought to be Education.
+
+```{r echo=FALSE, message=FALSE, warning=FALSE, Plot_One}
+# Set the ggPlot theme to something not gross
+theme_set(theme_minimal(12))
+
+# Factor levels aren't set properly; manually set
+adult$Education = sapply(adult$Education, as.character)
+levs = c('Preschool','1st-4th','5th-6th','7th-8th','9th','10th','11th','12th',
+         'HS-grad','Some-college','Assoc-acdm','Assoc-voc','Bachelors','Masters','Prof-school','Doctorate')
+adult$Education = factor(adult$Education, levels = levs)
+
+# Education histogram
+ggplot(adult, aes(Education)) +
+  geom_bar(fill="#236B8E") +
+  geom_vline(aes(xintercept=mean(EducationNum)), 
+             color="black", linetype="dashed",size=0.75) +
+  ylab("Count") + 
+  ggtitle("Education Histogram w/ Mean") + 
+  theme(axis.text.x = element_text(angle=90, hjust=1),
+        axis.title.x=element_blank())
+
+# Education split by gender
+# Normalize to discount gender imbalance
+edByGender = aggregate(1:nrow(adult), by=list(adult$Sex, adult$Education), length)
+names(edByGender) = c('Gender','Education','Num')
+
+edByGender$EdPctOfGender = with(edByGender, ifelse(Gender=='Male',
+                                         Num / sum(Num[Gender=='Male']),
+                                         Num / sum(Num[Gender=='Female'])))
+edByGender$GenderPctOfEd = mapply(function(ed,num) {
+  return(num / sum(edByGender[edByGender$Education==ed,'EdPctOfGender']))
+  }, edByGender$Education, edByGender$EdPctOfGender)
+
+# TODO: make cumulative line plot of Num by Gender
+
+
+```
diff --git a/doc/EDA.html b/doc/EDA.html
diff --git a/lib/data/__init__.py b/lib/data/__init__.py
@@ -0,0 +1,39 @@
+import datetime
+import time
+import pandas as pd
+
+from lib.model import Model
+from lib.model.classifiers import CLASSIFIERS
+
+def runTests(X_test, X_train, y_test, y_train):
+    classifiers = []
+    train_times = []
+    pred_times = []
+    f1_trains = []
+    f1_tests = []
+
+    for classifier, parameters in CLASSIFIERS:
+        this_model = Model(classifier, parameters)
+
+        this_model(X_train, y_train, X_test, y_test)
+        classifiers.append(this_model.classifier.__class__.__name__)
+        train_times.append(this_model.training_time)
+        pred_times.append(this_model.train_prediction_time)
+        f1_trains.append(this_model.f1_train)
+        f1_tests.append(this_model.f1_test)
+
+    return classifiers, train_times, pred_times, f1_trains, f1_tests
+
+def writeToCsv(classifiers, train_times, pred_times, f1_trains, f1_tests):
+    df = pd.DataFrame({"Classifier": classifiers,
+                       "Training Time": train_times,
+                       "Prediction Time": pred_times,
+                       "F1 Score on Training Set": f1_trains,
+                       "F1 Score on Test Set": f1_tests})
+
+    t = datetime.datetime(2011, 10, 21, 0, 0)
+    t = time.mktime(t.timetuple())
+    if not os.path.exists('results'):
+        os.makedirs('results')
+    filename = str(t)+'.csv'
+    frame.to_csv('results/'+filename)
diff --git a/lib/dataWrangler.py → lib/data/wrangler.py b/lib/dataWrangler.py → lib/data/wrangler.py
@@ -1,26 +1,16 @@
-"""
-usage: dataWrangler.py [-h] [-i RAWDATALOC] [-s CV] [-r RS] [-o TEMPLOC]
-
-DataWrangling Script - reads rawData from the /data folder and creates
-features-labels(Test Train Split) and store it in /tmp folder
-
-optional arguments:
-  -h, --help     show this help message and exit
-  -i RAWDATALOC  rawData file location <use absolute file path>.
-                 default_value: ./data/adult.data
-  -s CV          size of the cross_validation set. default_value: 0.30
-  -r RS          random_state to use for splitting the data. default_value: 42
-  -o TEMPLOC     file location to store temporary binary data for
-                 test_train_split <use absolute file path>. default_value:
-                 ./tmp/
-"""
+
 
 from __future__ import print_function, absolute_import
+import os
 import pandas as pd
 import numpy as np
 from sklearn.cross_validation import train_test_split
 import argparse
 
+def readData(fileLoc=None):
+    data = np.load(fileLoc)
+    print(data.keys())
+    return (data[item] for item in data.keys())
 
 def getData(fileLoc=None):
     """
@@ -102,6 +92,8 @@ def storeData(df, fileLoc='./tmp/', cv=0.30, rs=21):
     #       and access the train/test using split using dictionary formatting.
     #       Ex: data['XTrain']
     """
+    if not os.path.exists('tmp'):
+        os.makedirs('tmp')
     filename = fileLoc+'testTrainData'
     XTrain, XTest, yTrain, yTest = trainCvSplit(df, cv, rs)
     kwargs = {'XTrain': XTrain,