From 31622240c5420aca9eca91c890bdf89fdda25017 Mon Sep 17 00:00:00 2001 From: Matthew Zhou Date: Wed, 17 Aug 2016 19:13:48 -0400 Subject: [PATCH 1/7] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ca262c7..adca411 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ name - ensemble.BaggingClassifier (BHARAT) - ensemble.ExtraTreesClassifier - ensemble.GradientBoostingClassifier (NASH) -- ensemble.RandomForestClassifier +- ensemble.RandomForestClassifier (MATT) - ensemble.RandomTreesEmbedding - ensemble.RandomTreesEmbedding - ensemble.VotingClassifier (BHARAT) @@ -150,7 +150,7 @@ name - svm.LinearSVC - svm.NuSVC - svm.SVC (MATT) -- tree.DecisionTreeClassifier +- tree.DecisionTreeClassifier (MATT) - tree.ExtraTreeClassifier ``` From a21b1b9d11375c45bfdd8cfeeffc1a2e27432674 Mon Sep 17 00:00:00 2001 From: Joshua Cook Date: Thu, 18 Aug 2016 19:49:17 -0700 Subject: [PATCH 2/7] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index adca411..24cbd67 100644 --- a/README.md +++ b/README.md @@ -139,8 +139,8 @@ name - multiclass.OneVsOneClassifier - multiclass.OneVsRestClassifier - multiclass.OutputCodeClassifier -- naive_bayes.BernoulliNB -- naive_bayes.GaussianNB +- naive_bayes.BernoulliNB (ANDREY) +- naive_bayes.GaussianNB (ANDREY) - naive_bayes.MultinomialNB - neighbors.KNeighborsClassifier (MATT) - neighbors.NearestCentroid @@ -148,7 +148,7 @@ name - neural_network.BernoulliRBM (MAXIME) - semi_supervised.LabelPropagation - svm.LinearSVC -- svm.NuSVC +- svm.NuSVC (ANDREY) - svm.SVC (MATT) - tree.DecisionTreeClassifier (MATT) - tree.ExtraTreeClassifier From 9ea3544671e407de5339b051443581c52b460d8a Mon Sep 17 00:00:00 2001 From: Joshua Cook Date: Fri, 19 Aug 2016 20:50:40 -0700 Subject: [PATCH 3/7] refactor and abstract app.py --- app.py | 106 ++++++++++++--------------------------------- lib/classifiers.py | 47 ++++++++++++++++++++ lib/model.py | 63 +++++++++++++++++++++++++++ 3 files changed, 137 insertions(+), 79 deletions(-) create mode 100644 lib/classifiers.py create mode 100644 lib/model.py diff --git a/app.py b/app.py index a004a0a..a921173 100644 --- a/app.py +++ b/app.py @@ -1,91 +1,39 @@ # Import libraries import numpy as np import pandas as pd -import os -import time -# from matplotlib import pyplot as plt from sklearn import grid_search -from sklearn.metrics import f1_score -from sklearn import tree, svm, naive_bayes, ensemble, neighbors from sklearn.cross_validation import train_test_split -import datetime -# Read student data -student_data = pd.read_csv("student-data.csv") -student_data.reindex(np.random.permutation(student_data.index)) -print("Data read successfully!") - -dtc = tree.DecisionTreeClassifier() -svc = svm.SVC() -nbc = naive_bayes.GaussianNB() -knn = neighbors.KNeighborsClassifier() -rfc = ensemble.RandomForestClassifier() -adc = ensemble.AdaBoostClassifier() - -models = [dtc, svc, nbc, knn, rfc, adc] -X, y = np.arange(1000).reshape((500, 2)), range(500) - -class Model: - def __init__(self, classifier, parameters=[]): - self.classifier = classifier - self.parameters = parameters - - def train_classifier(self, clf, X_train, y_train): - print("Training {}...".format(clf.__class__.__name__)) - # start = np.datetime64(datetime.datetime.now(),"us") - start = time.time() - clf.fit(X_train, y_train) - # end = np.datetime64(datetime.datetime.now(),"us") - end = time.time() - self.training_time = end - start - print(self.training_time) - - def predict_labels(self, clf, features, target): - # print("Predicting labels using {}...".format(clf.__class__.__name__)) - # start = np.datetime64(datetime.datetime.now(),"us") - start = time.time() - y_pred = clf.predict(features) - # end = np.datetime64(datetime.datetime.now(),"us") - end = time.time() - self.prediction_time = end - start - f1_score_output = f1_score(target, y_pred, average="macro") - return f1_score_output - - def train_predict(self, clf, X_train, y_train, X_test, y_test): - print("------------------------------------------") - print("Training set size: {}".format(len(X_train))) - self.train_classifier(clf, X_train, y_train) - self.f1_train = self.predict_labels(clf, X_train, y_train) - self.f1_test = self.predict_labels(clf, X_test, y_test) - return [self.training_time, self.prediction_time, self.f1_train, self.f1_test] +from lib.classifiers import CLASSIFIERS +from lib.model import Model +data = np.load('./tmp/testTrainData.npz') +print(data.keys()) +X_train, X_test, y_test, y_train = (data[item] for item in data.keys()) +classifiers = [] dataframes = [] - -for x in [100, 200, 300]: - classifiers = [] - train_times = [] - pred_times = [] - f1_trains = [] - f1_tests = [] - - for model in models: - clf = Model(model) - X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42) - - output = clf.train_predict(model, X_train, y_train, X_test, y_test) - classifiers.append(model.__class__.__name__) - train_times.append(str(output[0])) - pred_times.append(str(output[1])) - f1_trains.append(output[2]) - f1_tests.append(output[3]) - - df = pd.DataFrame({"Classifier": classifiers, - "Training Time": train_times, - "Prediction Time": pred_times, - "F1 Score on Training Set": f1_trains, - "F1 Score on Test Set": f1_tests}) - dataframes.append(df) +train_times = [] +pred_times = [] +f1_trains = [] +f1_tests = [] + +for classifier, parameters in CLASSIFIERS: + this_model = Model(classifier, parameters) + + this_model(X_train, y_train, X_test, y_test) + classifiers.append(this_model.classifier.__class__.__name__) + train_times.append(this_model.training_time) + pred_times.append(this_model.train_prediction_time) + f1_trains.append(this_model.f1_train) + f1_tests.append(this_model.f1_test) + +df = pd.DataFrame({"Classifier": classifiers, + "Training Time": train_times, + "Prediction Time": pred_times, + "F1 Score on Training Set": f1_trains, + "F1 Score on Test Set": f1_tests}) +dataframes.append(df) for i, frame in enumerate(dataframes): filenumber = i * 100 diff --git a/lib/classifiers.py b/lib/classifiers.py new file mode 100644 index 0000000..b1a3018 --- /dev/null +++ b/lib/classifiers.py @@ -0,0 +1,47 @@ +from sklearn import calibration +from sklearn import discriminant_analysis +from sklearn import dummy +from sklearn import ensemble +from sklearn import linear_model +from sklearn import multiclass +from sklearn import naive_bayes +from sklearn import neighbors +from sklearn import neural_network +from sklearn import semi_supervised +from sklearn import svm +from sklearn import tree + +CLASSIFIERS = [ + # calibration.CalibratedClassifierCV, + # discriminant_analysis.LinearDiscriminantAnalysis, + # discriminant_analysis.QuadraticDiscriminantAnalysis, + # dummy.DummyClassifier, + # ensemble.AdaBoostClassifier, + # ensemble.BaggingClassifier, + # ensemble.ExtraTreesClassifier, + # ensemble.GradientBoostingClassifier, + # ensemble.RandomForestClassifier, + # ensemble.RandomTreesEmbedding, + # ensemble.RandomTreesEmbedding, + # ensemble.VotingClassifier, + # linear_model.LogisticRegression, + # linear_model.PassiveAggressiveClassifier, + # linear_model.RidgeClassifier, + # linear_model.SGDClassifier, + # multiclass.OneVsOneClassifier, + # multiclass.OneVsRestClassifier, + # multiclass.OutputCodeClassifier, + # naive_bayes.BernoulliNB, + # naive_bayes.GaussianNB, + # naive_bayes.MultinomialNB, + # neighbors.KNeighborsClassifier, + # neighbors.NearestCentroid, + # neighbors.RadiusNeighborsClassifier, + # neural_network.BernoulliRBM, + # semi_supervised.LabelPropagation, + # svm.LinearSVC, + # svm.NuSVC, + (svm.SVC, {}), + (tree.DecisionTreeClassifier, {'random_state' : 42}) + # tree.ExtraTreeClassifier + ] diff --git a/lib/model.py b/lib/model.py new file mode 100644 index 0000000..70ed10b --- /dev/null +++ b/lib/model.py @@ -0,0 +1,63 @@ +import time +from sklearn.metrics import f1_score + +class Model: + def __init__(self, classifier, parameters={}): + self.classifier = classifier(**parameters) + self.training_time = None + self.length_training_set = None + self.f1_train, self.train_prediction_time = None, None + self.f1_test, self.test_prediction_time = None, None + + def train_classifier(self, X_train, y_train): + print("Training {}...".format(self.classifier.__class__.__name__)) + # start = np.datetime64(datetime.datetime.now(),"us") + start = time.time() + self.classifier.fit(X_train, y_train) + # end = np.datetime64(datetime.datetime.now(),"us") + end = time.time() + self.training_time = end - start + + def predict_labels(self, features, target): + # print("Predicting labels using {}...".format(self.classifier.__class__.__name__)) + # start = np.datetime64(datetime.datetime.now(),"us") + start = time.time() + y_pred = self.classifier.predict(features) + # end = np.datetime64(datetime.datetime.now(),"us") + end = time.time() + prediction_time = end - start + f1_score_output = f1_score(target, y_pred, average="macro") + return f1_score_output, prediction_time + + def __str__(self): + return """ +Model(classifer: {} + length training set: {} + training time: {} + train f1/pred. time: {} {} + test f1/pred. time: {} {}) + """.format(self.classifier.__class__.__name__, + self.length_training_set, + self.training_time, self.f1_train, self.train_prediction_time, + self.f1_test, self.test_prediction_time) + + def __repr__(self): + return """ +Model(classifer: {} + length training set: {} + training time: {} + train f1/pred. time: {} {} + test f1/pred. time: {} {}) + +Detailed Classifier Description: +{} + """.format(self.classifier.__class__.__name__, + self.length_training_set, + self.training_time, self.f1_train, self.train_prediction_time, + self.f1_test, self.test_prediction_time, self.classifier) + + def __call__(self, X_train, y_train, X_test, y_test): + self.length_training_set = len(X_train) + self.training_time = self.train_classifier(X_train, y_train) + self.f1_train, self.train_prediction_time = self.predict_labels(X_train, y_train) + self.f1_test, self.test_prediction_time = self.predict_labels(X_test, y_test) From 2e760f041156f47656ebacc4a70458d5b06bbb38 Mon Sep 17 00:00:00 2001 From: Joshua Cook Date: Sat, 20 Aug 2016 11:10:48 -0700 Subject: [PATCH 4/7] more abstraction --- Dockerfile | 3 --- Makefile | 13 ++++++++++--- app.py | 4 ++-- lib/{dataWrangler.py => data/wrangler.py} | 3 +++ lib/{model.py => model/__init__.py} | 0 lib/{ => model}/classifiers.py | 0 test/__init__.py | 0 lib/baseLiner.py => test/test_dtree.py | 15 +++++++++++---- 8 files changed, 26 insertions(+), 12 deletions(-) rename lib/{dataWrangler.py => data/wrangler.py} (98%) rename lib/{model.py => model/__init__.py} (100%) rename lib/{ => model}/classifiers.py (100%) create mode 100644 test/__init__.py rename lib/baseLiner.py => test/test_dtree.py (57%) diff --git a/Dockerfile b/Dockerfile index 672ab0d..2dee382 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1 @@ FROM jupyter/scipy-notebook -RUN mkdir results -COPY app.py . -COPY student-data.csv . diff --git a/Makefile b/Makefile index 32f8af5..2eab26f 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,18 @@ +wrangle_data: + docker build -t timing_comparison . + docker run -it -v $(shell pwd):/home/jovyan/work --rm timing_comparison python -m lib.data.wrangler + single_classfier: docker build -t timing_comparison . docker run -it --rm timing_comparison python app.py CLASSIFIER=$(CLASSIFIER) - + all_classifiers: docker build -t timing_comparison . - docker run -it --rm timing_comparison python app.py - + docker run -it --rm timing_comparison python app.py + notebook_server: docker build -t timing_comparison . docker run --rm timing_comparison + +clean: + rm -rf tmp results **/*.pyc **/__pycache__ diff --git a/app.py b/app.py index a921173..0ad8cb6 100644 --- a/app.py +++ b/app.py @@ -4,12 +4,12 @@ from sklearn import grid_search from sklearn.cross_validation import train_test_split -from lib.classifiers import CLASSIFIERS from lib.model import Model +from lib.model.classifiers import CLASSIFIERS data = np.load('./tmp/testTrainData.npz') print(data.keys()) -X_train, X_test, y_test, y_train = (data[item] for item in data.keys()) +X_test, X_train, y_test, y_train = (data[item] for item in data.keys()) classifiers = [] dataframes = [] diff --git a/lib/dataWrangler.py b/lib/data/wrangler.py similarity index 98% rename from lib/dataWrangler.py rename to lib/data/wrangler.py index 6fd570c..a371c07 100644 --- a/lib/dataWrangler.py +++ b/lib/data/wrangler.py @@ -16,6 +16,7 @@ """ from __future__ import print_function, absolute_import +import os import pandas as pd import numpy as np from sklearn.cross_validation import train_test_split @@ -102,6 +103,8 @@ def storeData(df, fileLoc='./tmp/', cv=0.30, rs=21): # and access the train/test using split using dictionary formatting. # Ex: data['XTrain'] """ + if not os.path.exists('tmp'): + os.makedirs('tmp') filename = fileLoc+'testTrainData' XTrain, XTest, yTrain, yTest = trainCvSplit(df, cv, rs) kwargs = {'XTrain': XTrain, diff --git a/lib/model.py b/lib/model/__init__.py similarity index 100% rename from lib/model.py rename to lib/model/__init__.py diff --git a/lib/classifiers.py b/lib/model/classifiers.py similarity index 100% rename from lib/classifiers.py rename to lib/model/classifiers.py diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/baseLiner.py b/test/test_dtree.py similarity index 57% rename from lib/baseLiner.py rename to test/test_dtree.py index 3f603dd..4d06b76 100644 --- a/lib/baseLiner.py +++ b/test/test_dtree.py @@ -10,7 +10,7 @@ def baseline(*args): # returns: the classification accuracy_score """ XTrain, XTest, yTrain, yTest = args - clf = DecisionTreeClassifier() + clf = DecisionTreeClassifier(random_state=42) clf.fit(XTrain, yTrain) return clf.score(XTest, yTest), clf.feature_importances_ @@ -28,6 +28,13 @@ def selectFeatures(k_features=5, *args): if __name__ == "__main__": data = np.load('./tmp/testTrainData.npz') - XTrain, XTest, yTrain, yTest = (data[item] for item in data.keys()) - print(baseline(XTrain, XTest, yTrain, yTest)) - print(selectFeatures(5, XTrain, yTrain)) + XTrain, XTest, yTest, yTrain = (data[item] for item in data.keys()) + dtree_output = baseline(XTrain, XTest, yTrain, yTest) + assert(np.allclose(dtree_output[0],0.81349,atol=1e-5)) + assert(np.allclose(dtree_output[1], + np.array([ 0.18321826, 0.05276996, 0.14053585, 0.16994514, 0.0759449 , + 0.02099475, 0.01722604, 0.17374549, 0.04401084, 0.09797639, + 0.02363239]), atol=0.01)) + k_best_output = selectFeatures(5, XTrain, yTrain) + assert(k_best_output.shape == (22792, 5)) + #assert(np.allclose()) From 92dc2e4ae2dcfb671462d794cada24cbacc4e17c Mon Sep 17 00:00:00 2001 From: Joshua Cook Date: Sat, 20 Aug 2016 11:29:33 -0700 Subject: [PATCH 5/7] one last refactor --- app.py | 58 +++++++++-------------------------------- lib/data/__init__.py | 39 +++++++++++++++++++++++++++ lib/data/wrangler.py | 21 ++++----------- lib/helpers/__init__.py | 40 ++++++++++++++++++++++++++++ 4 files changed, 96 insertions(+), 62 deletions(-) create mode 100644 lib/data/__init__.py create mode 100644 lib/helpers/__init__.py diff --git a/app.py b/app.py index 0ad8cb6..2ce2a55 100644 --- a/app.py +++ b/app.py @@ -4,52 +4,18 @@ from sklearn import grid_search from sklearn.cross_validation import train_test_split -from lib.model import Model -from lib.model.classifiers import CLASSIFIERS +from lib.data.wrangler import readData +from lib.helpers import runTests, writeToCsv -data = np.load('./tmp/testTrainData.npz') -print(data.keys()) -X_test, X_train, y_test, y_train = (data[item] for item in data.keys()) +X_test, \ + X_train, \ + y_test, \ + y_train = readData('./tmp/testTrainData.npz') -classifiers = [] -dataframes = [] -train_times = [] -pred_times = [] -f1_trains = [] -f1_tests = [] +classifiers, \ + train_times, \ + pred_times, \ + f1_trains, \ + f1_tests = runTests(X_test, X_train, y_test, y_train) -for classifier, parameters in CLASSIFIERS: - this_model = Model(classifier, parameters) - - this_model(X_train, y_train, X_test, y_test) - classifiers.append(this_model.classifier.__class__.__name__) - train_times.append(this_model.training_time) - pred_times.append(this_model.train_prediction_time) - f1_trains.append(this_model.f1_train) - f1_tests.append(this_model.f1_test) - -df = pd.DataFrame({"Classifier": classifiers, - "Training Time": train_times, - "Prediction Time": pred_times, - "F1 Score on Training Set": f1_trains, - "F1 Score on Test Set": f1_tests}) -dataframes.append(df) - -for i, frame in enumerate(dataframes): - filenumber = i * 100 - filename = "results/{} samples.csv".format(filenumber) - frame.to_csv(filename) - - -# def fit_model(clf,parameters,X,Y): -# clfr = grid_search.GridSearchCV(clf,parameters,scoring="f1",cv=4) -# return clfr.fit(X,Y) - -# clf = fit_model(svc, -# [{"kernel":["poly"], -# "degree":[1,2,3,4,5], -# "C":[1,10,100,1000], -# }],X_train,y_train) - -# print(clf.best_params_) - # print(predict_labels(clf,X_test,y_test)) +writeToCsv(classifiers, train_times, pred_times, f1_trains, f1_tests) diff --git a/lib/data/__init__.py b/lib/data/__init__.py new file mode 100644 index 0000000..5fc5288 --- /dev/null +++ b/lib/data/__init__.py @@ -0,0 +1,39 @@ +import datetime +import time +import pandas as pd + +from lib.model import Model +from lib.model.classifiers import CLASSIFIERS + +def runTests(X_test, X_train, y_test, y_train): + classifiers = [] + train_times = [] + pred_times = [] + f1_trains = [] + f1_tests = [] + + for classifier, parameters in CLASSIFIERS: + this_model = Model(classifier, parameters) + + this_model(X_train, y_train, X_test, y_test) + classifiers.append(this_model.classifier.__class__.__name__) + train_times.append(this_model.training_time) + pred_times.append(this_model.train_prediction_time) + f1_trains.append(this_model.f1_train) + f1_tests.append(this_model.f1_test) + + return classifiers, train_times, pred_times, f1_trains, f1_tests + +def writeToCsv(classifiers, train_times, pred_times, f1_trains, f1_tests): + df = pd.DataFrame({"Classifier": classifiers, + "Training Time": train_times, + "Prediction Time": pred_times, + "F1 Score on Training Set": f1_trains, + "F1 Score on Test Set": f1_tests}) + + t = datetime.datetime(2011, 10, 21, 0, 0) + t = time.mktime(t.timetuple()) + if not os.path.exists('results'): + os.makedirs('results') + filename = str(t)+'.csv' + frame.to_csv('results/'+filename) diff --git a/lib/data/wrangler.py b/lib/data/wrangler.py index a371c07..ca50358 100644 --- a/lib/data/wrangler.py +++ b/lib/data/wrangler.py @@ -1,19 +1,4 @@ -""" -usage: dataWrangler.py [-h] [-i RAWDATALOC] [-s CV] [-r RS] [-o TEMPLOC] - -DataWrangling Script - reads rawData from the /data folder and creates -features-labels(Test Train Split) and store it in /tmp folder - -optional arguments: - -h, --help     show this help message and exit -  -i RAWDATALOC  rawData file location . -                 default_value: ./data/adult.data -  -s CV          size of the cross_validation set. default_value: 0.30 -  -r RS          random_state to use for splitting the data. default_value: 42 -  -o TEMPLOC     file location to store temporary binary data for -                 test_train_split . default_value: -                 ./tmp/ -""" + from __future__ import print_function, absolute_import import os @@ -22,6 +7,10 @@ from sklearn.cross_validation import train_test_split import argparse +def readData(fileLoc=None): + data = np.load(fileLoc) + print(data.keys()) + return (data[item] for item in data.keys()) def getData(fileLoc=None): """ diff --git a/lib/helpers/__init__.py b/lib/helpers/__init__.py new file mode 100644 index 0000000..2edf8ef --- /dev/null +++ b/lib/helpers/__init__.py @@ -0,0 +1,40 @@ +import datetime +import time +import os +import pandas as pd + +from lib.model import Model +from lib.model.classifiers import CLASSIFIERS + +def runTests(X_test, X_train, y_test, y_train): + classifiers = [] + train_times = [] + pred_times = [] + f1_trains = [] + f1_tests = [] + + for classifier, parameters in CLASSIFIERS: + this_model = Model(classifier, parameters) + + this_model(X_train, y_train, X_test, y_test) + classifiers.append(this_model.classifier.__class__.__name__) + train_times.append(this_model.training_time) + pred_times.append(this_model.train_prediction_time) + f1_trains.append(this_model.f1_train) + f1_tests.append(this_model.f1_test) + + return classifiers, train_times, pred_times, f1_trains, f1_tests + +def writeToCsv(classifiers, train_times, pred_times, f1_trains, f1_tests): + df = pd.DataFrame({"Classifier": classifiers, + "Training Time": train_times, + "Prediction Time": pred_times, + "F1 Score on Training Set": f1_trains, + "F1 Score on Test Set": f1_tests}) + + t = datetime.datetime(2011, 10, 21, 0, 0) + t = time.mktime(t.timetuple()) + if not os.path.exists('results'): + os.makedirs('results') + filename = str(t)+'.csv' + frame.to_csv('results/'+filename) From 1efa8343721003b91e191cee58b4c55b96e89074 Mon Sep 17 00:00:00 2001 From: Nash Taylor Date: Tue, 23 Aug 2016 14:00:12 -0700 Subject: [PATCH 6/7] Initial visualization commit --- doc/EDA.Rmd | 36 ++++++++++ doc/EDA.html | 182 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 218 insertions(+) create mode 100644 doc/EDA.Rmd create mode 100644 doc/EDA.html diff --git a/doc/EDA.Rmd b/doc/EDA.Rmd new file mode 100644 index 0000000..eacc9d5 --- /dev/null +++ b/doc/EDA.Rmd @@ -0,0 +1,36 @@ +--- +output: html_document +--- +Adult Data Exploratory Visualization +======================================================= + +Loading necessary packages and Adult Data. + +``` {r echo=TRUE, message=FALSE, warning=FALSE, packages} +# Loading useful packages +packs = c("ggplot2","ggthemes","gridExtra") +lapply(packs, function(p) { + if (!(p %in% installed.packages()[,'Package'])) { + install.packages(p) + } + else { + gsub('package', p, 'package already installed') + } +}) +library(ggplot2) +library(ggthemes) +library(gridExtra) + +# Loading adult data +cNames = c('Age','Workclass','FnlWgt','Education','EducationNum', + 'MaritalStatus','Occupation','Relationship','Race','Sex', + 'CapitalGain','CapitalLoss','HoursPerWeek','NativeCountry','Income') +adult = read.csv('../data/adult.data', header=FALSE, col.names = cNames) +``` + +First test visualization. + +``` {r echo=TRUE, message=FALSE, warning=FALSE, testViz} +# Kernel Density Estimate (KDE) of Age +plot(density(adult$Age)) +``` \ No newline at end of file diff --git a/doc/EDA.html b/doc/EDA.html new file mode 100644 index 0000000..ac81255 --- /dev/null +++ b/doc/EDA.html @@ -0,0 +1,182 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + +
+

Adult Data Exploratory Visualization

+

Loading necessary packages and Adult Data.

+
# Loading useful packages
+packs = c("ggplot2","ggthemes","gridExtra")
+lapply(packs, function(p) {
+  if (!(p %in% installed.packages()[,'Package'])) {
+    install.packages(p)
+  }
+  else {
+    gsub('package', p, 'package already installed')
+  }
+})
+
## [[1]]
+## [1] "ggplot2 already installed"
+## 
+## [[2]]
+## [1] "ggthemes already installed"
+## 
+## [[3]]
+## [1] "gridExtra already installed"
+
library(ggplot2)
+library(ggthemes)
+library(gridExtra)
+
+# Loading adult data
+cNames = c('Age','Workclass','FnlWgt','Education','EducationNum',
+           'MaritalStatus','Occupation','Relationship','Race','Sex',
+           'CapitalGain','CapitalLoss','HoursPerWeek','NativeCountry','Income')
+adult = read.csv('../data/adult.data', header=FALSE, col.names = cNames)
+

First test visualization.

+
# Kernel Density Estimate (KDE) of Age
+plot(density(adult$Age))
+

+
+ + + + +
+ + + + + + + + From d241a47307a5dcccb5a18c8de6bd3ea45ea7d65b Mon Sep 17 00:00:00 2001 From: Nash Taylor Date: Tue, 23 Aug 2016 16:09:44 -0700 Subject: [PATCH 7/7] data overview + first plot completed --- doc/EDA.Rmd | 73 +++++++++++++++++++++++++++++++++++++++--- doc/EDA.html | 89 +++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 152 insertions(+), 10 deletions(-) diff --git a/doc/EDA.Rmd b/doc/EDA.Rmd index eacc9d5..a340696 100644 --- a/doc/EDA.Rmd +++ b/doc/EDA.Rmd @@ -4,6 +4,8 @@ output: html_document Adult Data Exploratory Visualization ======================================================= +# Data Prep + Loading necessary packages and Adult Data. ``` {r echo=TRUE, message=FALSE, warning=FALSE, packages} @@ -25,12 +27,73 @@ library(gridExtra) cNames = c('Age','Workclass','FnlWgt','Education','EducationNum', 'MaritalStatus','Occupation','Relationship','Race','Sex', 'CapitalGain','CapitalLoss','HoursPerWeek','NativeCountry','Income') -adult = read.csv('../data/adult.data', header=FALSE, col.names = cNames) +adult = read.csv('../data/adult.data', header=FALSE, col.names = cNames, strip.white = TRUE) +``` + +# Data Overview +```{r echo=TRUE, message=FALSE, warning=FALSE, overview} +# Dimensions of data +m = nrow(adult) +n = ncol(adult) +gsub('ncol',n,gsub('nrow',m,'The dataset has nrow rows and ncol columns')) + +# Feature names +colnames(adult) + +# Unique classes +unique(adult$Income) + +# Summary of features +summary(adult[, names(adult)!='Income']) + +# Class balance +table(adult$Income) / m + +# Racial balance +table(adult$Race) / m + +# Gender balance +table(adult$Sex) / m + ``` -First test visualization. +There are 14 features to work with, and the nature of the label "Income" shows that this task will have to be binary classification. The labels are not totally unbalanced, at 76%-24%. Racially, the dataset is quite dominated by caucasians. The dataset is also made up of about 2/3rds males. + +One of the most interesting features, particularly when attempting to model Income, ought to be Education. + +```{r echo=FALSE, message=FALSE, warning=FALSE, Plot_One} +# Set the ggPlot theme to something not gross +theme_set(theme_minimal(12)) + +# Factor levels aren't set properly; manually set +adult$Education = sapply(adult$Education, as.character) +levs = c('Preschool','1st-4th','5th-6th','7th-8th','9th','10th','11th','12th', + 'HS-grad','Some-college','Assoc-acdm','Assoc-voc','Bachelors','Masters','Prof-school','Doctorate') +adult$Education = factor(adult$Education, levels = levs) + +# Education histogram +ggplot(adult, aes(Education)) + + geom_bar(fill="#236B8E") + + geom_vline(aes(xintercept=mean(EducationNum)), + color="black", linetype="dashed",size=0.75) + + ylab("Count") + + ggtitle("Education Histogram w/ Mean") + + theme(axis.text.x = element_text(angle=90, hjust=1), + axis.title.x=element_blank()) + +# Education split by gender +# Normalize to discount gender imbalance +edByGender = aggregate(1:nrow(adult), by=list(adult$Sex, adult$Education), length) +names(edByGender) = c('Gender','Education','Num') + +edByGender$EdPctOfGender = with(edByGender, ifelse(Gender=='Male', + Num / sum(Num[Gender=='Male']), + Num / sum(Num[Gender=='Female']))) +edByGender$GenderPctOfEd = mapply(function(ed,num) { + return(num / sum(edByGender[edByGender$Education==ed,'EdPctOfGender'])) + }, edByGender$Education, edByGender$EdPctOfGender) + +# TODO: make cumulative line plot of Num by Gender + -``` {r echo=TRUE, message=FALSE, warning=FALSE, testViz} -# Kernel Density Estimate (KDE) of Age -plot(density(adult$Age)) ``` \ No newline at end of file diff --git a/doc/EDA.html b/doc/EDA.html index ac81255..4c44c57 100644 --- a/doc/EDA.html +++ b/doc/EDA.html @@ -120,6 +120,9 @@

Adult Data Exploratory Visualization

+
+
+

Data Prep

Loading necessary packages and Adult Data.

# Loading useful packages
 packs = c("ggplot2","ggthemes","gridExtra")
@@ -147,11 +150,87 @@ 

Adult Data Exploratory Visualization

cNames = c('Age','Workclass','FnlWgt','Education','EducationNum', 'MaritalStatus','Occupation','Relationship','Race','Sex', 'CapitalGain','CapitalLoss','HoursPerWeek','NativeCountry','Income') -adult = read.csv('../data/adult.data', header=FALSE, col.names = cNames)
-

First test visualization.

-
# Kernel Density Estimate (KDE) of Age
-plot(density(adult$Age))
-

+adult = read.csv('../data/adult.data', header=FALSE, col.names = cNames, strip.white = TRUE) +
+
+

Data Overview

+
# Dimensions of data
+m = nrow(adult)
+n = ncol(adult)
+gsub('ncol',n,gsub('nrow',m,'The dataset has nrow rows and ncol columns'))
+
## [1] "The dataset has 32561 rows and 15 columns"
+
# Feature names
+colnames(adult)
+
##  [1] "Age"           "Workclass"     "FnlWgt"        "Education"    
+##  [5] "EducationNum"  "MaritalStatus" "Occupation"    "Relationship" 
+##  [9] "Race"          "Sex"           "CapitalGain"   "CapitalLoss"  
+## [13] "HoursPerWeek"  "NativeCountry" "Income"
+
# Unique classes
+unique(adult$Income)
+
## [1] <=50K >50K 
+## Levels: <=50K >50K
+
# Summary of features
+summary(adult[, names(adult)!='Income'])
+
##       Age                   Workclass         FnlWgt       
+##  Min.   :17.00   Private         :22696   Min.   :  12285  
+##  1st Qu.:28.00   Self-emp-not-inc: 2541   1st Qu.: 117827  
+##  Median :37.00   Local-gov       : 2093   Median : 178356  
+##  Mean   :38.58   ?               : 1836   Mean   : 189778  
+##  3rd Qu.:48.00   State-gov       : 1298   3rd Qu.: 237051  
+##  Max.   :90.00   Self-emp-inc    : 1116   Max.   :1484705  
+##                  (Other)         :  981                    
+##         Education      EducationNum                 MaritalStatus  
+##  HS-grad     :10501   Min.   : 1.00   Divorced             : 4443  
+##  Some-college: 7291   1st Qu.: 9.00   Married-AF-spouse    :   23  
+##  Bachelors   : 5355   Median :10.00   Married-civ-spouse   :14976  
+##  Masters     : 1723   Mean   :10.08   Married-spouse-absent:  418  
+##  Assoc-voc   : 1382   3rd Qu.:12.00   Never-married        :10683  
+##  11th        : 1175   Max.   :16.00   Separated            : 1025  
+##  (Other)     : 5134                   Widowed              :  993  
+##            Occupation           Relationship                   Race      
+##  Prof-specialty :4140   Husband       :13193   Amer-Indian-Eskimo:  311  
+##  Craft-repair   :4099   Not-in-family : 8305   Asian-Pac-Islander: 1039  
+##  Exec-managerial:4066   Other-relative:  981   Black             : 3124  
+##  Adm-clerical   :3770   Own-child     : 5068   Other             :  271  
+##  Sales          :3650   Unmarried     : 3446   White             :27816  
+##  Other-service  :3295   Wife          : 1568                             
+##  (Other)        :9541                                                    
+##      Sex         CapitalGain     CapitalLoss      HoursPerWeek  
+##  Female:10771   Min.   :    0   Min.   :   0.0   Min.   : 1.00  
+##  Male  :21790   1st Qu.:    0   1st Qu.:   0.0   1st Qu.:40.00  
+##                 Median :    0   Median :   0.0   Median :40.00  
+##                 Mean   : 1078   Mean   :  87.3   Mean   :40.44  
+##                 3rd Qu.:    0   3rd Qu.:   0.0   3rd Qu.:45.00  
+##                 Max.   :99999   Max.   :4356.0   Max.   :99.00  
+##                                                                 
+##        NativeCountry  
+##  United-States:29170  
+##  Mexico       :  643  
+##  ?            :  583  
+##  Philippines  :  198  
+##  Germany      :  137  
+##  Canada       :  121  
+##  (Other)      : 1709
+
# Class balance
+table(adult$Income) / m
+
## 
+##     <=50K      >50K 
+## 0.7591904 0.2408096
+
# Racial balance
+table(adult$Race) / m
+
## 
+## Amer-Indian-Eskimo Asian-Pac-Islander              Black 
+##        0.009551304        0.031909339        0.095942999 
+##              Other              White 
+##        0.008322840        0.854273517
+
# Gender balance
+table(adult$Sex) / m
+
## 
+##    Female      Male 
+## 0.3307945 0.6692055
+

There are 14 features to work with, and the nature of the label “Income” shows that this task will have to be binary classification. The labels are not totally unbalanced, at 76%-24%. Racially, the dataset is quite dominated by caucasians. The dataset is also made up of about 2/3rds males.

+

One of the most interesting features, particularly when attempting to model Income, ought to be Education.

+