Skip to content

Commit

Permalink
Added most up-to-date project directory
Browse files Browse the repository at this point in the history
  • Loading branch information
grantathon committed Jul 28, 2015
1 parent 2bb0003 commit 97639a8
Show file tree
Hide file tree
Showing 8 changed files with 863 additions and 0 deletions.
89 changes: 89 additions & 0 deletions project/HierarchicalRandomForest_keraudren.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from randomforest import *
import numpy as np
import operator
from pprint import pprint


class HierarchicalRandomForest(object):
def __init__(self, n_estimators=100, n_procs=1):
# TODO: Try different parameters
# params = { 'max_depth' : 10,
# 'min_sample_count' : 5,
# 'test_count' : 100,
# 'test_class' : getattr( weakLearner, learner)()
# }

self.forests = [Forest(ntrees=n_estimators, nprocs=n_procs) for i in range(0, 3)]

def train(self, training_labels, training_examples, num_classes, optimize=False):
size = len(training_examples)

# Train first RF hierarchy
self.forests[0].grow(points=training_examples, responses=training_labels[:, 0])

# Extract distribution from all leaves of first RF hierarchy
prediction_probs = np.ndarray((size, 2))
for i in range(size):
prediction_probs[i] = self.forests[0].predict(point=training_examples[i], soft=True).values()

# Train second RF hierarchy
self.forests[1].grow(points=prediction_probs, responses=training_labels[:, 1])

# Extract distribution from all leaves of second RF hierarchy
temp_prediction_probs = np.ndarray((size, 4))
for i in range(size):
temp_prediction_probs[i] = self.forests[1].predict(point=prediction_probs[i], soft=True).values()

# Train third RF hierarchy
self.forests[2].grow(points=temp_prediction_probs, responses=training_labels[:, 2])

def test(self, test_labels, test_examples, num_classes, top_n=[1, 5]):
if num_classes < np.max(top_n):
raise AttributeError, "The number of classes must be greater than or equal to all of the top-n values during testing."

size = len(test_labels)

top_n_hits = {}
for t_n in top_n:
top_n_hits[t_n] = np.ndarray(size)

for i in range(size):
prediction_probs = self.forests[2].predict(point=test_examples[i], soft=True)
sorted_class_probs = sorted(prediction_probs.items(), key=operator.itemgetter(1), reverse=True)

for t_n in top_n_hits.keys():
top_n = [c[0] for c in sorted_class_probs[0:t_n]]

# Record prediction accuracy
if test_labels[i, 2] in top_n:
top_n_hits[t_n][i] = True
else:
top_n_hits[t_n][i] = False

# Produce statistics dictionary
stats = {}
for key, val in top_n_hits.iteritems():
stats['top %d accuracy' % key] = len([1 for p in val if p == True]) / float(size)

return stats

def optimize(self, training_examples, training_labels):
raise NotImplementedError, "The HierarchicalForest.optimize() function has yet to be implemented."

# # Initialize hyper-parameter space
# param_grid = [
# {'criterion': ['gini'], 'max_depth': [None, 5, 6, 7, 8, 9, 10], 'n_estimators': [10, 20, 30, 40, 50, 75, 100, 150, 200],
# 'max_features': [None, int, float, 'auto', 'sqrt', 'log2']},
# {'criterion': ['entropy'], 'max_depth': [None, 5, 6, 7, 8, 9, 10], 'n_estimators': [10, 20, 30, 40, 50, 75, 100, 150, 200],
# 'max_features': [None, int, float, 'auto', 'sqrt', 'log2']}
# ]

# # Optimize classifier over hyper-parameter space
# clf = grid_search.GridSearchCV(estimator=ensemble.RandomForestClassifier(), param_grid=param_grid, scoring='accuracy')
# self.classifiers[2] = clf

def predict(self, examples):
# Extract results from the three chained random forest
results = self.classifiers[0].predict_proba(examples)
results = self.classifiers[1].predict_proba(results)
results = self.classifiers[2]
113 changes: 113 additions & 0 deletions project/HierarchicalRandomForest_sklearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from sklearn import tree, ensemble
from randomforest import *
import numpy as np
from pprint import pprint


class HierarchicalRandomForest(object):
def __init__(self, n_estimators=100, n_procs=1):
self.classifiers = [ensemble.RandomForestClassifier(n_estimators=n_estimators, n_jobs=n_procs) for i in range(0, 3)]

def train(self, training_labels, training_examples, optimize=False):
if optimize:
optimize(training_examples, training_labels)

# Train all three chained random forests
self.classifiers[0].fit(training_examples, training_labels[:, 0])
probs = self.classifiers[0].predict_proba(training_examples)
hierarchy_inputs = np.concatenate([probs, training_examples], axis=1)
self.classifiers[1].fit(hierarchy_inputs, training_labels[:, 1])
probs = self.classifiers[1].predict_proba(hierarchy_inputs)
hierarchy_inputs = np.concatenate([probs, training_examples], axis=1)
self.classifiers[2].fit(hierarchy_inputs, training_labels[:, 2])

# self.classifiers[0].fit(training_examples, training_labels[:, 0])
# probs = self.classifiers[0].predict_proba(training_examples)
# self.classifiers[1].fit(probs, training_labels[:, 1])
# probs = self.classifiers[1].predict_proba(probs)
# self.classifiers[2].fit(probs, training_labels[:, 2])

# self.classifiers[0].fit(training_examples, training_labels[:, 0])
# probs = self.classifiers[0].predict_proba(training_examples)
# self.classifiers[2].fit(probs, training_labels[:, 2])

# self.classifiers[1].fit(training_examples, training_labels[:, 1])
# probs = self.classifiers[1].predict_proba(training_examples)
# self.classifiers[2].fit(probs, training_labels[:, 2])

# self.classifiers[2].fit(training_examples, training_labels[:, 2])

def test(self, test_labels, test_examples, num_classes, top_n=[1, 5]):
if num_classes < np.max(top_n):
raise AttributeError, "The number of classes must be greater than or equal to all of the top-n values during testing."

size = len(test_labels)

# Extract results from the three chained random forest
probs = self.classifiers[0].predict_proba(test_examples)
hierarchy_inputs = np.concatenate([probs, test_examples], axis=1)
probs = self.classifiers[1].predict_proba(hierarchy_inputs)
hierarchy_inputs = np.concatenate([probs, test_examples], axis=1)
prediction_probs = self.classifiers[2].predict_proba(hierarchy_inputs)

# probs = self.classifiers[0].predict_proba(test_examples)
# probs = self.classifiers[1].predict_proba(probs)
# prediction_probs = self.classifiers[2].predict_proba(probs)

# probs = self.classifiers[0].predict_proba(test_examples)
# prediction_probs = self.classifiers[2].predict_proba(probs)

# probs = self.classifiers[1].predict_proba(test_examples)
# prediction_probs = self.classifiers[2].predict_proba(probs)

# prediction_probs = self.classifiers[2].predict_proba(test_examples)

classes = self.classifiers[2].classes_
sorted_class_probs = []

for i in range(size):
class_probs = [(classes[j], prediction_probs[i][j]) for j in range(num_classes)]
sorted_class_probs.append(sorted(class_probs, key=lambda tup: tup[1], reverse=True))

top_n_hits = {}
for t_n in top_n:
top_n_hits[t_n] = np.ndarray(size)

for i in range(size):
for t_n in top_n_hits.keys():
top_n = [tup[0] for tup in sorted_class_probs[i][0:t_n]]
in_top_n = test_labels[i, 2] in top_n

# Record prediction accuracy
if in_top_n:
top_n_hits[t_n][i] = True
else:
top_n_hits[t_n][i] = False

# Produce statistics dictionary
stats = {}
for key, val in top_n_hits.iteritems():
stats['top %d accuracy' % key] = len([1 for p in val if p == True]) / float(size)

return stats

def optimize(self, training_examples, training_labels):
raise NotImplementedError, "The optimize() function has yet to be implemented."

# # Initialize hyper-parameter space
# param_grid = [
# {'criterion': ['gini'], 'max_depth': [None, 5, 6, 7, 8, 9, 10], 'n_estimators': [10, 20, 30, 40, 50, 75, 100, 150, 200],
# 'max_features': [None, int, float, 'auto', 'sqrt', 'log2']},
# {'criterion': ['entropy'], 'max_depth': [None, 5, 6, 7, 8, 9, 10], 'n_estimators': [10, 20, 30, 40, 50, 75, 100, 150, 200],
# 'max_features': [None, int, float, 'auto', 'sqrt', 'log2']}
# ]

# # Optimize classifier over hyper-parameter space
# clf = grid_search.GridSearchCV(estimator=ensemble.RandomForestClassifier(), param_grid=param_grid, scoring='accuracy')
# self.classifiers[2] = clf

def predict(self, examples):
# Extract results from the three chained random forest
results = self.classifiers[0].predict_proba(examples)
results = self.classifiers[1].predict_proba(results)
results = self.classifiers[2]
Loading

0 comments on commit 97639a8

Please sign in to comment.