Added most up-to-date project directory

grantathon · Jul 28, 2015 · 97639a8 · 97639a8
1 parent 2bb0003
commit 97639a8
Show file tree

Hide file tree

Showing 8 changed files with 863 additions and 0 deletions.
diff --git a/project/HierarchicalRandomForest_keraudren.py b/project/HierarchicalRandomForest_keraudren.py
@@ -0,0 +1,89 @@
+from randomforest import *
+import numpy as np
+import operator
+from pprint import pprint
+
+
+class HierarchicalRandomForest(object):
+    def __init__(self, n_estimators=100, n_procs=1):
+        # TODO: Try different parameters
+        # params = {  'max_depth' : 10,
+        #             'min_sample_count' : 5,
+        #             'test_count' : 100,
+        #             'test_class' : getattr( weakLearner, learner)()
+        #          }
+
+        self.forests = [Forest(ntrees=n_estimators, nprocs=n_procs) for i in range(0, 3)]
+
+    def train(self, training_labels, training_examples, num_classes, optimize=False):
+        size = len(training_examples)
+
+        # Train first RF hierarchy
+        self.forests[0].grow(points=training_examples, responses=training_labels[:, 0])
+
+        # Extract distribution from all leaves of first RF hierarchy
+        prediction_probs = np.ndarray((size, 2))
+        for i in range(size):
+            prediction_probs[i] = self.forests[0].predict(point=training_examples[i], soft=True).values()
+
+        # Train second RF hierarchy
+        self.forests[1].grow(points=prediction_probs, responses=training_labels[:, 1])
+
+        # Extract distribution from all leaves of second RF hierarchy
+        temp_prediction_probs = np.ndarray((size, 4))
+        for i in range(size):
+            temp_prediction_probs[i] = self.forests[1].predict(point=prediction_probs[i], soft=True).values()
+
+        # Train third RF hierarchy
+        self.forests[2].grow(points=temp_prediction_probs, responses=training_labels[:, 2])
+
+    def test(self, test_labels, test_examples, num_classes, top_n=[1, 5]):
+        if num_classes < np.max(top_n):
+            raise AttributeError, "The number of classes must be greater than or equal to all of the top-n values during testing."
+
+        size = len(test_labels)
+
+        top_n_hits = {}
+        for t_n in top_n:
+            top_n_hits[t_n] = np.ndarray(size)
+
+        for i in range(size):
+            prediction_probs = self.forests[2].predict(point=test_examples[i], soft=True)
+            sorted_class_probs = sorted(prediction_probs.items(), key=operator.itemgetter(1), reverse=True)
+
+            for t_n in top_n_hits.keys():
+                top_n = [c[0] for c in sorted_class_probs[0:t_n]]
+
+                # Record prediction accuracy
+                if test_labels[i, 2] in top_n:
+                    top_n_hits[t_n][i] = True
+                else:
+                    top_n_hits[t_n][i] = False
+
+        # Produce statistics dictionary
+        stats = {}
+        for key, val in top_n_hits.iteritems():
+            stats['top %d accuracy' % key] = len([1 for p in val if p == True]) / float(size)
+
+        return stats
+
+    def optimize(self, training_examples, training_labels):
+        raise NotImplementedError, "The HierarchicalForest.optimize() function has yet to be implemented."
+
+        # # Initialize hyper-parameter space
+        # param_grid = [
+        #     {'criterion': ['gini'], 'max_depth': [None, 5, 6, 7, 8, 9, 10], 'n_estimators': [10, 20, 30, 40, 50, 75, 100, 150, 200],
+        #      'max_features': [None, int, float, 'auto', 'sqrt', 'log2']},
+        #     {'criterion': ['entropy'], 'max_depth': [None, 5, 6, 7, 8, 9, 10], 'n_estimators': [10, 20, 30, 40, 50, 75, 100, 150, 200],
+        #      'max_features': [None, int, float, 'auto', 'sqrt', 'log2']}
+        # ]
+
+        # # Optimize classifier over hyper-parameter space
+        # clf = grid_search.GridSearchCV(estimator=ensemble.RandomForestClassifier(), param_grid=param_grid, scoring='accuracy')
+        # self.classifiers[2] = clf
+
+    def predict(self, examples):
+        # Extract results from the three chained random forest
+        results = self.classifiers[0].predict_proba(examples)
+        results = self.classifiers[1].predict_proba(results)
+        results = self.classifiers[2]
diff --git a/project/HierarchicalRandomForest_sklearn.py b/project/HierarchicalRandomForest_sklearn.py
@@ -0,0 +1,113 @@
+from sklearn import tree, ensemble
+from randomforest import *
+import numpy as np
+from pprint import pprint
+
+
+class HierarchicalRandomForest(object):
+    def __init__(self, n_estimators=100, n_procs=1):
+        self.classifiers = [ensemble.RandomForestClassifier(n_estimators=n_estimators, n_jobs=n_procs) for i in range(0, 3)]
+
+    def train(self, training_labels, training_examples, optimize=False):
+        if optimize:
+            optimize(training_examples, training_labels)
+
+        # Train all three chained random forests
+        self.classifiers[0].fit(training_examples, training_labels[:, 0])
+        probs = self.classifiers[0].predict_proba(training_examples)
+        hierarchy_inputs = np.concatenate([probs, training_examples], axis=1)
+        self.classifiers[1].fit(hierarchy_inputs, training_labels[:, 1])
+        probs = self.classifiers[1].predict_proba(hierarchy_inputs)
+        hierarchy_inputs = np.concatenate([probs, training_examples], axis=1)
+        self.classifiers[2].fit(hierarchy_inputs, training_labels[:, 2])
+
+        # self.classifiers[0].fit(training_examples, training_labels[:, 0])
+        # probs = self.classifiers[0].predict_proba(training_examples)
+        # self.classifiers[1].fit(probs, training_labels[:, 1])
+        # probs = self.classifiers[1].predict_proba(probs)
+        # self.classifiers[2].fit(probs, training_labels[:, 2])
+
+        # self.classifiers[0].fit(training_examples, training_labels[:, 0])
+        # probs = self.classifiers[0].predict_proba(training_examples)
+        # self.classifiers[2].fit(probs, training_labels[:, 2])
+
+        # self.classifiers[1].fit(training_examples, training_labels[:, 1])
+        # probs = self.classifiers[1].predict_proba(training_examples)
+        # self.classifiers[2].fit(probs, training_labels[:, 2])
+
+        # self.classifiers[2].fit(training_examples, training_labels[:, 2])
+
+    def test(self, test_labels, test_examples, num_classes, top_n=[1, 5]):
+        if num_classes < np.max(top_n):
+            raise AttributeError, "The number of classes must be greater than or equal to all of the top-n values during testing."
+
+        size = len(test_labels)
+
+        # Extract results from the three chained random forest
+        probs = self.classifiers[0].predict_proba(test_examples)
+        hierarchy_inputs = np.concatenate([probs, test_examples], axis=1)
+        probs = self.classifiers[1].predict_proba(hierarchy_inputs)
+        hierarchy_inputs = np.concatenate([probs, test_examples], axis=1)
+        prediction_probs = self.classifiers[2].predict_proba(hierarchy_inputs)
+
+        # probs = self.classifiers[0].predict_proba(test_examples)
+        # probs = self.classifiers[1].predict_proba(probs)
+        # prediction_probs = self.classifiers[2].predict_proba(probs)
+
+        # probs = self.classifiers[0].predict_proba(test_examples)
+        # prediction_probs = self.classifiers[2].predict_proba(probs)
+
+        # probs = self.classifiers[1].predict_proba(test_examples)
+        # prediction_probs = self.classifiers[2].predict_proba(probs)
+
+        # prediction_probs = self.classifiers[2].predict_proba(test_examples)
+
+        classes = self.classifiers[2].classes_
+        sorted_class_probs = []
+
+        for i in range(size):
+            class_probs = [(classes[j], prediction_probs[i][j]) for j in range(num_classes)]
+            sorted_class_probs.append(sorted(class_probs, key=lambda tup: tup[1], reverse=True))
+
+        top_n_hits = {}
+        for t_n in top_n:
+            top_n_hits[t_n] = np.ndarray(size)
+
+        for i in range(size):
+            for t_n in top_n_hits.keys():
+                top_n = [tup[0] for tup in sorted_class_probs[i][0:t_n]]
+                in_top_n = test_labels[i, 2] in top_n
+
+                # Record prediction accuracy
+                if in_top_n:
+                    top_n_hits[t_n][i] = True
+                else:
+                    top_n_hits[t_n][i] = False
+
+        # Produce statistics dictionary
+        stats = {}
+        for key, val in top_n_hits.iteritems():
+            stats['top %d accuracy' % key] = len([1 for p in val if p == True]) / float(size)
+
+        return stats
+
+    def optimize(self, training_examples, training_labels):
+        raise NotImplementedError, "The optimize() function has yet to be implemented."
+
+        # # Initialize hyper-parameter space
+        # param_grid = [
+        #     {'criterion': ['gini'], 'max_depth': [None, 5, 6, 7, 8, 9, 10], 'n_estimators': [10, 20, 30, 40, 50, 75, 100, 150, 200],
+        #      'max_features': [None, int, float, 'auto', 'sqrt', 'log2']},
+        #     {'criterion': ['entropy'], 'max_depth': [None, 5, 6, 7, 8, 9, 10], 'n_estimators': [10, 20, 30, 40, 50, 75, 100, 150, 200],
+        #      'max_features': [None, int, float, 'auto', 'sqrt', 'log2']}
+        # ]
+
+        # # Optimize classifier over hyper-parameter space
+        # clf = grid_search.GridSearchCV(estimator=ensemble.RandomForestClassifier(), param_grid=param_grid, scoring='accuracy')
+        # self.classifiers[2] = clf
+
+    def predict(self, examples):
+        # Extract results from the three chained random forest
+        results = self.classifiers[0].predict_proba(examples)
+        results = self.classifiers[1].predict_proba(results)
+        results = self.classifiers[2]