-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
144 lines (127 loc) · 5.94 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# small dataset 46 & large dataset 78
import pandas as pd
import numpy as np
import copy
import time
df = pd.read_table('CS170_Small_Data__46.txt', engine='python', delimiter=' ',
names=['class label', '1', '2', '3', '4', '5', '6'])
large_df = pd.read_table('CS170_Large_Data__78.txt', engine='python', delimiter=' ',
names=['class label', '1', '2', '3', '4', '5', '6'])
test_df = pd.read_table('CS170_Small_Data__88.txt', engine='python', delimiter=' ',
names=['class label', '1', '2', '3', '4', '5', '6'])
small_test_df = df.head(20)
arr = test_df.to_numpy()
def calculate_distance(a, b):
sub = a - b
return np.dot(sub, sub)
def k_fold_cross_validation_anti_pandas(data, current_set, feature_to_add): # returns accuracy
# code to do k folding goes here
if len(current_set) == 0:
return 0.5 # hard coded to 0.5 because it is random if there's no features
features = copy.deepcopy(current_set)
if feature_to_add != 0:
features.append(feature_to_add)
count_of_correctly_classified = 0
for i in range(len(data)):
object_to_classify = []
for p in range(len(features)):
object_to_classify.append(data[i][features[p]]) # list of features at index i
label_object_to_classify = data[i][0] # class label at index i
nearest_neighbor_distance = np.inf
nearest_neighbor_label = 0
for k in range(len(data)):
if k != i:
temp = []
for s in range(len(features)):
temp.append(data[k][features[s]]) # list of features at index k
temp = np.array(temp)
distance = calculate_distance(object_to_classify, temp)
if distance < nearest_neighbor_distance:
nearest_neighbor_distance = distance
nearest_neighbor_location = k
nearest_neighbor_label = data[nearest_neighbor_location][0]
if label_object_to_classify == nearest_neighbor_label:
count_of_correctly_classified += 1
accuracy = count_of_correctly_classified/len(data)
accuracy = round(accuracy, 3)
# print('correctly classified out of 500:', count_of_correctly_classified)
# print(accuracy)
return accuracy
def feature_search(file_name):
data = pd.read_table(file_name, engine='python', delimiter=' ')
current_set_of_features = []
best_accuracy = 0
best_features = 0
# forward selection
for x in range(len(data.columns)-1):
print('On the', x, 'th level of the search tree')
feature_to_add_at_this_level = []
best_so_far_accuracy = 0
for k in range(len(data.columns)-1): # make sure to not count the class label column
if k+1 not in current_set_of_features:
pls = data.to_numpy()
accuracy = k_fold_cross_validation_anti_pandas(pls, current_set_of_features, k+1)
print('---Using features', str(current_set_of_features), str(k+1), 'the accuracy is', accuracy)
if accuracy > best_so_far_accuracy:
best_so_far_accuracy = accuracy
feature_to_add_at_this_level = k+1
print('On level', str(x), 'I added feature', str(feature_to_add_at_this_level), 'to current set')
current_set_of_features.append(feature_to_add_at_this_level)
if best_so_far_accuracy > best_accuracy:
best_accuracy = best_so_far_accuracy
best_features = copy.deepcopy(current_set_of_features)
print('\nSearch finished!')
print('best accuracy:', best_accuracy)
print('using these features:', best_features)
return best_features
def backwards_elimination(file_name):
data = pd.read_table(file_name, engine='python', delimiter=' ')
current_set_of_features = []
best_accuracy = 0
best_features = 0
pls = data.to_numpy()
for x in range(len(data.columns) - 1):
current_set_of_features.append(x+1)
for x in range(len(data.columns) - 1):
print('On the', x, 'th level of the search tree')
feature_to_remove = []
best_so_far_accuracy = 0
for k in range(len(data.columns) - 1):
exists = current_set_of_features.count(k+1)
if exists > 0:
elimination_test = copy.deepcopy(current_set_of_features)
elimination_test.remove(k+1)
accuracy = k_fold_cross_validation_anti_pandas(pls, elimination_test, 0)
print('---Using features', str(elimination_test), 'the accuracy is', accuracy)
if accuracy > best_so_far_accuracy:
best_so_far_accuracy = accuracy
feature_to_remove = k+1
current_set_of_features.remove(feature_to_remove)
print('On level', str(x), 'I removed feature', str(feature_to_remove), 'from current set')
if best_so_far_accuracy > best_accuracy:
best_accuracy = best_so_far_accuracy
best_features = copy.deepcopy(current_set_of_features)
print('\nSearch finished!')
print('best accuracy:', best_accuracy)
print('using these features:', best_features)
return best_features
def menu():
print('**Feature Selection and Nearest Neighbor Classification Algorithm**\n')
file_name = input('Type in the name of the file to test: ')
print('\n(1) Forward Selection\n(2) Backwards Elimination')
algorithm = input('Type the number of the algorithm you want to run: ')
if algorithm == '1':
print('\n Starting Forward Selection')
start = time.time()
feature_search(file_name)
end = time.time()
elapsed = round(end - start, 1)
print('time elapsed: ', elapsed)
if algorithm == '2':
print('\n Starting Backwards Elimination')
start = time.time()
backwards_elimination(file_name)
end = time.time()
elapsed = round(end - start, 1)
print('time elapsed: ', elapsed)
menu()