-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentro.py
85 lines (73 loc) · 3.18 KB
/
entro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Imtiaz Ahmed
import pandas as pd
import math
def measure(dictionary, method):
# method is of type String, either "entropy" or "gini"
if (method != "entropy" and method != "gini"):
print("param method is of type String, either \"entropy\" or \"gini\"")
return;
# Get total number of values in dictionary
total = 0;
for d in dictionary:
total += dictionary[d]
# Calculate probability and value using specified method
sum = 0;
for d in dictionary:
# multiply by 1.0 to preserve float value
prob = (dictionary[d]*1.0/total)
# -H = sig P(i) * log2 P(i)
if (method == "entropy"):
sum += prob*math.log(prob,2)
# Gini = 1 - sig P(i)^2
elif (method == "gini"):
sum += math.pow(prob,2)
if (method == "entropy"):
return (-sum)
elif (method == "gini"):
return (1-sum)
# This function takes a dataframe, a target feature (String) and a method of
# calculation (String) as parameters and returns the descriptive feature within
# the dataframe that provides the highest information gain.
def rem(df, target, method):
select_features = {}
# Set up dictionary of target feature to find entropy/gini
target_dict = {}
for i, v in df[target].value_counts().iteritems():
target_dict[i] = v
print(method + " of target feature " + target + ": " + str(measure(target_dict, method)) + "\n")
# Calculate rem, excluding the target feature of course
for col in df.drop([target], axis=1):
# Gather all possible values in a column into a list...
uniqueValues = df[col].unique()
sum = 0
# For each possible value in a column
for u in uniqueValues:
# Partition data by each value into table; printed with print(temp)
part = df[df[col] == u]
print(col + " = " + u)
# Gather count for target feature values within the partition
part_dict = {}
for i, v in part[target].value_counts().iteritems():
part_dict[i] = v
print(part_dict)
# rem = sig P(i)/n * H(P(i)/Gini(P(i)), target)
sum += ( part[col].count()*1.0/df[col].count() ) * measure(part_dict,method)
print("rem of " + col + " using " + method + ": " + str(sum))
# IG = H/Gini - rem
print("IG of " + col + " using " + method + ": " + str(measure(target_dict, method)-sum) + "\n")
select_features[col] = measure(target_dict, method)-sum
print("Feature with highest IG using " + method + ": " + str(max(select_features)) + "\n")
return str(max(select_features))
df = pd.read_csv("dataset_a.csv", usecols=['Rain','Sprinkler','Grass'])
target = "Grass"
# Split data into subsets based on values of the first attribute with highest
# information gain.
first = rem(df, target, "entropy")
print("------------------------------");
first_unique = df[first].unique()
for f in first_unique:
print("When " + first + " = " + f)
branch = df[df[first] == f].drop([first], axis = 1)
rem(branch, target, "entropy")
print("------------------------------");
print("End program.")