-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGet_Intra-cluster_distances
148 lines (109 loc) · 4.8 KB
/
Get_Intra-cluster_distances
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import gzip
import math
# function which takes a sim_mat and reduces it to the upper triangle
# the sim_mat can be calculated in the Get_components script
def simmat2triangle(simmat_path):
# check if the file is in square format
if simmat_path[-3:] == '.gz':
with gzip.open(simmat_path, mode = 'r') as f:
l_row1 = len(f.readline().split())
l_row2 = len(f.readline().split())
assert l_row1 == l_row2
else:
with open(simmat_path, mode = 'r') as f:
l_row1 = len(f.readline().split())
l_row2 = len(f.readline().split())
assert l_row1 == l_row2
if simmat_path[-3:] == '.gz':
with gzip.open(simmat_path, mode = 'r') as f:
new_path = simmat_path[:-3] + '[triangle].gz'
with gzip.open(new_path, mode = 'w') as g:
read_line = f.readline().decode()
MAX_ROWS = len(read_line.split())
w_line = write_line(read_line,0)
g.write(write_line)
for i in range(1, MAX_ROWS):
w_line = write_line(read_line, i)
g.write(w_line)
if i % 10000==0:
print('Got to the {}-th line!'.format(i-1))
else:
#with open.(simmat_path, mode = 'r') as f:
return
# takes a read line, cut off the left-most element and return the new line
# perhaps this is easier to achieve when one takes the str.index function and slices the string
def write_line(read_line,i):
w_line = read_line.split()[i+1:]
w_line = [float(el) for el in w_line]
w_line = str(w_line)
w_line = w_line[1:-1].replace(',',' ') +'\n'
w_line = str.encode(w_line)
return w_line
# get a list of matrix indices
def get_upper_triangle(integer = 5, index_list = None):
upper_triangle = []
if index_list:
iter_list = index_list
else:
iter_list = list(range(integer))
for i in iter_list:
for j in iter_list:
if j>i:
upper_triangle.append([i,j])
return upper_triangle
# transform the indices to a dictionary, where the key is the row and the values are all the columns to visit for each row
def get_index_dict(index_pairs):
index_dict = dict()
row_indices = [el[0] for el in index_pairs]
row_indices = set(row_indices)
for i in row_indices:
column_indices = [el[1] for el in index_pairs if el[0] == i]
index_dict[str(i)] = column_indices
return index_dict
# get the intra-cluster distance for an upper triangle matrix [i<j] (as e.g. calculated by simmat2triangle)
# the component should contain the indices of the words
# the similarity matri has to be unzipped, for better performance!
def get_intracluster_distance_triangle(component, simmat_path):
# Initialization
start = time.time()
# contains all index_pairs to visit for the component
index_pairs = get_upper_triangle(index_list = component)
NUM_PAIRS = len(index_pairs)
# transform index_pairs to a dictionary {row: columns}
index_dict = get_index_dict(index_pairs)
# if the index_dict is empty, it means the component is only a single element - None will be returned
try:
MAX_ROW = max([int(el) for el in index_dict.keys()])
except ValueError:
print('The intracluster_distance is 1: 0°. The cluster is a single element.')
return
index = 0
s_index = str(index)
similarities = []
with open(simmat_path, mode = 'r') as f: # gzip für die gezippte sim mat
while index <= MAX_ROW: # may also be coded as for loop
line = f.readline()
if s_index in index_dict.keys():
line = line.split()
line = np.array(line, dtype = 'f2') # read into a numpy 1-D array
get_sims = line[index_dict[s_index]]
similarities.append(get_sims)
index += 1
s_index = str(index)
# sum up all similarities
sums = sum([sum(el) for el in similarities])
# divide by the number of pairs
intracluster_distance = sums/NUM_PAIRS
# convert the result into degrees
rad = math.acos(intracluster_distance)
intracluster_distance_deg = math.degrees(rad)
end = time.time()
print('Took me:',end-start, 'seconds.')
print('Radians/Degrees:')
return intracluster_distance, intracluster_distance_deg
####################################################################
simmat_path = ''
# get an upper-triangle of the similarity matrix for speed up
simmat2triangle(simmat_path)
# unzip the upper triangle
simmat2triangle('test(full).gz')# outputs 'test(full)[triangle].gz'