-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathNo_w2kp_PRGraph.py
104 lines (89 loc) · 3.36 KB
/
No_w2kp_PRGraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# No word2keypress version
import itertools
import sys
import csv
import time
import compress_fasttext
import numpy as np
from matplotlib import pyplot as plt
from collections import OrderedDict
from nltk import edit_distance
def leet_code(string: str):
for char in string:
if char == 'a':
string = string.replace('a', '4')
elif char == 'b':
string = string.replace('b', '8')
elif char == 'e':
string = string.replace('e', '3')
elif char == 'l':
string = string.replace('l', '1')
elif char == 'o':
string = string.replace('o', '0')
elif char == 's':
string = string.replace('s', '5')
elif char == 't':
string = string.replace('t', '7')
else:
pass
return string
# HEURISTICS
def heuristics(pwd1: str, pwd2: str):
if leet_code(pwd1) == leet_code(pwd2):
return True
if pwd1.lower() == pwd2.lower():
return True
if edit_distance(pwd1, pwd2, transpositions=True) < 5:
return True
return False
def main():
# Usage:
# python3 PRGraph.py <filename>
if len(sys.argv) != 3:
sys.exit("Usage: python3 PRGraph.py <filename> <compressed_model>")
filename = sys.argv[1]
compressed_model_file = sys.argv[2]
small_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load(compressed_model_file)
pos_neg_count = OrderedDict()
prec_dict = OrderedDict()
rec_dict = OrderedDict()
for th in np.arange(0.0, 1.1, 0.1):
pos_neg_count_th = {'TP': 0, 'FP': 0, 'FN': 0}
pos_neg_count[th] = pos_neg_count_th
with open(filename) as file:
csv_reader = csv.reader(file, delimiter=':')
start_time = time.time()
for i, (user, pass_keyseq_list) in enumerate(csv_reader):
if i % 10000 == 0:
end_time = time.time()
print("Processed {} lines in {} seconds.".format(i, end_time - start_time))
start_time = end_time
user_pass_list = eval(pass_keyseq_list)
for pwd1, pwd2 in itertools.combinations(user_pass_list, 2):
# Find similarity percentage using the model
sim_score = small_model.similarity(pwd1, pwd2)
ground_truth = heuristics(pwd1, pwd2)
for th, pos_neg_count_th in pos_neg_count.items():
bin_sim_score = sim_score > th
if bin_sim_score and ground_truth:
pos_neg_count_th['TP'] += 1
elif bin_sim_score and not ground_truth:
pos_neg_count_th['FP'] += 1
elif not bin_sim_score and ground_truth:
pos_neg_count_th['FN'] += 1
for th, pos_neg_count_th in pos_neg_count.items():
th_prec = pos_neg_count_th['TP'] / (pos_neg_count_th['TP'] + pos_neg_count_th['FP'])
th_rec = pos_neg_count_th['TP'] / (pos_neg_count_th['TP'] + pos_neg_count_th['FN'])
prec_dict[th] = th_prec
rec_dict[th] = th_rec
x, y_p = zip(*prec_dict.items())
_, y_r = zip(*rec_dict.items())
plt.plot(x, y_p, label='Precision')
plt.plot(x, y_r, label='Recall')
plt.legend()
plt.grid()
plt.xticks(np.arange(0.0, 1.1, 0.1))
plt.yticks(np.arange(0.0, 1.05, 0.05))
plt.show()
if __name__ == '__main__':
main()