-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathw2kp_PRGraph.py
109 lines (93 loc) · 3.61 KB
/
w2kp_PRGraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Word2Keypress version
import itertools
import sys
import csv
import time
import gensim
import numpy as np
from matplotlib import pyplot as plt
from collections import OrderedDict
from nltk import edit_distance
from word2keypress import Keyboard
kb = Keyboard()
def leet_code(string: str):
for char in string:
if char == 'a':
string = string.replace('a', '4')
elif char == 'b':
string = string.replace('b', '8')
elif char == 'e':
string = string.replace('e', '3')
elif char == 'l':
string = string.replace('l', '1')
elif char == 'o':
string = string.replace('o', '0')
elif char == 's':
string = string.replace('s', '5')
elif char == 't':
string = string.replace('t', '7')
else:
pass
return string
# HEURISTICS
def heuristics(pwd1: str, pwd2: str):
if leet_code(pwd1) == leet_code(pwd2):
return True
if pwd1.lower() == pwd2.lower():
return True
if edit_distance(pwd1, pwd2, transpositions=True) < 5:
return True
return False
def main():
# Usage:
# python3 PRGraph.py <filename>
if len(sys.argv) != 3:
sys.exit("Usage: python3 PRGraph.py <filename> <big_model>")
filename = sys.argv[1]
big_model_file = sys.argv[2]
big_model = gensim.models.fasttext.load_facebook_vectors(big_model_file)
pos_neg_count = OrderedDict()
prec_dict = OrderedDict()
rec_dict = OrderedDict()
for th in np.arange(0.0, 1.1, 0.1):
pos_neg_count_th = {'TP': 0, 'FP': 0, 'FN': 0}
pos_neg_count[th] = pos_neg_count_th
with open(filename) as file:
csv_reader = csv.reader(file, delimiter=':')
start_time = time.time()
for i, (user, pass_keyseq_list) in enumerate(csv_reader):
if i % 10000 == 0:
end_time = time.time()
print("Processed {} lines in {} seconds.".format(i, end_time - start_time))
start_time = end_time
user_pass_list = eval(pass_keyseq_list)
for pwd1, pwd2 in itertools.combinations(user_pass_list, 2):
ground_truth = heuristics(pwd1, pwd2)
# Find similarity percentage using the model
# NB Passwords must be converted in key-presses, because the model was trained with word2keypress dataset.
pwd1_kp, pwd2_kp = kb.print_keyseq(kb.word_to_keyseq(pwd1)), kb.print_keyseq(kb.word_to_keyseq(pwd2))
sim_score = big_model.similarity(pwd1_kp, pwd2_kp)
for th, pos_neg_count_th in pos_neg_count.items():
bin_sim_score = sim_score > th
if bin_sim_score and ground_truth:
pos_neg_count_th['TP'] += 1
elif bin_sim_score and not ground_truth:
pos_neg_count_th['FP'] += 1
elif not bin_sim_score and ground_truth:
pos_neg_count_th['FN'] += 1
for th, pos_neg_count_th in pos_neg_count.items():
th_prec = pos_neg_count_th['TP'] / (pos_neg_count_th['TP'] + pos_neg_count_th['FP'])
th_rec = pos_neg_count_th['TP'] / (pos_neg_count_th['TP'] + pos_neg_count_th['FN'])
prec_dict[th] = th_prec
rec_dict[th] = th_rec
x, y_p = zip(*prec_dict.items())
_, y_r = zip(*rec_dict.items())
plt.plot(x, y_p, label='Precision')
plt.plot(x, y_r, label='Recall')
plt.legend()
plt.grid()
plt.xticks(np.arange(0.0, 1.1, 0.1))
plt.yticks(np.arange(0.0, 1.05, 0.05))
plt.savefig("big_model.png")
if __name__ == '__main__':
main()