-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGenerateModel.py
110 lines (102 loc) · 4.06 KB
/
GenerateModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# coding: utf-8
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib
import get_vector
import LexicalProcessing
import ModeAnalysisRestructured
# 本段结束后,raw_data应当是一个二维数组,每个向量包含9个元素(8个特征值+类别)
def generate_learning_data(abnormal_path, normal_path, keyword_path):
raw_data = []
abnormal_result_array = []
l = []
for root, dirs, files in os.walk(abnormal_path):
l.extend(files)
for f in l:
vector = get_vector.get_vector(abnormal_path + '\\' + f, keyword_path)
vector.append(1)
# 1是类别,代表有拼接
abnormal_result_array.append(vector)
normal_result_array = []
for root, dirs, files in os.walk(normal_path):
for file in files:
programme_path = normal_path + '\\' + file
try:
vector = get_vector.get_vector(programme_path, keyword_path)
vector.append(0)
# 0是类别,代表无拼接
normal_result_array.append(vector)
except:
pass
raw_data.extend(abnormal_result_array)
raw_data.extend(normal_result_array)
return raw_data
def generate_classifier(raw_data):
# Build DataFrame
df = pd.DataFrame(np.array(raw_data))
# index 8 是类型,drop之后就只剩下特征值
# X是一个矩阵,含有特征值;y是向量,含有0或1,是结果
X = df.drop(8, axis=1)
y = df[8]
classifier = SVC(kernel='linear')
classifier.fit(X, y)
joblib.dump(classifier, 'concatenation_model.pkl')
return classifier
def load_and_predict(abnormal_count, plus_equal_percentage, square_bracket_percentage, string_list):
classifier = joblib.load('concatenation_model.pkl')
vector = get_vector.get_vector(abnormal_count, plus_equal_percentage, square_bracket_percentage, string_list)
vector_list = []
vector_list.append(vector)
prediction = classifier.predict(vector_list)
if (prediction[0] == 1):
return True
else:
return False
def generate_model():
abnormal_path = 'E:\\encrypted_obfuscated_Javascript_programme_analysis\\Abnormal String Concatenation'
normal_path = 'E:\\encrypted_obfuscated_Javascript_programme_analysis\\Archive'
keyword_path = 'JavaScriptKeywords.txt'
generate_classifier(generate_learning_data(abnormal_path, normal_path, keyword_path))
def hand_written_training(abnormal_path, keyword_path):
file_count = 0
plus_equal_average = 0
average_string_length_average = 0
count_1_to_5_average = 0
count_special_char_average = 0
square_bracket_average = 0
num_and_char_average = 0
for root, dirs, files in os.walk(abnormal_path):
file_count = len(files)
for file in files:
programme_path = abnormal_path + '\\' + file
vector = get_vector.get_vector(programme_path, keyword_path)
plus_equal_average += vector[0]
average_string_length_average += vector[1]
count_1_to_5_average += vector[2]
count_special_char_average += vector[3]
square_bracket_average += vector[4]
num_and_char_average += vector[5]
model = [plus_equal_average, average_string_length_average, count_1_to_5_average, count_special_char_average, square_bracket_average, num_and_char_average]
for i in range(0, len(model)):
model[i] = model[i] / float(file_count)
with open('ExperimentModel', 'w') as f:
f.write(repr(model))
def calculate_score(programme_path, keyword_path):
model = []
with open('ExperimentData') as f:
model = eval(f.read())
vector = get_vector.get_vector(programme_path, keyword_path)
score = 0
score += 10 * vector[6]
score += 10 * vector[7]
for i in range(0, 6):
if (i == 1):
score -= abs(vector[1] - model[i])
else:
score -= 5 * abs(vector[i] - model[i])
return score