-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_vector.py
142 lines (127 loc) · 6.18 KB
/
get_vector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf -8 -*-
import os
import statistics
def str_with_num_and_char(string_array):
# Returns the share of strings containing numbers or special char in all strings
# 返回含数字或特殊字符的字符串占所有字符串的比例
# 特殊字符串包括 '_', '#', '=', '-', '+', '<', '>', '(', ')', '.', ','
count = 0
for string in string_array:
if contains_num_or_special_char(string):
count += 1
return 0 if len(string_array) == 0 else count / len(string_array)
def contains_num_or_special_char(string):
# Returns a boolean indicating whether the string contains a number or special char
# 返回string是否含有数字或特殊字符
num_char_list = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_', '#', '=', '-', '+', '<', '>', '(', ')', '.', ',']
for char in string:
if char in num_char_list:
return True
return False
def substrings(target_strings):
# Given a list of strings, return a list of substrings
# 输入一个字符串列表,返回子串列表
substrings = []
for target_string in target_strings:
for i in range(len(target_string)):
for j in range(i + 1, len(target_string) + 1):
substrings.append(target_string[i:j])
return substrings
# All target strings are in lower case
# 目标字符串列表中的字符串全部已转为小写
target_strings = ['wscript.shell', 'readystate', 'regwrite', '%temp%/', 'aplet.dll', 'quit', '.dll', '/elevate', 'responsebody',
'type', 'get', 'savetofile', 'folderexists', 'regsvr32 /s ', 'position', 'send', 'open', 'exists', 'msxml2.xmlhttp', 'arguments',
'div', 'scriptfullname', 'createtextfile', 'finish.scr', 'runas', 'floor', '.exe', 'random', '%username%', '%appdata%', 'sleep',
'fileexists', 'adodb.stream', 'write', 'c:\\users\\', 'run', 'close', 'fromcharcode', 'eval', 'createobject', 'expandenvironmentstrings',
'elevate', 'activexobject', 'split', 'scripting.filesystemobject', 'string.fromcharcode()', 'wscript', '%temp%', 'shellexecute',
'shell.application', 'fullname', 'reg_dword', 'named', 'c:\\programdata\\trava', 'c:\\windows\\syswow64\\', 'c:\\windows\\system32\\',
'c:\\program files (x86)\\', '"c:\\program files\\gbplugin"']
substrings_list = substrings(target_strings)
# count_hit returns the number of hits of substrings. Single letter hits and whole word hits are not counted
# count_hit返回strings中有多少个子串命中。忽略单字母命中,忽略全词命中。
def count_hit(strings):
count = 0
for string in strings:
if (string.lower() in substrings_list) and len(string) > 1 and string not in target_strings:
count += 1
if (len(strings) < 10):
count = 0
return count
#---------------------Main Function---------------------#
#-------------------------主函数-------------------------#
def get_vector(abnormal_count, plus_equal_percentage, square_bracket_percentage, string_list):
# 构建特征向量
feature_vector = []
# f0: +和+=的数量 / 运算符数量 ***
feature_vector.append(plus_equal_percentage)
# f1: 字符串数量 / token数量
#feature_vector.append(string_count / token_count)
# f2: 字符串平均长度 ***
feature_vector.append(ave_string_length(string_list))
# f3: 长度为1-5的字符串数量 / 字符串总数 ***
feature_vector.append(count_1_to_5(string_list))
# f4: 特殊字符(.\/@#%:_&) / 字符串总长度
feature_vector.append(count_special_char(string_list))
# f5: charAt, fromCharCode, eval数量 / token数量
#feature_vector.append(charAt_fromCharCode_eval_percentage)
# f6: [ 数量 / token数量
feature_vector.append(square_bracket_percentage)
# f7: 含数字和特殊字符的字符串数量 / 字符串总数 ***
feature_vector.append(str_with_num_and_char(string_list))
# f8: 命中数 / 字符串总数 ***
feature_vector.append(count_hit(string_list))
# f9: 异常函数调用 / identifier总数 ***
feature_vector.append(abnormal_count)
return feature_vector
def ave_string_length(string_list):
# Returns the average string length
# 返回脚本中所有字符串的平均长度
length_list = []
for string in string_list:
length_list.append(len(string))
return 0 if len(length_list) == 0 else statistics.mean(length_list)
def count_1_to_5(string_list):
# Returns the share of strings of length 1-5 among all strings
# 返回长度为1-5的字符串占所有字符串中的比例
result_count = 0
for i in string_list:
if (len(i) >= 1 and len(i) <= 5):
result_count += 1
return 0 if len(string_list) == 0 else result_count / len(string_list)
def count_special_char(string_list):
# Returns the ratio of special characters (.,\/@#%:_&) to the total string length
# 返回特殊字符(.,\/@#%:_)占字符串总长度的比例
special_set = {'.', ',', '\\', '/', '@', '#', '%', '_', '&'}
special_count = 0
total_string_length = 0
for i in string_list:
if i in special_set:
special_count += 1
total_string_length += len(i)
return 0 if total_string_length == 0 else special_count / total_string_length
#---------------------Execution---------------------#
#vect = get_vector('/Users/chenpengyuan/Desktop/Virus/fb0c9ea4406e98e48b579d3d0a1a3550', '/Users/chenpengyuan/Desktop/JavaScriptKeywords.txt')
#print(vect)
'''
abnormal_result_array = []
l = []
for root, dirs, files in os.walk('/Users/chenpengyuan/Desktop/has_concatenation'):
l.extend(files)
for f in l:
vector = get_vector('/Users/chenpengyuan/Desktop/has_concatenation/' + f, '/Users/chenpengyuan/Desktop/JavaScriptKeywords.txt')
vector.append(1)
# 1是类别,代表有拼接
abnormal_result_array.append(vector)
# print(abnormal_result_array)
# print("---------------------------------------------")
normal_result_array = []
for i in range(1, 101):
try:
vector = get_vector('/Users/chenpengyuan/Desktop/1/File ' + str(i) + '.js', '/Users/chenpengyuan/Desktop/JavaScriptKeywords.txt')
vector.append(0)
# 0是类别,代表无拼接
normal_result_array.append(vector)
except:
pass
'''
# print(normal_result_array)