-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeatrues_extracter.py
126 lines (107 loc) · 3.72 KB
/
featrues_extracter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
import pandas as pd
import os
import re
import json
import pandas as pd
import logging
import MySQLdb
from multiprocessing import cpu_count, Pool
import signal
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='featrues_extracter.log',
filemode='w')
total_featrue_path = '/data/root/cuckoo/signatures/windows/'
signatrues_path = '/data/root/cuckoo/storage/analyses/'
pattern = re.compile(' name = "(.*)"')
title_list = []
data_list = []
fail_list = []
title_list = ['md5', 'dumped_buffer', 'dumped_buffer2', 'network_bind']
# /data/root/cuckoo/storage/analyses/1/reports/report.json
def get_md5(i):
conn = MySQLdb.connect(
host='localhost',
port=3306,
user='root',
passwd='root123',
db='sandbox',
)
cur = conn.cursor()
cur.execute("select * from tasks")
info = cur.fetchall()[i - 1][1]
md5 = info.split('/')[5]
return md5
def init_worker():
signal.signal(signal.SIGINT, signal.SIG_IGN)
def extract_total_featrue():
total_featrue = []
file_list = os.listdir(total_featrue_path)
for filename in file_list:
filename = total_featrue_path + filename
with open(filename, 'r+') as f:
for lines in f:
if not re.findall(pattern, lines) == []:
name = re.findall(pattern, lines)
# print (name)
total_featrue.extend(name)
return total_featrue
def get_signatrue(file):
data_series = file
data = {}
filename = signatrues_path + file + '/reports/report.json'
logging.info('{}'.format(filename))
try:
with open(filename, 'r') as f:
j = json.load(f)
md5 = get_md5(int(file))
data.update(dict({'md5': md5}))
if 'signatures' not in j.keys():
fail_list.append(file)
logging.warning('{} dont have signatrues'.format(md5))
else:
sigs = j['signatures']
if sigs:
for i in xrange(len(sigs)):
featrue_name = j['signatures'][i]['name']
data.update({featrue_name: 1})
data_series = pd.Series(data)
else:
fail_list.append(file)
logging.warning('{} {} dont have signatrues'.format(file, md5))
except Exception as e:
logging.exception(e)
return None
return data_series
total_featrue_list = extract_total_featrue()
# ['trojan_jorik', 'fakeav_mutexes', 'antivm_xen_keys'] duplicate
total_featrue_list = list(set(total_featrue_list))
title_list.extend(total_featrue_list)
label = pd.DataFrame(columns=title_list)
series_type = type(pd.Series())
file_list = os.listdir(signatrues_path)
file_list.remove('latest')
file_list_int = map(int, file_list)
file_list_int_sorted = sorted(file_list_int)
file_list = map(str, file_list_int_sorted)
CPU_COUNT = cpu_count() - 2
pool = Pool(processes=CPU_COUNT, initializer=init_worker, maxtasksperchild=400)
data_list = pool.map(get_signatrue, file_list)
pool.close()
pool.join()
data_list_left = data_list
for i, cc in enumerate(data_list_left):
if type(cc) != series_type:
fail_list.append(data_list_left[i])
del data_list_left[i]
label_data = label.append(data_list_left, ignore_index=True)
label_data = label_data.fillna('0')
label_data.index = label_data['md5']
del label_data['md5']
label_data.to_csv('dynamic_featrue.csv')
with open('fail_list.txt', 'w+') as f:
for i in fail_list:
f.write(i)
f.write('\n')