-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathMyParser.py
368 lines (315 loc) · 16.4 KB
/
MyParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
#!/usr/bin/python
#coding:utf-8
import lxml.html
import re
from bs4 import BeautifulSoup
# 受信したレスポンスの解析(遷移先URLの抽出)
class HtmlParser:
def __init__(self):
return
# aタグのhref要素の取得
def get_a_tags(self, str_protocol, int_Port, str_fqdn, str_path, str_response):
# 診断対象のFQDN
str_target_url_full = str_protocol + "://" + str_fqdn + ":" + int_Port + str_path
str_target_url = str_protocol + "://" + str_fqdn + str_path
obj_bs4 = BeautifulSoup(str_response)
obj_next_url = obj_bs4.find_all("a")
int_idx = 0
lst_tags = []
while int_idx < len(obj_next_url):
str_href = obj_next_url[int_idx].get("href")
if str_href is None:
int_idx += 1
continue
# 診断対象の絶対パスが含まれている場合はリストに相対パスを追加
if (str_target_url_full in str_href) or (str_target_url in str_href):
int_len = len(str_target_url_full)
lst_tags.append(str_href[int_len:])
# 診断対象以外の絶対パスが含まれている場合はリストに追加しない
elif ("http://" in str_href) or ("https://" in str_href):
print("not target :" + str_href)
# プロトコルがhttpまたはhttps以外の場合はリストに追加しない
elif (str_href.find(':') > 0) or (str_href.find('@') > 0):
print("not target :" + str_href)
elif str_href == '':
print("not target : /")
# href要素が上記以外(対象の相対パス)はリストに追加(アンカー以外)
else:
if str_href.startswith('#') is False:
lst_tags.append(str_href)
int_idx += 1
return lst_tags
# formタグ内のinputタグからname属性を取得する
def get_input_tags(self, obj_element):
# inputタグからnameとvalueのセットを取得する
int_idx = 0
bol_initial = True
str_param = ''
obj_input_element = obj_element.find_all('input')
while int_idx < len(obj_input_element):
str_name = obj_input_element[int_idx].get('name')
str_value = obj_input_element[int_idx].get('value')
if str_name is not None:
if str_value is not None:
if bol_initial is True:
str_param = str_name + '=' + str_value
else:
str_param = str_param + '&' + str_name + '=' + str_value
else:
if bol_initial is True:
str_param = str_name + '='
else:
str_param = str_param + '&' + str_name + '='
bol_initial = False
int_idx += 1
return str_param
# formタグ内のselectタグからname属性とoption value属性を取得する
def get_select_tags(self, obj_element):
# selectタグからname、option valueのセットを取得
int_idx = 0
str_param = ''
str_value = ''
bol_initial = True
obj_select_element = obj_element.find_all('select')
while int_idx < len(obj_select_element):
# パラメータのname
str_name = obj_select_element[int_idx].get('name')
# optionタグからvalueを根こそぎ取得
int_idx2 = 0
obj_option_element = obj_select_element[int_idx].find_all('option')
while int_idx2 < len(obj_option_element):
# optionタグのvalue
str_option_value = obj_option_element[int_idx2].get('value')
if str_option_value is not None:
str_value = str_value + "|+|" + str_option_value
int_idx2 += 1
if str_name is not None:
if str_value is not None:
if bol_initial is True:
str_param = str_name + '=' + str_value
else:
str_param = str_param + '&' + str_name + '=' + str_value
else:
if bol_initial is True:
str_param = str_name + '='
else:
str_param = str_param + '&' + str_name + '='
bol_initial = False
int_idx += 1
return str_param
# formタグのmethod要素・action要素・enctype要素、inputタグ・selectタグのname要素を根こそぎ取得
def get_form_tags(self, str_protocol, str_fqdn, int_port, str_path, obj_response):
obj_bs4 = BeautifulSoup(obj_response)
obj_form_element = obj_bs4.find_all("form")
# ページ種別判定用のテキストを取得(hタグ)
lst_page_type_text = []
int_idx = 0
obj_h_element = obj_bs4.find_all(re.compile("^h[1-6]"))
while int_idx < len(obj_h_element):
str_temp = obj_h_element[int_idx].text.lower()
str_temp = str_temp.replace('!', '')
str_temp = str_temp.replace('@', '')
str_temp = str_temp.replace('.', '')
str_temp = str_temp.replace(':', '')
str_temp = str_temp.replace(' ', '')
lst_page_type_text.append(str_temp)
int_idx += 1
int_idx = 0
lst_url_list = []
str_param_label = ''
lst_param_feature_temp = []
lst_param_feature_raw = []
while int_idx < len(obj_form_element):
# method要素・action要素・enctype要素の取得
str_method = obj_form_element[int_idx].get('method')
str_raw_url = obj_form_element[int_idx].get('action')
str_enc_type = obj_form_element[int_idx].get('enctype')
bol_flag = True
# ページ種別判定用、パラメータの特徴決定用のテキストを取得(formタグ)
lst_temp_text = obj_form_element[int_idx].text.split('\n')
for str_text in lst_temp_text:
str_text = str_text.replace(' ', '')
if str_text != '':
lst_param_feature_raw.append(str_text)
# ラベル以外の文字列の排除
lst_form_contents = obj_form_element[int_idx].contents
int_idx_raw = 0
str_form_contents = ''
while int_idx_raw < len(lst_form_contents):
str_form_contents += lst_form_contents[int_idx_raw].encode()
int_idx_raw += 1
str_form_contents = str_form_contents.replace(' ', '')
str_form_contents = str_form_contents.replace('\n', '')
int_idx_raw = 0
int_start_idx = 0
int_end_idx = 0
while int_idx_raw < len(lst_param_feature_raw):
int_start_idx = str_form_contents.find(lst_param_feature_raw[int_idx_raw])
if (int_idx_raw + 1) == len(lst_param_feature_raw):
int_end_idx = len(str_form_contents)
else:
int_end_idx = str_form_contents.find(lst_param_feature_raw[int_idx_raw + 1])
if '<input' not in str_form_contents[int_start_idx:int_end_idx] and \
'<select' not in str_form_contents[int_start_idx:int_end_idx] and \
'textarea' not in str_form_contents[int_start_idx:int_end_idx]:
del lst_param_feature_raw[int_idx_raw]
int_idx_raw -= 1
int_idx_raw += 1
for str_text in lst_param_feature_raw:
str_text = str_text.lower()
str_text = str_text.replace('!', '')
str_text = str_text.replace('@', '')
str_text = str_text.replace('.', '')
str_text = str_text.replace(':', '')
if str_text != '':
lst_page_type_text.append(str_text)
lst_param_feature_temp.append(str_text)
# inputタグ、selectタグ、textareaタグからname、value(option value)、typeのセットを取得
int_idx2 = 0
str_param = ''
str_types = ''
obj_input_element = obj_form_element[int_idx].find_all(["input", "select", "textarea"])
while int_idx2 < len(obj_input_element):
# inputタグの場合
if obj_input_element[int_idx2].name.lower() == "input":
# パラメータのname、value
str_name = obj_input_element[int_idx2].get('name')
str_value = obj_input_element[int_idx2].get('value')
if str_value is None:
str_value = ''
# nameとvalueのセットを保存
if str_name is not None:
str_param = str_param + '&' + str_name + '=' + str_value
# パラメータのtype
str_type = obj_input_element[int_idx2].get('type')
str_types = str_types + ',' + str_type
# selectタグの場合
if obj_input_element[int_idx2].name.lower() == "select":
# name、option valueのセットを取得
str_name = obj_input_element[int_idx2].get('name')
str_value = ''
# optionタグからvalueを根こそぎ取得
int_idx3 = 0
obj_option_element = obj_input_element[int_idx2].find_all('option')
while int_idx3 < len(obj_option_element):
# optionタグのvalue
str_option_value = obj_option_element[int_idx3].get('value')
if str_option_value is not None:
str_value = str_value + "|+|" + str_option_value
int_idx3 += 1
if str_name is not None:
str_param = str_param + '&' + str_name + '=' + str_value
# パラメータのtype("select"固定)
str_types = str_types + ',' + "select"
# textareaタグの場合
if obj_input_element[int_idx2].name.lower() == "textarea":
# パラメータのname
str_name = obj_input_element[int_idx2].get('name')
str_value = obj_input_element[int_idx2].get('value')
if str_value is None:
str_value = ''
# nameとvalueのセットを保存
if str_name is not None:
str_param = str_param + '&' + str_name + '=' + str_value
# パラメータのtype
str_type = obj_input_element[int_idx2].get('type')
if str_type is None:
str_type = 'none'
str_types = str_types + ',' + str_type
int_idx2 += 1
int_idx += 1
# 入力フォームのラベルを設定
lst_param_types = str_types[1:].split(',')
int_idx4 = 0
for str_param_type in lst_param_types:
if str_param_type.lower() == "text" or str_param_type.lower() == "password":
str_param_label = str_param_label + ',' + lst_param_feature_temp[int_idx4]
int_idx4 += 1
elif str_param_type.lower() == "checkbox" or str_param_type.lower() == "radio" or str_param_type.lower() == "file":
str_param_label = str_param_label + ',' + '@'
int_idx4 += 1
else:
str_param_label = str_param_label + ',' + '@'
# action要素に対象の絶対パス(プロトコル+FQDN+ポート番号(省略も有り))が含まれている場合はリストに追加
if (str_protocol + str_fqdn + int_port + str_path in str_raw_url)\
or (str_protocol + str_fqdn + str_path in str_raw_url):
bol_flag = True
# action要素に対象以外の絶対パスが含まれている場合はリストに追加しない
elif ("http://" in str_raw_url) or ("https://" in str_raw_url):
bol_flag = False
# action要素が上記以外(対象の相対パス)はリストに追加
else:
if str_raw_url != '#':
bol_flag = True
# タプルに一つのformタグに関連する情報を格納
if bol_flag is True:
tpl_form_info = (str_method,
str_raw_url,
str_enc_type,
str_param[1:],
obj_response.encode('utf_8'),
str_types[1:])
lst_url_list.append(tpl_form_info)
if len(lst_url_list) != 0:
return lst_url_list, lst_page_type_text, str_param_label[1:]
else:
return lst_url_list, lst_page_type_text, str_param_label
# titleタグの文字列を取得
def get_title_tag(self, obj_response):
obj_bs4 = BeautifulSoup(obj_response.text)
obj_title_element = obj_bs4.find_all("title")
str_title = obj_title_element.text
return str_title
# 全てのタグ文字列を取得
def get_all_tag(self, obj_diff_string):
obj_root = lxml.html.fromstring(obj_diff_string)
# rootからtitleタグの文字列を取得
lst_diff_value = []
for str_value in obj_root.xpath('//'):
lst_diff_value.append(str_value)
return lst_diff_value
# 対象と一致するformタグの要素から、最新のパラメータ値を取得
def get_new_parameter_values(self, str_method, str_path, str_params, str_parent_response):
obj_bs4 = BeautifulSoup(str_parent_response)
obj_form_element = obj_bs4.find_all("form")
int_idx = 0
str_parent_params = ''
str_return_params = ''
while int_idx < len(obj_form_element):
str_parent_method = obj_form_element[int_idx].get('method')
str_parent_raw_url = obj_form_element[int_idx].get('action')
# methodとpathの比較
if str_method.lower() == str_parent_method.lower() and str_path == str_parent_raw_url:
str_parent_params = self.get_input_tags(obj_form_element[int_idx])
str_parent_select_params = self.get_select_tags(obj_form_element[int_idx])
if str_parent_select_params != '':
str_parent_params = str_parent_params + '&' + str_parent_select_params
else:
int_idx += 1
continue
# パラメータ数の比較
lst_params = []
lst_parent_params = []
if len(str_params.split('&')) == len(str_parent_params.split('&')):
lst_params = str_params.split('&')
lst_parent_params = str_parent_params.split('&')
else:
int_idx += 1
continue
# パラメータ構成の比較
int_match_count = 0
for str_param in lst_params:
lst_param = str_param.split('=')
for str_parent_param in lst_parent_params:
lst_parent_param = str_parent_param.split('=')
# パラメータ名が一致している場合、最新の値を取得。
if lst_param[0] == lst_parent_param[0]:
lst_param[1] = lst_parent_param[1]
str_return_params += '&' + lst_param[0] + '=' + lst_param[1]
int_match_count += 1
# パラメータ構成が一致している場合、最新のパラメータ情報を返す。
if int_match_count == len(str_parent_params.split('&')):
return str_return_params[1:]
int_idx += 1
# 一致するする要素が無い場合は元のパラメータを返す
return str_params