Skip to content

Commit

Permalink
Fix bug of document empty because of html.parsestring function
Browse files Browse the repository at this point in the history
  • Loading branch information
Naibo_Mac_M2 committed Dec 11, 2024
1 parent 2031b09 commit b4d7ddf
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
2 changes: 1 addition & 1 deletion ExecuteStage/.vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"justMyCode": false,
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--ids", "[5]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
"args": ["--ids", "[35]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
"--read_type", "remote",
]
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
Expand Down
8 changes: 8 additions & 0 deletions ExecuteStage/easyspider_executestage.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,6 +1148,14 @@ def loopExecute(self, node, loopValue, clickPath="", index=0):
self.history["handle"] = thisHandle
thisHistoryURL = self.browser.current_url
# 快速提取处理
# start = time.time()
try:
tree = html.fromstring(self.browser.page_source)
except Exception as e:
self.print_and_log("解析页面时出错,将切换普通提取模式|Error parsing page, will switch to normal extraction mode")
node["parameters"]["quickExtractable"] = False
# end = time.time()
# print("解析页面秒数:", end - start)
if node["parameters"]["quickExtractable"]:
self.browser.switch_to.default_content() # 切换到主页面
tree = html.fromstring(self.browser.page_source)
Expand Down

0 comments on commit b4d7ddf

Please sign in to comment.