diff --git a/README.md b/README.md index b642de9..13449b1 100644 --- a/README.md +++ b/README.md @@ -97,3 +97,26 @@ yield SeleniumRequest( script='window.scrollTo(0, document.body.scrollHeight);', ) ``` + +#### interact + +When used, will call this function with driver as parameter and the data of this call return will be added to the response `meta`: + +```python +def interact_on_page(driver): + radio_all = driver.find_element(By.CSS_SELECTOR, '.some-class a') + ActionChains(driver).move_to_element(radio_all).click(radio_all).perform() + WebDriverWait(driver, timeout=30).until_not(lambda d: d.find_element(By.CLASS_NAME, '.loading')) + data = driver.execute_script('const data={}; ********; return data;') + return data + +yield SeleniumRequest( + url=url, + callback=self.parse_result, + interact=interact_on_page +) + +def parse_result(self, response): + dynamic_data = response.request.meta['interact_data'] +``` + diff --git a/scrapy_selenium/http.py b/scrapy_selenium/http.py index cddf7bf..41c6ba3 100644 --- a/scrapy_selenium/http.py +++ b/scrapy_selenium/http.py @@ -6,7 +6,7 @@ class SeleniumRequest(Request): """Scrapy ``Request`` subclass providing additional arguments""" - def __init__(self, wait_time=None, wait_until=None, screenshot=False, script=None, *args, **kwargs): + def __init__(self, wait_time=None, wait_until=None, screenshot=False, script=None, interact=None,*args, **kwargs): """Initialize a new selenium request Parameters @@ -28,5 +28,6 @@ def __init__(self, wait_time=None, wait_until=None, screenshot=False, script=Non self.wait_until = wait_until self.screenshot = screenshot self.script = script + self.interact = interact super().__init__(*args, **kwargs) diff --git a/scrapy_selenium/middlewares.py b/scrapy_selenium/middlewares.py index 201db2c..66c4928 100644 --- a/scrapy_selenium/middlewares.py +++ b/scrapy_selenium/middlewares.py @@ -46,11 +46,6 @@ def __init__(self, driver_name, driver_executable_path, for argument in driver_arguments: driver_options.add_argument(argument) - driver_kwargs = { - 'executable_path': driver_executable_path, - f'{driver_name}_options': driver_options - } - # locally installed driver if driver_executable_path is not None: driver_kwargs = { @@ -58,6 +53,7 @@ def __init__(self, driver_name, driver_executable_path, f'{driver_name}_options': driver_options } self.driver = driver_klass(**driver_kwargs) + # remote driver elif command_executor is not None: from selenium import webdriver @@ -121,6 +117,9 @@ def process_request(self, request, spider): if request.script: self.driver.execute_script(request.script) + if interact_func := request.interact: + request.meta['interact_data'] = interact_func(self.driver) + body = str.encode(self.driver.page_source) # Expose the driver via the "meta" attribute diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index fe365e4..18663d4 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -2,8 +2,12 @@ from unittest.mock import patch +from selenium.webdriver.common.by import By +from selenium.webdriver.common.action_chains import ActionChains + from scrapy import Request from scrapy.crawler import Crawler +from selenium.webdriver.support.wait import WebDriverWait from scrapy_selenium.http import SeleniumRequest from scrapy_selenium.middlewares import SeleniumMiddleware @@ -135,3 +139,27 @@ def test_process_request_should_execute_script_if_script_option(self): html_response.selector.xpath('//title/text()').extract_first(), 'scrapy_selenium' ) + + def test_process_request_should_execute_interact_if_interact_option(self): + """Test that the ``process_request`` should execute the script and return a response""" + + def page_interact(driver): + el = driver.find_element(By.CSS_SELECTOR, '#downloads') + ActionChains(driver).move_to_element(el).perform() + + # fake dynamic loading + dl_url = WebDriverWait(driver, timeout=30).until(lambda d: d.find_element(By.CSS_SELECTOR, '#downloads .element-1 a')).get_attribute('href') + + return {'dl_link': 'fake_url'} + + selenium_request = SeleniumRequest( + url='http://www.python.org', + interact=page_interact + ) + + html_response = self.selenium_middleware.process_request( + request=selenium_request, + spider=None + ) + + self.assertEqual(html_response.request.meta['interact_data']['dl_link'], 'fake_url')