From 2e557f6f2221f2609b5b6c6cf3b609a28387d918 Mon Sep 17 00:00:00 2001 From: Nikita Medyankin Date: Fri, 28 Feb 2020 13:14:56 +0100 Subject: [PATCH] Feature: minimal addition to run remote webdriver (#55) * Add remote Selenium option * Add info about remote driver to readme * Import webdriver properly --- README.md | 5 +++++ scrapy_selenium/middlewares.py | 37 +++++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index d16aebf..b642de9 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,11 @@ Optionally, set the path to the browser executable: SELENIUM_BROWSER_EXECUTABLE_PATH = which('firefox') ``` +In order to use a remote Selenium driver, specify `SELENIUM_COMMAND_EXECUTOR` instead of `SELENIUM_DRIVER_EXECUTABLE_PATH`: + ```python + SELENIUM_COMMAND_EXECUTOR = 'http://localhost:4444/wd/hub' + ``` + 2. Add the `SeleniumMiddleware` to the downloader middlewares: ```python DOWNLOADER_MIDDLEWARES = { diff --git a/scrapy_selenium/middlewares.py b/scrapy_selenium/middlewares.py index 3761ca5..201db2c 100644 --- a/scrapy_selenium/middlewares.py +++ b/scrapy_selenium/middlewares.py @@ -13,8 +13,8 @@ class SeleniumMiddleware: """Scrapy middleware handling the requests using selenium""" - def __init__(self, driver_name, driver_executable_path, driver_arguments, - browser_executable_path): + def __init__(self, driver_name, driver_executable_path, + browser_executable_path, command_executor, driver_arguments): """Initialize the selenium webdriver Parameters @@ -27,6 +27,8 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments, A list of arguments to initialize the driver browser_executable_path: str The path of the executable binary of the browser + command_executor: str + Selenium remote server endpoint """ webdriver_base_path = f'selenium.webdriver.{driver_name}' @@ -38,6 +40,7 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments, driver_options_klass = getattr(driver_options_module, 'Options') driver_options = driver_options_klass() + if browser_executable_path: driver_options.binary_location = browser_executable_path for argument in driver_arguments: @@ -48,7 +51,19 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments, f'{driver_name}_options': driver_options } - self.driver = driver_klass(**driver_kwargs) + # locally installed driver + if driver_executable_path is not None: + driver_kwargs = { + 'executable_path': driver_executable_path, + f'{driver_name}_options': driver_options + } + self.driver = driver_klass(**driver_kwargs) + # remote driver + elif command_executor is not None: + from selenium import webdriver + capabilities = driver_options.to_capabilities() + self.driver = webdriver.Remote(command_executor=command_executor, + desired_capabilities=capabilities) @classmethod def from_crawler(cls, crawler): @@ -57,18 +72,22 @@ def from_crawler(cls, crawler): driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME') driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH') browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH') + command_executor = crawler.settings.get('SELENIUM_COMMAND_EXECUTOR') driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS') - if not driver_name or not driver_executable_path: - raise NotConfigured( - 'SELENIUM_DRIVER_NAME and SELENIUM_DRIVER_EXECUTABLE_PATH must be set' - ) + if driver_name is None: + raise NotConfigured('SELENIUM_DRIVER_NAME must be set') + + if driver_executable_path is None and command_executor is None: + raise NotConfigured('Either SELENIUM_DRIVER_EXECUTABLE_PATH ' + 'or SELENIUM_COMMAND_EXECUTOR must be set') middleware = cls( driver_name=driver_name, driver_executable_path=driver_executable_path, - driver_arguments=driver_arguments, - browser_executable_path=browser_executable_path + browser_executable_path=browser_executable_path, + command_executor=command_executor, + driver_arguments=driver_arguments ) crawler.signals.connect(middleware.spider_closed, signals.spider_closed)