From c5fdccf953b6d1b596ae27fc33179d06a30966d9 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 23 Nov 2023 16:46:59 +0100 Subject: [PATCH] Scrapy template: Minor code update and docs improvement (#236) --- templates/python-scrapy/src/__main__.py | 19 +++++++++------ templates/python-scrapy/src/items.py | 12 +++++++--- templates/python-scrapy/src/main.py | 4 +++- templates/python-scrapy/src/pipelines.py | 19 +++++++++++---- templates/python-scrapy/src/settings.py | 24 ++++++++++--------- .../python-scrapy/src/spiders/__init__.py | 14 +++++++---- templates/python-scrapy/src/spiders/title.py | 14 +++++++---- .../{projectFolder}/__main__.template.py | 19 +++++++++------ .../{projectFolder}/main.template.py | 4 +++- 9 files changed, 84 insertions(+), 45 deletions(-) mode change 100755 => 100644 templates/python-scrapy/src/spiders/__init__.py mode change 100755 => 100644 templates/python-scrapy/src/spiders/title.py diff --git a/templates/python-scrapy/src/__main__.py b/templates/python-scrapy/src/__main__.py index b6dc7691..b49b8aef 100644 --- a/templates/python-scrapy/src/__main__.py +++ b/templates/python-scrapy/src/__main__.py @@ -4,7 +4,7 @@ This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using -`scrapy crawl`. +`scrapy crawl title_spider`. We recommend you do not modify this file unless you really know what you are doing. """ @@ -13,11 +13,15 @@ # `scrapy.utils.log` before we patch it. import logging from typing import Any -import scrapy.utils.log +from scrapy.utils import log as scrapy_logging +from scrapy.utils.project import get_project_settings from apify.log import ActorLogFormatter -# If you want to change the logging level, change it here -LOGGING_LEVEL = logging.INFO +# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file, +# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for +# a specific logger, do it in this file. +settings = get_project_settings() +LOGGING_LEVEL = settings['LOG_LEVEL'] handler = logging.StreamHandler() handler.setFormatter(ActorLogFormatter(include_logger_name=True)) @@ -35,7 +39,7 @@ # `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method # like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because # otherwise we would lose some log messages. -old_configure_logging = scrapy.utils.log.configure_logging +old_configure_logging = scrapy_logging.configure_logging def new_configure_logging(*args: Any, **kwargs: Any) -> None: """ @@ -64,7 +68,7 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None: filelock_logger.setLevel(LOGGING_LEVEL) hpack_logger.setLevel(LOGGING_LEVEL) -scrapy.utils.log.configure_logging = new_configure_logging +scrapy_logging.configure_logging = new_configure_logging # Now we can do the rest of the setup import asyncio @@ -73,7 +77,8 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None: from scrapy.utils.reactor import install_reactor from .main import main -# This is necessary so that Twisted and AsyncIO work well together +# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify), +# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') nest_asyncio.apply() diff --git a/templates/python-scrapy/src/items.py b/templates/python-scrapy/src/items.py index bd97a6de..4f75738c 100644 --- a/templates/python-scrapy/src/items.py +++ b/templates/python-scrapy/src/items.py @@ -1,6 +1,12 @@ -# Define the models for your scraped items here -# -# See the Scrapy documentation: http://doc.scrapy.org/en/latest/topics/items.html +""" +Scrapy item models module + +This module defines Scrapy item models for scraped data. Items represent structured data +extracted by spiders. + +For detailed information on creating and utilizing items, refer to the official documentation: +https://docs.scrapy.org/en/latest/topics/items.html +""" from scrapy import Field, Item diff --git a/templates/python-scrapy/src/main.py b/templates/python-scrapy/src/main.py index 77e16b27..f7a8ab99 100644 --- a/templates/python-scrapy/src/main.py +++ b/templates/python-scrapy/src/main.py @@ -86,7 +86,9 @@ async def main() -> None: async with Actor: Actor.log.info('Actor is being executed...') - # Process Actor input - here you can add your own logic for handling Actor input + # Process Actor input - you can customize logic for handling Actor input here + # The `max_depth` option from Actor input overrides Scrapy's `DEPTH_LIMIT` setting + # The `start_urls` option from Actor input is combined with Scrapy's `start_urls` from your spiders actor_input = await Actor.get_input() or {} max_depth = actor_input.get('max_depth', LOCAL_DEFAULT_MAX_DEPTH) start_urls = [start_url.get('url') for start_url in actor_input.get('start_urls', LOCAL_DEFAULT_START_URLS)] diff --git a/templates/python-scrapy/src/pipelines.py b/templates/python-scrapy/src/pipelines.py index 1a04f1d7..da0faf5c 100644 --- a/templates/python-scrapy/src/pipelines.py +++ b/templates/python-scrapy/src/pipelines.py @@ -1,14 +1,23 @@ -# Define your item pipelines here -# -# See the Scrapy documentation: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +""" +Scrapy item pipelines module -import scrapy +This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components +that handle the scraped items, typically used for cleaning, validating, and persisting data. + +For detailed information on creating and utilizing item pipelines, refer to the official documentation: +http://doc.scrapy.org/en/latest/topics/item-pipeline.html +""" + +from scrapy import Spider from .items import TitleItem class TitleItemPipeline: + """ + This item pipeline defines processing steps for TitleItem objects scraped by spiders. + """ - def process_item(self, item: TitleItem, spider: scrapy.Spider) -> TitleItem: + def process_item(self, item: TitleItem, spider: Spider) -> TitleItem: # Do something with the item here, such as cleaning it or persisting it to a database return item diff --git a/templates/python-scrapy/src/settings.py b/templates/python-scrapy/src/settings.py index ac8de229..1ac0c9a5 100644 --- a/templates/python-scrapy/src/settings.py +++ b/templates/python-scrapy/src/settings.py @@ -1,16 +1,18 @@ -# Scrapy settings for this project -# -# For simplicity, this file contains only settings considered important or commonly used. -# -# You can find more settings consulting the documentation: http://doc.scrapy.org/en/latest/topics/settings.html +""" +Scrapy settings module -# Do not change this since it would break the Scrapy <-> Apify interaction -TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' +This module contains Scrapy settings for the project, defining various configurations and options. -# The following settings can be updated by the user +For more comprehensive details on Scrapy settings, refer to the official documentation: +http://doc.scrapy.org/en/latest/topics/settings.html +""" + +# You can update these options and add new ones BOT_NAME = 'titlebot' -SPIDER_MODULES = ['src.spiders'] +DEPTH_LIMIT = 1 # This will be overridden by the `max_depth` option from Actor input if running using Apify +ITEM_PIPELINES = {'src.pipelines.TitleItemPipeline': 123} +LOG_LEVEL = 'INFO' NEWSPIDER_MODULE = 'src.spiders' REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7' -ROBOTSTXT_OBEY = True # obey robots.txt rules -ITEM_PIPELINES = {'src.pipelines.TitleItemPipeline': 123} +ROBOTSTXT_OBEY = True +SPIDER_MODULES = ['src.spiders'] diff --git a/templates/python-scrapy/src/spiders/__init__.py b/templates/python-scrapy/src/spiders/__init__.py old mode 100755 new mode 100644 index 4aca8861..dc55fede --- a/templates/python-scrapy/src/spiders/__init__.py +++ b/templates/python-scrapy/src/spiders/__init__.py @@ -1,5 +1,9 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the Scrapy documentation for information on how to create and manage your spiders. -# -# https://docs.scrapy.org/en/latest/topics/spiders.html +""" +Scrapy spiders package + +This package contains the spiders for your Scrapy project. Spiders are the classes that define how to scrape +and process data from websites. + +For detailed information on creating and utilizing spiders, refer to the official documentation: +https://docs.scrapy.org/en/latest/topics/spiders.html +""" diff --git a/templates/python-scrapy/src/spiders/title.py b/templates/python-scrapy/src/spiders/title.py old mode 100755 new mode 100644 index 02e7659d..81092da2 --- a/templates/python-scrapy/src/spiders/title.py +++ b/templates/python-scrapy/src/spiders/title.py @@ -1,11 +1,11 @@ -from typing import Generator, Union +from __future__ import annotations + +from typing import Generator from urllib.parse import urljoin from scrapy import Request, Spider from scrapy.responsetypes import Response -from apify import Actor - from ..items import TitleItem @@ -16,7 +16,11 @@ class TitleSpider(Spider): name = 'title_spider' - def parse(self, response: Response) -> Generator[Union[TitleItem, Request], None, None]: + # The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input + # when the project is executed using Apify. + start_urls = ['https://apify.com/'] + + def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]: """ Parse the web page response. @@ -26,7 +30,7 @@ def parse(self, response: Response) -> Generator[Union[TitleItem, Request], None Yields: Yields scraped TitleItem and Requests for links. """ - Actor.log.info(f'TitleSpider is parsing {response}...') + self.logger.info('TitleSpider is parsing %s...', response) # Extract and yield the TitleItem url = response.url diff --git a/wrappers/python-scrapy/{projectFolder}/__main__.template.py b/wrappers/python-scrapy/{projectFolder}/__main__.template.py index bcc9decf..bde9cb93 100644 --- a/wrappers/python-scrapy/{projectFolder}/__main__.template.py +++ b/wrappers/python-scrapy/{projectFolder}/__main__.template.py @@ -4,7 +4,7 @@ This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using -`scrapy crawl`. +`scrapy crawl title_spider`. We recommend you do not modify this file unless you really know what you are doing. """ @@ -13,11 +13,15 @@ # `scrapy.utils.log` before we patch it. import logging from typing import Any -import scrapy.utils.log +from scrapy.utils import log as scrapy_logging +from scrapy.utils.project import get_project_settings from apify.log import ActorLogFormatter -# If you want to change the logging level, change it here -LOGGING_LEVEL = logging.INFO +# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file, +# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for +# a specific logger, do it in this file. +settings = get_project_settings() +LOGGING_LEVEL = settings['LOG_LEVEL'] handler = logging.StreamHandler() handler.setFormatter(ActorLogFormatter(include_logger_name=True)) @@ -35,7 +39,7 @@ # `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method # like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because # otherwise we would lose some log messages. -old_configure_logging = scrapy.utils.log.configure_logging +old_configure_logging = scrapy_logging.configure_logging def new_configure_logging(*args: Any, **kwargs: Any) -> None: """ @@ -64,7 +68,7 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None: filelock_logger.setLevel(LOGGING_LEVEL) hpack_logger.setLevel(LOGGING_LEVEL) -scrapy.utils.log.configure_logging = new_configure_logging +scrapy_logging.configure_logging = new_configure_logging # Now we can do the rest of the setup import asyncio @@ -73,7 +77,8 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None: from scrapy.utils.reactor import install_reactor from .main import main -# This is necessary so that Twisted and AsyncIO work well together +# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify), +# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') nest_asyncio.apply() diff --git a/wrappers/python-scrapy/{projectFolder}/main.template.py b/wrappers/python-scrapy/{projectFolder}/main.template.py index d7cb887f..00d9e399 100644 --- a/wrappers/python-scrapy/{projectFolder}/main.template.py +++ b/wrappers/python-scrapy/{projectFolder}/main.template.py @@ -86,7 +86,9 @@ async def main() -> None: async with Actor: Actor.log.info('Actor is being executed...') - # Process Actor input - here you can add your own logic for handling Actor input + # Process Actor input - you can customize logic for handling Actor input here + # The `max_depth` option from Actor input overrides Scrapy's `DEPTH_LIMIT` setting + # The `start_urls` option from Actor input is combined with Scrapy's `start_urls` from your spiders actor_input = await Actor.get_input() or {} max_depth = actor_input.get('max_depth', LOCAL_DEFAULT_MAX_DEPTH) start_urls = [start_url.get('url') for start_url in actor_input.get('start_urls', LOCAL_DEFAULT_START_URLS)]