Scrapy template: Minor code update and docs improvement (#236)

apify · Nov 23, 2023 · c5fdccf · c5fdccf
1 parent f019fa0
commit c5fdccf
Show file tree

Hide file tree

Showing 9 changed files with 84 additions and 45 deletions.
diff --git a/templates/python-scrapy/src/__main__.py b/templates/python-scrapy/src/__main__.py
@@ -4,7 +4,7 @@
 
 This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
 or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
-`scrapy crawl`.
+`scrapy crawl title_spider`.
 
 We recommend you do not modify this file unless you really know what you are doing.
 """
@@ -13,11 +13,15 @@
 # `scrapy.utils.log` before we patch it.
 import logging
 from typing import Any
-import scrapy.utils.log
+from scrapy.utils import log as scrapy_logging
+from scrapy.utils.project import get_project_settings
 from apify.log import ActorLogFormatter
 
-# If you want to change the logging level, change it here
-LOGGING_LEVEL = logging.INFO
+# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
+# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
+# a specific logger, do it in this file.
+settings = get_project_settings()
+LOGGING_LEVEL = settings['LOG_LEVEL']
 
 handler = logging.StreamHandler()
 handler.setFormatter(ActorLogFormatter(include_logger_name=True))
@@ -35,7 +39,7 @@
 # `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
 # like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
 # otherwise we would lose some log messages.
-old_configure_logging = scrapy.utils.log.configure_logging
+old_configure_logging = scrapy_logging.configure_logging
 
 def new_configure_logging(*args: Any, **kwargs: Any) -> None:
     """
@@ -64,7 +68,7 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
     filelock_logger.setLevel(LOGGING_LEVEL)
     hpack_logger.setLevel(LOGGING_LEVEL)
 
-scrapy.utils.log.configure_logging = new_configure_logging
+scrapy_logging.configure_logging = new_configure_logging
 
 # Now we can do the rest of the setup
 import asyncio
@@ -73,7 +77,8 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
 from scrapy.utils.reactor import install_reactor
 from .main import main
 
-# This is necessary so that Twisted and AsyncIO work well together
+# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),
+# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor
 install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
 nest_asyncio.apply()
 

diff --git a/templates/python-scrapy/src/items.py b/templates/python-scrapy/src/items.py
@@ -1,6 +1,12 @@
-# Define the models for your scraped items here
-#
-# See the Scrapy documentation: http://doc.scrapy.org/en/latest/topics/items.html
+"""
+Scrapy item models module
+
+This module defines Scrapy item models for scraped data. Items represent structured data
+extracted by spiders.
+
+For detailed information on creating and utilizing items, refer to the official documentation:
+https://docs.scrapy.org/en/latest/topics/items.html
+"""
 
 from scrapy import Field, Item
 

diff --git a/templates/python-scrapy/src/main.py b/templates/python-scrapy/src/main.py
@@ -86,7 +86,9 @@ async def main() -> None:
     async with Actor:
         Actor.log.info('Actor is being executed...')
 
-        # Process Actor input - here you can add your own logic for handling Actor input
+        # Process Actor input - you can customize logic for handling Actor input here
+        # The `max_depth` option from Actor input overrides Scrapy's `DEPTH_LIMIT` setting
+        # The `start_urls` option from Actor input is combined with Scrapy's `start_urls` from your spiders
         actor_input = await Actor.get_input() or {}
         max_depth = actor_input.get('max_depth', LOCAL_DEFAULT_MAX_DEPTH)
         start_urls = [start_url.get('url') for start_url in actor_input.get('start_urls', LOCAL_DEFAULT_START_URLS)]

diff --git a/templates/python-scrapy/src/pipelines.py b/templates/python-scrapy/src/pipelines.py
@@ -1,14 +1,23 @@
-# Define your item pipelines here
-#
-# See the Scrapy documentation: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+"""
+Scrapy item pipelines module
 
-import scrapy
+This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components
+that handle the scraped items, typically used for cleaning, validating, and persisting data.
+
+For detailed information on creating and utilizing item pipelines, refer to the official documentation:
+http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+"""
+
+from scrapy import Spider
 
 from .items import TitleItem
 
 
 class TitleItemPipeline:
+    """
+    This item pipeline defines processing steps for TitleItem objects scraped by spiders.
+    """
 
-    def process_item(self, item: TitleItem, spider: scrapy.Spider) -> TitleItem:
+    def process_item(self, item: TitleItem, spider: Spider) -> TitleItem:
         # Do something with the item here, such as cleaning it or persisting it to a database
         return item
diff --git a/templates/python-scrapy/src/settings.py b/templates/python-scrapy/src/settings.py
@@ -1,16 +1,18 @@
-# Scrapy settings for this project
-#
-# For simplicity, this file contains only settings considered important or commonly used.
-#
-# You can find more settings consulting the documentation: http://doc.scrapy.org/en/latest/topics/settings.html
+"""
+Scrapy settings module
 
-# Do not change this since it would break the Scrapy <-> Apify interaction
-TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
+This module contains Scrapy settings for the project, defining various configurations and options.
 
-# The following settings can be updated by the user
+For more comprehensive details on Scrapy settings, refer to the official documentation:
+http://doc.scrapy.org/en/latest/topics/settings.html
+"""
+
+# You can update these options and add new ones
 BOT_NAME = 'titlebot'
-SPIDER_MODULES = ['src.spiders']
+DEPTH_LIMIT = 1  # This will be overridden by the `max_depth` option from Actor input if running using Apify
+ITEM_PIPELINES = {'src.pipelines.TitleItemPipeline': 123}
+LOG_LEVEL = 'INFO'
 NEWSPIDER_MODULE = 'src.spiders'
 REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
-ROBOTSTXT_OBEY = True  # obey robots.txt rules
-ITEM_PIPELINES = {'src.pipelines.TitleItemPipeline': 123}
+ROBOTSTXT_OBEY = True
+SPIDER_MODULES = ['src.spiders']
diff --git a/templates/python-scrapy/src/spiders/__init__.py b/templates/python-scrapy/src/spiders/__init__.py
@@ -1,5 +1,9 @@
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the Scrapy documentation for information on how to create and manage your spiders.
-#
-# https://docs.scrapy.org/en/latest/topics/spiders.html
+"""
+Scrapy spiders package
+
+This package contains the spiders for your Scrapy project. Spiders are the classes that define how to scrape
+and process data from websites.
+
+For detailed information on creating and utilizing spiders, refer to the official documentation:
+https://docs.scrapy.org/en/latest/topics/spiders.html
+"""
diff --git a/templates/python-scrapy/src/spiders/title.py b/templates/python-scrapy/src/spiders/title.py
@@ -1,11 +1,11 @@
-from typing import Generator, Union
+from __future__ import annotations
+
+from typing import Generator
 from urllib.parse import urljoin
 
 from scrapy import Request, Spider
 from scrapy.responsetypes import Response
 
-from apify import Actor
-
 from ..items import TitleItem
 
 
@@ -16,7 +16,11 @@ class TitleSpider(Spider):
 
     name = 'title_spider'
 
-    def parse(self, response: Response) -> Generator[Union[TitleItem, Request], None, None]:
+    # The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input
+    # when the project is executed using Apify.
+    start_urls = ['https://apify.com/']
+
+    def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]:
         """
         Parse the web page response.
 
@@ -26,7 +30,7 @@ def parse(self, response: Response) -> Generator[Union[TitleItem, Request], None
         Yields:
             Yields scraped TitleItem and Requests for links.
         """
-        Actor.log.info(f'TitleSpider is parsing {response}...')
+        self.logger.info('TitleSpider is parsing %s...', response)
 
         # Extract and yield the TitleItem
         url = response.url

diff --git a/wrappers/python-scrapy/{projectFolder}/__main__.template.py b/wrappers/python-scrapy/{projectFolder}/__main__.template.py
@@ -4,7 +4,7 @@
 
 This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
 or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
-`scrapy crawl`.
+`scrapy crawl title_spider`.
 
 We recommend you do not modify this file unless you really know what you are doing.
 """
@@ -13,11 +13,15 @@
 # `scrapy.utils.log` before we patch it.
 import logging
 from typing import Any
-import scrapy.utils.log
+from scrapy.utils import log as scrapy_logging
+from scrapy.utils.project import get_project_settings
 from apify.log import ActorLogFormatter
 
-# If you want to change the logging level, change it here
-LOGGING_LEVEL = logging.INFO
+# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
+# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
+# a specific logger, do it in this file.
+settings = get_project_settings()
+LOGGING_LEVEL = settings['LOG_LEVEL']
 
 handler = logging.StreamHandler()
 handler.setFormatter(ActorLogFormatter(include_logger_name=True))
@@ -35,7 +39,7 @@
 # `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
 # like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
 # otherwise we would lose some log messages.
-old_configure_logging = scrapy.utils.log.configure_logging
+old_configure_logging = scrapy_logging.configure_logging
 
 def new_configure_logging(*args: Any, **kwargs: Any) -> None:
     """
@@ -64,7 +68,7 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
     filelock_logger.setLevel(LOGGING_LEVEL)
     hpack_logger.setLevel(LOGGING_LEVEL)
 
-scrapy.utils.log.configure_logging = new_configure_logging
+scrapy_logging.configure_logging = new_configure_logging
 
 # Now we can do the rest of the setup
 import asyncio
@@ -73,7 +77,8 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
 from scrapy.utils.reactor import install_reactor
 from .main import main
 
-# This is necessary so that Twisted and AsyncIO work well together
+# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),
+# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor
 install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
 nest_asyncio.apply()
 

diff --git a/wrappers/python-scrapy/{projectFolder}/main.template.py b/wrappers/python-scrapy/{projectFolder}/main.template.py
@@ -86,7 +86,9 @@ async def main() -> None:
     async with Actor:
         Actor.log.info('Actor is being executed...')
 
-        # Process Actor input - here you can add your own logic for handling Actor input
+        # Process Actor input - you can customize logic for handling Actor input here
+        # The `max_depth` option from Actor input overrides Scrapy's `DEPTH_LIMIT` setting
+        # The `start_urls` option from Actor input is combined with Scrapy's `start_urls` from your spiders
         actor_input = await Actor.get_input() or {}
         max_depth = actor_input.get('max_depth', LOCAL_DEFAULT_MAX_DEPTH)
         start_urls = [start_url.get('url') for start_url in actor_input.get('start_urls', LOCAL_DEFAULT_START_URLS)]