Skip to content

Commit

Permalink
Scrapy template: Minor code update and docs improvement (#236)
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek authored Nov 23, 2023
1 parent f019fa0 commit c5fdccf
Show file tree
Hide file tree
Showing 9 changed files with 84 additions and 45 deletions.
19 changes: 12 additions & 7 deletions templates/python-scrapy/src/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
`scrapy crawl`.
`scrapy crawl title_spider`.
We recommend you do not modify this file unless you really know what you are doing.
"""
Expand All @@ -13,11 +13,15 @@
# `scrapy.utils.log` before we patch it.
import logging
from typing import Any
import scrapy.utils.log
from scrapy.utils import log as scrapy_logging
from scrapy.utils.project import get_project_settings
from apify.log import ActorLogFormatter

# If you want to change the logging level, change it here
LOGGING_LEVEL = logging.INFO
# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
# a specific logger, do it in this file.
settings = get_project_settings()
LOGGING_LEVEL = settings['LOG_LEVEL']

handler = logging.StreamHandler()
handler.setFormatter(ActorLogFormatter(include_logger_name=True))
Expand All @@ -35,7 +39,7 @@
# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
# otherwise we would lose some log messages.
old_configure_logging = scrapy.utils.log.configure_logging
old_configure_logging = scrapy_logging.configure_logging

def new_configure_logging(*args: Any, **kwargs: Any) -> None:
"""
Expand Down Expand Up @@ -64,7 +68,7 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
filelock_logger.setLevel(LOGGING_LEVEL)
hpack_logger.setLevel(LOGGING_LEVEL)

scrapy.utils.log.configure_logging = new_configure_logging
scrapy_logging.configure_logging = new_configure_logging

# Now we can do the rest of the setup
import asyncio
Expand All @@ -73,7 +77,8 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
from scrapy.utils.reactor import install_reactor
from .main import main

# This is necessary so that Twisted and AsyncIO work well together
# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),
# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
nest_asyncio.apply()

Expand Down
12 changes: 9 additions & 3 deletions templates/python-scrapy/src/items.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Define the models for your scraped items here
#
# See the Scrapy documentation: http://doc.scrapy.org/en/latest/topics/items.html
"""
Scrapy item models module
This module defines Scrapy item models for scraped data. Items represent structured data
extracted by spiders.
For detailed information on creating and utilizing items, refer to the official documentation:
https://docs.scrapy.org/en/latest/topics/items.html
"""

from scrapy import Field, Item

Expand Down
4 changes: 3 additions & 1 deletion templates/python-scrapy/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ async def main() -> None:
async with Actor:
Actor.log.info('Actor is being executed...')

# Process Actor input - here you can add your own logic for handling Actor input
# Process Actor input - you can customize logic for handling Actor input here
# The `max_depth` option from Actor input overrides Scrapy's `DEPTH_LIMIT` setting
# The `start_urls` option from Actor input is combined with Scrapy's `start_urls` from your spiders
actor_input = await Actor.get_input() or {}
max_depth = actor_input.get('max_depth', LOCAL_DEFAULT_MAX_DEPTH)
start_urls = [start_url.get('url') for start_url in actor_input.get('start_urls', LOCAL_DEFAULT_START_URLS)]
Expand Down
19 changes: 14 additions & 5 deletions templates/python-scrapy/src/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
# Define your item pipelines here
#
# See the Scrapy documentation: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
"""
Scrapy item pipelines module
import scrapy
This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components
that handle the scraped items, typically used for cleaning, validating, and persisting data.
For detailed information on creating and utilizing item pipelines, refer to the official documentation:
http://doc.scrapy.org/en/latest/topics/item-pipeline.html
"""

from scrapy import Spider

from .items import TitleItem


class TitleItemPipeline:
"""
This item pipeline defines processing steps for TitleItem objects scraped by spiders.
"""

def process_item(self, item: TitleItem, spider: scrapy.Spider) -> TitleItem:
def process_item(self, item: TitleItem, spider: Spider) -> TitleItem:
# Do something with the item here, such as cleaning it or persisting it to a database
return item
24 changes: 13 additions & 11 deletions templates/python-scrapy/src/settings.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
# Scrapy settings for this project
#
# For simplicity, this file contains only settings considered important or commonly used.
#
# You can find more settings consulting the documentation: http://doc.scrapy.org/en/latest/topics/settings.html
"""
Scrapy settings module
# Do not change this since it would break the Scrapy <-> Apify interaction
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
This module contains Scrapy settings for the project, defining various configurations and options.
# The following settings can be updated by the user
For more comprehensive details on Scrapy settings, refer to the official documentation:
http://doc.scrapy.org/en/latest/topics/settings.html
"""

# You can update these options and add new ones
BOT_NAME = 'titlebot'
SPIDER_MODULES = ['src.spiders']
DEPTH_LIMIT = 1 # This will be overridden by the `max_depth` option from Actor input if running using Apify
ITEM_PIPELINES = {'src.pipelines.TitleItemPipeline': 123}
LOG_LEVEL = 'INFO'
NEWSPIDER_MODULE = 'src.spiders'
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
ROBOTSTXT_OBEY = True # obey robots.txt rules
ITEM_PIPELINES = {'src.pipelines.TitleItemPipeline': 123}
ROBOTSTXT_OBEY = True
SPIDER_MODULES = ['src.spiders']
14 changes: 9 additions & 5 deletions templates/python-scrapy/src/spiders/__init__.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the Scrapy documentation for information on how to create and manage your spiders.
#
# https://docs.scrapy.org/en/latest/topics/spiders.html
"""
Scrapy spiders package
This package contains the spiders for your Scrapy project. Spiders are the classes that define how to scrape
and process data from websites.
For detailed information on creating and utilizing spiders, refer to the official documentation:
https://docs.scrapy.org/en/latest/topics/spiders.html
"""
14 changes: 9 additions & 5 deletions templates/python-scrapy/src/spiders/title.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import Generator, Union
from __future__ import annotations

from typing import Generator
from urllib.parse import urljoin

from scrapy import Request, Spider
from scrapy.responsetypes import Response

from apify import Actor

from ..items import TitleItem


Expand All @@ -16,7 +16,11 @@ class TitleSpider(Spider):

name = 'title_spider'

def parse(self, response: Response) -> Generator[Union[TitleItem, Request], None, None]:
# The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input
# when the project is executed using Apify.
start_urls = ['https://apify.com/']

def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]:
"""
Parse the web page response.
Expand All @@ -26,7 +30,7 @@ def parse(self, response: Response) -> Generator[Union[TitleItem, Request], None
Yields:
Yields scraped TitleItem and Requests for links.
"""
Actor.log.info(f'TitleSpider is parsing {response}...')
self.logger.info('TitleSpider is parsing %s...', response)

# Extract and yield the TitleItem
url = response.url
Expand Down
19 changes: 12 additions & 7 deletions wrappers/python-scrapy/{projectFolder}/__main__.template.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
`scrapy crawl`.
`scrapy crawl title_spider`.
We recommend you do not modify this file unless you really know what you are doing.
"""
Expand All @@ -13,11 +13,15 @@
# `scrapy.utils.log` before we patch it.
import logging
from typing import Any
import scrapy.utils.log
from scrapy.utils import log as scrapy_logging
from scrapy.utils.project import get_project_settings
from apify.log import ActorLogFormatter

# If you want to change the logging level, change it here
LOGGING_LEVEL = logging.INFO
# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
# a specific logger, do it in this file.
settings = get_project_settings()
LOGGING_LEVEL = settings['LOG_LEVEL']

handler = logging.StreamHandler()
handler.setFormatter(ActorLogFormatter(include_logger_name=True))
Expand All @@ -35,7 +39,7 @@
# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
# otherwise we would lose some log messages.
old_configure_logging = scrapy.utils.log.configure_logging
old_configure_logging = scrapy_logging.configure_logging

def new_configure_logging(*args: Any, **kwargs: Any) -> None:
"""
Expand Down Expand Up @@ -64,7 +68,7 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
filelock_logger.setLevel(LOGGING_LEVEL)
hpack_logger.setLevel(LOGGING_LEVEL)

scrapy.utils.log.configure_logging = new_configure_logging
scrapy_logging.configure_logging = new_configure_logging

# Now we can do the rest of the setup
import asyncio
Expand All @@ -73,7 +77,8 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
from scrapy.utils.reactor import install_reactor
from .main import main

# This is necessary so that Twisted and AsyncIO work well together
# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),
# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
nest_asyncio.apply()

Expand Down
4 changes: 3 additions & 1 deletion wrappers/python-scrapy/{projectFolder}/main.template.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ async def main() -> None:
async with Actor:
Actor.log.info('Actor is being executed...')

# Process Actor input - here you can add your own logic for handling Actor input
# Process Actor input - you can customize logic for handling Actor input here
# The `max_depth` option from Actor input overrides Scrapy's `DEPTH_LIMIT` setting
# The `start_urls` option from Actor input is combined with Scrapy's `start_urls` from your spiders
actor_input = await Actor.get_input() or {}
max_depth = actor_input.get('max_depth', LOCAL_DEFAULT_MAX_DEPTH)
start_urls = [start_url.get('url') for start_url in actor_input.get('start_urls', LOCAL_DEFAULT_START_URLS)]
Expand Down

0 comments on commit c5fdccf

Please sign in to comment.