Skip to content

Commit

Permalink
initial import
Browse files Browse the repository at this point in the history
  • Loading branch information
stummjr committed Jul 19, 2016
0 parents commit b74e26b
Show file tree
Hide file tree
Showing 8 changed files with 183 additions and 0 deletions.
89 changes: 89 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# IPython Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# dotenv
.env

# virtualenv
venv/
ENV/

# Spyder project settings
.spyderproject

# Rope project settings
.ropeproject
Empty file added books/__init__.py
Empty file.
14 changes: 14 additions & 0 deletions books/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class BooksItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
11 changes: 11 additions & 0 deletions books/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class BooksPipeline(object):
def process_item(self, item, spider):
return item
24 changes: 24 additions & 0 deletions books/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-

BOT_NAME = 'books'

SPIDER_MODULES = ['books.spiders']
NEWSPIDER_MODULE = 'books.spiders'

ROBOTSTXT_OBEY = True
HTTPCACHE_ENABLED = True

SPIDER_MIDDLEWARES = {
'scrapy_deltafetch.DeltaFetch': 100,
'scrapy_magicfields.MagicFieldsMiddleware': 200,
}

DELTAFETCH_ENABLED = True

MAGICFIELDS_ENABLED = True
MAGIC_FIELDS = {
"timestamp": "$time",
"spider": "$spider:name",
"url": "scraped from $response:url",
"domain": "$response:url,r'https?://([\w\.]+)/']",
}
4 changes: 4 additions & 0 deletions books/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
30 changes: 30 additions & 0 deletions books/spiders/toscrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
import scrapy


class ToscrapeSpider(scrapy.Spider):
name = "toscrape"
allowed_domains = ["books.toscrape.com"]
start_urls = [
'http://books.toscrape.com/',
]

def parse(self, response):
for book_url in response.css("article.product_pod > h3 > a ::attr(href)").extract():
yield scrapy.Request(response.urljoin(book_url), callback=self.parse_book_page)
next_page = response.css("li.next > a ::attr(href)").extract_first()
if next_page:
yield scrapy.Request(response.urljoin(next_page), callback=self.parse)

def parse_book_page(self, response):
item = {}
product = response.css("div.product_main")
item["title"] = product.css("h1 ::text").extract_first()
item["price"] = product.css("p.price_color ::text").re_first("(\d+.\d+)")
item["stock"] = product.xpath(
"normalize-space(./p[contains(@class, 'availability')])"
).re_first("(\d+) available")
item['category'] = response.xpath(
"//ul[@class='breadcrumb']/li[@class='active']/preceding-sibling::li[1]/a/text()"
).extract_first()
yield item
11 changes: 11 additions & 0 deletions scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = books.settings

[deploy]
#url = http://localhost:6800/
project = books

0 comments on commit b74e26b

Please sign in to comment.