initial import

stummjr · Jul 19, 2016 · b74e26b · b74e26b
commit b74e26b
Show file tree

Hide file tree

Showing 8 changed files with 183 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,89 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
diff --git a/books/__init__.py b/books/__init__.py
diff --git a/books/items.py b/books/items.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class BooksItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
diff --git a/books/pipelines.py b/books/pipelines.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class BooksPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/books/settings.py b/books/settings.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+
+BOT_NAME = 'books'
+
+SPIDER_MODULES = ['books.spiders']
+NEWSPIDER_MODULE = 'books.spiders'
+
+ROBOTSTXT_OBEY = True
+HTTPCACHE_ENABLED = True
+
+SPIDER_MIDDLEWARES = {
+    'scrapy_deltafetch.DeltaFetch': 100,
+    'scrapy_magicfields.MagicFieldsMiddleware': 200,
+}
+
+DELTAFETCH_ENABLED = True
+
+MAGICFIELDS_ENABLED = True
+MAGIC_FIELDS = {
+    "timestamp": "$time",
+    "spider": "$spider:name",
+    "url": "scraped from $response:url",
+    "domain": "$response:url,r'https?://([\w\.]+)/']",
+}
diff --git a/books/spiders/__init__.py b/books/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/books/spiders/toscrape.py b/books/spiders/toscrape.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+import scrapy
+
+
+class ToscrapeSpider(scrapy.Spider):
+    name = "toscrape"
+    allowed_domains = ["books.toscrape.com"]
+    start_urls = [
+        'http://books.toscrape.com/',
+    ]
+
+    def parse(self, response):
+        for book_url in response.css("article.product_pod > h3 > a ::attr(href)").extract():
+            yield scrapy.Request(response.urljoin(book_url), callback=self.parse_book_page)
+        next_page = response.css("li.next > a ::attr(href)").extract_first()
+        if next_page:
+            yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
+
+    def parse_book_page(self, response):
+        item = {}
+        product = response.css("div.product_main")
+        item["title"] = product.css("h1 ::text").extract_first()
+        item["price"] = product.css("p.price_color ::text").re_first("(\d+.\d+)")
+        item["stock"] = product.xpath(
+            "normalize-space(./p[contains(@class, 'availability')])"
+        ).re_first("(\d+) available")
+        item['category'] = response.xpath(
+            "//ul[@class='breadcrumb']/li[@class='active']/preceding-sibling::li[1]/a/text()"
+        ).extract_first()
+        yield item
diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = books.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = books