Skip to content

Commit

Permalink
Proxy
Browse files Browse the repository at this point in the history
  • Loading branch information
tosofto committed Nov 27, 2022
1 parent e63e75a commit 8970809
Show file tree
Hide file tree
Showing 20 changed files with 171 additions and 180 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

*.pyc
Binary file modified Raw DB Tasks/__pycache__/settings.cpython-310.pyc
Binary file not shown.
55 changes: 0 additions & 55 deletions Raw DB Tasks/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,61 +4,6 @@

dotenv.load_dotenv()

appId = 1
# database details where products, categories, etc. are stored
db_name = "tasks_db"
products_table = "products"
products_category_table = "productCategory"
products_sellers_table = "productSellers"
product_price_history_table = "productPriceHistory"
app_settings_table = "appSettings"

mongodb_user = "david"
mongodb_password = "YAFV68dBmBQhoNJs"
mongodb_cluster = "uhrfxiy"

# refer to https://www.scraperapi.com/documentation/python/#getting-started to learn more about the ScraperAPI
# get proxy from ScraperAPI
def get_proxy():
scraperAPI = os.getenv('SCRAPER_API_KEY')
proxies = f"http://scraperapi.country_code=us.device_type=desktop:{scraperAPI}@proxy-server.scraperapi.com:8001 "
return proxies


# User agents to experiment with

# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
# 'Chrome/71.0.3578.98 Safari/537.36',
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en-US,en;q=0.5',
# 'Accept-Encoding': 'gzip',
# 'DNT': '1', # Do Not Track Request Header
# 'Connection': 'close'
# }

# headers = {"User-Agent": "Mozilla/5.0",
# "Accept-Language": "en-US,en;q=0.9"}


headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 '
'Safari/537.36 OPR/91.0.4516.95',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip',
'Referer': 'https://www.google.com/',
'DNT': '1', # Do Not Track Request Header
'Connection': 'close'
}

# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
# 'Chrome/105.0.0.0 Safari/537.36 OPR/91.0.4516.95',
# 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
# 'application/signed-exchange;v=b3',
# 'Accept-Encoding': 'gzip',
# 'Referer': 'https://www.google.com/',
# 'Upgrade-Insecure-Requests': '1'
# }
Binary file modified __pycache__/config.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/__pycache__/items.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/__pycache__/pipelines.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/__pycache__/settings.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/__pycache__/utils.cpython-310.pyc
Binary file not shown.
26 changes: 16 additions & 10 deletions extractors/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,17 @@ def open_spider(self, spider):
self.appSettings = self.db[settings.get('APP_SETTING_COL')]

self.category = self.categoryCollection.find_one({"appId": settings.get('APP_ID')})
#get proxy status

# get proxy status
spider.meta = {}
seller = self.productSellersCollection.find_one({"sellerName":spider.name})
if seller and seller["useProxy"] == 1:
spider.meta["proxy"] = settings.get('PROXY')

seller = self.productSellersCollection.find_one({"sellerName": spider.name})
if seller and seller["useProxy"] == 1:
spider.meta["proxy"] = settings.get('PROXY')

# get requestInterval
appSettings = self.appSettings.find_one({})
if appSettings:
spider.requestInterval = appSettings["requestInterval"]

# get category url from db by appId.
if self.category is not None:
Expand All @@ -47,17 +51,18 @@ def open_spider(self, spider):
spider.categoryUrl = ""

# get products list to find new product.
productLists = self.productCollection.find({"productCategoryId":self.category["_id"]})
productLists = self.productCollection.find({"productCategoryId": self.category["_id"]})
spider.productLists = list(map(lambda product: product['productLocalId'], productLists))

def close_spider(self, spider):
self.client.close()

def process_item(self, item, spider):
print('====Item processing======')
if isinstance(item, MarketItem):
product = ItemAdapter(item).asdict()

if product == "NA":
if product["price"] == "NA":
return item

# define the data to save to database.
Expand Down Expand Up @@ -92,13 +97,14 @@ def process_item(self, item, spider):

try:
price["productPrice"] = float(sub(r'[^\d.]', '', product["price"]))
except:
except Exception as ex:
print(ex)
price["productPrice"] = float(format(0, '.2f'))

price["productShippingFee"] = float(format(0, '.2f')) # currently set to 0.
productOldPrice = product["oldPrice"]

if productOldPrice is None:
if productOldPrice == "NA":
price["productPriceType"] = "Regular"
self.productPriceHistoryCollection.insert_one(price)
else:
Expand Down
Binary file modified extractors/selectors/__pycache__/amazon.cpython-310.pyc
Binary file not shown.
29 changes: 18 additions & 11 deletions extractors/selectors/amazon.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

selectors = {
"products": ['//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div/div/div/div/div/div/div[2]/div/div/div[1]/h2/a[not(contains(@style,"display:none")) or not(contains(@style,"visible:hidden"))]/@href'],
"products": ['//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div/div/div/div/div/div/div[2]/div/div/div['
'1]/h2/a[not(contains(@style,"display:none")) or not(contains(@style,"visible:hidden"))]/@href'],
"nextPage": ['//a[contains(@class,"s-pagination-next")]/@href'],
"brand": [
'//a[@id="bylineInfo"]/text()',
Expand All @@ -11,22 +11,29 @@
"sellerName": [
'//a[@id="sellerProfileTriggerId"]/text()',
'//div[@tabular-attribute-name="Sold by"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()'

],
"imageLink": ["//script[contains(., 'ImageBlockATF')]/text()"],
"productTitle": ['//span[@id="productTitle"]/text()'],
"stockStatusDesc": ['//div[@id="availabilityInsideBuyBox_feature_div"]/div/div[@id="availability"]/span[@class="a-size-medium a-color-price"]/text()'],
"stockStatusDesc": ['//div[@id="availabilityInsideBuyBox_feature_div"]/div/div[@id="availability"]/span['
'@class="a-size-medium a-color-price"]/text()'],
"userRatingCount": ['//span[@id="acrCustomerReviewText"]/text()'],
"userRatingStar": ['//span[@id="acrPopover"]/@title'],
"price": [
# '//span[contains(@class,"a-price")]/span[1]/text()',
# '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span',
# '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]',
'//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()',
# '//span[contains(@class,"a-price")]/span[1]/text()', '//div[@id="centerCol"]/div[
# @id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span',
# '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[
# contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]',
'//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span['
'contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()',
'//div[@id="centerCol"]/div/div[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span/text()'
],
"oldPrice": [
'//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()',
'//span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()',
'//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span['
'contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()',
# '//span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()',
],
"variants": [
'//li[@data-defaultasin]/@data-defaultasin'
]
}
}
5 changes: 3 additions & 2 deletions extractors/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# import config

import os, sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config import *

Expand All @@ -34,8 +35,8 @@
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'extractors.middlewares.CustomProxyMiddleware': 350,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
# 'extractors.middlewares.CustomProxyMiddleware': 350,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
# }

# Configure item pipelines
Expand Down
Binary file modified extractors/spiders/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/spiders/__pycache__/amazon_find_spider.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/spiders/__pycache__/newegg.cpython-310.pyc
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import re
from scrapy.utils.project import get_project_settings

from ..items import MarketItem
from ..utils import getCategoryName, getElement, getRandomUAgents
from ..selectors.amazon import selectors
from extractors.items import MarketItem
from extractors.utils import getCategoryName, getElement, getRandomUAgents
from extractors.selectors.amazon import selectors

from dataclasses import asdict
from itemadapter import ItemAdapter
Expand All @@ -18,70 +18,76 @@

settings = get_project_settings()


class AmazonSpider(scrapy.Spider):
name = "Amazon"

baseUrl = "https://www.amazon.com"

# custom_settings = {
# 'CONCURRENT_REQUESTS':30,
# 'DOWNLOAD_DELAY': requestInterval
# }

def start_requests(self):
'''
"""
This method is to get content of given category url.
'''
"""
# url = "https://www.amazon.com/Azzaro-Wanted-Eau-Toilette-5-1/dp/B078P7YZ3L/ref=sxin_15_pa_sp_search_thematic_sspa?content-id=amzn1.sym.ee6a664f-a1c5-4f93-a61f-81d41af42efc%3Aamzn1.sym.ee6a664f-a1c5-4f93-a61f-81d41af42efc&crid=HQB58X9PHWMD&cv_ct_cx=dior+sauvage+men&keywords=dior+sauvage+men&pd_rd_i=B078P7YZ3L&pd_rd_r=1e0d974b-6cda-46c9-a707-8bc83fb8491a&pd_rd_w=YoqOE&pd_rd_wg=0Trhw&pf_rd_p=ee6a664f-a1c5-4f93-a61f-81d41af42efc&pf_rd_r=YZTS4H22J6C2NJ9DG4XD&qid=1669453831&sprefix=dio+savage+me%2Caps%2C340&sr=1-2-cbc80bc4-104b-44f8-8e5c-6397d5250496-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEyTVVNNFJKQkc4SjdTJmVuY3J5cHRlZElkPUEwMjM4Nzk4SE42S1dMTzlKTVhDJmVuY3J5cHRlZEFkSWQ9QTA3ODA4NzkxMDBGR1FYSEFNWkRIJndpZGdldE5hbWU9c3Bfc2VhcmNoX3RoZW1hdGljJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ=="
# self.meta["asin"] = "B078P7YZ3L"
url = "https://www.amazon.com/New-Apple-AirPods-Max-Green/dp/B08PZDSP2Z/ref=sr_1_3?crid=1V8XTXSXHHBI2&keywords=apple+airpods+max&qid=1669453913&sprefix=apple+airpods+max%2Caps%2C335&sr=8-3"
self.meta["asin"] = "B08PZDSP2Z"
# request with category url
# self.custom_request(url=self.categoryUrl, callback=self.parse_category)
yield scrapy.Request(url=self.categoryUrl, callback=self.parse_category, headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta)
yield scrapy.Request(url=url, callback=self.parse_product,
headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta)
# yield scrapy.Request(url=self.categoryUrl, callback=self.parse_category, headers = getRandomUAgents(
# settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta)

def parse_category(self, response):
'''
This method is to extract product pages from given category
'''
# def parse_category(self, response):
# '''
# This method is to extract product pages from given category

# check if the Captcha exists.
if response.css('#captchacharacters').extract_first():
self.log("Captcha found")

# get products from the category
products = getElement(selectors["products"], response).getall()

for productLink in products:

# get asin
if re.search(r'dp\/(.*)\/', productLink):
asin = re.search(r'dp\/(.*)\/', productLink).group(1)
else:
asin = ""

# get current link
productUrl = urljoin(self.baseUrl, productLink)

# get rid of unnecessary query params
if re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl):
realProductlink = re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl).group(0)
else:
realProductlink = ""

# get product page
if asin:
if asin not in self.productLists:
self.productLists.append(asin)
customMeta = copy.deepcopy(self.meta)
customMeta['asin'] = asin
yield scrapy.Request(url=realProductlink, callback=self.parse_product,headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), meta=customMeta)

# get next page url
nextPage = getElement(selectors["nextPage"], response).extract_first(default="NA")
if nextPage:
nextUrl = urljoin(self.baseUrl, nextPage)
yield scrapy.Request(url=nextUrl, callback=self.parse_category, headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta)
# '''

# # check if the Captcha exists.
# if response.css('#captchacharacters').extract_first():
# self.log("Captcha found")

# # get products from the category
# products = getElement(selectors["products"], response).getall()

# for productLink in products:

# # get asin
# if re.search(r'dp\/(.*)\/', productLink):
# asin = re.search(r'dp\/(.*)\/', productLink).group(1)
# else:
# asin = ""

# # get current link
# productUrl = urljoin(self.baseUrl, productLink)

# # get rid of unnecessary query params
# if re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl):
# realProductlink = re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl).group(0)
# else:
# realProductlink = ""

# # get product page if asin: if asin not in self.productLists: self.productLists.append(asin) customMeta =
# copy.deepcopy(self.meta) customMeta['asin'] = asin yield scrapy.Request(url=realProductlink,
# callback=self.parse_product,headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),
# meta=customMeta)

# # get next page url nextPage = getElement(selectors["nextPage"], response).extract_first(default="NA") if
# nextPage: nextUrl = urljoin(self.baseUrl, nextPage) yield scrapy.Request(url=nextUrl,
# callback=self.parse_category, headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),
# meta=self.meta)

def parse_product(self, response):
'''
"""
This method is to extract data from product page.
'''
"""

# try:
# with open('response.html', 'w', encoding='utf-8') as file:
Expand Down Expand Up @@ -113,7 +119,7 @@ def parse_product(self, response):
# description
productDescription = getElement(selectors["description"], response).getall()

## get rid of blank rows.
# get rid of blank rows.
while '' in productDescription:
productDescription.remove('')
while ' ' in productDescription:
Expand Down Expand Up @@ -188,14 +194,28 @@ def parse_product(self, response):
}

# price
Item["price"] = getElement(selectors["price"], response).extract_first(default = "NA")
Item["oldPrice"] = getElement(selectors["oldPrice"], response).extract_first(default = "NA")
Item["price"] = getElement(selectors["price"], response).extract_first(default="NA")
Item["oldPrice"] = getElement(selectors["oldPrice"], response).extract_first(default="NA")

#productPricessTime
Item["productProcessTime"] = round(response.meta.get('download_latency'),2)
# productProcessTime
Item["productProcessTime"] = round(response.meta.get('download_latency'), 2)
# print(download_latency)

#productProcessSize
Item["productProcessSize"] = round(len(response.body)/1024,2)
# productProcessSize
Item["productProcessSize"] = round(len(response.body) / 1024, 2)

# other variants
variants = getElement(selectors["variants"], response).getall()

base_variant_url = response.url.split("/dp/", 1)[0]
for variant in variants:
if variant != response.meta['asin']:
self.productLists.append(variant)
customMeta = copy.deepcopy(self.meta)
customMeta['asin'] = variant
url = base_variant_url + "/dp/" + variant
yield scrapy.Request(url=url, callback=self.parse_product,
headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),
meta=customMeta)

yield Item
Loading

0 comments on commit 8970809

Please sign in to comment.