Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
tosofto committed Dec 4, 2022
1 parent 8970809 commit a64a844
Show file tree
Hide file tree
Showing 15 changed files with 237 additions and 84 deletions.
Binary file modified Raw DB Tasks/__pycache__/settings.cpython-310.pyc
Binary file not shown.
Binary file modified __pycache__/config.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/__pycache__/items.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/__pycache__/pipelines.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/__pycache__/settings.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/__pycache__/utils.cpython-310.pyc
Binary file not shown.
2 changes: 2 additions & 0 deletions extractors/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ class MarketItem(scrapy.Item):
oldPrice = scrapy.Field()
productProcessTime= scrapy.Field()
productProcessSize= scrapy.Field()
variant= scrapy.Field()
discountType= scrapy.Field()
18 changes: 15 additions & 3 deletions extractors/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ def process_item(self, item, spider):
productToSave["productLocalId"] = product["productLocalId"]
productToSave["productProcessTime"] = product["productProcessTime"]
productToSave["productProcessSize"] = product["productProcessSize"]
try:
productToSave["productVariants"] = product["variant"]
except Exception as error:
print("no variant")

# add necessary data related to collections.
productToSave["lastUpdate"] = datetime.timestamp(datetime.now())
Expand Down Expand Up @@ -112,13 +116,21 @@ def process_item(self, item, spider):
try:
oldPrice = float(sub(r'[^\d.]', '', product["oldPrice"]))
currentPrice = float(sub(r'[^\d.]', '', product["price"]))
discountValue = 100 - currentPrice * 100 / oldPrice or 0
price["productDiscountValue"] = float(f'{discountValue:.2f}')
if product["discountType"] == "Percent":
discountValue = 100 - currentPrice * 100 / oldPrice or 0
elif product["discountType"] == "Fixed":
discountValue = oldPrice - currentPrice

discountValue = int(discountValue)
price["productDiscount"] = {
"productDiscountValue" : discountValue,
"productDiscountType" : product["discountType"]
}
price["productOldPrice"] = oldPrice
except Exception as inst:
print(inst)
price["productOldPrice"] = float(format(0, '.2f'))
price["productDiscountValue"] = float(format(0, '.2f'))
price["productDiscount"] = {}

self.productPriceHistoryCollection.insert_one(price)
return item
Binary file modified extractors/selectors/__pycache__/amazon.cpython-310.pyc
Binary file not shown.
53 changes: 42 additions & 11 deletions extractors/selectors/amazon.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,51 @@
"userRatingCount": ['//span[@id="acrCustomerReviewText"]/text()'],
"userRatingStar": ['//span[@id="acrPopover"]/@title'],
"price": [
# '//span[contains(@class,"a-price")]/span[1]/text()', '//div[@id="centerCol"]/div[
# @id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span',
# '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[
# contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]',
'//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span['
'contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()',
'//div[@id="centerCol"]/div/div[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span/text()'
'//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()',
'//*[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span[1]/text()',
'//span[contains(@class, "priceToPay")]/span[1]/text()',
'//*[@id="snsDetailPagePrice"]/span[@id="sns-base-price"]/text()',
'//*[@id="priceblock_ourprice"]/text()',
'//*[@id="corePrice_desktop"]/div/table/tr[2]/td[2]/span[1]/span[1]/text()'
],
"oldPrice": [
'//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span['
'contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()',
# '//span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()',
'//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()',
'//*[@id="corePrice_desktop"]/div/table/tr[1]/td[2]/span[@data-a-strike="true"]/span[1]/text()'

],
"discountType":[
'//*[@id="savingsPercentage"]/text()',
'//*[@id="corePrice_desktop"]/div/table/tr[3]/td[2]/span[1]/text()',

],
"variants": [
'//li[@data-defaultasin]/@data-defaultasin'
'//li[@data-defaultasin]/@data-dp-url',
'//option[@class="dropdownAvailable"]/@value'
],
"variantName":[
'//div[contains(@class,"twisterTextDiv")]/p/text()',
'/@data-a-html-content'
],
'variantPrice':[
'//p[contains(@class,"twisterSwatchPrice")]/text()'
],
'variantGroups':[
'//form[@id="twister"]/div[contains(@id,"variation_")]'
]
}

#price data
# '//span[contains(@class,"a-price")]/span[1]/text()', '//div[@id="centerCol"]/div[
# @id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span',
# '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[
# contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]',

# '//span[contains(@class, "apexPriceToPay")]/span[1]/text()',
# '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()',
# '//div[@id="centerCol"]/div/div[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span/text()',
# '//*[@id="snsDetailPagePrice"]/span[@id="sns-base-price"]/text()',
# '//*[@id="corePrice_desktop"]/div/table/tr[2]/td[2]/span[1]/span[1]/text()',
# '//*[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span[contains(@class,"priceToPay")]/span[1]/text()'

# old price data
# '//span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()',
Binary file modified extractors/spiders/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file modified extractors/spiders/__pycache__/newegg.cpython-310.pyc
Binary file not shown.
223 changes: 154 additions & 69 deletions extractors/spiders/amazon.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from scrapy.utils.project import get_project_settings

from extractors.items import MarketItem
from extractors.utils import getCategoryName, getElement, getRandomUAgents
from extractors.utils import getCategoryName, getElement, getRandomUAgents, cleanUrl
from extractors.selectors.amazon import selectors

from dataclasses import asdict
Expand All @@ -13,6 +13,7 @@
from urllib.parse import urljoin
from urllib.parse import unquote
import copy
import uuid

import random

Expand All @@ -24,6 +25,9 @@ class AmazonSpider(scrapy.Spider):

baseUrl = "https://www.amazon.com"

env = "dev"
# env = "prod"

# custom_settings = {
# 'CONCURRENT_REQUESTS':30,
# 'DOWNLOAD_DELAY': requestInterval
Expand All @@ -34,68 +38,76 @@ def start_requests(self):
This method is to get content of given category url.
"""
# url = "https://www.amazon.com/Azzaro-Wanted-Eau-Toilette-5-1/dp/B078P7YZ3L/ref=sxin_15_pa_sp_search_thematic_sspa?content-id=amzn1.sym.ee6a664f-a1c5-4f93-a61f-81d41af42efc%3Aamzn1.sym.ee6a664f-a1c5-4f93-a61f-81d41af42efc&crid=HQB58X9PHWMD&cv_ct_cx=dior+sauvage+men&keywords=dior+sauvage+men&pd_rd_i=B078P7YZ3L&pd_rd_r=1e0d974b-6cda-46c9-a707-8bc83fb8491a&pd_rd_w=YoqOE&pd_rd_wg=0Trhw&pf_rd_p=ee6a664f-a1c5-4f93-a61f-81d41af42efc&pf_rd_r=YZTS4H22J6C2NJ9DG4XD&qid=1669453831&sprefix=dio+savage+me%2Caps%2C340&sr=1-2-cbc80bc4-104b-44f8-8e5c-6397d5250496-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEyTVVNNFJKQkc4SjdTJmVuY3J5cHRlZElkPUEwMjM4Nzk4SE42S1dMTzlKTVhDJmVuY3J5cHRlZEFkSWQ9QTA3ODA4NzkxMDBGR1FYSEFNWkRIJndpZGdldE5hbWU9c3Bfc2VhcmNoX3RoZW1hdGljJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ=="
# self.meta["asin"] = "B078P7YZ3L"
url = "https://www.amazon.com/New-Apple-AirPods-Max-Green/dp/B08PZDSP2Z/ref=sr_1_3?crid=1V8XTXSXHHBI2&keywords=apple+airpods+max&qid=1669453913&sprefix=apple+airpods+max%2Caps%2C335&sr=8-3"
self.meta["asin"] = "B08PZDSP2Z"
# request with category url
yield scrapy.Request(url=url, callback=self.parse_product,
headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta)
# yield scrapy.Request(url=self.categoryUrl, callback=self.parse_category, headers = getRandomUAgents(
# settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta)

# def parse_category(self, response):
# '''
# This method is to extract product pages from given category

# '''

# # check if the Captcha exists.
# if response.css('#captchacharacters').extract_first():
# self.log("Captcha found")

# # get products from the category
# products = getElement(selectors["products"], response).getall()

# for productLink in products:

# # get asin
# if re.search(r'dp\/(.*)\/', productLink):
# asin = re.search(r'dp\/(.*)\/', productLink).group(1)
# else:
# asin = ""

# # get current link
# productUrl = urljoin(self.baseUrl, productLink)

# # get rid of unnecessary query params
# if re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl):
# realProductlink = re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl).group(0)
# else:
# realProductlink = ""

# # get product page if asin: if asin not in self.productLists: self.productLists.append(asin) customMeta =
# copy.deepcopy(self.meta) customMeta['asin'] = asin yield scrapy.Request(url=realProductlink,
# callback=self.parse_product,headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),
# meta=customMeta)

# # get next page url nextPage = getElement(selectors["nextPage"], response).extract_first(default="NA") if
# nextPage: nextUrl = urljoin(self.baseUrl, nextPage) yield scrapy.Request(url=nextUrl,
# callback=self.parse_category, headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),
# meta=self.meta)

def parse_product(self, response):
test_urls = [
'https://www.amazon.com/DreamController-Original-Controller-Compatible-Wireless/dp/B09V37CLLR?th=1',
'https://www.amazon.com/Razer-Universal-Quick-Charging-Xbox-S/dp/B09DHSJ4SZ',
'https://www.amazon.com/CableMod-CM-PCSR-FKIT-NKW-R-Cable-Kit-White/dp/B089KPWW3J?th=1',
'https://www.amazon.com/Azzaro-Most-Wanted-Parfum-Fragrance/dp/B09VN2FCDF/?_encoding=UTF8&pd_rd_w=jVQKE&content-id=amzn1.sym.aa5d5fb8-9ab9-46ea-8709-d60f551faa80&pf_rd_p=aa5d5fb8-9ab9-46ea-8709-d60f551faa80&pf_rd_r=F2CTCZ402NYW0D04S2DQ&pd_rd_wg=7duSD&pd_rd_r=f5ad392d-c089-448e-afc3-213f9cefcfc3&ref_=pd_gw_deals_gi'

]
if self.env == "dev":
for url in test_urls:
# self.meta["asin"] = "B08WC2SMSN"
asin = re.search(r'\/[0-9A-Z]{10}',url).group(0)
asin = asin[1:]
self.meta['asin'] = asin
self.productLists = []
# request with category url
yield scrapy.Request(url=cleanUrl(url), callback=self.parse_product,
headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta, cb_kwargs={"isProduct":True})
else:
yield scrapy.Request(url=cleanUrl(self.categoryUrl), callback=self.parse_category, headers = getRandomUAgents(
settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta)

def parse_category(self, response):
'''
This method is to extract product pages from given category
'''

# check if the Captcha exists.
if response.css('#captchacharacters').extract_first():
self.log("Captcha found")

# get products from the category
products = getElement(selectors["products"], response).getall()

for productLink in products:

# get asin
if re.search(r'dp\/(.*)\/', productLink):
asin = re.search(r'dp\/(.*)\/', productLink).group(1)
else:
asin = ""

# get current link
productUrl = urljoin(self.baseUrl, productLink)

# get rid of unnecessary query params
if re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl):
realProductlink = re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl).group(0)
else:
realProductlink = ""

# get product page
if asin:
if asin not in self.productLists:
self.productLists.append(asin)
customMeta = copy.deepcopy(self.meta)
customMeta['asin'] = asin
yield scrapy.Request(url=realProductlink, callback=self.parse_product,headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),meta=customMeta, cb_kwargs = {"isProduct":True})

# get next page url
nextPage = getElement(selectors["nextPage"], response).extract_first(default="NA")
if nextPage:
nextUrl = urljoin(self.baseUrl, nextPage)
yield scrapy.Request(url=cleanUrl(nextUrl), callback=self.parse_category, headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),meta=self.meta)

def parse_product(self, response, isProduct = False):
"""
This method is to extract data from product page.
"""

# try:
# with open('response.html', 'w', encoding='utf-8') as file:
# file.write(response.body.decode('utf-8'))
# file.close()
# except Exception:
# print(Exception)

# check if the recaptcha exists.
if response.css('#captchacharacters').extract_first():
self.log("Captcha found ")
Expand Down Expand Up @@ -196,6 +208,20 @@ def parse_product(self, response):
# price
Item["price"] = getElement(selectors["price"], response).extract_first(default="NA")
Item["oldPrice"] = getElement(selectors["oldPrice"], response).extract_first(default="NA")
discountTypeList = getElement(selectors["discountType"], response).getall()

if Item["price"] != "NA" and Item["oldPrice"] != "NA":

if len(discountTypeList) > 1:
discountType = discountTypeList[1]
else:
discountType = "Fixed"
else:
discountType = "NA"
if '%' in discountType:
discountType = "Percent"

Item["discountType"] = discountType

# productProcessTime
Item["productProcessTime"] = round(response.meta.get('download_latency'), 2)
Expand All @@ -205,17 +231,76 @@ def parse_product(self, response):
Item["productProcessSize"] = round(len(response.body) / 1024, 2)

# other variants

if isProduct:
variantId = str(uuid.uuid5(uuid.NAMESPACE_DNS, response.meta['asin']))
else:
variantId = response.meta["variantId"]

variantGroups = getElement(selectors["variantGroups"], response)

variants = getElement(selectors["variants"], response).getall()

base_variant_url = response.url.split("/dp/", 1)[0]
for variant in variants:
if variant != response.meta['asin']:
self.productLists.append(variant)
customMeta = copy.deepcopy(self.meta)
customMeta['asin'] = variant
url = base_variant_url + "/dp/" + variant
yield scrapy.Request(url=url, callback=self.parse_product,
headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),
meta=customMeta)
variantPrices = getElement(selectors["variantPrice"], response).getall()

if len(variantPrices) <2 and len(variantGroups) < 2:
variantId = "NA"
print('HERE?????')
print(len(variantPrices))
print(len(variantGroups))

#variantId
try:
if response.meta["variantId"] != "NA":
Item["variant"] = {
"variantId": response.meta["variantId"],
"variantName": response.meta["variantName"]
}
except Exception as inst:
if len(variantPrices) > 1:
variantName = response.xpath('//li[@data-defaultasin="'+Item['productLocalId']+'"]' + selectors["variantName"][0]).get()
Item["variant"] = {
"variantId": variantId,
"variantName": variantName
}
if len(variantGroups) > 1:
variantName = "Many Variants"
Item["variant"] = {
"variantId": variantId,
"variantName": variantName
}
for temp_variant in variants:
r = re.search(r'\/[A-Z0-9]{10}\/',temp_variant)
if r is not None:
variant = r.group(0)
variant = variant[1:-1]
else:
r = re.search(r',[A-Z0-9]{10}',temp_variant)
if r is not None:
variant = r.group(0)
variant = variant[1:]
else:
variant = ""

if variant != "" and variant != response.meta['asin']:
if variant not in self.productLists:
self.productLists.append(variant)
customMeta = copy.deepcopy(self.meta)
customMeta['asin'] = variant

if len(variantGroups) > 1:
variantName = "Many Variants"
else:
variantName = response.xpath('//li[@data-defaultasin="'+variant+'"]' + selectors["variantName"][0]).get(default = "NA")
if variantName == "NA":
variantName = response.xpath('//option[contains(@value,"'+variant+'")]' + selectors["variantName"][1]).get(default = "NA")

customMeta["variantId"] = variantId
customMeta["variantName"] = variantName
url = re.sub(r'\/[0-9A-Z]{10}','/'+variant, response.url)

yield scrapy.Request(url=cleanUrl(url), callback=self.parse_product,
headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),
meta=customMeta)

yield Item
Loading

0 comments on commit a64a844

Please sign in to comment.