From a64a8445aeea0c10d3c98ba3144262d0082eff67 Mon Sep 17 00:00:00 2001 From: Richard Date: Sun, 4 Dec 2022 10:51:48 -0600 Subject: [PATCH] update --- .../__pycache__/settings.cpython-310.pyc | Bin 351 -> 316 bytes __pycache__/config.cpython-310.pyc | Bin 8915 -> 8880 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 193 -> 158 bytes extractors/__pycache__/items.cpython-310.pyc | Bin 699 -> 707 bytes .../__pycache__/pipelines.cpython-310.pyc | Bin 3590 -> 3801 bytes .../__pycache__/settings.cpython-310.pyc | Bin 803 -> 768 bytes extractors/__pycache__/utils.cpython-310.pyc | Bin 949 -> 1359 bytes extractors/items.py | 2 + extractors/pipelines.py | 18 +- .../__pycache__/amazon.cpython-310.pyc | Bin 1856 -> 2664 bytes extractors/selectors/amazon.py | 53 ++++- .../__pycache__/__init__.cpython-310.pyc | Bin 201 -> 166 bytes .../__pycache__/newegg.cpython-310.pyc | Bin 1887 -> 1852 bytes extractors/spiders/amazon.py | 223 ++++++++++++------ extractors/utils.py | 25 +- 15 files changed, 237 insertions(+), 84 deletions(-) diff --git a/Raw DB Tasks/__pycache__/settings.cpython-310.pyc b/Raw DB Tasks/__pycache__/settings.cpython-310.pyc index 3a1f70a3b3f0914a16ff2ec47e28c718d83e17fe..1451921068fcba4db0966379f49ca523dde43b70 100644 GIT binary patch delta 50 zcmcc5w1@(AnVgZBniC(KT$ET)keQc0S&Y#W E0C5%%d;kCd delta 84 zcmdnPbf1YkpO=@50SI1SYn;e^&Nj`(D#o``!6iSrG&eP`q&UVuFV&?evn(|xAhoDC lKQA#SCZH%kD>WG?<&;`nq7YD&nVhQNmYJ85S~T&qCjing9{&IU diff --git a/__pycache__/config.cpython-310.pyc b/__pycache__/config.cpython-310.pyc index bc3cd4e1abc9c371d001052dc77399518e38dab9..bf94cfa9c4c63cc9027bbb5d79938ebd84c9ef7d 100644 GIT binary patch delta 165 zcmccYy1|t@pO=@50SGR;)o$cwWM+)o%*(7N!oms^nw+ShJUK}`kc|zzObN6>~iVqET4T=(g z@d6x!gTwuUT%snkD4E%YyI95eRw}sUCzs}?=9Lu3_~)g%6lIpB#ss7m73b$A=EMXP g4g{k#cmkiZL`aFpu}mOwLG5&4~|AE=nvY$jnQh*kTI+ DJpT^O delta 84 zcmbQoc#x4hpO=@50SIO&)lKBKu}yKYit(*faLG?D%}vcKDUR{aOLZyAEK7|ENG&SP l&r8gS2`I|XN=*hzIi(htCn) zYKdn_YObHA$i(k2?QXG`B^G5S=9S#yNy#iu&M(a?38^edUCB@+4m1=@{IYYkiZL`a wFpu}mOwLG5&4~|AE=nvY$jnQhti`0ys4zK-Np^BElbirt3q-TTH?V-%16S{N&Qy)Vz}782`Lfm!izF)R=(OqT>9##GIIbqWrAX zWT2E&YH^7|Kv8CLs)Ac)UP@}wWIZN*Mw!WROtO>fnWVJfrb5gY16p#6!zMRBr8Fni PjuFT!W&si$tYXXn-PbbO diff --git a/extractors/__pycache__/pipelines.cpython-310.pyc b/extractors/__pycache__/pipelines.cpython-310.pyc index 42ecea5d2ede74b038c48ac87b39ae5a455c6825..9db3d1abe1887c9fb99d164031bfd134f37e38ba 100644 GIT binary patch delta 998 zcmZ8g%Wl&^6rGtkvEx_LB+a7_T4)mp-B4H*P{aaSC<{cTR+N@XRpc??5}e4L(l(5n zN|6vCAyx90MI-S8z>aM<`~v<05P!e|xOUvCU}-cn_nv$1J;(N|k#D1VQ`2Mw&&1_7 z#;3wZeLxtxId^?+div7L`aR2dS~nZ(i-uEgwJiI|=QZIiDW?D))ZBu#`HLXL5XD`L z5yKoa(j^DjdFd(QhfZ7KF`6nNF3@xhEh9TGAQs6+|Jn7q{d6 zeH(8hmN*n)Nj=oVz>ujSjy95qGEH!@p;)+w-2_k2Y{;Jrvubo3tRb4`9f20YgFuTl zWFc1s0_{-eLJx1GT!~9uq$Ma6Xbwn38jwJv@H|vQZJnjr03D_yAcK@pH`c{~8V#k4 zg;KtQlq5*WGQ^Xi2Wgg}d3vTxy0~);4%H?1lA$`-Lv^QsfE$C>F*;sCuFPc`dlr}| z(zB2`7b?rb*kTX7;&>6!iGI92cWOYW4F+_TlUV0uYF7pF$r7UFkoXF%aFteah%3XW zhz@XTn&w^2RwC5-5OsqqTrDD3V>zw?vBtG1f8jlW0je;V0{g|#kIBG~1H8+?5gi;U zFf`Bg|1Igmt-x$3kKVh!r-#nK9ykLp=^wS#DeqnKOT`yo)E%pCv(;3~Y0@o&J?P&z z+p^spkCNUU`Kaux)Y>$fTQ*zTZJEBfU^<3rvmX?`SU-N-+A(RBcqt`+Z8aNl&7LYa zX>c6$k2~IP&;`db%<78xr|vqiJffXEEMoZv6=iMiw44Q{huTAjzQy%n{r6P(66&S7bv{K*L3zFMiu oM8YZ-3eryki5#Shup}%ILK0a1tqN(lWpK_rQdjUpZ&ds77gN0ZbpQYW delta 784 zcmZuvJ#W)c6n)Qe9LKSpFDEplX#)+FGQf&JVW3H&3u;6WRjVwK-$Qap?dZj!P4wId zMPgv8WW>UfNNg-1HdZFY0t*uP1Em`he}Ee|QcH!c>+Abjx(7xRzs2r6&&CYr1R|fXceh)lz9!n@?`h zcAGMG6H06i9#erHu=QC~^}nhcxZ@kzH6a!9?`x|CS&*@c#kl-Ozy{n|oPaoiWt_rG O{%dUqKlFcVCw~FTYR8rU diff --git a/extractors/__pycache__/settings.cpython-310.pyc b/extractors/__pycache__/settings.cpython-310.pyc index 78d6ec98bb1bc4871a3e6fdc8a90fce33073508d..8ef0207c06bcc84a6daf78596641d0dea65f3921 100644 GIT binary patch delta 51 zcmZ3?*1*P{&&$ij00i6nnm2NHFiAPOTE!R|8kooXW+rDOrsl*4Cl@6a6lCV5Prkqu F3;

532wG delta 86 zcmZo*Tg=9t&&$ij00ghEHE!hYV6shdv5N7nRB*{pF3nBND=Cig&r5YF$}CHb2}ms} n&d*ECi3upm&q_@ON;#z#mnZ}jWhSR8xMk+0q!vwH%M=U%#|R!V diff --git a/extractors/__pycache__/utils.cpython-310.pyc b/extractors/__pycache__/utils.cpython-310.pyc index ffbb07f8b6ba81b991718e9abefa05f7bf164efc..7819fe67f5b4d31706d71540f2602ee992b1d248 100644 GIT binary patch delta 616 zcmaJ;&ui2`6rT5HlAVSmEU1ST7d#ar)ZGPt;2ta@J=)7EVvH5bZ>PCw?V8v!xw0;4)NP^gBoQf!DX;PpuR*h1cKioNYukUpp|_jSlr6l zF(@e536@ZRVsp^z1stQo6DYAj8l1nwbyngNR^X7G`^8`W{)K<}&w}DlsbD#OxeKi1 zz!pv}3Vr~lTgLhkfQ5yNk{w&(mxWbc61V>)S#1S#Rs^{Vo6btA4Kx zy1KsJ_n#Dt&hmfGZO^DsN0pgHV;xQ^V^7cKX|=ZVB8<{Jo=qzw)$ugY)0;OS)Pyhe` delta 203 zcmX@lwUwPOpO=@50SIO&)g|9$n#d=kr2^!oFr+Z%Fhnt=Fr_fJFhnt?u%xg8X{Hpm z6m}rZoWdB)pvf^Y<(F-ui&czorGiU-a%paAUP*C`e_pCfQD#|cOh9T;aeiK6PE0^i zepYHSP|7K_xI`hKC^I=#!7Vc{CADaB1)~`w^W^o6Cm8uBS1?^vmjv2f%mySx7ERg6t-u*&i-%GBos<1#7Q6vSvkv(sE{HSl?s6%ibyW)Hi~BbrW*^#wr0kg zvX?y}A#TxJdg_52w;p?-Ql+-nN54~`y)K-906$$Y-GjV_xVaxNL-@Na8e=|SS z{+Jj>)oRI*&yD1k$Lof1U;VuKSn{VWKlY7Wd;<(Hd!{e}K~77nXF&e8ve+=9Ed?k- zsfVPc3>9suLQTs$jA}Us?`b&>pG|0GQj+Y^B=5ss*r&lCz<&5pYX{&XIHg_uT=c${tX?@MVRBwz7m+#{l8wnox?m*Nma*SuX#%Rrb% z{h0evC@i#4NlT0b0ijEHlZcaOyPOvNO$oj1hu6f<$gQQ7jC4~(?c;sDM z;g~1vyd)e9U9ptgBNlQSV`)8MgkHtm4||_PNyx?5x%ttw_S!0ORQ!~iDyR9F`W|W6 zgS(=bzbPW?ly%8ww`5#b1)hDypm(}~ta?34 z=o@~@?V`$g$~t_sb6jFUtvX+s%C6k0BrQY%==fqJ|E;x_;lu;8W_HAOe)i+E0zUIp z+g7i~WH9|I0fzgsp=_B@kH{|cR0Uak&Sf++ofH2LzZ1~cNj{J~+|OlUAT z=DLwS2%VjNE;>M?7^1E4TwnGIcLNoq-N!*f(sD+j&dH$Vy5RTUnuo);e8ve~m18^a z_ti+^-6SX7HeimMxVgcHn?X~}QWla{GG5&kMI67Ac4Z55Z$mpO>a4bEMsR^fNzBw) z`7)xLJ9E$SkyZRB-3xN+^Nz-;d$2CqmKtVSlsAERJfiH5LC592*UOWOpPu=m)mk{c ze#Q4TFbURIJc{F3PD9+Y&&GjtM22mm7j_OkD_H2MT||#j37PVJj{ddl=#f2&9@=B* tfjy4?wkObEg-P@$I*@tXM^2{wj*ev7Z|GR2-9xU_UZ7vmaU~1Cyaen+gYW4e-7~Pqhz3gsscMA47BS=;PtE~_WX3<8I!)Oy%VX`wWlkCkLGqZ=LIns#O zg(+2pbQU&tVxicGm8CEs7HRwoS{T1e6#U?Qy!YmN-!Svz%!lqobvj7^)&9O$UXO#| za4P<tObg&u)?ejYz7bkVn70zWpi-S0oDcdo_96_c4{L? zPw&3VvI}=t1Y2I%KYZg}qD$^8I&hEiMfVZ!yZf!oJ!_qFUs`*Pk|puc~_C-qf7JJq?%R%<_kJG2MObK9eVIedat$-JNKD!XMD4w$uojc6>C_ ziZ!J*n~{X^0j;1)g&w)AZ!;uyx_mQgE;Dn8-7F1fZmLea+t-Q2K zn^%FJ{{C_MRp*zBn~(arDw`oZ-W2PS=UR+;G2q%LN$X;u)ixweag`g}KVIFxCM4t9 ly^XIdEy6B>2yP=1VvG^~O+wJ%5QN*{{KN@90(O*q`vWTHsqFv& diff --git a/extractors/selectors/amazon.py b/extractors/selectors/amazon.py index 70b22ec..d871e9c 100644 --- a/extractors/selectors/amazon.py +++ b/extractors/selectors/amazon.py @@ -20,20 +20,51 @@ "userRatingCount": ['//span[@id="acrCustomerReviewText"]/text()'], "userRatingStar": ['//span[@id="acrPopover"]/@title'], "price": [ - # '//span[contains(@class,"a-price")]/span[1]/text()', '//div[@id="centerCol"]/div[ - # @id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span', - # '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[ - # contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]', - '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[' - 'contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()', - '//div[@id="centerCol"]/div/div[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span/text()' + '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()', + '//*[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span[1]/text()', + '//span[contains(@class, "priceToPay")]/span[1]/text()', + '//*[@id="snsDetailPagePrice"]/span[@id="sns-base-price"]/text()', + '//*[@id="priceblock_ourprice"]/text()', + '//*[@id="corePrice_desktop"]/div/table/tr[2]/td[2]/span[1]/span[1]/text()' ], "oldPrice": [ - '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[' - 'contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()', - # '//span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()', + '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()', + '//*[@id="corePrice_desktop"]/div/table/tr[1]/td[2]/span[@data-a-strike="true"]/span[1]/text()' + + ], + "discountType":[ + '//*[@id="savingsPercentage"]/text()', + '//*[@id="corePrice_desktop"]/div/table/tr[3]/td[2]/span[1]/text()', + ], "variants": [ - '//li[@data-defaultasin]/@data-defaultasin' + '//li[@data-defaultasin]/@data-dp-url', + '//option[@class="dropdownAvailable"]/@value' + ], + "variantName":[ + '//div[contains(@class,"twisterTextDiv")]/p/text()', + '/@data-a-html-content' + ], + 'variantPrice':[ + '//p[contains(@class,"twisterSwatchPrice")]/text()' + ], + 'variantGroups':[ + '//form[@id="twister"]/div[contains(@id,"variation_")]' ] } + + #price data + # '//span[contains(@class,"a-price")]/span[1]/text()', '//div[@id="centerCol"]/div[ + # @id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span', + # '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[ + # contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]', + + # '//span[contains(@class, "apexPriceToPay")]/span[1]/text()', + # '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()', + # '//div[@id="centerCol"]/div/div[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span/text()', + # '//*[@id="snsDetailPagePrice"]/span[@id="sns-base-price"]/text()', + # '//*[@id="corePrice_desktop"]/div/table/tr[2]/td[2]/span[1]/span[1]/text()', + # '//*[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span[contains(@class,"priceToPay")]/span[1]/text()' + + # old price data + # '//span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()', \ No newline at end of file diff --git a/extractors/spiders/__pycache__/__init__.cpython-310.pyc b/extractors/spiders/__pycache__/__init__.cpython-310.pyc index 4cced2f08e887c853db991b735557945380cd03e..ab1fb6b6c1ea87f1d4775af8d5f2719836339d87 100644 GIT binary patch delta 51 zcmX@fxQvlEpO=@50SMNVW+yM4$ZI9#>1q{YXlP&_@0*#Nk(injADmp2SWu9emp-w@ F6##2;5Ay&3 delta 86 zcmZ3+c#@GfpO=@50SIO&)g>>T$ZKVr<6;%#TdClZpIn-onpaXBQ?5%1lmGaLdd~NiEtulW{RC0QtHez5oCK diff --git a/extractors/spiders/amazon.py b/extractors/spiders/amazon.py index 580408f..2705d2d 100644 --- a/extractors/spiders/amazon.py +++ b/extractors/spiders/amazon.py @@ -4,7 +4,7 @@ from scrapy.utils.project import get_project_settings from extractors.items import MarketItem -from extractors.utils import getCategoryName, getElement, getRandomUAgents +from extractors.utils import getCategoryName, getElement, getRandomUAgents, cleanUrl from extractors.selectors.amazon import selectors from dataclasses import asdict @@ -13,6 +13,7 @@ from urllib.parse import urljoin from urllib.parse import unquote import copy +import uuid import random @@ -24,6 +25,9 @@ class AmazonSpider(scrapy.Spider): baseUrl = "https://www.amazon.com" + env = "dev" + # env = "prod" + # custom_settings = { # 'CONCURRENT_REQUESTS':30, # 'DOWNLOAD_DELAY': requestInterval @@ -34,68 +38,76 @@ def start_requests(self): This method is to get content of given category url. """ - # url = "https://www.amazon.com/Azzaro-Wanted-Eau-Toilette-5-1/dp/B078P7YZ3L/ref=sxin_15_pa_sp_search_thematic_sspa?content-id=amzn1.sym.ee6a664f-a1c5-4f93-a61f-81d41af42efc%3Aamzn1.sym.ee6a664f-a1c5-4f93-a61f-81d41af42efc&crid=HQB58X9PHWMD&cv_ct_cx=dior+sauvage+men&keywords=dior+sauvage+men&pd_rd_i=B078P7YZ3L&pd_rd_r=1e0d974b-6cda-46c9-a707-8bc83fb8491a&pd_rd_w=YoqOE&pd_rd_wg=0Trhw&pf_rd_p=ee6a664f-a1c5-4f93-a61f-81d41af42efc&pf_rd_r=YZTS4H22J6C2NJ9DG4XD&qid=1669453831&sprefix=dio+savage+me%2Caps%2C340&sr=1-2-cbc80bc4-104b-44f8-8e5c-6397d5250496-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEyTVVNNFJKQkc4SjdTJmVuY3J5cHRlZElkPUEwMjM4Nzk4SE42S1dMTzlKTVhDJmVuY3J5cHRlZEFkSWQ9QTA3ODA4NzkxMDBGR1FYSEFNWkRIJndpZGdldE5hbWU9c3Bfc2VhcmNoX3RoZW1hdGljJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ==" - # self.meta["asin"] = "B078P7YZ3L" - url = "https://www.amazon.com/New-Apple-AirPods-Max-Green/dp/B08PZDSP2Z/ref=sr_1_3?crid=1V8XTXSXHHBI2&keywords=apple+airpods+max&qid=1669453913&sprefix=apple+airpods+max%2Caps%2C335&sr=8-3" - self.meta["asin"] = "B08PZDSP2Z" - # request with category url - yield scrapy.Request(url=url, callback=self.parse_product, - headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta) - # yield scrapy.Request(url=self.categoryUrl, callback=self.parse_category, headers = getRandomUAgents( - # settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta) - - # def parse_category(self, response): - # ''' - # This method is to extract product pages from given category - - # ''' - - # # check if the Captcha exists. - # if response.css('#captchacharacters').extract_first(): - # self.log("Captcha found") - - # # get products from the category - # products = getElement(selectors["products"], response).getall() - - # for productLink in products: - - # # get asin - # if re.search(r'dp\/(.*)\/', productLink): - # asin = re.search(r'dp\/(.*)\/', productLink).group(1) - # else: - # asin = "" - - # # get current link - # productUrl = urljoin(self.baseUrl, productLink) - - # # get rid of unnecessary query params - # if re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl): - # realProductlink = re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl).group(0) - # else: - # realProductlink = "" - - # # get product page if asin: if asin not in self.productLists: self.productLists.append(asin) customMeta = - # copy.deepcopy(self.meta) customMeta['asin'] = asin yield scrapy.Request(url=realProductlink, - # callback=self.parse_product,headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), - # meta=customMeta) - - # # get next page url nextPage = getElement(selectors["nextPage"], response).extract_first(default="NA") if - # nextPage: nextUrl = urljoin(self.baseUrl, nextPage) yield scrapy.Request(url=nextUrl, - # callback=self.parse_category, headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), - # meta=self.meta) - - def parse_product(self, response): + test_urls = [ + 'https://www.amazon.com/DreamController-Original-Controller-Compatible-Wireless/dp/B09V37CLLR?th=1', + 'https://www.amazon.com/Razer-Universal-Quick-Charging-Xbox-S/dp/B09DHSJ4SZ', + 'https://www.amazon.com/CableMod-CM-PCSR-FKIT-NKW-R-Cable-Kit-White/dp/B089KPWW3J?th=1', + 'https://www.amazon.com/Azzaro-Most-Wanted-Parfum-Fragrance/dp/B09VN2FCDF/?_encoding=UTF8&pd_rd_w=jVQKE&content-id=amzn1.sym.aa5d5fb8-9ab9-46ea-8709-d60f551faa80&pf_rd_p=aa5d5fb8-9ab9-46ea-8709-d60f551faa80&pf_rd_r=F2CTCZ402NYW0D04S2DQ&pd_rd_wg=7duSD&pd_rd_r=f5ad392d-c089-448e-afc3-213f9cefcfc3&ref_=pd_gw_deals_gi' + + ] + if self.env == "dev": + for url in test_urls: + # self.meta["asin"] = "B08WC2SMSN" + asin = re.search(r'\/[0-9A-Z]{10}',url).group(0) + asin = asin[1:] + self.meta['asin'] = asin + self.productLists = [] + # request with category url + yield scrapy.Request(url=cleanUrl(url), callback=self.parse_product, + headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta, cb_kwargs={"isProduct":True}) + else: + yield scrapy.Request(url=cleanUrl(self.categoryUrl), callback=self.parse_category, headers = getRandomUAgents( + settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta) + + def parse_category(self, response): + ''' + This method is to extract product pages from given category + + ''' + + # check if the Captcha exists. + if response.css('#captchacharacters').extract_first(): + self.log("Captcha found") + + # get products from the category + products = getElement(selectors["products"], response).getall() + + for productLink in products: + + # get asin + if re.search(r'dp\/(.*)\/', productLink): + asin = re.search(r'dp\/(.*)\/', productLink).group(1) + else: + asin = "" + + # get current link + productUrl = urljoin(self.baseUrl, productLink) + + # get rid of unnecessary query params + if re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl): + realProductlink = re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl).group(0) + else: + realProductlink = "" + + # get product page + if asin: + if asin not in self.productLists: + self.productLists.append(asin) + customMeta = copy.deepcopy(self.meta) + customMeta['asin'] = asin + yield scrapy.Request(url=realProductlink, callback=self.parse_product,headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),meta=customMeta, cb_kwargs = {"isProduct":True}) + + # get next page url + nextPage = getElement(selectors["nextPage"], response).extract_first(default="NA") + if nextPage: + nextUrl = urljoin(self.baseUrl, nextPage) + yield scrapy.Request(url=cleanUrl(nextUrl), callback=self.parse_category, headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),meta=self.meta) + + def parse_product(self, response, isProduct = False): """ This method is to extract data from product page. """ - # try: - # with open('response.html', 'w', encoding='utf-8') as file: - # file.write(response.body.decode('utf-8')) - # file.close() - # except Exception: - # print(Exception) - # check if the recaptcha exists. if response.css('#captchacharacters').extract_first(): self.log("Captcha found ") @@ -196,6 +208,20 @@ def parse_product(self, response): # price Item["price"] = getElement(selectors["price"], response).extract_first(default="NA") Item["oldPrice"] = getElement(selectors["oldPrice"], response).extract_first(default="NA") + discountTypeList = getElement(selectors["discountType"], response).getall() + + if Item["price"] != "NA" and Item["oldPrice"] != "NA": + + if len(discountTypeList) > 1: + discountType = discountTypeList[1] + else: + discountType = "Fixed" + else: + discountType = "NA" + if '%' in discountType: + discountType = "Percent" + + Item["discountType"] = discountType # productProcessTime Item["productProcessTime"] = round(response.meta.get('download_latency'), 2) @@ -205,17 +231,76 @@ def parse_product(self, response): Item["productProcessSize"] = round(len(response.body) / 1024, 2) # other variants + + if isProduct: + variantId = str(uuid.uuid5(uuid.NAMESPACE_DNS, response.meta['asin'])) + else: + variantId = response.meta["variantId"] + + variantGroups = getElement(selectors["variantGroups"], response) + variants = getElement(selectors["variants"], response).getall() - base_variant_url = response.url.split("/dp/", 1)[0] - for variant in variants: - if variant != response.meta['asin']: - self.productLists.append(variant) - customMeta = copy.deepcopy(self.meta) - customMeta['asin'] = variant - url = base_variant_url + "/dp/" + variant - yield scrapy.Request(url=url, callback=self.parse_product, - headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), - meta=customMeta) + variantPrices = getElement(selectors["variantPrice"], response).getall() + + if len(variantPrices) <2 and len(variantGroups) < 2: + variantId = "NA" + print('HERE?????') + print(len(variantPrices)) + print(len(variantGroups)) + + #variantId + try: + if response.meta["variantId"] != "NA": + Item["variant"] = { + "variantId": response.meta["variantId"], + "variantName": response.meta["variantName"] + } + except Exception as inst: + if len(variantPrices) > 1: + variantName = response.xpath('//li[@data-defaultasin="'+Item['productLocalId']+'"]' + selectors["variantName"][0]).get() + Item["variant"] = { + "variantId": variantId, + "variantName": variantName + } + if len(variantGroups) > 1: + variantName = "Many Variants" + Item["variant"] = { + "variantId": variantId, + "variantName": variantName + } + for temp_variant in variants: + r = re.search(r'\/[A-Z0-9]{10}\/',temp_variant) + if r is not None: + variant = r.group(0) + variant = variant[1:-1] + else: + r = re.search(r',[A-Z0-9]{10}',temp_variant) + if r is not None: + variant = r.group(0) + variant = variant[1:] + else: + variant = "" + + if variant != "" and variant != response.meta['asin']: + if variant not in self.productLists: + self.productLists.append(variant) + customMeta = copy.deepcopy(self.meta) + customMeta['asin'] = variant + + if len(variantGroups) > 1: + variantName = "Many Variants" + else: + variantName = response.xpath('//li[@data-defaultasin="'+variant+'"]' + selectors["variantName"][0]).get(default = "NA") + if variantName == "NA": + variantName = response.xpath('//option[contains(@value,"'+variant+'")]' + selectors["variantName"][1]).get(default = "NA") + + customMeta["variantId"] = variantId + customMeta["variantName"] = variantName + url = re.sub(r'\/[0-9A-Z]{10}','/'+variant, response.url) + + yield scrapy.Request(url=cleanUrl(url), callback=self.parse_product, + headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), + meta=customMeta) yield Item diff --git a/extractors/utils.py b/extractors/utils.py index 3b29478..f976581 100644 --- a/extractors/utils.py +++ b/extractors/utils.py @@ -1,4 +1,5 @@ import random +import re def getCategoryName(name): name = name.title() @@ -26,4 +27,26 @@ def getRandomUAgents(agents, headers): randIndex = random.randint(0, len(agents)-1) headers["'User-Agent'"] = agents[randIndex] - return headers \ No newline at end of file + return headers + +def cleanUrl(url): + try: + #detect asin as this type /DHA2423SLA/ + search_result = re.search(r'https:\/\/.*?\/[0-9A-Z]{10}\/',url) + + if search_result is not None: + result = search_result.group(0) + result = result[:-1] + else: + search_result = re.search(r'https:\/\/.*?\/[0-9A-Z]{10}\?',url) + if search_result is not None: + result = search_result.group(0) + result = result[:-1] + else: + result = url + + except Exception as inst: + print(inst) + result = url + + return result \ No newline at end of file