From 29642364346f4e76226a3d87a6fbf0f7cd2702f8 Mon Sep 17 00:00:00 2001 From: Tulio Goncalves Date: Sat, 27 Jun 2020 20:10:46 -0300 Subject: [PATCH] initial commit --- .gitignore | 128 ++++-------------------------------------- app.py | 13 +++++ scrapper_angeloni.py | 74 ++++++++++++++++++++++++ scrapper_carrfour.py | 92 ++++++++++++++++++++++++++++++ scrapper_extra.py | 92 ++++++++++++++++++++++++++++++ scrapper_paoacucar.py | 92 ++++++++++++++++++++++++++++++ 6 files changed, 375 insertions(+), 116 deletions(-) create mode 100644 app.py create mode 100644 scrapper_angeloni.py create mode 100644 scrapper_carrfour.py create mode 100644 scrapper_extra.py create mode 100644 scrapper_paoacucar.py diff --git a/.gitignore b/.gitignore index b6e4761..6d9b785 100644 --- a/.gitignore +++ b/.gitignore @@ -1,129 +1,25 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class +*~ -# C extensions -*.so +/venv -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST +/api/static -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec +*.db -# Installer logs -pip-log.txt -pip-delete-this-directory.txt +*.jpg -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: *.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid -# SageMath parsed files -*.sage.py +*.pid -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ +.venv* -# Spyder project settings -.spyderproject -.spyproject +*.swp -# Rope project settings -.ropeproject +*.pyc -# mkdocs documentation -/site +credential -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json +*.csv -# Pyre type checker -.pyre/ +*.csv# diff --git a/app.py b/app.py new file mode 100644 index 0000000..ef89ac7 --- /dev/null +++ b/app.py @@ -0,0 +1,13 @@ +#stantard imports +import os + +#third party imports +import requests + +#local modules imports + + +def get_item_information(supermarket) + +if __name__ == '__main': + print('run') diff --git a/scrapper_angeloni.py b/scrapper_angeloni.py new file mode 100644 index 0000000..8006254 --- /dev/null +++ b/scrapper_angeloni.py @@ -0,0 +1,74 @@ +#stantard imports +import os +import time +from datetime import date + +#third party imports +import requests +from bs4 import BeautifulSoup +import pandas as pd + + +class Scrapper(): + #angeloni URL sorted low to high price, 48 items/page + today = date.today() + base_url = [ + "https://www.angeloni.com.br/super/c?No=", + "&Nrpp=48&Ns=dim.product.inStock|1||sku.activePrice|0" + ] + + def __init__(self, supermarket='angeloni', index=0): + self.supermarket = supermarket + self.index = index + self.response = 200 + + + def _get_item_information(self): + df = pd.DataFrame(columns=[ + 'insert_date', + 'market_name', + 'product_description', + 'product_price', + 'product_availability' + ]) + #for i in range(200): + while self.response == 200: + page = requests.get( + "{}{}{}".format( + self.base_url[0], + str(self.index), + self.base_url[1] + ) + ) + soup = BeautifulSoup(page.content, 'html.parser') + elements = soup.find_all(class_='box-produto') + if not elements: + self.response = 404 + for element in elements: + item_desc = element.find('h2', class_='box-produto__desc-prod') + item_preco_int = element.find('span', class_='box-produto__preco__valor') + item_preco_dec = element.find('span', class_='box-produto__preco__centavos') + if not item_preco_int: + disponibilidade = 0 + item_preco = 0 + else: + disponibilidade = 1 + item_preco = '{}{}'.format( + item_preco_int.text, + item_preco_dec.text + ) + new_row = { + 'insert_date': self.today.strftime('%m-%d-%Y'), + 'market_name': self.supermarket, + 'product_description': item_desc.text, + 'product_price': item_preco, + 'product_availability': disponibilidade + } + df = df.append(new_row, ignore_index=True) + df.to_csv('{}_data.csv'.format(self.supermarket)) + self.index = self.index + 48 + print('Extraction {} data finished without errors!'.format(self.supermarket)) + +if __name__ == '__main__': + scrapper_item = Scrapper() + scrapper_item._get_item_information() diff --git a/scrapper_carrfour.py b/scrapper_carrfour.py new file mode 100644 index 0000000..cd92cfa --- /dev/null +++ b/scrapper_carrfour.py @@ -0,0 +1,92 @@ +#stantard imports +import time +import os +import time +from datetime import date +import re + +#third party imports +import requests +from bs4 import BeautifulSoup +import pandas as pd +from selenium import webdriver +from selenium.common.exceptions import TimeoutException +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + + +class Scrapper(): + + def __init__(self, supermarket='not-defined', index=0): + self.supermarket = supermarket + self.index = index + self.response = 200 + + + def _get_item_information(self): + today = date.today() + base_url = ['https://www.clubeextra.com.br/secoes/','?qt=12&p=0>=list'] + driver = webdriver.Firefox() + df_mapping = pd.read_csv('extra_mapping.csv') + + for section in df_mapping['section']: + df = pd.DataFrame(columns=[ 'insert_date', + 'market_name', + 'section', + 'brand', + 'product_description', + 'product_price', + 'product_price_reals', + 'product_price_cents' + ]) + section_name = section.split('/')[1] + + driver.get("{}{}{}".format( + base_url[0], + section, + base_url[1] + )) + timeout = 6 + try: + WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.CLASS_NAME, 'filter ng-binding ng-scope'))) + except TimeoutException: + print("Timed out waiting for page to load") + finally: + print("Page loaded") + + soup = BeautifulSoup(driver.page_source, 'html.parser') + total_items = int(re.findall(r"\d+", str(soup.find_all("p", {'class': "filter ng-binding ng-scope"})))[2]) + print('Total items in this page is: {}'.format(total_items)) + for i in range(0,(int(total_items/12)+2)): + #for i in range(0,5): + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(1) + soup = BeautifulSoup(driver.page_source, "html.parser") + item_name = soup.find_all("p", {'class': "product-description ng-binding"}) + item_price = soup.find_all("p", {'class': "normal-price ng-binding ng-scope"}) + for name, price in zip(item_name, item_price): + name_txt = name.get_text().strip() + brand = ' '.join(re.findall(r"[A-Z]{2,}", name_txt)) + price_txt = price.get_text().strip() + price_cents = int(price_txt.split(',')[1]) + price_reals = int(price_txt.replace('.','').split(',')[0].split(' ')[1]) + new_row = { + 'insert_date': today.strftime('%m-%d-%Y'), + 'market_name': self.supermarket, + 'section': section_name, + 'brand': brand, + 'product_description': name_txt, + 'product_price': price_txt, + 'product_price_reals': price_reals, + 'product_price_cents': price_cents + } + df = df.append(new_row, ignore_index=True) + df = df.drop_duplicates(['product_description']) + df.to_csv('{}_data.csv'.format(self.supermarket), mode='a') + print('Extraction of section {} for {} finished without errors!'.format(section_name, self.supermarket)) + print('Extraction for {} finished without errors!'.format(self.supermarket)) + +if __name__ == '__main__': + scrapper_item = Scrapper('extra', 0) + scrapper_item._get_item_information() diff --git a/scrapper_extra.py b/scrapper_extra.py new file mode 100644 index 0000000..cd92cfa --- /dev/null +++ b/scrapper_extra.py @@ -0,0 +1,92 @@ +#stantard imports +import time +import os +import time +from datetime import date +import re + +#third party imports +import requests +from bs4 import BeautifulSoup +import pandas as pd +from selenium import webdriver +from selenium.common.exceptions import TimeoutException +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + + +class Scrapper(): + + def __init__(self, supermarket='not-defined', index=0): + self.supermarket = supermarket + self.index = index + self.response = 200 + + + def _get_item_information(self): + today = date.today() + base_url = ['https://www.clubeextra.com.br/secoes/','?qt=12&p=0>=list'] + driver = webdriver.Firefox() + df_mapping = pd.read_csv('extra_mapping.csv') + + for section in df_mapping['section']: + df = pd.DataFrame(columns=[ 'insert_date', + 'market_name', + 'section', + 'brand', + 'product_description', + 'product_price', + 'product_price_reals', + 'product_price_cents' + ]) + section_name = section.split('/')[1] + + driver.get("{}{}{}".format( + base_url[0], + section, + base_url[1] + )) + timeout = 6 + try: + WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.CLASS_NAME, 'filter ng-binding ng-scope'))) + except TimeoutException: + print("Timed out waiting for page to load") + finally: + print("Page loaded") + + soup = BeautifulSoup(driver.page_source, 'html.parser') + total_items = int(re.findall(r"\d+", str(soup.find_all("p", {'class': "filter ng-binding ng-scope"})))[2]) + print('Total items in this page is: {}'.format(total_items)) + for i in range(0,(int(total_items/12)+2)): + #for i in range(0,5): + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(1) + soup = BeautifulSoup(driver.page_source, "html.parser") + item_name = soup.find_all("p", {'class': "product-description ng-binding"}) + item_price = soup.find_all("p", {'class': "normal-price ng-binding ng-scope"}) + for name, price in zip(item_name, item_price): + name_txt = name.get_text().strip() + brand = ' '.join(re.findall(r"[A-Z]{2,}", name_txt)) + price_txt = price.get_text().strip() + price_cents = int(price_txt.split(',')[1]) + price_reals = int(price_txt.replace('.','').split(',')[0].split(' ')[1]) + new_row = { + 'insert_date': today.strftime('%m-%d-%Y'), + 'market_name': self.supermarket, + 'section': section_name, + 'brand': brand, + 'product_description': name_txt, + 'product_price': price_txt, + 'product_price_reals': price_reals, + 'product_price_cents': price_cents + } + df = df.append(new_row, ignore_index=True) + df = df.drop_duplicates(['product_description']) + df.to_csv('{}_data.csv'.format(self.supermarket), mode='a') + print('Extraction of section {} for {} finished without errors!'.format(section_name, self.supermarket)) + print('Extraction for {} finished without errors!'.format(self.supermarket)) + +if __name__ == '__main__': + scrapper_item = Scrapper('extra', 0) + scrapper_item._get_item_information() diff --git a/scrapper_paoacucar.py b/scrapper_paoacucar.py new file mode 100644 index 0000000..7fb3192 --- /dev/null +++ b/scrapper_paoacucar.py @@ -0,0 +1,92 @@ +#stantard imports +import time +import os +import time +from datetime import date +import re + +#third party imports +import requests +from bs4 import BeautifulSoup +import pandas as pd +from selenium import webdriver +from selenium.common.exceptions import TimeoutException +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + + +class Scrapper(): + + def __init__(self, supermarket='not-defined', index=0): + self.supermarket = supermarket + self.index = index + self.response = 200 + + + def _get_item_information(self): + today = date.today() + base_url = ['https://www.paodeacucar.com/secoes/','?qt=12&p=0>=list'] + driver = webdriver.Firefox() + df_mapping = pd.read_csv('pao-de-acucar_mapping.csv') + + for section in df_mapping['section']: + df = pd.DataFrame(columns=[ 'insert_date', + 'market_name', + 'section', + 'brand', + 'product_description', + 'product_price', + 'product_price_reals', + 'product_price_cents' + ]) + section_name = section.split('/')[1] + + driver.get("{}{}{}".format( + base_url[0], + section, + base_url[1] + )) + timeout = 6 + try: + WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.CLASS_NAME, 'filter ng-binding ng-scope'))) + except TimeoutException: + print("Timed out waiting for page to load") + finally: + print("Page loaded") + + soup = BeautifulSoup(driver.page_source, 'html.parser') + total_items = int(re.findall(r"\d+", str(soup.find_all("p", {'class': "filter ng-binding ng-scope"})))[2]) + print('Total items in this page is: {}'.format(total_items)) + for i in range(0,(int(total_items/12)+2)): + #for i in range(0,5): + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(1) + soup = BeautifulSoup(driver.page_source, "html.parser") + item_name = soup.find_all("p", {'class': "product-description ng-binding"}) + item_price = soup.find_all("p", {'class': "normal-price ng-binding ng-scope"}) + for name, price in zip(item_name, item_price): + name_txt = name.get_text().strip() + brand = ' '.join(re.findall(r"[A-Z]{2,}", name_txt)) + price_txt = price.get_text().strip() + price_cents = int(price_txt.split(',')[1]) + price_reals = int(price_txt.replace('.','').split(',')[0].split(' ')[1]) + new_row = { + 'insert_date': today.strftime('%m-%d-%Y'), + 'market_name': self.supermarket, + 'section': section_name, + 'brand': brand, + 'product_description': name_txt, + 'product_price': price_txt, + 'product_price_reals': price_reals, + 'product_price_cents': price_cents + } + df = df.append(new_row, ignore_index=True) + df = df.drop_duplicates(['product_description']) + df.to_csv('{}_data.csv'.format(self.supermarket), mode='a') + print('Extraction of section {} for {} finished without errors!'.format(section_name, self.supermarket)) + print('Extraction for {} finished without errors!'.format(self.supermarket)) + +if __name__ == '__main__': + scrapper_item = Scrapper('pao-de-acucar', 0) + scrapper_item._get_item_information()