-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from tuliocg/feat_initial_data_scrap
initial commit
- Loading branch information
Showing
6 changed files
with
375 additions
and
116 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,129 +1,25 @@ | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
*~ | ||
|
||
# C extensions | ||
*.so | ||
/venv | ||
|
||
# Distribution / packaging | ||
.Python | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
wheels/ | ||
pip-wheel-metadata/ | ||
share/python-wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
/api/static | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
*.db | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
*.jpg | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.nox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
*.py,cover | ||
.hypothesis/ | ||
.pytest_cache/ | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
db.sqlite3 | ||
db.sqlite3-journal | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
|
||
# IPython | ||
profile_default/ | ||
ipython_config.py | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# pipenv | ||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | ||
# However, in case of collaboration, if having platform-specific dependencies or dependencies | ||
# having no cross-platform support, pipenv may install dependencies that don't work, or not | ||
# install all needed dependencies. | ||
#Pipfile.lock | ||
|
||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow | ||
__pypackages__/ | ||
|
||
# Celery stuff | ||
celerybeat-schedule | ||
celerybeat.pid | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
*.pid | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ | ||
.venv* | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
*.swp | ||
|
||
# Rope project settings | ||
.ropeproject | ||
*.pyc | ||
|
||
# mkdocs documentation | ||
/site | ||
credential | ||
|
||
# mypy | ||
.mypy_cache/ | ||
.dmypy.json | ||
dmypy.json | ||
*.csv | ||
|
||
# Pyre type checker | ||
.pyre/ | ||
*.csv# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#stantard imports | ||
import os | ||
|
||
#third party imports | ||
import requests | ||
|
||
#local modules imports | ||
|
||
|
||
def get_item_information(supermarket) | ||
|
||
if __name__ == '__main': | ||
print('run') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#stantard imports | ||
import os | ||
import time | ||
from datetime import date | ||
|
||
#third party imports | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import pandas as pd | ||
|
||
|
||
class Scrapper(): | ||
#angeloni URL sorted low to high price, 48 items/page | ||
today = date.today() | ||
base_url = [ | ||
"https://www.angeloni.com.br/super/c?No=", | ||
"&Nrpp=48&Ns=dim.product.inStock|1||sku.activePrice|0" | ||
] | ||
|
||
def __init__(self, supermarket='angeloni', index=0): | ||
self.supermarket = supermarket | ||
self.index = index | ||
self.response = 200 | ||
|
||
|
||
def _get_item_information(self): | ||
df = pd.DataFrame(columns=[ | ||
'insert_date', | ||
'market_name', | ||
'product_description', | ||
'product_price', | ||
'product_availability' | ||
]) | ||
#for i in range(200): | ||
while self.response == 200: | ||
page = requests.get( | ||
"{}{}{}".format( | ||
self.base_url[0], | ||
str(self.index), | ||
self.base_url[1] | ||
) | ||
) | ||
soup = BeautifulSoup(page.content, 'html.parser') | ||
elements = soup.find_all(class_='box-produto') | ||
if not elements: | ||
self.response = 404 | ||
for element in elements: | ||
item_desc = element.find('h2', class_='box-produto__desc-prod') | ||
item_preco_int = element.find('span', class_='box-produto__preco__valor') | ||
item_preco_dec = element.find('span', class_='box-produto__preco__centavos') | ||
if not item_preco_int: | ||
disponibilidade = 0 | ||
item_preco = 0 | ||
else: | ||
disponibilidade = 1 | ||
item_preco = '{}{}'.format( | ||
item_preco_int.text, | ||
item_preco_dec.text | ||
) | ||
new_row = { | ||
'insert_date': self.today.strftime('%m-%d-%Y'), | ||
'market_name': self.supermarket, | ||
'product_description': item_desc.text, | ||
'product_price': item_preco, | ||
'product_availability': disponibilidade | ||
} | ||
df = df.append(new_row, ignore_index=True) | ||
df.to_csv('{}_data.csv'.format(self.supermarket)) | ||
self.index = self.index + 48 | ||
print('Extraction {} data finished without errors!'.format(self.supermarket)) | ||
|
||
if __name__ == '__main__': | ||
scrapper_item = Scrapper() | ||
scrapper_item._get_item_information() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#stantard imports | ||
import time | ||
import os | ||
import time | ||
from datetime import date | ||
import re | ||
|
||
#third party imports | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import pandas as pd | ||
from selenium import webdriver | ||
from selenium.common.exceptions import TimeoutException | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from selenium.webdriver.support import expected_conditions as EC | ||
from selenium.webdriver.common.by import By | ||
|
||
|
||
class Scrapper(): | ||
|
||
def __init__(self, supermarket='not-defined', index=0): | ||
self.supermarket = supermarket | ||
self.index = index | ||
self.response = 200 | ||
|
||
|
||
def _get_item_information(self): | ||
today = date.today() | ||
base_url = ['https://www.clubeextra.com.br/secoes/','?qt=12&p=0>=list'] | ||
driver = webdriver.Firefox() | ||
df_mapping = pd.read_csv('extra_mapping.csv') | ||
|
||
for section in df_mapping['section']: | ||
df = pd.DataFrame(columns=[ 'insert_date', | ||
'market_name', | ||
'section', | ||
'brand', | ||
'product_description', | ||
'product_price', | ||
'product_price_reals', | ||
'product_price_cents' | ||
]) | ||
section_name = section.split('/')[1] | ||
|
||
driver.get("{}{}{}".format( | ||
base_url[0], | ||
section, | ||
base_url[1] | ||
)) | ||
timeout = 6 | ||
try: | ||
WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.CLASS_NAME, 'filter ng-binding ng-scope'))) | ||
except TimeoutException: | ||
print("Timed out waiting for page to load") | ||
finally: | ||
print("Page loaded") | ||
|
||
soup = BeautifulSoup(driver.page_source, 'html.parser') | ||
total_items = int(re.findall(r"\d+", str(soup.find_all("p", {'class': "filter ng-binding ng-scope"})))[2]) | ||
print('Total items in this page is: {}'.format(total_items)) | ||
for i in range(0,(int(total_items/12)+2)): | ||
#for i in range(0,5): | ||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | ||
time.sleep(1) | ||
soup = BeautifulSoup(driver.page_source, "html.parser") | ||
item_name = soup.find_all("p", {'class': "product-description ng-binding"}) | ||
item_price = soup.find_all("p", {'class': "normal-price ng-binding ng-scope"}) | ||
for name, price in zip(item_name, item_price): | ||
name_txt = name.get_text().strip() | ||
brand = ' '.join(re.findall(r"[A-Z]{2,}", name_txt)) | ||
price_txt = price.get_text().strip() | ||
price_cents = int(price_txt.split(',')[1]) | ||
price_reals = int(price_txt.replace('.','').split(',')[0].split(' ')[1]) | ||
new_row = { | ||
'insert_date': today.strftime('%m-%d-%Y'), | ||
'market_name': self.supermarket, | ||
'section': section_name, | ||
'brand': brand, | ||
'product_description': name_txt, | ||
'product_price': price_txt, | ||
'product_price_reals': price_reals, | ||
'product_price_cents': price_cents | ||
} | ||
df = df.append(new_row, ignore_index=True) | ||
df = df.drop_duplicates(['product_description']) | ||
df.to_csv('{}_data.csv'.format(self.supermarket), mode='a') | ||
print('Extraction of section {} for {} finished without errors!'.format(section_name, self.supermarket)) | ||
print('Extraction for {} finished without errors!'.format(self.supermarket)) | ||
|
||
if __name__ == '__main__': | ||
scrapper_item = Scrapper('extra', 0) | ||
scrapper_item._get_item_information() |
Oops, something went wrong.