Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
tuliocg committed Jun 27, 2020
1 parent 5460b8e commit 2964236
Show file tree
Hide file tree
Showing 6 changed files with 375 additions and 116 deletions.
128 changes: 12 additions & 116 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,129 +1,25 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
*~

# C extensions
*.so
/venv

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
/api/static

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
*.db

# Installer logs
pip-log.txt
pip-delete-this-directory.txt
*.jpg

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py
*.pid

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
.venv*

# Spyder project settings
.spyderproject
.spyproject
*.swp

# Rope project settings
.ropeproject
*.pyc

# mkdocs documentation
/site
credential

# mypy
.mypy_cache/
.dmypy.json
dmypy.json
*.csv

# Pyre type checker
.pyre/
*.csv#
13 changes: 13 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#stantard imports
import os

#third party imports
import requests

#local modules imports


def get_item_information(supermarket)

if __name__ == '__main':
print('run')
74 changes: 74 additions & 0 deletions scrapper_angeloni.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#stantard imports
import os
import time
from datetime import date

#third party imports
import requests
from bs4 import BeautifulSoup
import pandas as pd


class Scrapper():
#angeloni URL sorted low to high price, 48 items/page
today = date.today()
base_url = [
"https://www.angeloni.com.br/super/c?No=",
"&Nrpp=48&Ns=dim.product.inStock|1||sku.activePrice|0"
]

def __init__(self, supermarket='angeloni', index=0):
self.supermarket = supermarket
self.index = index
self.response = 200


def _get_item_information(self):
df = pd.DataFrame(columns=[
'insert_date',
'market_name',
'product_description',
'product_price',
'product_availability'
])
#for i in range(200):
while self.response == 200:
page = requests.get(
"{}{}{}".format(
self.base_url[0],
str(self.index),
self.base_url[1]
)
)
soup = BeautifulSoup(page.content, 'html.parser')
elements = soup.find_all(class_='box-produto')
if not elements:
self.response = 404
for element in elements:
item_desc = element.find('h2', class_='box-produto__desc-prod')
item_preco_int = element.find('span', class_='box-produto__preco__valor')
item_preco_dec = element.find('span', class_='box-produto__preco__centavos')
if not item_preco_int:
disponibilidade = 0
item_preco = 0
else:
disponibilidade = 1
item_preco = '{}{}'.format(
item_preco_int.text,
item_preco_dec.text
)
new_row = {
'insert_date': self.today.strftime('%m-%d-%Y'),
'market_name': self.supermarket,
'product_description': item_desc.text,
'product_price': item_preco,
'product_availability': disponibilidade
}
df = df.append(new_row, ignore_index=True)
df.to_csv('{}_data.csv'.format(self.supermarket))
self.index = self.index + 48
print('Extraction {} data finished without errors!'.format(self.supermarket))

if __name__ == '__main__':
scrapper_item = Scrapper()
scrapper_item._get_item_information()
92 changes: 92 additions & 0 deletions scrapper_carrfour.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#stantard imports
import time
import os
import time
from datetime import date
import re

#third party imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


class Scrapper():

def __init__(self, supermarket='not-defined', index=0):
self.supermarket = supermarket
self.index = index
self.response = 200


def _get_item_information(self):
today = date.today()
base_url = ['https://www.clubeextra.com.br/secoes/','?qt=12&p=0&gt=list']
driver = webdriver.Firefox()
df_mapping = pd.read_csv('extra_mapping.csv')

for section in df_mapping['section']:
df = pd.DataFrame(columns=[ 'insert_date',
'market_name',
'section',
'brand',
'product_description',
'product_price',
'product_price_reals',
'product_price_cents'
])
section_name = section.split('/')[1]

driver.get("{}{}{}".format(
base_url[0],
section,
base_url[1]
))
timeout = 6
try:
WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.CLASS_NAME, 'filter ng-binding ng-scope')))
except TimeoutException:
print("Timed out waiting for page to load")
finally:
print("Page loaded")

soup = BeautifulSoup(driver.page_source, 'html.parser')
total_items = int(re.findall(r"\d+", str(soup.find_all("p", {'class': "filter ng-binding ng-scope"})))[2])
print('Total items in this page is: {}'.format(total_items))
for i in range(0,(int(total_items/12)+2)):
#for i in range(0,5):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
soup = BeautifulSoup(driver.page_source, "html.parser")
item_name = soup.find_all("p", {'class': "product-description ng-binding"})
item_price = soup.find_all("p", {'class': "normal-price ng-binding ng-scope"})
for name, price in zip(item_name, item_price):
name_txt = name.get_text().strip()
brand = ' '.join(re.findall(r"[A-Z]{2,}", name_txt))
price_txt = price.get_text().strip()
price_cents = int(price_txt.split(',')[1])
price_reals = int(price_txt.replace('.','').split(',')[0].split(' ')[1])
new_row = {
'insert_date': today.strftime('%m-%d-%Y'),
'market_name': self.supermarket,
'section': section_name,
'brand': brand,
'product_description': name_txt,
'product_price': price_txt,
'product_price_reals': price_reals,
'product_price_cents': price_cents
}
df = df.append(new_row, ignore_index=True)
df = df.drop_duplicates(['product_description'])
df.to_csv('{}_data.csv'.format(self.supermarket), mode='a')
print('Extraction of section {} for {} finished without errors!'.format(section_name, self.supermarket))
print('Extraction for {} finished without errors!'.format(self.supermarket))

if __name__ == '__main__':
scrapper_item = Scrapper('extra', 0)
scrapper_item._get_item_information()
Loading

0 comments on commit 2964236

Please sign in to comment.