-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetcher.py
80 lines (56 loc) · 2.33 KB
/
fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from multiprocessing.pool import ThreadPool
from multiprocessing import cpu_count
import joblib
import geocoder
from sqlalchemy.orm.session import sessionmaker
from sqlalchemy import update, func
import settings
from database import Database
from models import Offer
from scraper import Scraper
from settings import PATH_CACHE_GEOCODER
__author__ = 'Antonin'
# Initialization: load the disk-backed geocaching cache data
memory = joblib.Memory(cachedir=PATH_CACHE_GEOCODER, verbose=0)
# Initialization: Connect to the database and create a transaction
engine, _ = Database().connect()
Session = sessionmaker(bind=engine)
session = Session()
current_total = session.query(func.count(Offer.id)).scalar()
active_total = session.query(func.count(Offer.id)).filter(Offer.active==True).scalar()
# Set all current offers to active=False
update(Offer).where(Offer.active==True).values(active=False)
def fetch_and_process_offer(offer_id):
offer = Scraper.scrap_offer_details_page(offer_id)
if not offer:
return
if not offer.get('location'):
location = geocode_city_country(offer['city'].lower(), offer['country'].lower()) # Lower for better cache
offer['location'] = location or None
Offer.upsert(session, offer)
@memory.cache()
def geocode_city_country(city, country):
location = '{city}, {country}'.format(city=city, country=country)
return geocoder.google(location, key=settings.API_KEY_GOOGLE_GEOCODER).wkt
# 1) Fetch the number of pages of offers
print("Fetching total info...")
max_page = Scraper.scrap_offers_list_max_page()
print("Total %d pages to fetch" % max_page)
# 2) Fetch the list of offer ids
thread_pool = ThreadPool(processes=cpu_count())
offer_ids = thread_pool.map(Scraper.scrap_offers_list_page, range(1, max_page+1))
thread_pool.close()
thread_pool.join()
offer_ids = list(sum(offer_ids, []))
print("OFFER IDS", len(offer_ids), offer_ids)
# 3) Fetcb all the offers
thread_pool = ThreadPool(processes=cpu_count())
offers = thread_pool.map(fetch_and_process_offer, offer_ids)
thread_pool.close()
thread_pool.join()
print(offers)
new_total = session.query(func.count(Offer.id)).scalar()
new_active_total = session.query(func.count(Offer.id)).filter(Offer.active==True).scalar()
session.commit()
print("Before", current_total, "now", new_total)
print("Active count was", active_total, "now", new_active_total)