From a73f4c3f52ce34347305e3deeca11d9b72b4ae06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Reichert?= Date: Wed, 18 Sep 2024 23:52:40 +0200 Subject: [PATCH] fix swimming pool scraper --- public/data/leipzig-swimming-pools.json | 2 +- src/scrapers/leipzig-swimming-pools.js | 101 ++++++++++--------- test/scrapers/leipzig-swimming-pools-test.js | 46 +++++++++ 3 files changed, 100 insertions(+), 49 deletions(-) create mode 100644 test/scrapers/leipzig-swimming-pools-test.js diff --git a/public/data/leipzig-swimming-pools.json b/public/data/leipzig-swimming-pools.json index dd09803..09dfa19 100644 --- a/public/data/leipzig-swimming-pools.json +++ b/public/data/leipzig-swimming-pools.json @@ -1 +1 @@ -[{"title":"Grünauer Welle","address":{"leisure":"Grünauer Welle","house_number":"7","road":"Stuttgarter Allee","neighbourhood":"Wohnkomplex 4","suburb":"Grünau-Siedlung","city_district":"West","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04209","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/gruenauer-welle","type":"indoor_pool","lat":"51.31507295","lon":"12.291681015468672"},{"title":"Sachsen-Therme","address":{"road":"Schongauerstraße","suburb":"Paunsdorf","city_district":"Ost","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04329","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/sachsen-therme","type":"indoor_pool","lat":"51.3497763","lon":"12.4739067"},{"title":"Schwimmhalle Mitte","address":{"leisure":"Schwimmhalle Mitte","house_number":"84","road":"Kirschbergstraße","suburb":"Gohlis-Süd","city_district":"Nord","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04155","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/schwimmhalle-mitte","type":"indoor_pool","lat":"51.36225665","lon":"12.353394654258464"},{"title":"Schwimmhalle Nord","address":{"leisure":"Schwimmhalle Nord","house_number":"54","road":"Kleiststraße","suburb":"Eutritzsch","city_district":"Nord","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04157","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/schwimmhalle-nord","type":"indoor_pool","lat":"51.37231965","lon":"12.38134070842299"},{"title":"Schwimmhalle Nordost","address":{"leisure":"Schwimmhalle Nordost","house_number":"26","road":"Schönefelder Allee","neighbourhood":"Schönefeld","suburb":"Schönefeld-Abtnaundorf","city_district":"Nordost","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04347","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/schwimmhalle-nordost","type":"indoor_pool","lat":"51.358358550000005","lon":"12.404955916520343"},{"title":"Schwimmhalle Süd","address":{"leisure":"Schwimmhalle Süd","house_number":"10","road":"Tarostraße","suburb":"Zentrum-Südost","city_district":"Mitte","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04103","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/schwimmhalle-sued","type":"indoor_pool","lat":"51.3239909","lon":"12.385783649999993"},{"title":"Schwimmhalle Südost","address":{"leisure":"Schwimmhalle Südost","house_number":"35","road":"Kolmstraße","suburb":"Probstheida","city_district":"Südost","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04299","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/schwimmhalle-suedost","type":"indoor_pool","lat":"51.31632955","lon":"12.42836830011111"},{"title":"Schwimmhalle Universität Leipzig","address":{"leisure":"Universitätsschwimmhalle","house_number":"4","road":"Mainzer Straße","neighbourhood":"Bachviertel","suburb":"Zentrum-West","city_district":"Mitte","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04109","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/schwimmhalle-universitaet-leipzig","type":"indoor_pool","lat":"51.336847199999994","lon":"12.353675175513555"},{"title":"Schwimmhalle West","address":{"leisure":"Schwimmhalle West","house_number":"52a","road":"Hans-Driesch-Straße","suburb":"Leutzsch","city_district":"Altwest","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04179","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/schwimmhalle-west","type":"indoor_pool","lat":"51.34652235","lon":"12.311952829626495"},{"title":"Sportbad an der Elster","address":{"leisure":"Sportbad an der Elster","house_number":"8","road":"Antonienstraße","suburb":"Plagwitz","city_district":"Südwest","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04229","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/sportbad-an-der-elster","type":"indoor_pool","lat":"51.319954300000006","lon":"12.336005139793688"},{"title":"Sprunghalle der Universität Leipzig","address":{"leisure":"Universitätsschwimmhalle","house_number":"4","road":"Mainzer Straße","neighbourhood":"Bachviertel","suburb":"Zentrum-West","city_district":"Mitte","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04109","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/sprunghalle-der-universitaet-leipzig","type":"indoor_pool","lat":"51.336847199999994","lon":"12.353675175513555"},{"title":"Kinderfreibecken \"Robbe\"","address":{"leisure":"Schwimmhalle Nord","house_number":"54","road":"Kleiststraße","suburb":"Eutritzsch","city_district":"Nord","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04157","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/kinderfreibecken-robbe","type":"outdoor_pool","lat":"51.37231965","lon":"12.38134070842299"},{"title":"Ökobad Lindenthal","address":{"house_number":"3","road":"Am Freibad","suburb":"Lindenthal","city_district":"Nordwest","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04158","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/oekobad-lindenthal","type":"outdoor_pool","lat":"51.3935438","lon":"12.3269022"},{"title":"Schreberbad","address":{"leisure":"Schreberbad","house_number":"15","road":"Schreberstraße","neighbourhood":"Bachviertel","suburb":"Zentrum-West","city_district":"Mitte","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04109","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/schreberbad","type":"outdoor_pool","lat":"51.338967100000005","lon":"12.358802401994044"},{"title":"Sommerbad Kleinzschocher","address":{"house_number":"75","road":"Küchenholzallee","suburb":"Kleinzschocher","city_district":"Südwest","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04229","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/sommerbad-kleinzschocher","type":"outdoor_pool","lat":"51.3163021","lon":"12.338389"},{"title":"Sommerbad Schönefeld","address":{"house_number":"39","road":"Volbedingstraße","neighbourhood":"Schönefeld","suburb":"Schönefeld-Abtnaundorf","city_district":"Nordost","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04357","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/sommerbad-schoenefeld","type":"outdoor_pool","lat":"51.3621058","lon":"12.4110359"},{"title":"Sommerbad Südost","address":{"house_number":"173","road":"Oststraße","suburb":"Stötteritz","city_district":"Südost","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04299","country":"Deutschland","country_code":"de"},"link":"https://www.leipzig.de//detailansicht-adresse/sommerbad-suedost","type":"outdoor_pool","lat":"51.3262935","lon":"12.4198936"}] \ No newline at end of file +[{"title":"Grünauer Welle","address":{"leisure":"Grünauer Welle","house_number":"7","road":"Stuttgarter Allee","neighbourhood":"Wohnkomplex 4","suburb":"Grünau-Siedlung","city_district":"West","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04209","country":"Deutschland","country_code":"de"},"lat":"51.31507295","lon":"12.291681015468672","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/schwimmhalle-gruenauer-welle"},{"title":"Schwimmhalle Mitte","address":{"leisure":"Schwimmhalle Mitte","house_number":"84","road":"Kirschbergstraße","suburb":"Gohlis-Süd","city_district":"Nord","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04155","country":"Deutschland","country_code":"de"},"lat":"51.36225665","lon":"12.353394654258464","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/schwimmhalle-mitte"},{"title":"Schwimmhalle Nord","address":{"leisure":"Schwimmhalle Nord","house_number":"54","road":"Kleiststraße","suburb":"Eutritzsch","city_district":"Nord","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04157","country":"Deutschland","country_code":"de"},"lat":"51.37231965","lon":"12.38134070842299","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/schwimmhalle-nord"},{"title":"Schwimmhalle Nordost","address":{"leisure":"Schwimmhalle Nordost","house_number":"26","road":"Schönefelder Allee","neighbourhood":"Schönefeld","suburb":"Schönefeld-Abtnaundorf","city_district":"Nordost","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04347","country":"Deutschland","country_code":"de"},"lat":"51.358358550000005","lon":"12.404955916520343","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/schwimmhalle-nordost"},{"title":"Schwimmhalle Süd","address":{"leisure":"Schwimmhalle Süd","house_number":"10","road":"Tarostraße","suburb":"Zentrum-Südost","city_district":"Mitte","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04103","country":"Deutschland","country_code":"de"},"lat":"51.3239909","lon":"12.385783649999993","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/schwimmhalle-sued"},{"title":"Schwimmhalle Südost","address":{"leisure":"Schwimmhalle Südost","house_number":"35","road":"Kolmstraße","suburb":"Probstheida","city_district":"Südost","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04299","country":"Deutschland","country_code":"de"},"lat":"51.31632955","lon":"12.42836830011111","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/schwimmhalle-suedost"},{"title":"Schwimmhalle West","address":{"leisure":"Schwimmhalle West","house_number":"52a","road":"Hans-Driesch-Straße","suburb":"Leutzsch","city_district":"Altwest","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04179","country":"Deutschland","country_code":"de"},"lat":"51.34652235","lon":"12.311952829626495","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/schwimmhalle-west"},{"title":"Sportbad an der Elster","address":{"leisure":"Sauna im Sportbad an der Elster","house_number":"8","road":"Antonienstraße","suburb":"Plagwitz","city_district":"Südwest","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04229","country":"Deutschland","country_code":"de"},"lat":"51.3196487","lon":"12.3356885","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/sportbad-an-der-elster"},{"title":"Sommerbad Kleinzschocher","address":{"shop":"Sommerbad Kleinzschocher","house_number":"75","road":"Küchenholzallee","suburb":"Kleinzschocher","city_district":"Südwest","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04229","country":"Deutschland","country_code":"de"},"lat":"51.3165758","lon":"12.3384644","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/freibad-kleinzschocher"},{"title":"Schreberbad","address":{"leisure":"Schreberbad","house_number":"15","road":"Schreberstraße","neighbourhood":"Bachviertel","suburb":"Zentrum-West","city_district":"Mitte","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04109","country":"Deutschland","country_code":"de"},"lat":"51.338967100000005","lon":"12.358802401994044","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/freibad-schreberbad"},{"title":"Sommerbad Schönefeld","address":{"leisure":"Sommerbad Schönefeld","house_number":"39","road":"Volbedingstraße","neighbourhood":"Schönefeld","suburb":"Schönefeld-Abtnaundorf","city_district":"Nordost","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04357","country":"Deutschland","country_code":"de"},"lat":"51.3630321","lon":"12.411970819689838","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/freibad-schoenefeld"},{"title":"Sommerbad Südost","address":{"house_number":"173","road":"Oststraße","suburb":"Stötteritz","city_district":"Südost","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04299","country":"Deutschland","country_code":"de"},"lat":"51.3262935","lon":"12.4198936","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/freibad-suedost"},{"title":"Ökobad Lindenthal","address":{"house_number":"3","road":"Am Freibad","suburb":"Lindenthal","city_district":"Nordwest","city":"Leipzig","state":"Sachsen","ISO3166-2-lvl4":"DE-SN","postcode":"04158","country":"Deutschland","country_code":"de"},"lat":"51.3930018","lon":"12.3270532","link":"https://www.leipzig.de/freizeit-kultur-und-tourismus/sport/sportstaettenbelegung/detailseite/detailseite/oekobad-lindenthal"}] \ No newline at end of file diff --git a/src/scrapers/leipzig-swimming-pools.js b/src/scrapers/leipzig-swimming-pools.js index 155e14c..0866e90 100644 --- a/src/scrapers/leipzig-swimming-pools.js +++ b/src/scrapers/leipzig-swimming-pools.js @@ -8,70 +8,75 @@ const url_outdoor_pool = `${domain}/freizeit-kultur-und-tourismus/sport/sportsta const __dirname = new URL('.', import.meta.url).pathname; -Promise.all([scrapeIt(url_indoor_pool, { - list: { - listItem: '.address-list-item', - data: { - title: { - selector: '.link_intern', - attr: 'title' - }, - address: { - selector: '.list.left', - convert: address => address.split('\n').map(d => d.trim()).filter(d => !!d) - }, - link: { - attr: "href", - selector: '.link_intern', - convert: href => `${domain}/${href}` - } - } - } -}), scrapeIt(url_outdoor_pool, { +const scrapeDetailsUrl = { list: { - listItem: '.address-list-item', + listItem: '.project-attributes', data: { title: { - selector: '.link_intern', - attr: 'title' + selector: 'h3 a' }, - address: { - selector: '.list.left', - convert: address => address.split('\n').map(d => d.trim()).filter(d => !!d) - }, - link: { - attr: "href", - selector: '.link_intern', - convert: href => `${domain}/${href}` + detailsUrl: { + selector: 'h3 a', + attr: "href" } } } -})]).then(async ([indoor_pool, outdoor_pool]) => { - let data = [ - ...indoor_pool.data.list.map(sh => ({ ...sh, type: 'indoor_pool' })), - ...outdoor_pool.data.list.map(sh => ({ ...sh, type: 'outdoor_pool' })), - ] +} - let newList = []; - for (let i = 0; i < data.length; i++) { - let element = data[i]; - let q = `${element.address.join(' ').replace(/ \(.*\)/, '').replace('an der Schwimmhalle Nord', '').replace('Vollbedingstraße', 'Volbedingstraße').replace('Kirschbergstraße 84', 'Schwimmhalle Mitte')}`; - console.log(q) - let resp = await search(q) - let search_results = resp.filter(r => ['water_park', 'sports_centre'].includes(r.type)); +const scrapeDetailsData = { + address: { + selector: '.t3booking-t3booking-main-content p', + eq: 0, + convert: value => value.replace(' 04', ', 04').replace('Im Stadtplan anzeigen', '') + } +} + +const enrichWithCoords = async (element) => { + console.log('search for ' + element.address) + try { + const resp = await search(element.address) + const search_results = resp.filter(r => ['water_park', 'sports_centre'].includes(r.type)); if(search_results.length > 0) { - element = { + return { ...element, address: resp[0].address, lat: resp[0].lat, lon: resp[0].lon - } + } } else { console.log(element.title, element.address, 'could not be found') console.log(resp) + return element; } - newList.push(element) + } catch(e) { + console.log(element.title, element.address, 'could not be resolved') + console.log(resp) + return element; + } +} + +const scrapeDetailsUrlIndoor = scrapeIt(url_indoor_pool, scrapeDetailsUrl); +const scrapeDetailsUrlOutdoor = scrapeIt(url_outdoor_pool, scrapeDetailsUrl); + +const handleScrapeResponse = async ([indoor_pool, outdoor_pool]) => { + const data = [ + ...indoor_pool.data.list.map(sh => ({ ...sh, type: 'indoor_pool' })), + ...outdoor_pool.data.list.map(sh => ({ ...sh, type: 'outdoor_pool' })), + ] + const promises = []; + for (let i = 0; i < data.length; i++) { + const element = data[i]; + const detailsUrl = `${domain}${element.detailsUrl}`; + const details = await scrapeIt(detailsUrl, scrapeDetailsData); + const result = await enrichWithCoords(details.data); + promises.push({ + title: element.title, + ...result, + link: detailsUrl + }); } + const details = await Promise.all(promises); + fs.writeFileSync(`${__dirname}../../public/data/leipzig-swimming-pools.json`, JSON.stringify(details), 'utf8') +}; - fs.writeFileSync(`${__dirname}../../public/data/leipzig-swimming-pools.json`, JSON.stringify(newList, null, 2), 'utf8') -}) \ No newline at end of file +Promise.all([scrapeDetailsUrlIndoor, scrapeDetailsUrlOutdoor]).then(handleScrapeResponse) \ No newline at end of file diff --git a/test/scrapers/leipzig-swimming-pools-test.js b/test/scrapers/leipzig-swimming-pools-test.js new file mode 100644 index 0000000..955fc0a --- /dev/null +++ b/test/scrapers/leipzig-swimming-pools-test.js @@ -0,0 +1,46 @@ +import scrapeIt from "scrape-it"; + +const domain = `https://www.leipzig.de`; +const url_indoor_pool = `${domain}/freizeit-kultur-und-tourismus/sport/sportstaetten/schwimmhallen`; + +const scrapeDetailsUrl = { + list: { + listItem: '.project-attributes', + data: { + title: { + selector: 'h3 a' + }, + detailsUrl: { + selector: 'h3 a', + attr: "href" + } + } + } +} + +const scrapeDetailsData = { + address: { + selector: '.t3booking-t3booking-main-content p', + eq: 0, + convert: value => value.replace(' 04', ', 04').replace('Im Stadtplan anzeigen', '') + } +} + +const scrapeDetailsUrlIndoor = scrapeIt(url_indoor_pool, scrapeDetailsUrl); + +const handleScrapeResponse = async ([indoor_pool]) => { + let data = [ + ...indoor_pool.data.list.map(sh => ({ ...sh, type: 'indoor_pool' })), + ] + let element = data[0]; + scrapeIt(`${domain}${element.detailsUrl}`, scrapeDetailsData).then(details => { + console.log(details.data); + const result = { + title: element.title, + address: details.data.address, + } + console.log(JSON.stringify(result, null, 2)); + }) +}; + +Promise.all([scrapeDetailsUrlIndoor]).then(handleScrapeResponse)