From 10356bd93729f74a39d9b6ad620673e1d4dc3d32 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 7 Aug 2023 00:36:05 +0200 Subject: [PATCH 01/18] implemented improved german name scrapper --- scraper/src/helpers/helpers.js | 40 +------------ scraper/src/merge_datasets.js | 103 +++++++++++++++++++++++++++------ 2 files changed, 87 insertions(+), 56 deletions(-) diff --git a/scraper/src/helpers/helpers.js b/scraper/src/helpers/helpers.js index ffc6eac72..f13eb5c7a 100644 --- a/scraper/src/helpers/helpers.js +++ b/scraper/src/helpers/helpers.js @@ -1,5 +1,3 @@ -import axios from "axios"; - /** * Sanitizes the column names of the json array * @@ -103,40 +101,4 @@ function getSpreadEnumTyp(spread) { return value <= 0.15 ? "narrow" : value <= 0.61 ? "medium" : "wide"; } -/** - * Fetches the German name of the plant from wikidata API. - * - * @param {*} binomialName - * @returns {Promise} German name of the plant - */ -async function fetchGermanName(binomialName) { - try { - const url = `https://www.wikidata.org/w/api.php?action=wbsearchentities&search=${binomialName}&language=en&format=json`; - const response = await axios.get(url); - const data = response.data; - const results = data.search; - if (results.length === 0) { - return null; - } - const result = results[0]; - const id = result.id; - const url2 = `https://www.wikidata.org/w/api.php?action=wbgetentities&ids=${id}&languages=de&format=json`; - const response2 = await axios.get(url2); - const data2 = response2.data; - const entities = data2.entities; - const entity = entities[id]; - const dewiki = await entity["sitelinks"]["dewiki"]; - if (dewiki) { - return dewiki.title; - } - } catch (error) {} - return null; -} - -export { - sanitizeColumnNames, - getSoilPH, - getHeightEnumTyp, - getSpreadEnumTyp, - fetchGermanName, -}; +export { sanitizeColumnNames, getSoilPH, getHeightEnumTyp, getSpreadEnumTyp }; diff --git a/scraper/src/merge_datasets.js b/scraper/src/merge_datasets.js index 5249bcdf3..781e3e09e 100644 --- a/scraper/src/merge_datasets.js +++ b/scraper/src/merge_datasets.js @@ -2,28 +2,96 @@ import fs from "fs"; import { parse as json2csv } from "json2csv"; import csv from "csvtojson"; import permapeopleColumnMapping from "./helpers/column_mapping_permapeople.js"; +import axios from "axios"; +import axiosRetry from "axios-retry"; import { sanitizeColumnNames, getSoilPH, getHeightEnumTyp, getSpreadEnumTyp, - fetchGermanName, } from "./helpers/helpers.js"; /** - * Fetches the German names for the plants from Wikidata API + * Defines the amount of retries we do, if axios encounters errors during a HTTP GET Request. + * Increse Delay if we encounter error to prevent 429 Errors. */ -const fetchGermanNames = async (plants) => { - return Promise.all( - plants.map(async (plant) => { - if (plant["common_name_de"]) { - return plant; - } - const germanName = await fetchGermanName(plant["unique_name"]); - plant["common_name_de"] = germanName; - return plant; - }) - ); +axiosRetry(axios, { + retries: 5, // number of retries + retryDelay: (retryCount) => { + return retryCount * 1000; // time interval between retries + }, +}); + +/** + * Fetches the German name of the plant from wikidata API. + * Sets the 'common_name_de of every plant in the array. + * + * @param {*} plants[] + */ +const fetchDataForPlantsArray = async (plants) => { + let GermanNamesFound = 0; + console.log("[INFO] Start fetching German common Names!"); + for (const plant of plants) { + const unique_name = plant["unique_name"]; + if (unique_name == "") { + continue; + } + + try { + await axios + .get( + `https://www.wikidata.org/w/api.php?action=wbsearchentities&search=${unique_name}&language=en&format=json` + ) + .then((response) => { + const data = response.data; + const results = data.search; + + if (!results || results.length === 0) { + return null; + } + + const result = results[0]; + const id = result.id; + + axios + .get( + `https://www.wikidata.org/w/api.php?action=wbgetentities&ids=${id}&languages=de&format=json` + ) + .then((response) => { + const data = response.data; + const entities = data.entities; + const entity = entities[id]; + const dewiki = entity["sitelinks"]["dewiki"]; + + if (dewiki) { + const title = dewiki.title; + let germanName = title.replace(/ \(.*\)/, ""); + germanName = germanName.replace('"', ""); + + germanName = germanName.replace(unique_name, ""); + germanName = germanName.trim(); + if (!germanName || germanName === "true") { + return null; + } + GermanNamesFound++; + //console.log(`${unique_name} is a ${germanName}`) + plant["common_name_de"] = germanName; + } + }) + .catch((error) => { + // The second request fails + console.error("2. Request got an error " + error); + }); + }) + .catch((error) => { + // The first request fails + console.error("1. Request got an error " + error); + }); + } catch (error) { + console.error("Error", error.message); + } + } + console.log(`[INFO] Done! Found ${GermanNamesFound} German Names!`); }; /** @@ -36,10 +104,7 @@ const unifyValueFormat = (plants, columnMapping) => { const mappedColumns = Object.keys(columnMapping).filter( (key) => columnMapping[key] !== null ); - //console.log(mappedColumns) plants.forEach((plant) => { - //console.log(plant) - mappedColumns.forEach((column) => { if (plant[column]) { if (!!columnMapping[column]["valueMapping"]) { @@ -301,14 +366,18 @@ async function writePlantsToCsv(plants) { fs.mkdirSync("data"); } + //const PlantsNew = plants.slice(0, 100); + //let updatedPlants = unifyValueFormat(PlantsNew, permapeopleColumnMapping); + let updatedPlants = unifyValueFormat(plants, permapeopleColumnMapping); - updatedPlants = await fetchGermanNames(updatedPlants); + await fetchDataForPlantsArray(updatedPlants); console.log("[INFO] Writing merged dataset to CSV file..."); console.log("[INFO] Total number of plants: ", updatedPlants.length); const csv = json2csv(updatedPlants); + //fs.writeFileSync("data/mergedDatasets2.csv", csv); fs.writeFileSync("data/mergedDatasets.csv", csv); return updatedPlants; From 6c400957baa3cf3fae63dd9dbe0a417f38a1431d Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 7 Aug 2023 00:37:55 +0200 Subject: [PATCH 02/18] add axios-retry to package.json --- scraper/package-lock.json | 64 +++++++++++++++++++++++++++++++++++++++ scraper/package.json | 1 + 2 files changed, 65 insertions(+) diff --git a/scraper/package-lock.json b/scraper/package-lock.json index 7205033a1..99fe8629e 100644 --- a/scraper/package-lock.json +++ b/scraper/package-lock.json @@ -11,6 +11,7 @@ "dependencies": { "@playwright/test": "^1.32.0", "axios": "^1.3.4", + "axios-retry": "^3.6.0", "csvtojson": "^2.0.10", "dotenv": "^16.0.3", "json2csv": "^6.0.0-alpha.2", @@ -20,6 +21,17 @@ "playwright": "^1.32.2" } }, + "node_modules/@babel/runtime": { + "version": "7.22.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.22.6.tgz", + "integrity": "sha512-wDb5pWm4WDdF6LFUde3Jl8WzPA+3ZbxYqkC6xAXuD3irdEHN1k0NfTRrJD8ZD378SJ61miMLCqIOXYhd8x+AJQ==", + "dependencies": { + "regenerator-runtime": "^0.13.11" + }, + "engines": { + "node": ">=6.9.0" + } + }, "node_modules/@playwright/test": { "version": "1.32.3", "resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.32.3.tgz", @@ -71,6 +83,15 @@ "proxy-from-env": "^1.1.0" } }, + "node_modules/axios-retry": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/axios-retry/-/axios-retry-3.6.0.tgz", + "integrity": "sha512-jtH4qWTKZ2a17dH6tjq52Y1ssNV0lKge6/Z9Lw67s9Wt01nGTg4hg7/LJBGYfDci44NTANJQlCPHPOT/TSFm9w==", + "dependencies": { + "@babel/runtime": "^7.15.4", + "is-retry-allowed": "^2.2.0" + } + }, "node_modules/bluebird": { "version": "3.7.2", "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.7.2.tgz", @@ -281,6 +302,17 @@ "he": "bin/he" } }, + "node_modules/is-retry-allowed": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/is-retry-allowed/-/is-retry-allowed-2.2.0.tgz", + "integrity": "sha512-XVm7LOeLpTW4jV19QSH38vkswxoLud8sQ57YwJVTPWdiaI9I8keEhGFpBlslyVsgdQy4Opg8QOLb8YRgsyZiQg==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/is-utf8": { "version": "0.2.1", "resolved": "https://registry.npmjs.org/is-utf8/-/is-utf8-0.2.1.tgz", @@ -544,6 +576,11 @@ "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" }, + "node_modules/regenerator-runtime": { + "version": "0.13.11", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz", + "integrity": "sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==" + }, "node_modules/spex": { "version": "3.2.0", "resolved": "https://registry.npmjs.org/spex/-/spex-3.2.0.tgz", @@ -592,6 +629,14 @@ } }, "dependencies": { + "@babel/runtime": { + "version": "7.22.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.22.6.tgz", + "integrity": "sha512-wDb5pWm4WDdF6LFUde3Jl8WzPA+3ZbxYqkC6xAXuD3irdEHN1k0NfTRrJD8ZD378SJ61miMLCqIOXYhd8x+AJQ==", + "requires": { + "regenerator-runtime": "^0.13.11" + } + }, "@playwright/test": { "version": "1.32.3", "resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.32.3.tgz", @@ -632,6 +677,15 @@ "proxy-from-env": "^1.1.0" } }, + "axios-retry": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/axios-retry/-/axios-retry-3.6.0.tgz", + "integrity": "sha512-jtH4qWTKZ2a17dH6tjq52Y1ssNV0lKge6/Z9Lw67s9Wt01nGTg4hg7/LJBGYfDci44NTANJQlCPHPOT/TSFm9w==", + "requires": { + "@babel/runtime": "^7.15.4", + "is-retry-allowed": "^2.2.0" + } + }, "bluebird": { "version": "3.7.2", "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.7.2.tgz", @@ -761,6 +815,11 @@ "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz", "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==" }, + "is-retry-allowed": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/is-retry-allowed/-/is-retry-allowed-2.2.0.tgz", + "integrity": "sha512-XVm7LOeLpTW4jV19QSH38vkswxoLud8sQ57YwJVTPWdiaI9I8keEhGFpBlslyVsgdQy4Opg8QOLb8YRgsyZiQg==" + }, "is-utf8": { "version": "0.2.1", "resolved": "https://registry.npmjs.org/is-utf8/-/is-utf8-0.2.1.tgz", @@ -948,6 +1007,11 @@ "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" }, + "regenerator-runtime": { + "version": "0.13.11", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz", + "integrity": "sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==" + }, "spex": { "version": "3.2.0", "resolved": "https://registry.npmjs.org/spex/-/spex-3.2.0.tgz", diff --git a/scraper/package.json b/scraper/package.json index 6a95cb0b2..ae995437d 100644 --- a/scraper/package.json +++ b/scraper/package.json @@ -22,6 +22,7 @@ "dependencies": { "@playwright/test": "^1.32.0", "axios": "^1.3.4", + "axios-retry": "^3.6.0", "csvtojson": "^2.0.10", "dotenv": "^16.0.3", "json2csv": "^6.0.0-alpha.2", From 37bd431ce40126722f1a5c847b4b52380bc1c576 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 7 Aug 2023 00:38:48 +0200 Subject: [PATCH 03/18] removed debug logging. --- scraper/src/merge_datasets.js | 5 ----- 1 file changed, 5 deletions(-) diff --git a/scraper/src/merge_datasets.js b/scraper/src/merge_datasets.js index 781e3e09e..508572391 100644 --- a/scraper/src/merge_datasets.js +++ b/scraper/src/merge_datasets.js @@ -74,7 +74,6 @@ const fetchDataForPlantsArray = async (plants) => { return null; } GermanNamesFound++; - //console.log(`${unique_name} is a ${germanName}`) plant["common_name_de"] = germanName; } }) @@ -366,9 +365,6 @@ async function writePlantsToCsv(plants) { fs.mkdirSync("data"); } - //const PlantsNew = plants.slice(0, 100); - //let updatedPlants = unifyValueFormat(PlantsNew, permapeopleColumnMapping); - let updatedPlants = unifyValueFormat(plants, permapeopleColumnMapping); await fetchDataForPlantsArray(updatedPlants); @@ -377,7 +373,6 @@ async function writePlantsToCsv(plants) { console.log("[INFO] Total number of plants: ", updatedPlants.length); const csv = json2csv(updatedPlants); - //fs.writeFileSync("data/mergedDatasets2.csv", csv); fs.writeFileSync("data/mergedDatasets.csv", csv); return updatedPlants; From 07ee9cb08ae07db94769e3122a1a9c64391a52a7 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 7 Aug 2023 00:38:57 +0200 Subject: [PATCH 04/18] added information to readme --- scraper/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scraper/README.md b/scraper/README.md index c56e0eb55..a0bab05d7 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -89,6 +89,10 @@ This can be done with the following command: npm run merge:datasets ``` +It iteratively gets the German common names from https://www.wikidata.org, a third-party website, to prevent 429 errors by sending too many requests in a short time. + +This may take quite a while, so don't stop the program while running. + 3. Correct data manually before the insertion (optional) The scraped data can contain inconsistencies and errors. From 626ffb38c59b9056ff29d41b17825299b16a3f63 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 7 Aug 2023 00:42:57 +0200 Subject: [PATCH 05/18] add entry to changelog. closes #750 --- doc/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/CHANGELOG.md b/doc/CHANGELOG.md index 5ed47c33d..1da0c93e5 100644 --- a/doc/CHANGELOG.md +++ b/doc/CHANGELOG.md @@ -17,7 +17,7 @@ Syntax: `- short text describing the change _(Your Name)_` - _()_ - _()_ - _()_ -- _()_ +- Improved the scraper: Fixed a bug and improved cleaning for German common names _(temmey)_ - _()_ - _()_ - _()_ From cd68a86bd6b4c8118e7ef1501b301e696b87fb8e Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 8 Aug 2023 00:24:22 +0200 Subject: [PATCH 06/18] fixed scraper readme --- scraper/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper/README.md b/scraper/README.md index a0bab05d7..3ed0a59a9 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -89,7 +89,7 @@ This can be done with the following command: npm run merge:datasets ``` -It iteratively gets the German common names from https://www.wikidata.org, a third-party website, to prevent 429 errors by sending too many requests in a short time. +It iteratively gets the German common names from https://www.wikidata.org to prevent 429 errors by sending too many requests in a short time. This may take quite a while, so don't stop the program while running. From 005c1f6e1b1147bd20786c4012ded36b0079f316 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 8 Aug 2023 02:59:52 +0200 Subject: [PATCH 07/18] improve scraper, faster, more names, cleaner --- scraper/src/helpers/helpers.js | 25 ++++- scraper/src/merge_datasets.js | 170 +++++++++++++++++++++++---------- 2 files changed, 143 insertions(+), 52 deletions(-) diff --git a/scraper/src/helpers/helpers.js b/scraper/src/helpers/helpers.js index f13eb5c7a..b8f499857 100644 --- a/scraper/src/helpers/helpers.js +++ b/scraper/src/helpers/helpers.js @@ -1,3 +1,20 @@ +/** + * Capitalizes the first character of every word in a string. + * + * @param {string} str - The input string. + * @returns {string} The string with first characters of every word capitalized. + */ +function capitalizeWords(str) { + const wordsArray = str.split(" "); + + for (let i = 0; i < wordsArray.length; i++) { + const word = wordsArray[i]; + wordsArray[i] = word.charAt(0).toUpperCase() + word.slice(1); + } + + return wordsArray.join(" "); +} + /** * Sanitizes the column names of the json array * @@ -101,4 +118,10 @@ function getSpreadEnumTyp(spread) { return value <= 0.15 ? "narrow" : value <= 0.61 ? "medium" : "wide"; } -export { sanitizeColumnNames, getSoilPH, getHeightEnumTyp, getSpreadEnumTyp }; +export { + sanitizeColumnNames, + getSoilPH, + getHeightEnumTyp, + getSpreadEnumTyp, + capitalizeWords, +}; diff --git a/scraper/src/merge_datasets.js b/scraper/src/merge_datasets.js index 508572391..e2b07723b 100644 --- a/scraper/src/merge_datasets.js +++ b/scraper/src/merge_datasets.js @@ -9,6 +9,7 @@ import { getSoilPH, getHeightEnumTyp, getSpreadEnumTyp, + capitalizeWords, } from "./helpers/helpers.js"; /** @@ -18,79 +19,146 @@ import { axiosRetry(axios, { retries: 5, // number of retries retryDelay: (retryCount) => { - return retryCount * 1000; // time interval between retries + console.log(`retry attempt: ${retryCount}`); + return retryCount * 3000; // time interval between retries }, }); /** - * Fetches the German name of the plant from wikidata API. - * Sets the 'common_name_de of every plant in the array. + * Fetches the German name of the plant from the Wikidata API. + * Sets the 'common_name_de' property of every plant in the array. * - * @param {*} plants[] + * @param {string[]} germanNames - An array with German plant names. + * @param {string} unique_name - A plant name to filter if it's in the German names. */ -const fetchDataForPlantsArray = async (plants) => { - let GermanNamesFound = 0; - console.log("[INFO] Start fetching German common Names!"); - for (const plant of plants) { - const unique_name = plant["unique_name"]; - if (unique_name == "") { +const filterGermanNames = (germanNames, unique_name) => { + const unique_name_filter = unique_name.toLowerCase(); + + const cleanedGermanNames = []; + + for (const singleGermanName of germanNames) { + let germanName = singleGermanName.toLowerCase(); + + germanName = germanName.replace(/ \(.*\)/, ""); + //remove special characters + germanName = germanName.replace(/['"`-]/, ""); + germanName = germanName.trim(); + if (germanName.includes(unique_name_filter)) { continue; } + if (germanName.length !== 0 && germanName !== "true") { + cleanedGermanNames.push(capitalizeWords(germanName)); + } + } + //remove duplicates + const uniqueNameSet = new Set(cleanedGermanNames); + + return Array.from(uniqueNameSet); +}; + +/** + * Fetches the German name of the plant from the Wikidata API. + * Sets the 'common_name_de' property of every plant in the array. + * + * @param {Array} plants - An array containing plant objects. + */ +const fetchGermanNamesForPlantsConcurrent = async (plants) => { + const MAX_CONCURRENT_REQUESTS = 25; + let GermanNamesFound = 0; + + console.log("[INFO] Start fetching German common names!"); + + const processPlants = async (plants) => { + for (const plant of plants) { + const unique_name = plant["unique_name"]; + const germanNames = []; + + if ( + plant.common_name_de && + plant.common_name_de != "" && + plant.common_name_de !== "true" + ) { + const existingGermanNames = plant.common_name_de.split(","); + existingGermanNames.forEach((existingGermanName) => { + germanNames.push(existingGermanName); + }); + } + + plant.common_name_de = null; - try { await axios .get( - `https://www.wikidata.org/w/api.php?action=wbsearchentities&search=${unique_name}&language=en&format=json` + `https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles=${unique_name}&normalize=&languages=de&format=json` ) .then((response) => { const data = response.data; - const results = data.search; + const entities = data.entities; - if (!results || results.length === 0) { - return null; + if (entities[-1]) { + // If there is a -1, the site doesn't exist + // console.log(`Did not find ${unique_name}`) + return; } - const result = results[0]; - const id = result.id; - - axios - .get( - `https://www.wikidata.org/w/api.php?action=wbgetentities&ids=${id}&languages=de&format=json` - ) - .then((response) => { - const data = response.data; - const entities = data.entities; - const entity = entities[id]; - const dewiki = entity["sitelinks"]["dewiki"]; - - if (dewiki) { - const title = dewiki.title; - let germanName = title.replace(/ \(.*\)/, ""); - germanName = germanName.replace('"', ""); - - germanName = germanName.replace(unique_name, ""); - germanName = germanName.trim(); - if (!germanName || germanName === "true") { - return null; - } - GermanNamesFound++; - plant["common_name_de"] = germanName; - } - }) - .catch((error) => { - // The second request fails - console.error("2. Request got an error " + error); + const keys = Object.keys(entities); + const entity = entities[keys[0]]; + + const label_entity = entity.labels; + if (label_entity && label_entity.de) { + germanNames.push(label_entity.de.value); + } else { + return; + } + + const dewiki_entity = entity.sitelinks.dewiki; + if (dewiki_entity) { + germanNames.push(dewiki_entity.title); + } + + const aliase_entities = entity.aliases.de; + if (aliase_entities) { + aliase_entities.forEach((alias) => { + germanNames.push(alias.value); }); + } + + if (germanNames.length === 0) { + return; + } + + const cleanedGermanNames = filterGermanNames( + germanNames, + unique_name + ); + + if ( + cleanedGermanNames.length > 0 && + Array.isArray(cleanedGermanNames) + ) { + GermanNamesFound++; + plant["common_name_de"] = cleanedGermanNames; + } }) .catch((error) => { - // The first request fails - console.error("1. Request got an error " + error); + console.error( + `[ERROR] Could not get German names from "${unique_name}": `, + error.message + ); }); - } catch (error) { - console.error("Error", error.message); } + }; + + // Chunk the plants into batches of MAX_CONCURRENT_REQUESTS + const chunks = []; + const chunk_size = plants.length / MAX_CONCURRENT_REQUESTS + 1; + for (let i = 0; i < plants.length; i += chunk_size) { + chunks.push(plants.slice(i, i + chunk_size)); } - console.log(`[INFO] Done! Found ${GermanNamesFound} German Names!`); + + // Process each chunk concurrently using Promise.all + await Promise.all(chunks.map((chunk) => processPlants(chunk))); + + console.log(`[INFO] Done! Found ${GermanNamesFound} German names!`); }; /** @@ -367,7 +435,7 @@ async function writePlantsToCsv(plants) { let updatedPlants = unifyValueFormat(plants, permapeopleColumnMapping); - await fetchDataForPlantsArray(updatedPlants); + await fetchGermanNamesForPlantsConcurrent(updatedPlants); console.log("[INFO] Writing merged dataset to CSV file..."); console.log("[INFO] Total number of plants: ", updatedPlants.length); From a51939dd24d53d29d170b02e8b02c46a2af3873a Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 8 Aug 2023 03:01:29 +0200 Subject: [PATCH 08/18] updated readme --- scraper/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/scraper/README.md b/scraper/README.md index 3ed0a59a9..da4936fbb 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -98,6 +98,7 @@ This may take quite a while, so don't stop the program while running. The scraped data can contain inconsistencies and errors. In order to correct these mistakes, we can manually correct the data i.e. change the values in the `mergedDatasets.csv` file. The corrected data in the new file should be stored in the same format as the generated data i.e. columns may not be changed. +If it starts throwing 429 errors, reduce MAX_CONCURRENT_REQUESTS to a lower number, such as 10. 4. Insert the data into the database From 196ebd9cc9246ebb0f43afb5957e97916a73fcde Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 8 Aug 2023 03:25:12 +0200 Subject: [PATCH 09/18] removed hybrid x, small cleanup --- scraper/src/merge_datasets.js | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/scraper/src/merge_datasets.js b/scraper/src/merge_datasets.js index e2b07723b..f74a7cefc 100644 --- a/scraper/src/merge_datasets.js +++ b/scraper/src/merge_datasets.js @@ -32,7 +32,9 @@ axiosRetry(axios, { * @param {string} unique_name - A plant name to filter if it's in the German names. */ const filterGermanNames = (germanNames, unique_name) => { - const unique_name_filter = unique_name.toLowerCase(); + let unique_name_filter = unique_name.toLowerCase(); + unique_name_filter = unique_name_filter.replace(/(? { germanName = germanName.replace(/ \(.*\)/, ""); //remove special characters germanName = germanName.replace(/['"`-]/, ""); + //remove hybrid x or × symbol + germanName = germanName.replace(/(? { const unique_name = plant["unique_name"]; const germanNames = []; - if ( - plant.common_name_de && - plant.common_name_de != "" && - plant.common_name_de !== "true" - ) { + if (plant.common_name_de && plant.common_name_de !== "true") { const existingGermanNames = plant.common_name_de.split(","); existingGermanNames.forEach((existingGermanName) => { germanNames.push(existingGermanName); @@ -131,10 +133,7 @@ const fetchGermanNamesForPlantsConcurrent = async (plants) => { unique_name ); - if ( - cleanedGermanNames.length > 0 && - Array.isArray(cleanedGermanNames) - ) { + if (cleanedGermanNames.length > 0) { GermanNamesFound++; plant["common_name_de"] = cleanedGermanNames; } From fe7579e5b0316355baefb3eaecc2a6bce815a231 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 8 Aug 2023 10:24:58 +0200 Subject: [PATCH 10/18] replaced "" with null for csv --- scraper/src/merge_datasets.js | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/scraper/src/merge_datasets.js b/scraper/src/merge_datasets.js index f74a7cefc..abc1b611b 100644 --- a/scraper/src/merge_datasets.js +++ b/scraper/src/merge_datasets.js @@ -422,6 +422,22 @@ async function mergeDatasets() { return allPlants; } +/** + * Cleans up a JSON array for smoother CSV export. + * + * @param {Array} plants - Array of plants + */ +function cleanUpJsonForCsv(plants) { + const columns = Object.keys(plants[0]); + plants.forEach((plant) => { + columns.forEach((column) => { + if (plant[column] === "") { + plant[column] = null; + } + }); + }); +} + /** * The function writes the merged dataset to a CSV file. * @@ -433,6 +449,7 @@ async function writePlantsToCsv(plants) { } let updatedPlants = unifyValueFormat(plants, permapeopleColumnMapping); + cleanUpJsonForCsv(updatedPlants); await fetchGermanNamesForPlantsConcurrent(updatedPlants); From 13c934f23627cefbfa8bf41450dbcb4b3de04985 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 12 Aug 2023 21:09:28 +0200 Subject: [PATCH 11/18] fixed grammer in readme --- scraper/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper/README.md b/scraper/README.md index da4936fbb..5699025d8 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -89,7 +89,7 @@ This can be done with the following command: npm run merge:datasets ``` -It iteratively gets the German common names from https://www.wikidata.org to prevent 429 errors by sending too many requests in a short time. +Iteratively gets the German common names from https://www.wikidata.org to prevent 429 errors by sending too many requests in a short time. This may take quite a while, so don't stop the program while running. From 73633536ea85f8b1c1a83d30e6f72a7044878716 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 12 Aug 2023 22:37:44 +0200 Subject: [PATCH 12/18] applied requested changes --- scraper/src/merge_datasets.js | 222 +++++++++++++++++++++------------- 1 file changed, 137 insertions(+), 85 deletions(-) diff --git a/scraper/src/merge_datasets.js b/scraper/src/merge_datasets.js index abc1b611b..5ae2c9837 100644 --- a/scraper/src/merge_datasets.js +++ b/scraper/src/merge_datasets.js @@ -12,6 +12,8 @@ import { capitalizeWords, } from "./helpers/helpers.js"; +let GermanNamesFound = 0; + /** * Defines the amount of retries we do, if axios encounters errors during a HTTP GET Request. * Increse Delay if we encounter error to prevent 429 Errors. @@ -22,6 +24,9 @@ axiosRetry(axios, { console.log(`retry attempt: ${retryCount}`); return retryCount * 3000; // time interval between retries }, + retryCondition: (error) => { + return error.response.status === 429; + }, }); /** @@ -35,7 +40,7 @@ const filterGermanNames = (germanNames, unique_name) => { let unique_name_filter = unique_name.toLowerCase(); unique_name_filter = unique_name_filter.replace(/(? { unique_name_filter = unique_name_filter.replace(/\s+/g, " "); germanName = germanName.trim(); - if (germanName.includes(unique_name_filter)) { + if ( + unique_names.some((unique_name_part) => + germanName.includes(unique_name_part) + ) + ) { continue; } + if (germanName.length !== 0 && germanName !== "true") { cleanedGermanNames.push(capitalizeWords(germanName)); } @@ -62,90 +72,110 @@ const filterGermanNames = (germanNames, unique_name) => { return Array.from(uniqueNameSet); }; +/** + * Handles the response from Wikidata. + * Gets the German label, dewiki, and alias entries and adds them to the found possible German names. + * + * @param {*} response - Response object from the axios function. + * @param {Array} germanNames - An array containing the currently found German names. + */ +const handleResponseAddGermanNamesFound = (response, germanNames) => { + const data = response.data; + + if (data.normalized) { + //This isn't the correct plant, it got redirected to somewhere else. + return; + } + + const entities = data.entities; + + if (entities[-1]) { + // If there is a -1, the site doesn't exist + return; + } + + const keys = Object.keys(entities); + const entity = entities[keys[0]]; + + const label_entity = entity.labels; + if (label_entity && label_entity.de) { + germanNames.push(label_entity.de.value); + } else { + //If there is no label for the plant, then there are no other relevant german common names. + return; + } + + const dewiki_entity = entity.sitelinks.dewiki; + if (dewiki_entity) { + germanNames.push(dewiki_entity.title); + } + + const aliase_entities = entity.aliases.de; + if (aliase_entities) { + aliase_entities.forEach((alias) => { + germanNames.push(alias.value); + }); + } +}; + /** * Fetches the German name of the plant from the Wikidata API. * Sets the 'common_name_de' property of every plant in the array. + * Uses the current entry in 'common_name_de' and extends it with data found on Wikidata. * - * @param {Array} plants - An array containing plant objects. + * @param {Array} plants - An array containing a part of all plant objects. */ -const fetchGermanNamesForPlantsConcurrent = async (plants) => { - const MAX_CONCURRENT_REQUESTS = 25; - let GermanNamesFound = 0; +const processPlants = async (plants) => { + for (const plant of plants) { + const unique_name = plant["unique_name"]; + const germanNames = []; + + if (plant.common_name_de && plant.common_name_de !== "true") { + const existingGermanNames = plant.common_name_de.split(","); + existingGermanNames.forEach((existingGermanName) => { + germanNames.push(existingGermanName); + }); + } - console.log("[INFO] Start fetching German common names!"); + plant.common_name_de = null; + + await axios + .get( + `https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles=${unique_name}&normalize=&languages=de&format=json` + ) + .then((response) => { + handleResponseAddGermanNamesFound(response, germanNames); + }) + .catch((error) => { + console.error( + `[ERROR] Could not get German names for "${unique_name}" from Wikidata: `, + error.message + ); + }) + .finally(() => { + if (germanNames.length === 0) { + return; + } + const cleanedGermanNames = filterGermanNames(germanNames, unique_name); - const processPlants = async (plants) => { - for (const plant of plants) { - const unique_name = plant["unique_name"]; - const germanNames = []; + if (cleanedGermanNames.length > 0) { + GermanNamesFound++; + plant["common_name_de"] = cleanedGermanNames.join(", "); + } + }); + } +}; - if (plant.common_name_de && plant.common_name_de !== "true") { - const existingGermanNames = plant.common_name_de.split(","); - existingGermanNames.forEach((existingGermanName) => { - germanNames.push(existingGermanName); - }); - } +/** + * Sets the 'common_name_de' property of every plant in the array. + * Splits up the entire plants array into smaller arrays to improve performance for name fetching. + * + * @param {Array} plants - An array containing all plant objects. + */ +const fetchGermanNamesForPlantsConcurrent = async (plants) => { + const MAX_CONCURRENT_REQUESTS = 25; - plant.common_name_de = null; - - await axios - .get( - `https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles=${unique_name}&normalize=&languages=de&format=json` - ) - .then((response) => { - const data = response.data; - const entities = data.entities; - - if (entities[-1]) { - // If there is a -1, the site doesn't exist - // console.log(`Did not find ${unique_name}`) - return; - } - - const keys = Object.keys(entities); - const entity = entities[keys[0]]; - - const label_entity = entity.labels; - if (label_entity && label_entity.de) { - germanNames.push(label_entity.de.value); - } else { - return; - } - - const dewiki_entity = entity.sitelinks.dewiki; - if (dewiki_entity) { - germanNames.push(dewiki_entity.title); - } - - const aliase_entities = entity.aliases.de; - if (aliase_entities) { - aliase_entities.forEach((alias) => { - germanNames.push(alias.value); - }); - } - - if (germanNames.length === 0) { - return; - } - - const cleanedGermanNames = filterGermanNames( - germanNames, - unique_name - ); - - if (cleanedGermanNames.length > 0) { - GermanNamesFound++; - plant["common_name_de"] = cleanedGermanNames; - } - }) - .catch((error) => { - console.error( - `[ERROR] Could not get German names from "${unique_name}": `, - error.message - ); - }); - } - }; + console.log("[INFO] Start fetching German common names!"); // Chunk the plants into batches of MAX_CONCURRENT_REQUESTS const chunks = []; @@ -157,7 +187,9 @@ const fetchGermanNamesForPlantsConcurrent = async (plants) => { // Process each chunk concurrently using Promise.all await Promise.all(chunks.map((chunk) => processPlants(chunk))); - console.log(`[INFO] Done! Found ${GermanNamesFound} German names!`); + console.log( + `[INFO] Done! Found German names for ${GermanNamesFound} plants!` + ); }; /** @@ -215,6 +247,13 @@ const unifyValueFormat = (plants, columnMapping) => { plant["spread"] = getSpreadEnumTyp(plant["width"]); } + if (plant["unique_name"].startsWith("Solanum lycopersicum (Brandywine)")) { + plant["unique_name"] = plant["unique_name"].replace( + "Solanum lycopersicum (Brandywine)", + "Solanum lycopersicum 'Brandywine'" + ); + } + if (plant["unique_name"].startsWith("Papaver somnif. paeonifl.")) { plant["unique_name"] = plant["unique_name"].replace( "Papaver somnif. paeonifl.", @@ -423,18 +462,29 @@ async function mergeDatasets() { } /** - * Cleans up a JSON array for smoother CSV export. + * Cleans up a JSON array entries for smoother CSV export. * * @param {Array} plants - Array of plants */ function cleanUpJsonForCsv(plants) { const columns = Object.keys(plants[0]); - plants.forEach((plant) => { - columns.forEach((column) => { - if (plant[column] === "") { - plant[column] = null; - } - }); + plants.forEach((plant, index) => { + if ( + plant["unique_name"].startsWith("Cimicifuga racemosa (Actaea racemosa)") + ) { + plants.splice(index, 1); // Remove the object that meets the delete condition + } else if (plant["unique_name"].startsWith("'Clover Grass'")) { + plants.splice(index, 1); // Remove the object that meets the delete condition + } else if (plant["unique_name"].startsWith("'Kleegras'")) { + plants.splice(index, 1); // Remove the object that meets the delete condition + } else { + delete plant.subfamily; + columns.forEach((column) => { + if (plant[column] === "") { + plant[column] = null; + } + }); + } }); } @@ -448,6 +498,8 @@ async function writePlantsToCsv(plants) { fs.mkdirSync("data"); } + //const slicedPlants = plants.slice(0,100) + let updatedPlants = unifyValueFormat(plants, permapeopleColumnMapping); cleanUpJsonForCsv(updatedPlants); From 625b85295af180ec33506bf186db62970fedb64f Mon Sep 17 00:00:00 2001 From: Christoph Schreiner Date: Mon, 12 Feb 2024 07:50:35 +0000 Subject: [PATCH 13/18] Separate fetching of german common names and merging of datasets; Add apply overrides functionality --- scraper/package.json | 2 + scraper/src/apply_overrides.js | 110 ++++++++++++++ scraper/src/fetch_german_names.js | 218 +++++++++++++++++++++++++++ scraper/src/merge_datasets.js | 235 +----------------------------- 4 files changed, 336 insertions(+), 229 deletions(-) create mode 100644 scraper/src/apply_overrides.js create mode 100644 scraper/src/fetch_german_names.js diff --git a/scraper/package.json b/scraper/package.json index ae995437d..77aa80b3e 100644 --- a/scraper/package.json +++ b/scraper/package.json @@ -10,6 +10,8 @@ "merge:datasets": "node src/merge_datasets.js", "merge:reinsaat": "node src/merge_reinsaat.js", "merge:csvfiles": "node src/helpers/merge_csv_files.js", + "fetch:germannames": "node src/fetch_german_names.js", + "apply:overrides": "node src/apply_overrides.js", "insert:plants": "node src/insert_plants.js", "insert:relations": "node src/insert_plant_relations.js", "insert": "npm run insert:plants && npm run insert:relations", diff --git a/scraper/src/apply_overrides.js b/scraper/src/apply_overrides.js new file mode 100644 index 000000000..ee2562c0a --- /dev/null +++ b/scraper/src/apply_overrides.js @@ -0,0 +1,110 @@ +import fs from "fs"; +import path from "path"; +import { parse as json2csv } from "json2csv"; +import csv from "csvtojson"; + +const deletionsFile = "00_DELETIONS.csv"; + +async function loadMergedDataset() { + return csv().fromFile("data/mergedDatasets.csv"); +} + +async function applyDeletions(plants) { + console.log(`Deleting plants from data/overrides/${deletionsFile}`); + + const deletePlants = await csv().fromFile(`data/overrides/${deletionsFile}`); + + deletePlants.forEach((overridePlant) => { + // find the plant + const index = plants.findIndex( + (plant) => plant.unique_name === overridePlant.unique_name + ); + + if (index === -1) { + console.log( + `[INFO] Could not find plant with unique_name '${overridePlant.unique_name}' in merged dataset.` + ); + return; + } + + // delete the plant + plants.splice(index, 1); + }); + + return plants; +} + +async function applyOverrides(plants) { + if (!fs.existsSync("data/overrides")) { + fs.mkdirSync("data/overrides"); + } + + // list all csv files in data/overrides + const overrideFiles = fs.readdirSync("data/overrides"); + + // apply all overrides, deletions are handled separately + for (const file of overrideFiles) { + if (path.extname(file) !== ".csv" || file === deletionsFile) { + continue; + } + console.log(`Applying data/overrides/${file}`); + + const overridePlants = await csv().fromFile(`data/overrides/${file}`); + + overridePlants.forEach((overridePlant) => { + // find the plant + const index = plants.findIndex( + (plant) => plant.unique_name === overridePlant.unique_name + ); + + if (index === -1) { + console.log( + `[INFO] Could not find plant with unique_name '${overridePlant.unique_name}' in merged dataset.` + ); + return; + } + + // for each column in the override plant, update the plant + Object.keys(overridePlant).forEach((key) => { + if (key !== "unique_name") { + if (key === "new_unique_name") { + // actually override the unique name + key = "unique_name"; + } + + plants[index][key] = overridePlant[key].trim(); + } + }); + }); + } + + return plants; +} + +function cleanUpJsonForCsv(plants) { + const columns = Object.keys(plants[0]); + plants.forEach((plant, index) => { + columns.forEach((column) => { + if (plant[column] === "") { + plant[column] = null; + } + }); + }); + return plants; +} + +async function writePlantsToOverwriteCsv(plants) { + console.log("[INFO] Total number of plants: ", plants.length); + + console.log("[INFO] Writing plants to csv data/final_plants.csv"); + const csvFile = json2csv(cleanUpJsonForCsv(plants)); + fs.writeFileSync("data/final_plants.csv", csvFile); + + return plants; +} + +loadMergedDataset() + .then((plants) => applyDeletions(plants)) + .then((plants) => applyOverrides(plants)) + .then((plants) => writePlantsToOverwriteCsv(plants)) + .catch((error) => console.error(error)); diff --git a/scraper/src/fetch_german_names.js b/scraper/src/fetch_german_names.js new file mode 100644 index 000000000..121bc5b27 --- /dev/null +++ b/scraper/src/fetch_german_names.js @@ -0,0 +1,218 @@ +import axios from "axios"; +import axiosRetry from "axios-retry"; +import fs from "fs"; +import { parse as json2csv } from "json2csv"; +import csv from "csvtojson"; +import { capitalizeWords } from "./helpers/helpers.js"; + +let GermanNamesFound = 0; + +/** + * Defines the amount of retries we do, if axios encounters errors during a HTTP GET Request. + * Increse Delay if we encounter error to prevent 429 Errors. + */ +axiosRetry(axios, { + retries: 5, // number of retries + retryDelay: (retryCount) => { + console.log(`retry attempt: ${retryCount}`); + return retryCount * 3000; // time interval between retries + }, + retryCondition: (error) => { + return error.response.status === 429; + }, +}); + +/** + * Fetches the German name of the plant from the Wikidata API. + * Sets the 'common_name_de' property of every plant in the array. + * + * @param {string[]} germanNames - An array with German plant names. + * @param {string} unique_name - A plant name to filter if it's in the German names. + */ +const filterGermanNames = (germanNames, unique_name) => { + let unique_name_filter = unique_name.toLowerCase(); + unique_name_filter = unique_name_filter.replace(/(? + germanName.includes(unique_name_part) + ) + ) { + continue; + } + + if (germanName.length !== 0 && germanName !== "true") { + cleanedGermanNames.push(capitalizeWords(germanName)); + } + } + //remove duplicates + const uniqueNameSet = new Set(cleanedGermanNames); + + return Array.from(uniqueNameSet); +}; + +/** + * Handles the response from Wikidata. + * Gets the German label, dewiki, and alias entries and adds them to the found possible German names. + * + * @param {*} response - Response object from the axios function. + * @param {Array} germanNames - An array containing the currently found German names. + */ +const handleResponseAddGermanNamesFound = (response, germanNames) => { + const data = response.data; + + if (data.normalized) { + //This isn't the correct plant, it got redirected to somewhere else. + return; + } + + const entities = data.entities; + + if (entities[-1]) { + // If there is a -1, the site doesn't exist + return; + } + + const keys = Object.keys(entities); + const entity = entities[keys[0]]; + + const label_entity = entity.labels; + if (label_entity && label_entity.de) { + germanNames.push(label_entity.de.value); + } else { + //If there is no label for the plant, then there are no other relevant german common names. + return; + } + + const dewiki_entity = entity.sitelinks.dewiki; + if (dewiki_entity) { + germanNames.push(dewiki_entity.title); + } + + const aliase_entities = entity.aliases.de; + if (aliase_entities) { + aliase_entities.forEach((alias) => { + germanNames.push(alias.value); + }); + } +}; + +/** + * Fetches the German name of the plant from the Wikidata API. + * Sets the 'common_name_de' property of every plant in the array. + * Uses the current entry in 'common_name_de' and extends it with data found on Wikidata. + * + * @param {Array} plants - An array containing a part of all plant objects. + */ +const processPlants = async (plants) => { + for (const plant of plants) { + const unique_name = plant["unique_name"]; + const germanNames = []; + + if (plant.common_name_de && plant.common_name_de !== "true") { + const existingGermanNames = plant.common_name_de.split(","); + existingGermanNames.forEach((existingGermanName) => { + germanNames.push(existingGermanName); + }); + } + + plant.common_name_de = null; + + await axios + .get( + `https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles=${unique_name}&normalize=&languages=de&format=json` + ) + .then((response) => { + // log response code + handleResponseAddGermanNamesFound(response, germanNames); + }) + .catch((error) => { + console.error( + `[ERROR] Could not get German names for "${unique_name}" from Wikidata: `, + error.message + ); + }) + .finally(() => { + if (germanNames.length === 0) { + return; + } + const cleanedGermanNames = filterGermanNames(germanNames, unique_name); + + if (cleanedGermanNames.length > 0) { + GermanNamesFound++; + plant["common_name_de"] = cleanedGermanNames.join(", "); + } + }); + } +}; + +/** + * Sets the 'common_name_de' property of every plant in the array. + * Splits up the entire plants array into smaller arrays to improve performance for name fetching. + * + * @param {Array} plants - An array containing all plant objects. + */ +const fetchGermanNamesForPlantsConcurrent = async (plants) => { + const MAX_CONCURRENT_REQUESTS = 25; + + console.log("[INFO] Start fetching German common names!"); + + // Chunk the plants into batches of MAX_CONCURRENT_REQUESTS + const chunks = []; + const chunk_size = plants.length / MAX_CONCURRENT_REQUESTS + 1; + for (let i = 0; i < plants.length; i += chunk_size) { + chunks.push(plants.slice(i, i + chunk_size)); + } + + // Process each chunk concurrently using Promise.all + await Promise.all(chunks.map((chunk) => processPlants(chunk))); + + console.log( + `[INFO] Done! Found German names for ${GermanNamesFound} plants!` + ); +}; + +async function fetchGermanNames() { + if (!fs.existsSync("data/overrides")) { + fs.mkdirSync("data/overrides"); + } + + let plants = await csv().fromFile("data/mergedDatasets.csv"); + + // plants = plants.slice(0, 100) // during developement + + await fetchGermanNamesForPlantsConcurrent(plants); + + return plants; +} + +async function writePlantsToOverwriteCsv(plants) { + console.log("[INFO] Writing german common names to CSV file..."); + console.log("[INFO] Total number of plants: ", plants.length); + + const opts = { + fields: ["unique_name", "common_name_de"], + }; + const csvFile = json2csv(plants, opts); + fs.writeFileSync("data/overrides/01_germanCommonNames.csv", csvFile); + + return plants; +} + +fetchGermanNames() + .then((plants) => writePlantsToOverwriteCsv(plants)) + .catch((error) => console.error(error)); diff --git a/scraper/src/merge_datasets.js b/scraper/src/merge_datasets.js index 5ae2c9837..c6f632382 100644 --- a/scraper/src/merge_datasets.js +++ b/scraper/src/merge_datasets.js @@ -2,8 +2,6 @@ import fs from "fs"; import { parse as json2csv } from "json2csv"; import csv from "csvtojson"; import permapeopleColumnMapping from "./helpers/column_mapping_permapeople.js"; -import axios from "axios"; -import axiosRetry from "axios-retry"; import { sanitizeColumnNames, getSoilPH, @@ -12,186 +10,6 @@ import { capitalizeWords, } from "./helpers/helpers.js"; -let GermanNamesFound = 0; - -/** - * Defines the amount of retries we do, if axios encounters errors during a HTTP GET Request. - * Increse Delay if we encounter error to prevent 429 Errors. - */ -axiosRetry(axios, { - retries: 5, // number of retries - retryDelay: (retryCount) => { - console.log(`retry attempt: ${retryCount}`); - return retryCount * 3000; // time interval between retries - }, - retryCondition: (error) => { - return error.response.status === 429; - }, -}); - -/** - * Fetches the German name of the plant from the Wikidata API. - * Sets the 'common_name_de' property of every plant in the array. - * - * @param {string[]} germanNames - An array with German plant names. - * @param {string} unique_name - A plant name to filter if it's in the German names. - */ -const filterGermanNames = (germanNames, unique_name) => { - let unique_name_filter = unique_name.toLowerCase(); - unique_name_filter = unique_name_filter.replace(/(? - germanName.includes(unique_name_part) - ) - ) { - continue; - } - - if (germanName.length !== 0 && germanName !== "true") { - cleanedGermanNames.push(capitalizeWords(germanName)); - } - } - //remove duplicates - const uniqueNameSet = new Set(cleanedGermanNames); - - return Array.from(uniqueNameSet); -}; - -/** - * Handles the response from Wikidata. - * Gets the German label, dewiki, and alias entries and adds them to the found possible German names. - * - * @param {*} response - Response object from the axios function. - * @param {Array} germanNames - An array containing the currently found German names. - */ -const handleResponseAddGermanNamesFound = (response, germanNames) => { - const data = response.data; - - if (data.normalized) { - //This isn't the correct plant, it got redirected to somewhere else. - return; - } - - const entities = data.entities; - - if (entities[-1]) { - // If there is a -1, the site doesn't exist - return; - } - - const keys = Object.keys(entities); - const entity = entities[keys[0]]; - - const label_entity = entity.labels; - if (label_entity && label_entity.de) { - germanNames.push(label_entity.de.value); - } else { - //If there is no label for the plant, then there are no other relevant german common names. - return; - } - - const dewiki_entity = entity.sitelinks.dewiki; - if (dewiki_entity) { - germanNames.push(dewiki_entity.title); - } - - const aliase_entities = entity.aliases.de; - if (aliase_entities) { - aliase_entities.forEach((alias) => { - germanNames.push(alias.value); - }); - } -}; - -/** - * Fetches the German name of the plant from the Wikidata API. - * Sets the 'common_name_de' property of every plant in the array. - * Uses the current entry in 'common_name_de' and extends it with data found on Wikidata. - * - * @param {Array} plants - An array containing a part of all plant objects. - */ -const processPlants = async (plants) => { - for (const plant of plants) { - const unique_name = plant["unique_name"]; - const germanNames = []; - - if (plant.common_name_de && plant.common_name_de !== "true") { - const existingGermanNames = plant.common_name_de.split(","); - existingGermanNames.forEach((existingGermanName) => { - germanNames.push(existingGermanName); - }); - } - - plant.common_name_de = null; - - await axios - .get( - `https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles=${unique_name}&normalize=&languages=de&format=json` - ) - .then((response) => { - handleResponseAddGermanNamesFound(response, germanNames); - }) - .catch((error) => { - console.error( - `[ERROR] Could not get German names for "${unique_name}" from Wikidata: `, - error.message - ); - }) - .finally(() => { - if (germanNames.length === 0) { - return; - } - const cleanedGermanNames = filterGermanNames(germanNames, unique_name); - - if (cleanedGermanNames.length > 0) { - GermanNamesFound++; - plant["common_name_de"] = cleanedGermanNames.join(", "); - } - }); - } -}; - -/** - * Sets the 'common_name_de' property of every plant in the array. - * Splits up the entire plants array into smaller arrays to improve performance for name fetching. - * - * @param {Array} plants - An array containing all plant objects. - */ -const fetchGermanNamesForPlantsConcurrent = async (plants) => { - const MAX_CONCURRENT_REQUESTS = 25; - - console.log("[INFO] Start fetching German common names!"); - - // Chunk the plants into batches of MAX_CONCURRENT_REQUESTS - const chunks = []; - const chunk_size = plants.length / MAX_CONCURRENT_REQUESTS + 1; - for (let i = 0; i < plants.length; i += chunk_size) { - chunks.push(plants.slice(i, i + chunk_size)); - } - - // Process each chunk concurrently using Promise.all - await Promise.all(chunks.map((chunk) => processPlants(chunk))); - - console.log( - `[INFO] Done! Found German names for ${GermanNamesFound} plants!` - ); -}; - /** * Custom rules to unify the value format of merged datasets. * @@ -246,35 +64,6 @@ const unifyValueFormat = (plants, columnMapping) => { if ("width" in plant) { plant["spread"] = getSpreadEnumTyp(plant["width"]); } - - if (plant["unique_name"].startsWith("Solanum lycopersicum (Brandywine)")) { - plant["unique_name"] = plant["unique_name"].replace( - "Solanum lycopersicum (Brandywine)", - "Solanum lycopersicum 'Brandywine'" - ); - } - - if (plant["unique_name"].startsWith("Papaver somnif. paeonifl.")) { - plant["unique_name"] = plant["unique_name"].replace( - "Papaver somnif. paeonifl.", - "Papaver somniferum paeoniflorum" - ); - } else if (plant["unique_name"].startsWith("Alcea rosea fl. pl.")) { - plant["unique_name"] = plant["unique_name"].replace( - "Alcea rosea fl. pl.", - "Alcea rosea flore pleno" - ); - } else if (plant["unique_name"].startsWith("Campanula lat. macr.")) { - plant["unique_name"] = plant["unique_name"].replace( - "Campanula lat. macr.", - "Campanula latifolia macrantha" - ); - } else if (plant["unique_name"].startsWith("Malva sylvestris ssp. maur.")) { - plant["unique_name"] = plant["unique_name"].replace( - "Malva sylvestris ssp. maur.", - "Malva sylvestris mauritiana" - ); - } }); return plants; @@ -469,22 +258,12 @@ async function mergeDatasets() { function cleanUpJsonForCsv(plants) { const columns = Object.keys(plants[0]); plants.forEach((plant, index) => { - if ( - plant["unique_name"].startsWith("Cimicifuga racemosa (Actaea racemosa)") - ) { - plants.splice(index, 1); // Remove the object that meets the delete condition - } else if (plant["unique_name"].startsWith("'Clover Grass'")) { - plants.splice(index, 1); // Remove the object that meets the delete condition - } else if (plant["unique_name"].startsWith("'Kleegras'")) { - plants.splice(index, 1); // Remove the object that meets the delete condition - } else { - delete plant.subfamily; - columns.forEach((column) => { - if (plant[column] === "") { - plant[column] = null; - } - }); - } + delete plant.subfamily; + columns.forEach((column) => { + if (plant[column] === "") { + plant[column] = null; + } + }); }); } @@ -503,8 +282,6 @@ async function writePlantsToCsv(plants) { let updatedPlants = unifyValueFormat(plants, permapeopleColumnMapping); cleanUpJsonForCsv(updatedPlants); - await fetchGermanNamesForPlantsConcurrent(updatedPlants); - console.log("[INFO] Writing merged dataset to CSV file..."); console.log("[INFO] Total number of plants: ", updatedPlants.length); From 23919a6b88e96ea5c3087d86c58ef08fc2e425b4 Mon Sep 17 00:00:00 2001 From: Christoph Schreiner Date: Sun, 18 Feb 2024 21:16:06 +0000 Subject: [PATCH 14/18] German names separate from other overrides; updated changelog.md --- scraper/README.md | 31 +++++++++---- scraper/package.json | 5 ++- scraper/src/apply_overrides.js | 72 ++++++++----------------------- scraper/src/fetch_german_names.js | 22 ++++------ scraper/src/helpers/helpers.js | 19 ++++++++ scraper/src/helpers/override.js | 45 +++++++++++++++++++ scraper/src/insert_plants.js | 2 +- scraper/src/merge_datasets.js | 19 +------- scraper/src/merge_german_names.js | 31 +++++++++++++ 9 files changed, 149 insertions(+), 97 deletions(-) create mode 100644 scraper/src/helpers/override.js create mode 100644 scraper/src/merge_german_names.js diff --git a/scraper/README.md b/scraper/README.md index 5699025d8..9f0dee482 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -29,7 +29,7 @@ cp .env.example .env ### Installation Option 1: With a single command -The following command will fetch the data from the sources, merge the datasets and insert the data into the database: +The following command will fetch the data from the sources, merge the datasets, apply the overrides and insert the data into the database: ```shell npm run start:full @@ -46,6 +46,7 @@ npm run start 1. `detail.csv` - scraped from PracticalPlants 2. `permapeopleRawData.csv` - scraped from Permapeople 3. `reinsaatRawData.csv` - scraped from Reinsaat and merged from `reinsaatRawDataEN.csv` and `reinsaatRawDataDE.csv` +4. `germanCommonNames.csv` - scraped from wikidata ### Installation Option 2: Step by Step @@ -76,6 +77,7 @@ The scraped data is stored in the `data` directory: - `reinsaatRawDataEN.csv`: This file contains the raw data scraped from the english version of the Reinsaat webpage. - `reinsaatRawDataDE.csv`: This file contains the raw data scraped from the german version of the Reinsaat webpage. - `reinsaatRawData.csv`: This file contains the merged data scraped from the english and german version of the Reinsaat webpage. +- `germanCommonNames.csv`: This file contains the German common names fetched from https://www.wikidata.org 2. Merge the scraped datasets @@ -89,18 +91,29 @@ This can be done with the following command: npm run merge:datasets ``` -Iteratively gets the German common names from https://www.wikidata.org to prevent 429 errors by sending too many requests in a short time. +3. Fetch German common names -This may take quite a while, so don't stop the program while running. +Goes through all unique names from mergedDatasets.csv and fetches the German common names from https://www.wikidata.org concurrently. Then merges them into `mergedDatasets.csv` -3. Correct data manually before the insertion (optional) +If it starts throwing 429 errors, reduce MAX_CONCURRENT_REQUESTS to a lower number, such as 10. + +```shell +npm run fetch:germannames && npm run merge:germannames +``` + +4. Apply overrides The scraped data can contain inconsistencies and errors. -In order to correct these mistakes, we can manually correct the data i.e. change the values in the `mergedDatasets.csv` file. -The corrected data in the new file should be stored in the same format as the generated data i.e. columns may not be changed. -If it starts throwing 429 errors, reduce MAX_CONCURRENT_REQUESTS to a lower number, such as 10. +In order to correct these mistakes, we can create override files. +`data/overrides` may contain any number of `csv` files, which are applied consecutively to `mergedDatasets.csv` to create `finalDataset.csv` + +For details see `data/overrides/README.md` + +```shell +npm run apply:overrides +``` -4. Insert the data into the database +5. Insert the data into the database The scraper also inserts the scraped data into the database: @@ -108,7 +121,7 @@ The scraper also inserts the scraped data into the database: npm run insert:plants ``` -5. Insert relations into the database +6. Insert relations into the database The scraper inserts the relation data into the database. diff --git a/scraper/package.json b/scraper/package.json index 77aa80b3e..5f4d29950 100644 --- a/scraper/package.json +++ b/scraper/package.json @@ -11,12 +11,13 @@ "merge:reinsaat": "node src/merge_reinsaat.js", "merge:csvfiles": "node src/helpers/merge_csv_files.js", "fetch:germannames": "node src/fetch_german_names.js", + "merge:germannames": "node src/merge_german_names.js", "apply:overrides": "node src/apply_overrides.js", "insert:plants": "node src/insert_plants.js", "insert:relations": "node src/insert_plant_relations.js", "insert": "npm run insert:plants && npm run insert:relations", - "start:full": "npm run fetch:permapeople && npm run fetch:practicalplants && npm run fetch:reinsaat && npm run merge:reinsaat && npm run merge:datasets && npm run insert:plants", - "start": "npm run merge:datasets && npm run insert:plants" + "start:full": "npm run fetch:permapeople && npm run fetch:practicalplants && npm run fetch:reinsaat && npm run merge:reinsaat && npm run merge:datasets && npm run fetch:germannames && npm run merge:germannames && npm run insert:plants", + "start": "npm run merge:datasets && npm run merge:germannames && npm run insert:plants" }, "keywords": [], "author": "", diff --git a/scraper/src/apply_overrides.js b/scraper/src/apply_overrides.js index ee2562c0a..367f9f790 100644 --- a/scraper/src/apply_overrides.js +++ b/scraper/src/apply_overrides.js @@ -2,6 +2,8 @@ import fs from "fs"; import path from "path"; import { parse as json2csv } from "json2csv"; import csv from "csvtojson"; +import { cleanUpJsonForCsv } from "./helpers/helpers.js"; +import { applyOverride } from "./helpers/override.js"; const deletionsFile = "00_DELETIONS.csv"; @@ -10,7 +12,7 @@ async function loadMergedDataset() { } async function applyDeletions(plants) { - console.log(`Deleting plants from data/overrides/${deletionsFile}`); + console.log(`[INFO] Deleting plants from data/overrides/${deletionsFile}`); const deletePlants = await csv().fromFile(`data/overrides/${deletionsFile}`); @@ -34,77 +36,41 @@ async function applyDeletions(plants) { return plants; } -async function applyOverrides(plants) { - if (!fs.existsSync("data/overrides")) { - fs.mkdirSync("data/overrides"); +async function applyAllOverrides(plants) { + let overridesDir = "data/overrides"; + if (!fs.existsSync(overridesDir)) { + fs.mkdirSync(overridesDir); } // list all csv files in data/overrides - const overrideFiles = fs.readdirSync("data/overrides"); + const overrideFiles = fs.readdirSync(overridesDir); + overrideFiles.sort(); - // apply all overrides, deletions are handled separately + // apply all overrides for (const file of overrideFiles) { + // deletions were handled separately if (path.extname(file) !== ".csv" || file === deletionsFile) { continue; } - console.log(`Applying data/overrides/${file}`); - - const overridePlants = await csv().fromFile(`data/overrides/${file}`); - - overridePlants.forEach((overridePlant) => { - // find the plant - const index = plants.findIndex( - (plant) => plant.unique_name === overridePlant.unique_name - ); - - if (index === -1) { - console.log( - `[INFO] Could not find plant with unique_name '${overridePlant.unique_name}' in merged dataset.` - ); - return; - } - - // for each column in the override plant, update the plant - Object.keys(overridePlant).forEach((key) => { - if (key !== "unique_name") { - if (key === "new_unique_name") { - // actually override the unique name - key = "unique_name"; - } - - plants[index][key] = overridePlant[key].trim(); - } - }); - }); + await applyOverride(plants, `${overridesDir}/${file}`); } return plants; } -function cleanUpJsonForCsv(plants) { - const columns = Object.keys(plants[0]); - plants.forEach((plant, index) => { - columns.forEach((column) => { - if (plant[column] === "") { - plant[column] = null; - } - }); - }); - return plants; -} - async function writePlantsToOverwriteCsv(plants) { - console.log("[INFO] Total number of plants: ", plants.length); - - console.log("[INFO] Writing plants to csv data/final_plants.csv"); - const csvFile = json2csv(cleanUpJsonForCsv(plants)); - fs.writeFileSync("data/final_plants.csv", csvFile); + console.log( + `[INFO] Writing ${plants.length} plants to csv data/finalDataset.csv` + ); + cleanUpJsonForCsv(plants); + const csvFile = json2csv(plants); + fs.writeFileSync("data/finalDataset.csv", csvFile); return plants; } loadMergedDataset() .then((plants) => applyDeletions(plants)) - .then((plants) => applyOverrides(plants)) + .then((plants) => applyAllOverrides(plants)) .then((plants) => writePlantsToOverwriteCsv(plants)) .catch((error) => console.error(error)); diff --git a/scraper/src/fetch_german_names.js b/scraper/src/fetch_german_names.js index 121bc5b27..b606a499b 100644 --- a/scraper/src/fetch_german_names.js +++ b/scraper/src/fetch_german_names.js @@ -4,6 +4,7 @@ import fs from "fs"; import { parse as json2csv } from "json2csv"; import csv from "csvtojson"; import { capitalizeWords } from "./helpers/helpers.js"; +import { log } from "console"; let GermanNamesFound = 0; @@ -14,7 +15,7 @@ let GermanNamesFound = 0; axiosRetry(axios, { retries: 5, // number of retries retryDelay: (retryCount) => { - console.log(`retry attempt: ${retryCount}`); + if (retryCount == 5) console.log(`last retry attempt (${retryCount})`); return retryCount * 3000; // time interval between retries }, retryCondition: (error) => { @@ -47,11 +48,11 @@ const filterGermanNames = (germanNames, unique_name) => { unique_name_filter = unique_name_filter.replace(/\s+/g, " "); germanName = germanName.trim(); - if ( - unique_names.some((unique_name_part) => - germanName.includes(unique_name_part) - ) - ) { + + const areSetsEqual = (a, b) => + a.size === b.size && [...a].every((value) => b.has(value)); + // If the German name is the same as the unique name, skip it + if (areSetsEqual(new Set(germanName.split(" ")), new Set(unique_names))) { continue; } @@ -102,13 +103,6 @@ const handleResponseAddGermanNamesFound = (response, germanNames) => { if (dewiki_entity) { germanNames.push(dewiki_entity.title); } - - const aliase_entities = entity.aliases.de; - if (aliase_entities) { - aliase_entities.forEach((alias) => { - germanNames.push(alias.value); - }); - } }; /** @@ -208,7 +202,7 @@ async function writePlantsToOverwriteCsv(plants) { fields: ["unique_name", "common_name_de"], }; const csvFile = json2csv(plants, opts); - fs.writeFileSync("data/overrides/01_germanCommonNames.csv", csvFile); + fs.writeFileSync("data/germanCommonNames.csv", csvFile); return plants; } diff --git a/scraper/src/helpers/helpers.js b/scraper/src/helpers/helpers.js index b8f499857..135569417 100644 --- a/scraper/src/helpers/helpers.js +++ b/scraper/src/helpers/helpers.js @@ -105,6 +105,24 @@ function getHeightEnumTyp(height) { return value <= 0.25 ? "low" : value <= 0.61 ? "medium" : "high"; } +/** + * Cleans up a JSON array entries for smoother CSV export. + * Changes empty strings to null and removes the subfamily column + * + * @param {Array} plants - Array of plants + */ +function cleanUpJsonForCsv(plants) { + const columns = Object.keys(plants[0]); + plants.forEach((plant, index) => { + delete plant.subfamily; + columns.forEach((column) => { + if (plant[column] === "") { + plant[column] = null; + } + }); + }); +} + /** * Returns the spread enum typ based on the spread/width * @@ -124,4 +142,5 @@ export { getHeightEnumTyp, getSpreadEnumTyp, capitalizeWords, + cleanUpJsonForCsv, }; diff --git a/scraper/src/helpers/override.js b/scraper/src/helpers/override.js new file mode 100644 index 000000000..a46de087f --- /dev/null +++ b/scraper/src/helpers/override.js @@ -0,0 +1,45 @@ +import csv from "csvtojson"; + +/** + * Apply the given override file to the plants + * + * @param {*} plants - Array of plants + * @param {*} file - The relative path of the csv file + * @returns - The plants with the overrides applied + * + */ +async function applyOverride(plants, file) { + console.log(`[INFO] Applying override ${file}`); + + const overridePlants = await csv().fromFile(file); + + overridePlants.forEach((overridePlant) => { + // find the plant + const index = plants.findIndex( + (plant) => plant.unique_name === overridePlant.unique_name + ); + + if (index === -1) { + console.log( + `[INFO] Could not find plant with unique_name '${overridePlant.unique_name}' in merged dataset.` + ); + return; + } + + // for each column in the override plant, update the plant + Object.keys(overridePlant).forEach((key) => { + if (key !== "unique_name") { + if (key === "new_unique_name") { + // actually override the unique name + key = "unique_name"; + } + + plants[index][key] = overridePlant[key].trim(); + } + }); + }); + + return plants; +} + +export { applyOverride }; diff --git a/scraper/src/insert_plants.js b/scraper/src/insert_plants.js index 7cb5c1314..de545022e 100644 --- a/scraper/src/insert_plants.js +++ b/scraper/src/insert_plants.js @@ -8,7 +8,7 @@ config(); const pgp = pgPromise({}); const db = pgp(process.env.DATABASE_URL); -const plantsFilePath = process.argv[2] || "data/mergedDatasets.csv"; +const plantsFilePath = process.argv[2] || "data/finalDataset.csv"; /** * Sanitizes the values of the json array diff --git a/scraper/src/merge_datasets.js b/scraper/src/merge_datasets.js index c6f632382..a9a6a99f1 100644 --- a/scraper/src/merge_datasets.js +++ b/scraper/src/merge_datasets.js @@ -7,7 +7,7 @@ import { getSoilPH, getHeightEnumTyp, getSpreadEnumTyp, - capitalizeWords, + cleanUpJsonForCsv, } from "./helpers/helpers.js"; /** @@ -250,23 +250,6 @@ async function mergeDatasets() { return allPlants; } -/** - * Cleans up a JSON array entries for smoother CSV export. - * - * @param {Array} plants - Array of plants - */ -function cleanUpJsonForCsv(plants) { - const columns = Object.keys(plants[0]); - plants.forEach((plant, index) => { - delete plant.subfamily; - columns.forEach((column) => { - if (plant[column] === "") { - plant[column] = null; - } - }); - }); -} - /** * The function writes the merged dataset to a CSV file. * diff --git a/scraper/src/merge_german_names.js b/scraper/src/merge_german_names.js new file mode 100644 index 000000000..596ef5056 --- /dev/null +++ b/scraper/src/merge_german_names.js @@ -0,0 +1,31 @@ +import fs from "fs"; +import { parse as json2csv } from "json2csv"; +import csv from "csvtojson"; +import { cleanUpJsonForCsv } from "./helpers/helpers.js"; +import { applyOverride } from "./helpers/override.js"; + +const germanCommonNames = "data/germanCommonNames.csv"; + +async function loadMergedDataset() { + return csv().fromFile("data/mergedDatasets.csv"); +} + +async function applyGermanNames(plants) { + return applyOverride(plants, germanCommonNames); +} + +async function writePlantsToOverwriteCsv(plants) { + console.log( + `[INFO] Writing ${plants.length} plants to csv data/mergedDatasets.csv` + ); + cleanUpJsonForCsv(plants); + const csvFile = json2csv(plants); + fs.writeFileSync("data/mergedDatasets.csv", csvFile); + + return plants; +} + +loadMergedDataset() + .then((plants) => applyGermanNames(plants)) + .then((plants) => writePlantsToOverwriteCsv(plants)) + .catch((error) => console.error(error)); From 1fed6a4dc7de96c2645ebb56a5621675c599c3cf Mon Sep 17 00:00:00 2001 From: Christoph Schreiner Date: Sun, 18 Feb 2024 22:10:44 +0000 Subject: [PATCH 15/18] Fix broken merge; Make sure unique_name has no leading/trailing whitespace --- scraper/src/helpers/helpers.js | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/scraper/src/helpers/helpers.js b/scraper/src/helpers/helpers.js index 0dc0e719f..58e4dbcb7 100644 --- a/scraper/src/helpers/helpers.js +++ b/scraper/src/helpers/helpers.js @@ -102,12 +102,14 @@ function processMeasurement(value) { const processedValue = processValue(value); if (processedValue === null) return null; - return value <= 0.25 ? "low" : value <= 0.61 ? "medium" : "high"; + return Math.round(processedValue * 100); } /** * Cleans up a JSON array entries for smoother CSV export. - * Changes empty strings to null and removes the subfamily column + * - Changes empty strings to null + * - Removes the subfamily column + * - Trims unique_name column * * @param {Array} plants - Array of plants */ @@ -118,29 +120,17 @@ function cleanUpJsonForCsv(plants) { columns.forEach((column) => { if (plant[column] === "") { plant[column] = null; + } else if (column === "unique_name") { + plant[column] = plant[column].trim(); } }); }); } -/** - * Returns the spread enum typ based on the spread/width - * - * @param {string} spread String containing the spread/width value in meter - * @returns {string} - */ -function getSpreadEnumTyp(spread) { - const value = processValue(spread); - if (value === null) return null; - - return value <= 0.15 ? "narrow" : value <= 0.61 ? "medium" : "wide"; -} - export { sanitizeColumnNames, getSoilPH, - getHeightEnumTyp, - getSpreadEnumTyp, capitalizeWords, cleanUpJsonForCsv, + processMeasurement, }; From afdd15775e9fe157df5c91143a11daf780a1b86b Mon Sep 17 00:00:00 2001 From: Christoph Schreiner Date: Thu, 29 Feb 2024 14:44:05 +0100 Subject: [PATCH 16/18] Fixed new unique_name override not being applied --- scraper/README.md | 2 +- scraper/package.json | 4 ++-- scraper/src/fetch_german_names.js | 2 +- scraper/src/helpers/override.js | 11 ++++++----- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/scraper/README.md b/scraper/README.md index f1012871e..01a4d83aa 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -125,7 +125,7 @@ npm run insert:plants The scraper inserts the relation data into the database. -First you need to download the `Companions.csv` and `Antigonist.csv` file from the nextcloud server or export them yourself from the current `Plant_Relations.ods`. +First you need to download the `Companions.csv` and `Antagonist.csv` file from the nextcloud server or export them yourself from the current `Plant_Relations.ods`. Copy them into the /data directory and run: ```shell diff --git a/scraper/package.json b/scraper/package.json index 5f4d29950..aaa0e6fbe 100644 --- a/scraper/package.json +++ b/scraper/package.json @@ -16,8 +16,8 @@ "insert:plants": "node src/insert_plants.js", "insert:relations": "node src/insert_plant_relations.js", "insert": "npm run insert:plants && npm run insert:relations", - "start:full": "npm run fetch:permapeople && npm run fetch:practicalplants && npm run fetch:reinsaat && npm run merge:reinsaat && npm run merge:datasets && npm run fetch:germannames && npm run merge:germannames && npm run insert:plants", - "start": "npm run merge:datasets && npm run merge:germannames && npm run insert:plants" + "start:full": "npm run fetch:permapeople && npm run fetch:practicalplants && npm run fetch:reinsaat && npm run merge:reinsaat && npm run merge:datasets && npm run fetch:germannames && npm run merge:germannames && apply:overrides && npm run insert:plants", + "start": "npm run merge:datasets && npm run merge:germannames && apply:overrides && npm run insert:plants" }, "keywords": [], "author": "", diff --git a/scraper/src/fetch_german_names.js b/scraper/src/fetch_german_names.js index b606a499b..16da1b810 100644 --- a/scraper/src/fetch_german_names.js +++ b/scraper/src/fetch_german_names.js @@ -163,7 +163,7 @@ const processPlants = async (plants) => { const fetchGermanNamesForPlantsConcurrent = async (plants) => { const MAX_CONCURRENT_REQUESTS = 25; - console.log("[INFO] Start fetching German common names!"); + console.log("[INFO] Fetching German common names ..."); // Chunk the plants into batches of MAX_CONCURRENT_REQUESTS const chunks = []; diff --git a/scraper/src/helpers/override.js b/scraper/src/helpers/override.js index a46de087f..f7c235004 100644 --- a/scraper/src/helpers/override.js +++ b/scraper/src/helpers/override.js @@ -27,14 +27,15 @@ async function applyOverride(plants, file) { } // for each column in the override plant, update the plant - Object.keys(overridePlant).forEach((key) => { - if (key !== "unique_name") { - if (key === "new_unique_name") { + Object.keys(overridePlant).forEach((source_key) => { + if (source_key !== "unique_name") { + let destination_key = source_key; + if (source_key === "new_unique_name") { // actually override the unique name - key = "unique_name"; + destination_key = "unique_name"; } - plants[index][key] = overridePlant[key].trim(); + plants[index][destination_key] = overridePlant[source_key].trim(); } }); }); From a40bab2f318e3bf4b4101363bfe773d0bc906dd0 Mon Sep 17 00:00:00 2001 From: Christoph Schreiner Date: Thu, 28 Mar 2024 18:24:07 +0100 Subject: [PATCH 17/18] Incorporate reviews: Removed commented code; Improved code documentation --- scraper/src/fetch_german_names.js | 11 ++++------- scraper/src/merge_datasets.js | 2 -- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/scraper/src/fetch_german_names.js b/scraper/src/fetch_german_names.js index 16da1b810..acc868bf0 100644 --- a/scraper/src/fetch_german_names.js +++ b/scraper/src/fetch_german_names.js @@ -4,13 +4,12 @@ import fs from "fs"; import { parse as json2csv } from "json2csv"; import csv from "csvtojson"; import { capitalizeWords } from "./helpers/helpers.js"; -import { log } from "console"; let GermanNamesFound = 0; /** * Defines the amount of retries we do, if axios encounters errors during a HTTP GET Request. - * Increse Delay if we encounter error to prevent 429 Errors. + * Increase delay if we encounter error to prevent 429 Errors. */ axiosRetry(axios, { retries: 5, // number of retries @@ -24,8 +23,7 @@ axiosRetry(axios, { }); /** - * Fetches the German name of the plant from the Wikidata API. - * Sets the 'common_name_de' property of every plant in the array. + * Filter out German names that are the same as the unique name and remove duplicates * * @param {string[]} germanNames - An array with German plant names. * @param {string} unique_name - A plant name to filter if it's in the German names. @@ -60,7 +58,8 @@ const filterGermanNames = (germanNames, unique_name) => { cleanedGermanNames.push(capitalizeWords(germanName)); } } - //remove duplicates + + // remove duplicates const uniqueNameSet = new Set(cleanedGermanNames); return Array.from(uniqueNameSet); @@ -187,8 +186,6 @@ async function fetchGermanNames() { let plants = await csv().fromFile("data/mergedDatasets.csv"); - // plants = plants.slice(0, 100) // during developement - await fetchGermanNamesForPlantsConcurrent(plants); return plants; diff --git a/scraper/src/merge_datasets.js b/scraper/src/merge_datasets.js index 168d37ad9..292e19ca0 100644 --- a/scraper/src/merge_datasets.js +++ b/scraper/src/merge_datasets.js @@ -270,8 +270,6 @@ async function writePlantsToCsv(plants) { fs.mkdirSync("data"); } - //const slicedPlants = plants.slice(0,100) - let updatedPlants = unifyValueFormat(plants, permapeopleColumnMapping); cleanUpJsonForCsv(updatedPlants); From 815ebc3dbe2a0f446a2e45aed549fe5b8e519413 Mon Sep 17 00:00:00 2001 From: Markus Raab Date: Thu, 4 Apr 2024 10:04:11 +0200 Subject: [PATCH 18/18] use new folder /data, simply copy all csvs --- ci/Jenkinsfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ci/Jenkinsfile b/ci/Jenkinsfile index 866ceb26b..2d134974a 100644 --- a/ci/Jenkinsfile +++ b/ci/Jenkinsfile @@ -286,9 +286,7 @@ lock("${env.NODE_NAME}-exclusive") { dir('scraper') { sh 'npm ci' sh 'mkdir ./data/' - sh 'cp /nextcloud/Database/scraper-data/mergedDatasets.csv ./data/' - sh 'cp /nextcloud/Database/scraper-data/Companions.csv ./data/' - sh 'cp /nextcloud/Database/scraper-data/Antagonist.csv ./data/' + sh 'cp /data/*.csv ./data/' sh 'npm run insert' sh 'rm -rf ./data/' sh 'rm -rf ./node_modules/'