diff --git a/tools/dump-istexid-and-other-ids.js b/tools/dump-istexid-and-other-ids.js index 21b041e..e068bd6 100644 --- a/tools/dump-istexid-and-other-ids.js +++ b/tools/dump-istexid-and-other-ids.js @@ -1,47 +1,48 @@ //var request = require('request'); var request = require('requestretry'); -var urls = [ 'https://api.istex.fr/document/?q=*&output=id,corpusName,doi,pmid,pii&size=1000' ] +var urls = [ 'https://api.istex.fr/document/?q=*&output=id,corpusName,doi,pmid,pii&scroll=1m&size=1000' ] //var urls = [ 'https://api.istex.fr/document/?q=*&size=1000&output=id,corpusName&defaultOperator=OR&from=8229000' ] function getIstexIdsFromOnePage() { - var url = urls.pop(); - request.get({ - url: url, - maxAttempts: 50, // (default) try 5 times - retryDelay: 5000, // (default) wait for 5s before trying again - retryStrategy: request.RetryStrategies.HTTPOrNetworkError // (default) retry on 5xx or network errors - }, function (err, res) { - if (err) { - console.error(err, url); - return; - } - var json = JSON.parse(res.body); - if (json && json.hits) { - json.hits.forEach(function (doc) { - var jsonRes = { - corpusName: doc.corpusName, - istexId: doc.id, - doi: doc.doi || [], - pmid: doc.pmid || [], - pii: doc.pii || [] - }; - console.log(JSON.stringify(jsonRes)); - }); - } else { - console.error('json.hits empty ' + url); - console.error(json); - } - if (json.nextPageURI) { - urls.push(json.nextPageURI); - } else { - console.error('no more nextPageURL ' + url) - } - setTimeout(function () { - getIstexIdsFromOnePage(); - }, 10); - }); + var url = urls.pop(); + request.get({ + url: url, + maxAttempts: 50, // (default) try 5 times + retryDelay: 5000, // (default) wait for 5s before trying again + retryStrategy: request.RetryStrategies.HTTPOrNetworkError // (default) retry on 5xx or network errors + }, + function (err, res) { + if (err) { + console.error(err, url); + return; + } + var json = JSON.parse(res.body); + if (json && json.hits) { + json.hits.forEach(function (doc) { + var jsonRes = { + corpusName: doc.corpusName, + istexId: doc.id, + doi: doc.doi || [], + pmid: doc.pmid || [], + pii: doc.pii || [] + }; + console.log(JSON.stringify(jsonRes)); + }); + } else { + console.error('json.hits empty ' + url); + console.error(json); + } + if (!json.noMoreScrollResults && json.nextScrollURI) { + urls.push(json.nextScrollURI); + } else { + console.error('no more nextPageURL ' + url) + } + setTimeout(function () { + getIstexIdsFromOnePage(); + }, 10); + }); } setTimeout(function () { - getIstexIdsFromOnePage(); + getIstexIdsFromOnePage(); }, 10);