Skip to content

Commit

Permalink
Merge pull request #1 from kermitt2/master
Browse files Browse the repository at this point in the history
Update dump creation with the new API scroll functionality
  • Loading branch information
kerphi authored Jul 18, 2017
2 parents 09ad729 + 7e44a6c commit b062ebe
Showing 1 changed file with 39 additions and 38 deletions.
77 changes: 39 additions & 38 deletions tools/dump-istexid-and-other-ids.js
Original file line number Diff line number Diff line change
@@ -1,47 +1,48 @@
//var request = require('request');
var request = require('requestretry');

var urls = [ 'https://api.istex.fr/document/?q=*&output=id,corpusName,doi,pmid,pii&size=1000' ]
var urls = [ 'https://api.istex.fr/document/?q=*&output=id,corpusName,doi,pmid,pii&scroll=1m&size=1000' ]
//var urls = [ 'https://api.istex.fr/document/?q=*&size=1000&output=id,corpusName&defaultOperator=OR&from=8229000' ]

function getIstexIdsFromOnePage() {
var url = urls.pop();
request.get({
url: url,
maxAttempts: 50, // (default) try 5 times
retryDelay: 5000, // (default) wait for 5s before trying again
retryStrategy: request.RetryStrategies.HTTPOrNetworkError // (default) retry on 5xx or network errors
}, function (err, res) {
if (err) {
console.error(err, url);
return;
}
var json = JSON.parse(res.body);
if (json && json.hits) {
json.hits.forEach(function (doc) {
var jsonRes = {
corpusName: doc.corpusName,
istexId: doc.id,
doi: doc.doi || [],
pmid: doc.pmid || [],
pii: doc.pii || []
};
console.log(JSON.stringify(jsonRes));
});
} else {
console.error('json.hits empty ' + url);
console.error(json);
}
if (json.nextPageURI) {
urls.push(json.nextPageURI);
} else {
console.error('no more nextPageURL ' + url)
}
setTimeout(function () {
getIstexIdsFromOnePage();
}, 10);
});
var url = urls.pop();
request.get({
url: url,
maxAttempts: 50, // (default) try 5 times
retryDelay: 5000, // (default) wait for 5s before trying again
retryStrategy: request.RetryStrategies.HTTPOrNetworkError // (default) retry on 5xx or network errors
},
function (err, res) {
if (err) {
console.error(err, url);
return;
}
var json = JSON.parse(res.body);
if (json && json.hits) {
json.hits.forEach(function (doc) {
var jsonRes = {
corpusName: doc.corpusName,
istexId: doc.id,
doi: doc.doi || [],
pmid: doc.pmid || [],
pii: doc.pii || []
};
console.log(JSON.stringify(jsonRes));
});
} else {
console.error('json.hits empty ' + url);
console.error(json);
}
if (!json.noMoreScrollResults && json.nextScrollURI) {
urls.push(json.nextScrollURI);
} else {
console.error('no more nextPageURL ' + url)
}
setTimeout(function () {
getIstexIdsFromOnePage();
}, 10);
});
}
setTimeout(function () {
getIstexIdsFromOnePage();
getIstexIdsFromOnePage();
}, 10);

0 comments on commit b062ebe

Please sign in to comment.