diff --git a/richcontext/scholapi/scholapi.py b/richcontext/scholapi/scholapi.py index fea5c04..7f13776 100755 --- a/richcontext/scholapi/scholapi.py +++ b/richcontext/scholapi/scholapi.py @@ -174,7 +174,7 @@ def title_search (self, title): """ t0 = time.time() - url = self.get_api_url(urllib.parse.quote(title)) + url = self.get_api_url() + "title={}".format(urllib.parse.quote(title)) response = requests.get(url).text soup = BeautifulSoup(response, "html.parser") @@ -197,6 +197,28 @@ def title_search (self, title): self.mark_time(t0) return None + def full_text_search (self, search_term,nresults = None): + """ + parse metadata from XML returned from the OpenAIRE API query + """ + t0 = time.time() + base_url = self.get_api_url() + "keywords={}".format(urllib.parse.quote(search_term)) + + if nresults: + search_url = base_url + '&size={}'.format(nresults) + + elif not nresults: + response = requests.get(base_url).text + soup = BeautifulSoup(response, "html.parser") + nresults_response = int(soup.find("total").text) + search_url = base_url + '&size={}'.format(nresults_response) + + response = requests.get(search_url).text + soup = BeautifulSoup(response, "html.parser") + pub_metadata = soup.find_all("oaf:result") + self.mark_time(t0) + return pub_metadata + class ScholInfra_SemanticScholar (ScholInfra): """ @@ -340,20 +362,21 @@ def full_text_search (self, search_term, exact_match = True, nresults = None): if exact_match == False: query = 'search publications in full_data_exact for "{}" return publications[all] limit 1000'.format(search_term) - + if nresults: query = 'search publications in full_data_exact for "\\"{}\\"" return publications[all] limit {}'.format(search_term,nresults) if exact_match == False: query = 'search publications in full_data_exact for "{}" return publications[all] limit {}'.format(search_term,nresults) + self.login() response = self.run_query(query) search_results = response.publications - + self.mark_time(t0) return search_results - + class ScholInfra_RePEc (ScholInfra): """ @@ -627,7 +650,7 @@ def fulltext_id_search (self, search_term, nresults = None): response_count = int([d for d in query_return["eGQueryResult"] if d["DbName"] == "pubmed"][0]["Count"]) if response_count > 0: - if nresults == None: + if nresults == None: handle = Entrez.read(Entrez.esearch(db="pubmed", retmax=response_count, term="\"{}\"".format(search_term) @@ -644,12 +667,14 @@ def fulltext_id_search (self, search_term, nresults = None): ) id_list = handle["IdList"] - return id_list + + return id_list else: return None + def full_text_search (self, search_term, nresults = None): t0 = time.time() @@ -670,14 +695,11 @@ def full_text_search (self, search_term, nresults = None): xml = xmltodict.parse(data) meta_list = json.loads(json.dumps(xml)) meta = meta_list["PubmedArticleSet"]["PubmedArticle"] - + self.mark_time(t0) return meta - - else: - raise Exception("Input to fetch from PubMed is not a list of IDs") - + def journal_lookup (self, issn): """ use the NCBI discovery service for ISSN lookup diff --git a/test.py b/test.py index 2973287..5af36a2 100755 --- a/test.py +++ b/test.py @@ -23,7 +23,15 @@ def test_openaire_title_search (self): print("\ntime: {:.3f} ms - {}".format(schol.openaire.elapsed_time, schol.openaire.name)) self.assertTrue(repr(meta) == "OrderedDict([('url', 'https://europepmc.org/articles/PMC5574185/'), ('authors', ['Taillie, Lindsey Smith', 'Ng, Shu Wen', 'Xue, Ya', 'Harding, Matthew']), ('open', True)])") + + def test_openaire_fulltext_search (self): + schol = rc_scholapi.ScholInfraAPI(config_file="rc.cfg") + search_term = "NHANES" + + meta = schol.openaire.full_text_search(search_term) + print("\ntime: {:.3f} ms - {}".format(schol.openaire.elapsed_time, schol.openaire.name)) + self.assertTrue(len(meta) >= 3300) def test_crossref_publication_lookup (self): schol = rc_scholapi.ScholInfraAPI(config_file="rc.cfg")