diff --git a/data_checks/conftest.py b/data_checks/conftest.py index 4f0f3ab..d8b04a9 100644 --- a/data_checks/conftest.py +++ b/data_checks/conftest.py @@ -10,9 +10,7 @@ @pytest.fixture(scope="session") def articles(): """Return articles data.""" - return pd.read_csv( - os.path.join("data", "articles.csv"), dtype={"pmid": str} - ) + return pd.read_csv(os.path.join("data", "articles.csv"), dtype={"pmid": str}) @pytest.fixture(scope="session") @@ -42,18 +40,14 @@ def author_wrote_article(): @pytest.fixture(scope="session") def author_affiliated_with_institution(): """Return affiliation data.""" - return pd.read_csv( - os.path.join("data", "author_affiliated_with_institution.csv") - ) + return pd.read_csv(os.path.join("data", "author_affiliated_with_institution.csv")) @pytest.fixture(scope="session") def embedded_article_uids(): """Return embedding uids.""" uids = [] - with open( - os.path.join("data", "articles_embedded.jsonl"), "r", encoding="utf-8" - ) as articles_file: + with open(os.path.join("data", "articles_embedded.jsonl"), "r", encoding="utf-8") as articles_file: for line in articles_file: embedded_article = json.loads(line) uids.append(embedded_article["article_uid"]) diff --git a/pyproject.toml b/pyproject.toml index 25ad7f2..78ebe12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,12 +50,12 @@ gather_articles = "citations.scripts.gather_articles:main" gather_authors = "citations.scripts.gather_authors:main" [tool.black] -line-length = 79 +line-length = 120 preview = true [tool.isort] profile = "black" -line_length = 79 +line_length = 120 [tool.pytest.ini_options] addopts = "--cov=src --cov-report=html --cov-config=.coveragerc" diff --git a/src/citations/data_sources/bbp.py b/src/citations/data_sources/bbp.py index bca7e73..a8e59b1 100644 --- a/src/citations/data_sources/bbp.py +++ b/src/citations/data_sources/bbp.py @@ -10,9 +10,7 @@ logger = logging.getLogger(__name__) -def get_bbp_author_names( - bbp_publications: pd.DataFrame, title: str, is_bbp: bool -) -> Optional[List[str]]: +def get_bbp_author_names(bbp_publications: pd.DataFrame, title: str, is_bbp: bool) -> Optional[List[str]]: """ Get names of BBP authors for a particular title. @@ -32,12 +30,8 @@ def get_bbp_author_names( """ try: if is_bbp: - bbp_row = bbp_publications[ - bbp_publications["normalized_title"] == normalize_title(title) - ].iloc[0] - author_names = [ - name.strip() for name in bbp_row["Author"].split(";") - ] + bbp_row = bbp_publications[bbp_publications["normalized_title"] == normalize_title(title)].iloc[0] + author_names = [name.strip() for name in bbp_row["Author"].split(";")] else: author_names = None except Exception as e: diff --git a/src/citations/data_sources/europmc.py b/src/citations/data_sources/europmc.py index e2defab..f688b20 100644 --- a/src/citations/data_sources/europmc.py +++ b/src/citations/data_sources/europmc.py @@ -64,9 +64,7 @@ def get_citations( return citations, citing_articles -def get_article( - europmc_id: str, europmc_xml_map: dict | None = None -) -> Article | None: +def get_article(europmc_id: str, europmc_xml_map: dict | None = None) -> Article | None: r"""Process a citation by fetching its metadata from Euro PMC. \f @@ -80,8 +78,7 @@ def get_article( Article """ response = get_with_waiting( - "https://www.ebi.ac.uk/europepmc/webservices" - f"/rest/search?query=EXT_ID:{europmc_id}&resultType=core" + "https://www.ebi.ac.uk/europepmc/webservices" f"/rest/search?query=EXT_ID:{europmc_id}&resultType=core" ) root = parse_xml(response.text) if root is None: @@ -186,10 +183,7 @@ def extract_bbp_article( europmc_id = europmc_id_elem.text # type: ignore if publication_date is None: publication_date_elem = element.find("./firstPublicationDate") - if ( - publication_date_elem is not None - and publication_date_elem.text is not None - ): + if publication_date_elem is not None and publication_date_elem.text is not None: publication_date = to_date(publication_date_elem.text) europmc_source_elem = element.find("./source") # type: ignore europmc_source = europmc_source_elem.text # type: ignore @@ -245,16 +239,10 @@ def extract_authors(element: Element) -> list[str]: """ # Maybe later we might want to add other author id types. elements = element.findall("./authorIdList/authorId[@type='ORCID']") - return [ - orcid_id.text - for orcid_id in elements - if orcid_id is not None and orcid_id.text is not None - ] + return [orcid_id.text for orcid_id in elements if orcid_id is not None and orcid_id.text is not None] -def fetch_citation_ids( - europmc_id: str, europmc_source: str, page_size: int = 1000 -) -> list[str] | None: +def fetch_citation_ids(europmc_id: str, europmc_source: str, page_size: int = 1000) -> list[str] | None: r"""Fetch citation IDs for a given article from Europe PMC. \f @@ -273,8 +261,7 @@ def fetch_citation_ids( A list of citation IDs associated with the given article. """ response = get_with_waiting( - f"https://www.ebi.ac.uk/europepmc/webservices/rest/{ - europmc_source}/{europmc_id}" + f"https://www.ebi.ac.uk/europepmc/webservices/rest/{europmc_source}/{europmc_id}" f"/citations?page=1&pageSize={page_size}&format=xml" ) root = parse_xml(response.text) @@ -285,35 +272,21 @@ def fetch_citation_ids( if num_citations == 0: return [] pages = math.ceil(num_citations / page_size) - citation_ids = [ - result.text for result in root.findall("./citationList/citation/id") - ] + citation_ids = [result.text for result in root.findall("./citationList/citation/id")] for page in range(2, pages + 1): response = get_with_waiting( - f"https://www.ebi.ac.uk/europepmc/webservices/rest/{ - europmc_source}/{europmc_id}" + f"https://www.ebi.ac.uk/europepmc/webservices/rest/{europmc_source}/{europmc_id}" f"/citations?page={page}&pageSize={page_size}&format=xml" ) root = parse_xml(response.text) if root is None: continue - citation_ids.extend( - [ - result.text - for result in root.findall("./citationList/citation/id") - ] - ) - citation_ids = [ - citation_id - for citation_id in citation_ids - if citation_id != europmc_id - ] + citation_ids.extend([result.text for result in root.findall("./citationList/citation/id")]) + citation_ids = [citation_id for citation_id in citation_ids if citation_id != europmc_id] return citation_ids # type: ignore -def fetch_article_element( - doi: str, isbns: str | None, title: str -) -> Element | None: +def fetch_article_element(doi: str, isbns: str | None, title: str) -> Element | None: """ Retrieve the XML element of an article by its DOI, isbn or title. @@ -334,8 +307,7 @@ def fetch_article_element( normalized_title = normalize_title(title) if doi is not None: response = get_with_waiting( - "https://www.ebi.ac.uk/europepmc/webservices/rest/" - f"search?query=DOI:{doi}&resultType=core" + "https://www.ebi.ac.uk/europepmc/webservices/rest/" f"search?query=DOI:{doi}&resultType=core" ) root = parse_xml(response.text) if root is not None: @@ -349,24 +321,20 @@ def fetch_article_element( if isbns is not None: for isbn in isbns.split(): response = get_with_waiting( - "https://www.ebi.ac.uk/europepmc/webservices/rest/" - f"search?query=ISBN:{isbn}&resultType=core" + "https://www.ebi.ac.uk/europepmc/webservices/rest/" f"search?query=ISBN:{isbn}&resultType=core" ) root = parse_xml(response.text) if root is not None: article_element = root.find("./resultList/result") if article_element is not None: result_title = article_element.find("./title").text # type: ignore - if ( - normalize_title(str(result_title)) == normalized_title - ): # type: ignore + if normalize_title(str(result_title)) == normalized_title: # type: ignore return article_element # When using a title in a query we need to replace spaces query_title = quote(title) response = get_with_waiting( - "https://www.ebi.ac.uk/europepmc/webservices" - f"/rest/search?query={query_title}&resultType=core" + "https://www.ebi.ac.uk/europepmc/webservices" f"/rest/search?query={query_title}&resultType=core" ) root = parse_xml(response.text) if root is None: diff --git a/src/citations/data_sources/orcid.py b/src/citations/data_sources/orcid.py index 1c8f67c..8d74826 100644 --- a/src/citations/data_sources/orcid.py +++ b/src/citations/data_sources/orcid.py @@ -19,11 +19,7 @@ Institution, OrganizationIdSource, ) -from citations.utils import ( - generate_unique_id, - get_with_waiting, - normalize_title, -) +from citations.utils import generate_unique_id, get_with_waiting, normalize_title logger = logging.getLogger(__name__) @@ -63,9 +59,7 @@ ] -def extract_affiliation_date( - position: Element, date_type: Literal["start-date", "end-date"] -) -> date | None: +def extract_affiliation_date(position: Element, date_type: Literal["start-date", "end-date"]) -> date | None: """ Extract a date from the given XML element based on the specified date type. @@ -114,9 +108,7 @@ def extract_affiliation_date( return dt -def fetch_article_orcidids( - doi: str | None, pmid: str | None, article_title: str, top_n_orcid: int = 5 -) -> list[str]: +def fetch_article_orcidids(doi: str | None, pmid: str | None, article_title: str, top_n_orcid: int = 5) -> list[str]: """ Fetch all orcid ids for authors of this article. @@ -137,24 +129,18 @@ def fetch_article_orcidids( article_orcid_ids = [] if doi is not None: endpoint = f"https://pub.orcid.org/v3.0/search/?q=doi-self:{doi}" - article_orcid_ids.extend( - get_orcidids_from_endpoint(endpoint)[:top_n_orcid] - ) + article_orcid_ids.extend(get_orcidids_from_endpoint(endpoint)[:top_n_orcid]) if pmid is not None: endpoint = f"https://pub.orcid.org/v3.0/search/?q=pmid:{pmid}" - article_orcid_ids.extend( - get_orcidids_from_endpoint(endpoint)[:top_n_orcid] - ) + article_orcid_ids.extend(get_orcidids_from_endpoint(endpoint)[:top_n_orcid]) article_orcid_ids = list(set(article_orcid_ids)) article_orcid_ids = filter_orcidids(article_orcid_ids, article_title) return list(set(article_orcid_ids)) -def get_orcidids_from_author_names( - author_names: list[str], max_author_names: int = 3 -) -> list[str]: +def get_orcidids_from_author_names(author_names: list[str], max_author_names: int = 3) -> list[str]: """ Fetch all orcid ids for authors of this article. @@ -176,8 +162,10 @@ def get_orcidids_from_author_names( family_name = family_name.strip() given_names = given_names.strip() try: - endpoint = f"https://pub.orcid.org/v3.0/search/?q=family-name:{ - quote(family_name)}+AND+given-names:{quote(given_names)}" + endpoint = ( + f"https://pub.orcid.org/v3.0/search/?q=family-name:" + f"{quote(family_name)}+AND+given-names:{quote(given_names)}" + ) response = get_with_waiting(endpoint) except HTTPStatusError as e: logger.error(f"Error sending request: {str(e)}") @@ -199,22 +187,15 @@ def get_orcidids_from_author_names( orcid_ids.append(elements[0].text) else: # multiple potential authors found_exact_name = False - logger.warning( - f"More than one author found with endpoint {endpoint}" - ) + logger.warning(f"More than one author found with endpoint {endpoint}") exact_name = f"{given_names} {family_name}" - logger.warning( - f"Looking for author with exact name {exact_name}" - ) + logger.warning(f"Looking for author with exact name {exact_name}") orcidids_with_exact_name = [] for orcid_id_element in elements: - endpoint = f"https://pub.orcid.org/v3.0/{ - orcid_id_element.text}/record" + endpoint = f"https://pub.orcid.org/v3.0/{orcid_id_element.text}/record" response = get_with_waiting(endpoint) - record = citations.data_sources.utils.parse_xml( - response.text - ) + record = citations.data_sources.utils.parse_xml(response.text) if record is None: continue name = get_author_name(record) @@ -229,9 +210,7 @@ def get_orcidids_from_author_names( try: record = ET.fromstring(response.text) except ET.ParseError as e: - logger.error( - f"Error parsing result of call to {endpoint}" - ) + logger.error(f"Error parsing result of call to {endpoint}") logger.error(f"Response text: {response.text}") raise e @@ -246,9 +225,7 @@ def get_orcidids_from_author_names( orcid_ids.append(elements[0].text) if name == exact_name: if orcid_id_element.text is not None: - orcidids_with_exact_name.append( - orcid_id_element.text - ) + orcidids_with_exact_name.append(orcid_id_element.text) if len(orcidids_with_exact_name) == 0: logger.warning(f"Cannot find exact name '{exact_name}'.") for i in range(len(elements)): @@ -257,13 +234,9 @@ def get_orcidids_from_author_names( orcid_ids.append(elements[i].text) # type: ignore elif len(orcidids_with_exact_name) > 1: logger.warning( - "There is more than one id with exact name '{}'.".format( - f"{given_names} {family_name}" - ) - ) - logger.warning( - f"Choosing id: {orcidids_with_exact_name[0]}" + "There is more than one id with exact name '{}'.".format(f"{given_names} {family_name}") ) + logger.warning(f"Choosing id: {orcidids_with_exact_name[0]}") orcid_ids.append(orcidids_with_exact_name[0]) else: orcid_ids.append(orcidids_with_exact_name[0]) @@ -324,9 +297,7 @@ def filter_orcidids(orcid_ids: list[str], article_title: str) -> list[str]: normalized_article_title = normalize_title(article_title) actual_author_orcidids = [] for orcidid in orcid_ids: - response = get_with_waiting( - f"https://pub.orcid.org/v3.0/{orcidid}/record" - ) + response = get_with_waiting(f"https://pub.orcid.org/v3.0/{orcidid}/record") root = citations.data_sources.utils.parse_xml(response.text) if root is None: @@ -336,11 +307,7 @@ def filter_orcidids(orcid_ids: list[str], article_title: str) -> list[str]: "./activities:activities-summary/activities:*/activities:group/*/*/common:title", namespaces=NAMESPACES, ) - normalized_titles = { - normalize_title(element.text) - for element in elements - if element.text is not None - } + normalized_titles = {normalize_title(element.text) for element in elements if element.text is not None} if normalized_article_title in normalized_titles: actual_author_orcidids.append(orcidid) return actual_author_orcidids @@ -375,9 +342,7 @@ def get_author_orcid_information( ------- None """ - author_wrote_article = AuthorWroteArticle( - author_uid=author_uid, article_uid=current_article_uid - ) + author_wrote_article = AuthorWroteArticle(author_uid=author_uid, article_uid=current_article_uid) endpoint = f"https://pub.orcid.org/v3.0/{orcidid}/record" response = get_with_waiting(endpoint) @@ -415,20 +380,11 @@ def get_author_name(record: Element) -> str | None: """ element = record.find(".//person:name", namespaces=NAMESPACES) if element is not None: - given_names_element = element.find( - ".//personal-details:given-names", namespaces=NAMESPACES - ) - family_name_element = element.find( - ".//personal-details:family-name", namespaces=NAMESPACES - ) + given_names_element = element.find(".//personal-details:given-names", namespaces=NAMESPACES) + family_name_element = element.find(".//personal-details:family-name", namespaces=NAMESPACES) if given_names_element is not None and family_name_element is not None: - if ( - given_names_element.text is not None - and family_name_element.text is not None - ): - name = ( - given_names_element.text + " " + family_name_element.text - ) + if given_names_element.text is not None and family_name_element.text is not None: + name = given_names_element.text + " " + family_name_element.text return name return None diff --git a/src/citations/data_sources/serp.py b/src/citations/data_sources/serp.py index feba0b2..f2f53cd 100644 --- a/src/citations/data_sources/serp.py +++ b/src/citations/data_sources/serp.py @@ -71,7 +71,5 @@ def get_all_bbp_publications( bbp_wip_theses = pd.read_csv(bbp_theses_wip_path) bbp_wip_theses["is_published"] = False bbp_publications = pd.concat([bbp_publications, bbp_wip_theses]) - bbp_publications["normalized_title"] = bbp_publications["Title"].apply( - lambda title: normalize_title(title) - ) + bbp_publications["normalized_title"] = bbp_publications["Title"].apply(lambda title: normalize_title(title)) return bbp_publications diff --git a/src/citations/data_sources/utils.py b/src/citations/data_sources/utils.py index 8ceaf47..23731f7 100644 --- a/src/citations/data_sources/utils.py +++ b/src/citations/data_sources/utils.py @@ -20,9 +20,7 @@ logger = logging.getLogger(__name__) -def load_authors_state( - checkpoint_dir: str, articles: pd.DataFrame, only_get_bbp_authors: bool -) -> tuple: +def load_authors_state(checkpoint_dir: str, articles: pd.DataFrame, only_get_bbp_authors: bool) -> tuple: """ Load a checkpoint for the gather authors script if exists or return an empty state. @@ -61,9 +59,7 @@ def load_authors_state( ckpt_exists = False if checkpoint_dir is not None: os.makedirs(checkpoint_dir, exist_ok=True) - articles_processed_path = os.path.join( - checkpoint_dir, "articles_processed.csv" - ) + articles_processed_path = os.path.join(checkpoint_dir, "articles_processed.csv") ckpt_exists = os.path.exists(articles_processed_path) if ckpt_exists: @@ -86,36 +82,22 @@ def load_authors_state( authors_df = pd.read_csv(os.path.join(checkpoint_dir, "authors.csv")) authors = [dict(author) for _, author in authors_df.iterrows()] all_author_uids.update([author["uid"] for author in authors]) - institutions_df = pd.read_csv( - os.path.join(checkpoint_dir, "institutions.csv") - ) - institutions = [ - dict(institution) for _, institution in institutions_df.iterrows() - ] + institutions_df = pd.read_csv(os.path.join(checkpoint_dir, "institutions.csv")) + institutions = [dict(institution) for _, institution in institutions_df.iterrows()] all_institution_uids.update([inst["uid"] for inst in institutions]) - author_wrote_article_df = pd.read_csv( - os.path.join(checkpoint_dir, "author_wrote_article.csv") - ) - author_wrote_article = [ - dict(wrote) for _, wrote in author_wrote_article_df.iterrows() - ] + author_wrote_article_df = pd.read_csv(os.path.join(checkpoint_dir, "author_wrote_article.csv")) + author_wrote_article = [dict(wrote) for _, wrote in author_wrote_article_df.iterrows()] author_aff_institution_df = pd.read_csv( os.path.join( checkpoint_dir, "author_affiliated_with_institution.csv", ) ) - author_aff_institution = [ - dict(aff) for _, aff in author_aff_institution_df.iterrows() - ] + author_aff_institution = [dict(aff) for _, aff in author_aff_institution_df.iterrows()] # Subtract already processed articles - merged_df = articles.merge( - articles_processed, how="left", indicator=True - ) - remaining_articles = merged_df[ - merged_df["_merge"] == "left_only" - ].drop(columns="_merge") + merged_df = articles.merge(articles_processed, how="left", indicator=True) + remaining_articles = merged_df[merged_df["_merge"] == "left_only"].drop(columns="_merge") else: articles_processed = pd.DataFrame(columns=articles.columns) authors = [] @@ -185,18 +167,12 @@ def save_authors_results( df = pd.DataFrame(institutions, columns=INSTITUTION_COLUMNS) if len(df) > 0: df.sort_values(by="uid", inplace=True) - df.to_csv( - os.path.join(output_dir, "institutions.csv"), index=False, header=True - ) + df.to_csv(os.path.join(output_dir, "institutions.csv"), index=False, header=True) df = pd.DataFrame(authors, columns=AUTHOR_COLUMNS) if len(df) > 0: df.sort_values(by="uid", inplace=True) - df.to_csv( - os.path.join(output_dir, "authors.csv"), index=False, header=True - ) - df = pd.DataFrame( - author_wrote_article, columns=AUTHOR_WROTE_ARTICLE_COLUMNS - ) + df.to_csv(os.path.join(output_dir, "authors.csv"), index=False, header=True) + df = pd.DataFrame(author_wrote_article, columns=AUTHOR_WROTE_ARTICLE_COLUMNS) if len(df) > 0: df.sort_values(by=["author_uid", "article_uid"], inplace=True) df.to_csv( @@ -236,23 +212,11 @@ def get_author_ids( Returns ------- List[Dict[str, Optional[str]]] - A list of dictionaries, where each dictionary represents an author and their associated information. The dictionary contains the following keys: - - "orcidid" : str, optional - The ORCID ID of the author. - - "google_scholar_id" : str, optional - The Google Scholar ID of the author. - - "author_name" : str, optional - The name of the author. + A list of dictionaries, where each dictionary represents an author and their associated information. """ - article_orcid_ids = list( - europmc_wrote[europmc_wrote["article_uid"] == row.uid]["author_uid"] - ) - article_orcid_ids.extend( - orcid.fetch_article_orcidids( - doi, pmid, row.title, top_n_orcid=top_n_orcid - ) - ) + article_orcid_ids = list(europmc_wrote[europmc_wrote["article_uid"] == row.uid]["author_uid"]) + article_orcid_ids.extend(orcid.fetch_article_orcidids(doi, pmid, row.title, top_n_orcid=top_n_orcid)) article_orcid_ids = sorted(set(article_orcid_ids)) author_ids = [ { diff --git a/src/citations/schemas.py b/src/citations/schemas.py index 39c40f7..d449a62 100644 --- a/src/citations/schemas.py +++ b/src/citations/schemas.py @@ -5,9 +5,7 @@ from pydantic import BaseModel, ConfigDict -OrganizationIdSource = Literal[ - "LEI", "FUNDREF", "GRID", "RINGGOLD", "ROR", "sha256" -] +OrganizationIdSource = Literal["LEI", "FUNDREF", "GRID", "RINGGOLD", "ROR", "sha256"] class Article(BaseModel): diff --git a/src/citations/utils.py b/src/citations/utils.py index b81007f..11990bd 100644 --- a/src/citations/utils.py +++ b/src/citations/utils.py @@ -17,9 +17,7 @@ logger = logging.getLogger(__name__) -def get_with_waiting( - endpoint: str, retry_times: int = 5, wait: float = 30 -) -> Response: +def get_with_waiting(endpoint: str, retry_times: int = 5, wait: float = 30) -> Response: """ Attempt to send a GET request to the specified endpoint. diff --git a/tests/data_sources/test_europmc.py b/tests/data_sources/test_europmc.py index a2c97f5..edb71ce 100644 --- a/tests/data_sources/test_europmc.py +++ b/tests/data_sources/test_europmc.py @@ -17,9 +17,7 @@ from citations.utils import to_date -def generate_citation_response_xml( - article_ids: list[str], source: str, hit_count: int -) -> str: +def generate_citation_response_xml(article_ids: list[str], source: str, hit_count: int) -> str: """ Generate an XML string for a given list of IDs. @@ -85,15 +83,11 @@ def generate_single_result_xml( if author_list: authors = ET.SubElement(result, "authorIdList") for orcid_id in author_list: - ET.SubElement( - authors, "authorId", attrib={"type": "ORCID"} - ).text = orcid_id + ET.SubElement(authors, "authorId", attrib={"type": "ORCID"}).text = orcid_id return result -def generate_article_search_results_xml( - article_ids, sources, pmids, dois, titles, isbns -): +def generate_article_search_results_xml(article_ids, sources, pmids, dois, titles, isbns): response_wrapper = ET.Element( "responseWrapper", { @@ -134,9 +128,7 @@ def test_fetch_citation_ids(): "citations.data_sources.europmc.get_with_waiting", side_effect=[response1, response2, response3], ): - citation_ids = fetch_citation_ids( - "example_id", "example_source", page_size=2 - ) + citation_ids = fetch_citation_ids("example_id", "example_source", page_size=2) assert citation_ids == ["1", "2", "3", "4", "5", "6"] @@ -217,9 +209,7 @@ def test_fetch_article_element_doi(): isbn = "isbn1" response = httpx.Response( status_code=200, - text=generate_article_search_results_xml( - [article_id], [source], [pmid], [doi1], [title], [isbn] - ), + text=generate_article_search_results_xml([article_id], [source], [pmid], [doi1], [title], [isbn]), ) with patch( "citations.data_sources.europmc.get_with_waiting", @@ -249,17 +239,13 @@ def test_fetch_article_element_isbns(): ) response3 = httpx.Response( status_code=200, - text=generate_article_search_results_xml( - [article_id], [source], [pmid], [doi1], [title], [isbn] - ), + text=generate_article_search_results_xml([article_id], [source], [pmid], [doi1], [title], [isbn]), ) with patch( "citations.data_sources.europmc.get_with_waiting", side_effect=[response1, response2, response3], ): - article_element = fetch_article_element( - "bad doi", "isbn1 isbn2", title - ) + article_element = fetch_article_element("bad doi", "isbn1 isbn2", title) assert article_element.find("./id").text == article_id assert article_element.find("./source").text == source assert article_element.find("./pmid").text == pmid @@ -276,9 +262,7 @@ def test_fetch_article_element_title(): isbn = "isbn1" response = httpx.Response( status_code=200, - text=generate_article_search_results_xml( - [article_id], [source], [pmid], [doi1], [title], [isbn] - ), + text=generate_article_search_results_xml([article_id], [source], [pmid], [doi1], [title], [isbn]), ) with patch( "citations.data_sources.europmc.get_with_waiting", @@ -312,9 +296,7 @@ def test_extract_bbp_article(): isbn = "isbn1" pub_date = "2020-01-01" - result = generate_single_result_xml( - article_id, "europmc", pmid, doi1, title, isbn, pub_date - ) + result = generate_single_result_xml(article_id, "europmc", pmid, doi1, title, isbn, pub_date) article, _, _ = extract_bbp_article(result, title, isbns=isbn) assert article.uid == article_id assert article.pmid == pmid @@ -383,9 +365,7 @@ def generate_get_article_response_xml( def test_get_article(doi, title, abstract, urls, pmid, europmc_id, date): response = httpx.Response( status_code=200, - text=generate_get_article_response_xml( - doi, title, abstract, urls, pmid, europmc_id, date - ), + text=generate_get_article_response_xml(doi, title, abstract, urls, pmid, europmc_id, date), ) with patch( "citations.data_sources.europmc.get_with_waiting", @@ -406,9 +386,7 @@ def test_get_article(doi, title, abstract, urls, pmid, europmc_id, date): def test_get_article_no_article(): - response = httpx.Response( - status_code=200, text=generate_get_article_response_xml() - ) + response = httpx.Response(status_code=200, text=generate_get_article_response_xml()) with patch( "citations.data_sources.europmc.get_with_waiting", return_value=response, @@ -449,9 +427,7 @@ def test_get_citations(citation_ids, articles): "citations.data_sources.europmc.fetch_citation_ids", return_value=citation_ids, ): - with patch( - "citations.data_sources.europmc.get_article", side_effect=articles - ): + with patch("citations.data_sources.europmc.get_article", side_effect=articles): citations, citing_articles = get_citations("uid3", "MED") for citation, citing_article in zip(citations, citing_articles): diff --git a/tests/data_sources/test_orcid.py b/tests/data_sources/test_orcid.py index 89a7701..9400c88 100644 --- a/tests/data_sources/test_orcid.py +++ b/tests/data_sources/test_orcid.py @@ -14,9 +14,7 @@ ) -def generate_affiliation_xml( - organization, start_date, end_date, pos_type="education" -): +def generate_affiliation_xml(organization, start_date, end_date, pos_type="education"): xml_string = f""" <{pos_type}:{pos_type}-summary xmlns:{pos_type}="http://www.orcid.org/ns/{pos_type}" xmlns:common="http://www.orcid.org/ns/common"> @@ -50,9 +48,7 @@ def test_extract_affiliation_date(date_type): organization = "org1" start_date = date(2020, 1, 1) end_date = date(2021, 1, 1) - element = generate_affiliation_xml( - organization, start_date, end_date, pos_type="education" - ) + element = generate_affiliation_xml(organization, start_date, end_date, pos_type="education") if date_type == "start-date": expected_dt = start_date @@ -63,9 +59,7 @@ def test_extract_affiliation_date(date_type): dt = extract_affiliation_date(element, date_type) assert dt == expected_dt - element = generate_affiliation_xml( - organization, start_date, end_date, pos_type="work" - ) + element = generate_affiliation_xml(organization, start_date, end_date, pos_type="work") if date_type == "start-date": expected_dt = start_date @@ -92,9 +86,7 @@ def generate_orcid_list(orcid_ids: list[str]): return ET.tostring(response_wrapper, encoding="unicode") -def generate_author_record( - titles: list[str], article_organizations=None, element=False -): +def generate_author_record(titles: list[str], article_organizations=None, element=False): response_wrapper = ET.Element( "record:record", { @@ -104,9 +96,7 @@ def generate_author_record( "xmlns:common": NAMESPACES["common"], }, ) - activities = ET.SubElement( - response_wrapper, "activities:activities-summary" - ) + activities = ET.SubElement(response_wrapper, "activities:activities-summary") works = ET.SubElement(activities, "activities:works") group = ET.SubElement(works, "activities:group") for i, title in enumerate(titles): @@ -121,12 +111,8 @@ def generate_author_record( organization = ET.SubElement(work_summary, "common:organization") ET.SubElement(organization, "common:name").text = org_name dis_org = ET.SubElement(organization, "disambiguated-organization") - ET.SubElement( - dis_org, "common:disambiguated-organization" - ).text = org_id - ET.SubElement( - dis_org, "common:disambiguated-organization-identifier" - ).text = org_source + ET.SubElement(dis_org, "common:disambiguated-organization").text = org_id + ET.SubElement(dis_org, "common:disambiguated-organization-identifier").text = org_source if element: return response_wrapper @@ -141,15 +127,11 @@ def test_filter_orcidids(): good_orcid_id = "orcid2" response1 = httpx.Response( status_code=200, - text=generate_author_record( - ["From Big Data to Big Displays", "A Physically Plausible Model"] - ), + text=generate_author_record(["From Big Data to Big Displays", "A Physically Plausible Model"]), ) response2 = httpx.Response( status_code=200, - text=generate_author_record( - ["In Silico Brain Imaging", "Large Volume Imaging of Rodent Brain"] - ), + text=generate_author_record(["In Silico Brain Imaging", "Large Volume Imaging of Rodent Brain"]), ) with patch( "citations.data_sources.orcid.get_with_waiting", @@ -162,9 +144,7 @@ def test_filter_orcidids(): def test_fetch_article_authors_doi(): orcid_ids = ["orcid1", "orcid2"] - response = httpx.Response( - status_code=200, text=generate_orcid_list(orcid_ids) - ) + response = httpx.Response(status_code=200, text=generate_orcid_list(orcid_ids)) with patch( "citations.data_sources.orcid.get_with_waiting", return_value=response, @@ -180,9 +160,7 @@ def test_fetch_article_authors_doi(): def test_fetch_article_authors_pmid(): orcid_ids = ["orcid1", "orcid2"] response1 = httpx.Response(status_code=400, text="") - response2 = httpx.Response( - status_code=200, text=generate_orcid_list(orcid_ids) - ) + response2 = httpx.Response(status_code=200, text=generate_orcid_list(orcid_ids)) with patch( "citations.data_sources.orcid.get_with_waiting", side_effect=[response1, response2], @@ -204,9 +182,7 @@ def test_get_author_affiliation(): {"name": "org1_name", "id": "org1_id", "source": "ROR"}, {"name": "org2_name", "id": "org2_id", "source": "GRID"}, ] - record = generate_author_record( - titles, expected_institutions, element=True - ) + record = generate_author_record(titles, expected_institutions, element=True) from citations.data_sources.orcid import get_author_affiliations institutions, affiliations = get_author_affiliations(orcidid, record) diff --git a/tests/test_utils.py b/tests/test_utils.py index e3a9110..b3b2208 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,11 +3,7 @@ import pytest from httpx import RequestError -from citations.utils import ( - generate_unique_id, - get_with_waiting, - normalize_title, -) +from citations.utils import generate_unique_id, get_with_waiting, normalize_title def test_get_with_waiting(httpx_mock): @@ -39,14 +35,8 @@ def test_generate_unique_id_different_input(): "input1, input2", [ ( - ( - "From Big Data to Big Displays High-Performance Visualization" - " at Blue Brain" - ), - ( - "From Big Data To big Displays High-Performance visualization" - " at Blue Brain" - ), + ("From Big Data to Big Displays High-Performance Visualization" " at Blue Brain"), + ("From Big Data To big Displays High-Performance visualization" " at Blue Brain"), ), ( "The Scientific Case for Brain Simulations",