From de8f6356b221388356f0182f66024a8d6f672a1c Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Thu, 21 Sep 2017 17:53:39 -0700 Subject: [PATCH 01/24] new prod search --- allofplos/Production team investigates.ipynb | 98 +++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/allofplos/Production team investigates.ipynb b/allofplos/Production team investigates.ipynb index 6c90ac6f..c68f04e8 100644 --- a/allofplos/Production team investigates.ipynb +++ b/allofplos/Production team investigates.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -226,6 +226,102 @@ " writer.writerow(item)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Q: Which articles have a series of table-wrap elements?" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "example_doi = '10.1371/journal.pone.0068090'\n", + "article_xml = get_articleXML_content(doi_to_file(example_doi), tag_path_elements=('/',\n", + " 'article',\n", + " 'body'))\n", + "def find_table_wraps(article):\n", + " \"\"\"\n", + " find all articles with a `table-wrap` element. of those, if there is no immediate sub-tag of\n", + " 'alternative' in table\n", + " \"\"\"\n", + "# sections = article_xml[0].getchildren()\n", + "# for section in sections:\n", + "# for subsections in sections:\n", + "# for parts in subsections:\n", + "# print(parts.tag)\n", + " article_tree = et.parse(doi_to_file(example_doi))\n", + " table_wraps = article_tree.findall('.//table-wrap')\n", + " if table_wraps:\n", + " for table_wrap in table_wraps:\n", + " table_parts = table_wrap.getchildren()\n", + " if all('alternatives' not in table_part.tag for table_part in table_parts):\n", + " danger = table_wrap.iterfind('.//graphic')\n", + " if danger:\n", + " return 'danger table'\n", + "# if all('')\n", + " else:\n", + " pass\n", + "# for table_part in table_parts:\n", + "# if 'alternatives' not in table_part.tag:\n", + "# pass\n", + "# else:\n", + "# return 'danger table'\n", + " return \"table ok\"\n", + " else:\n", + " return 'no table'" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "danger table\n", + "danger table\n", + "danger table\n", + "danger table\n", + "danger table\n", + "danger table\n", + "danger table\n", + "danger table\n", + "danger table\n", + "danger table\n" + ] + } + ], + "source": [ + "for article_file in listdir_nohidden(corpusdir)[180000:180010]:\n", + " print(find_table_wraps(article_file))" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'table ok'" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "find_table_wraps(doi_to_file(example_doi))" + ] + }, { "cell_type": "code", "execution_count": null, From fc7d6999aa0cba8da2c9c9aac32749db418b6d10 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Fri, 22 Sep 2017 18:14:17 -0700 Subject: [PATCH 02/24] +new prod searches --- allofplos/Production team investigates.ipynb | 126 +++++++++++-------- 1 file changed, 76 insertions(+), 50 deletions(-) diff --git a/allofplos/Production team investigates.ipynb b/allofplos/Production team investigates.ipynb index c68f04e8..2f898f00 100644 --- a/allofplos/Production team investigates.ipynb +++ b/allofplos/Production team investigates.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -235,91 +235,117 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 190, "metadata": {}, "outputs": [], "source": [ "example_doi = '10.1371/journal.pone.0068090'\n", - "article_xml = get_articleXML_content(doi_to_file(example_doi), tag_path_elements=('/',\n", - " 'article',\n", - " 'body'))\n", + "search_1_file = 'xml_testing/Search-1_TRUE.xml'\n", + "search_2_file = 'xml_testing/Search-2_TRUE.xml'\n", + "intro_file = doi_to_file(example_doi)\n", + "fail_file = doi_to_file('10.1371/journal.pone.0182980')\n", + "test_list = [intro_file, search_1_file, search_2_file, fail_file]\n", + "\n", + "intro_condition = []\n", + "search_1 = []\n", + "search_2 = []\n", + "\n", "def find_table_wraps(article):\n", " \"\"\"\n", " find all articles with a `table-wrap` element. of those, if there is no immediate sub-tag of\n", " 'alternative' in table\n", " \"\"\"\n", - "# sections = article_xml[0].getchildren()\n", - "# for section in sections:\n", - "# for subsections in sections:\n", - "# for parts in subsections:\n", - "# print(parts.tag)\n", - " article_tree = et.parse(doi_to_file(example_doi))\n", + " intro_condition = False\n", + " search_1 = False\n", + " search_2 = False\n", + "\n", + " article_tree = et.parse(article)\n", " table_wraps = article_tree.findall('.//table-wrap')\n", " if table_wraps:\n", " for table_wrap in table_wraps:\n", - " table_parts = table_wrap.getchildren()\n", - " if all('alternatives' not in table_part.tag for table_part in table_parts):\n", - " danger = table_wrap.iterfind('.//graphic')\n", + " try:\n", + " if all('alternatives' not in table_part.tag for table_part in table_wrap) and \\\n", + " all('graphic' not in table_part.tag for table_part in table_wrap):\n", + " intro_condition = True\n", + " except TypeError:\n", + " # this is an imperfect work-around. if alternatives were a sub-sub-element,\n", + " # it would be incorrectly excluded from intro_\n", + " alternatives = table_wrap.findall('.//alternatives')\n", + " if alternatives == 0:\n", + " intro_condition = True\n", + " if intro_condition:\n", + " danger = table_wrap.findall('.//graphic')\n", " if danger:\n", - " return 'danger table'\n", - "# if all('')\n", + " search_1 = True\n", + " danger2 = table_wrap.findall('.//inline-graphic')\n", + " if danger2:\n", + " search_2 = True\n", " else:\n", " pass\n", - "# for table_part in table_parts:\n", - "# if 'alternatives' not in table_part.tag:\n", - "# pass\n", - "# else:\n", - "# return 'danger table'\n", - " return \"table ok\"\n", + " \n", + "# for table_part in table_parts:\n", + "# if 'alternatives' in table_part.tag:\n", + "# print('alternatives')\n", + "\n", " else:\n", - " return 'no table'" + " pass\n", + "\n", + " return intro_condition, search_1, search_2\n" ] }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 196, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "danger table\n", - "danger table\n", - "danger table\n", - "danger table\n", - "danger table\n", - "danger table\n", - "danger table\n", - "danger table\n", - "danger table\n", - "danger table\n" + "allofplos_xml/journal.pone.0068090.xml True False False\n", + "xml_testing/Search-1_TRUE.xml True True False\n", + "xml_testing/Search-2_TRUE.xml True True True\n", + "allofplos_xml/journal.pone.0182980.xml False False False\n" ] } ], "source": [ - "for article_file in listdir_nohidden(corpusdir)[180000:180010]:\n", - " print(find_table_wraps(article_file))" + "table_results = []\n", + "for article_file in test_list:\n", + " intro_condition, search_1, search_2 = find_table_wraps(article_file)\n", + " print(article_file, intro_condition, search_1, search_2)" ] }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 197, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'table ok'" - ] - }, - "execution_count": 96, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "find_table_wraps(doi_to_file(example_doi))" + "table_results = []\n", + "file_list = listdir_nohidden(corpusdir)\n", + "for article_file in file_list:\n", + " intro_condition, search_1, search_2 = find_table_wraps(article_file)\n", + " if intro_condition:\n", + " result = [file_to_doi(article_file), search_1, search_2]\n", + " table_results.append(result)\n", + "\n", + "# print(table_results)\n", + "with open('table_search_results_revised.csv', 'w') as f:\n", + " writer = csv.writer(f)\n", + " writer.writerow(['DOI', 'Search 1', 'Search 2'])\n", + " for doi_result in sorted(table_results):\n", + " writer.writerow(doi_result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for article_file in listdir_nohidden(corpusdir)[180000:180010]:\n", + " print(find_table_wraps(article_file))" ] }, { From fee19af070d745c702e898fdf91e5ecacc875bb6 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Tue, 3 Oct 2017 18:07:06 -0700 Subject: [PATCH 03/24] update last merge --- .gitignore | 12 +- allofplos/.gitignore | 14 +- allofplos/Corpus_Analysis_Examples.ipynb | 257 ------ allofplos/Corpus_QA-Copy1.ipynb | 450 ---------- allofplos/Corpus_QA.ipynb | 512 ----------- ...is basics.ipynb => allofplos_basics.ipynb} | 0 allofplos/jupyternb/Corpus_Analysis-old.ipynb | 792 ------------------ ...S article XML from journals.plos.org.ipynb | 236 ------ ...thly integrity check for PLOS corpus.ipynb | 148 ---- allofplos/plos_corpus.py | 136 ++- allofplos/plos_regex.py | 12 +- allofplos/plospmc.py | 77 -- allofplos/samples/corpus_analysis.py | 580 ++----------- 13 files changed, 153 insertions(+), 3073 deletions(-) delete mode 100644 allofplos/Corpus_Analysis_Examples.ipynb delete mode 100644 allofplos/Corpus_QA-Copy1.ipynb delete mode 100644 allofplos/Corpus_QA.ipynb rename allofplos/{Corpus Analysis basics.ipynb => allofplos_basics.ipynb} (100%) delete mode 100644 allofplos/jupyternb/Corpus_Analysis-old.ipynb delete mode 100644 allofplos/jupyternb/Download PLOS article XML from journals.plos.org.ipynb delete mode 100644 allofplos/jupyternb/Monthly integrity check for PLOS corpus.ipynb delete mode 100644 allofplos/plospmc.py diff --git a/.gitignore b/.gitignore index 08d34087..3b691732 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ *.xml -AllofPLOS_article_XML/ +allofplos_xml/ *.zip *.log .ipynb_checkpoints/* @@ -7,19 +7,11 @@ AllofPLOS_article_XML/ *.json *.swp *.pyc -CaSSius/ -Experiments/lens/ *.js -lens-starter/ -lens/ -Python Challenge.ipynb -simple-writer/ -twoto3_nb.py/ !requirements.txt *.xlsx *.csv !doi_to_pmc.csv -Python Tutorial How to Parse and Combine RSS News headlines using feedparser.ipynb *.iml */.ipynb_checkpoints/* -zip_info.txt +zip_info.txt diff --git a/allofplos/.gitignore b/allofplos/.gitignore index 1d7412de..9f669896 100644 --- a/allofplos/.gitignore +++ b/allofplos/.gitignore @@ -1,5 +1,5 @@ *.xml -AllofPLOS_article_XML/ +allofplos_xml/ *.zip *.log */.ipynb_checkpoints/* @@ -9,18 +9,8 @@ AllofPLOS_article_XML/ *.json *.swp *.pyc -CaSSius/ -Experiments/lens/ *.js -lens-starter/ -lens/ -Python Challenge.ipynb -simple-writer/ -twoto3_nb.py *.xlsx *.csv -*.txt -Python Tutorial How to Parse and Combine RSS News headlines using feedparser.ipynb *.iml -*.gz -Scratchpad.ipynb +*.gz \ No newline at end of file diff --git a/allofplos/Corpus_Analysis_Examples.ipynb b/allofplos/Corpus_Analysis_Examples.ipynb deleted file mode 100644 index bc5f45bb..00000000 --- a/allofplos/Corpus_Analysis_Examples.ipynb +++ /dev/null @@ -1,257 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Required functions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "code_folding": [], - "collapsed": true - }, - "outputs": [], - "source": [ - "from samples.corpus_analysis import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PLOS article types" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## JATS-standard NLM article types" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "editable": false, - "run_control": { - "frozen": true - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "15 types of articles found.\n", - "[('research-article', 204109), ('correction', 9113), ('article-commentary', 1284), ('discussion', 1087), ('review-article', 612), ('other', 584), ('editorial', 340), ('letter', 300), ('retraction', 79), ('book-review', 77), ('meeting-report', 38), ('case-report', 23), ('expression-of-concern', 13), ('obituary', 10), ('brief-report', 1)]\n" - ] - } - ], - "source": [ - "jats_article_type_list = get_jats_article_type_list()\n", - "print(jats_article_type_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## PLOS article types" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "editable": false, - "run_control": { - "frozen": true - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "60 types of articles found.\n", - "[('Research Article', 202673), ('Correction', 9124), ('Synopsis', 1093), ('Perspective', 801), ('Review', 555), ('Editorial', 486), ('Pearls', 438), ('Essay', 379), ('Policy Forum', 309), ('Correspondence', 287), ('Primer', 237), ('Viewpoints', 209), ('Community Page', 139), ('Opinion', 136), ('Health in Action', 118), ('Education', 103), ('Retraction', 79), ('Book Review/Science in the Media', 76), ('Message from ISCB', 70), ('Symposium', 70), ('Policy Platform', 54), ('Feature', 53), ('Formal Comment', 52), ('Research in Translation', 51), ('Guidelines and Guidance', 51), ('Collection Review', 50), ('Research Matters', 44), ('Interview', 44), ('The PLoS Medicine Debate', 38), ('Historical Profiles and Perspectives', 38), ('Unsolved Mystery', 34), ('Overview', 34), ('Neglected Diseases', 29), ('Expert Commentary', 29), ('Learning Forum', 27), ('From Innovation to Application', 24), ('Obituary', 22), ('Quiz', 21), ('Correspondence and Other Communications', 13), ('Expression of Concern', 13), ('Journal Club', 12), ('Meta-Research Article', 12), ('Student Forum', 12), ('Open Highlights', 11), ('Topic Page', 11), ('Case Report', 10), ('Photo Quiz', 10), ('Best Practice', 5), ('Deep Reads', 4), ('Historical and Philosophical Perspectives', 3), ('Special Report', 3), ('Book Review', 2), ('Message from the Founders', 1), ('Message from PLoS', 1), ('Short Reports', 1), ('Methods and Resources', 1), ('Technical Report', 1), ('Message from the PLoS Founders', 1), ('Collection Review ', 1), ('Debate', 1)]\n" - ] - } - ], - "source": [ - "PLOS_article_type_list = get_plos_article_type_list()\n", - "print(PLOS_article_type_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Taking random samples of DOIs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "random_sample_of_dois = get_random_list_of_DOIs() # returns 100 DOIs by default" - ] - }, - { - "cell_type": "code", - "execution_count": 203, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['journal.pone.0074820', 'journal.pone.0063497', 'journal.pone.0126357', 'journal.pntd.0004807', 'journal.pone.0031896', 'journal.pone.0045503', 'journal.pone.0138217', 'journal.pbio.0050002', 'journal.pone.0122848', 'journal.pone.0099248']\n" - ] - } - ], - "source": [ - "random_sample_of_articles = [doi_to_article(doi) for doi in random_sample_of_dois]\n", - "print(random_sample_of_articles[0:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Retracted and corrected articles" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get list of retracted articles" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "79 retracted articles found.\n", - "['journal.pbio.0030123', 'journal.pbio.0050005', 'journal.pbio.0050146', 'journal.pbio.1001212', 'journal.pcbi.1002308', 'journal.pgen.1003361', 'journal.pgen.1003791', 'journal.pgen.1005586', 'journal.pgen.1000424', 'journal.pmed.1001214', 'journal.pone.0072333', 'journal.pone.0084127', 'journal.pone.0027571', 'journal.pone.0046410', 'journal.pone.0080145', 'journal.pone.0019652', 'journal.pone.0075928', 'journal.pone.0075046', 'journal.pone.0062178', 'journal.pone.0051549', 'journal.pone.0093095', 'journal.pone.0069669', 'journal.pone.0133525', 'journal.pone.0115980', 'journal.pone.0115741', 'journal.pone.0139044', 'journal.pone.0146193', 'journal.pone.0045667', 'journal.pone.0040789', 'journal.pone.0094830', 'journal.pone.0031943', 'journal.pone.0097700', 'journal.pone.0047218', 'journal.pone.0090951', 'journal.pone.0014232', 'journal.pone.0090318', 'journal.pone.0072895', 'journal.pone.0065651', 'journal.pone.0059556', 'journal.pone.0076809', 'journal.pone.0099630', 'journal.pone.0121549', 'journal.pone.0048402', 'journal.pone.0062170', 'journal.pone.0020152', 'journal.pone.0164571', 'journal.pone.0164378', 'journal.pone.0116682', 'journal.pone.0125542', 'journal.pone.0047110', 'journal.pone.0026503', 'journal.pone.0037102', 'journal.pone.0014163', 'journal.pone.0043204', 'journal.pone.0001276', 'journal.pone.0035142', 'journal.pone.0011299', 'journal.pone.0005373', 'journal.pone.0030980', 'journal.pone.0000306', 'journal.pone.0064576', 'journal.pone.0016011', 'journal.pone.0001444', 'journal.pone.0043406', 'journal.pone.0029192', 'journal.pone.0001908', 'journal.pone.0016256', 'journal.pone.0013512', 'journal.pone.0045965', 'journal.pone.0022730', 'journal.pone.0006333', 'journal.pone.0004168', 'journal.pone.0035453', 'journal.pone.0032853', 'journal.ppat.1003435', 'journal.ppat.1002062', 'journal.ppat.1000915', 'journal.ppat.1000210', 'journal.ppat.0020025']\n" - ] - } - ], - "source": [ - "retractions_article_list, retracted_article_list = get_retracted_article_list()\n", - "print(retracted_article_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get list of corrected articles" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "journal.pcbi.1003582.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003490\n", - "journal.pcbi.1003732.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003159\n", - "journal.pone.0101541.xml has incorrect linked DOI: journal.PONE-D-13-26510\n", - "journal.pone.0104353.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104472.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104581.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104601.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105485.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105486.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105490.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105658.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105668.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105669.xml has incorrect linked DOI: journal.\n", - "9127 corrected articles found.\n" - ] - } - ], - "source": [ - "corrections_article_list, corrected_article_list = get_corrected_article_list()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - }, - "toc": { - "colors": { - "hover_highlight": "#DAA520", - "navigate_num": "#000000", - "navigate_text": "#333333", - "running_highlight": "#FF0000", - "selected_highlight": "#FFD700", - "sidebar_border": "#EEEEEE", - "wrapper_background": "#FFFFFF" - }, - "moveMenuLeft": true, - "nav_menu": { - "height": "174px", - "width": "252px" - }, - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 4, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false, - "widenNotebook": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/Corpus_QA-Copy1.ipynb b/allofplos/Corpus_QA-Copy1.ipynb deleted file mode 100644 index b7ccf708..00000000 --- a/allofplos/Corpus_QA-Copy1.ipynb +++ /dev/null @@ -1,450 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Required functions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "code_folding": [] - }, - "outputs": [], - "source": [ - "from samples.corpus_analysis import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PLOS/NLM article type mapping" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'i' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# For mapping the JATS article type onto the PLOS article type, while taking NLM DTD into account.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0marticle_types_map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_article_types_map\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mPLOS_article_types_structured\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcounter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_types_map\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmost_common\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mPLOS_article_types_structured\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mget_article_types_map\u001b[0;34m(directory)\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0marticle_file\u001b[0m \u001b[0;32min\u001b[0m \u001b[0marticle_files\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 182\u001b[0m \u001b[0mjats_article_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_article_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 183\u001b[0;31m \u001b[0mplos_article_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_plos_article_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 184\u001b[0m \u001b[0mdtd_version\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_article_dtd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0mtypes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mjats_article_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplos_article_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtd_version\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mget_plos_article_type\u001b[0;34m(article_file)\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msubject\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msubject_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msubject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'subj-group-type'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"heading\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 140\u001b[0;31m \u001b[0msubject_instance\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msubject_list\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 141\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msubject_instance\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitertext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'i' is not defined" - ] - } - ], - "source": [ - "# For mapping the JATS article type onto the PLOS article type, while taking NLM DTD into account.\n", - "article_types_map = get_article_types_map()\n", - "PLOS_article_types_structured = counter(article_types_map).most_common()\n", - "print(PLOS_article_types_structured)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create .csv file mapping JATS to PLOS article types\n", - "article_types_map_to_csv(article_types_map)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Retracted and corrected articles" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get list of retracted articles" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 retracted articles found.\n" - ] - } - ], - "source": [ - "# article_list = [doi_to_file(doi) for doi in get_random_list_of_dois(count=5000)]\n", - "retractions_doi_list, retracted_doi_list = get_retracted_doi_list(article_list=article_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['10.1371/journal.pbio.1002215']" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "retractions_doi_list" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "article_list = [doi_to_file('10.1371/journal.pbio.1002215')]" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [], - "source": [ - "def get_retracted_doi_list(article_list=None, directory=corpusdir):\n", - " \"\"\"\n", - " Scans through articles in a directory to see if they are retraction notifications,\n", - " scans articles that are that type to find DOIs of retracted articles\n", - " :return: tuple of lists of DOIs for retractions articles, and retracted articles\n", - " \"\"\"\n", - " retractions_doi_list = []\n", - " retracted_doi_list = []\n", - " if article_list is None:\n", - " article_list = listdir_nohidden(directory)\n", - " for article_file in article_list:\n", - " if check_if_retraction_article(article_file):\n", - " retractions_doi_list.append(file_to_doi(article_file))\n", - " # Look in those articles to find actual articles that are retracted\n", - " retracted_doi = get_related_retraction_article(article_file)[0]\n", - " retracted_doi_list.append(retracted_doi)\n", - " # check linked DOI for accuracy\n", - " if make_regex_bool(full_doi_regex_match.search(retracted_doi)) is False:\n", - " print(\"{} has incorrect linked DOI field: '{}'\".format(article_file, retracted_doi))\n", - " if len(retractions_doi_list) == len(retracted_doi_list):\n", - " print(len(retracted_doi_list), 'retracted articles found.')\n", - " else:\n", - " print('Number of retraction articles and retracted articles are different: ',\n", - " '{} vs. {}'.format(len(retractions_article_list), len(retracted_article_list)))\n", - " return retractions_doi_list, retracted_doi_list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get list of corrected articles" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5 corrected articles found.\n" - ] - } - ], - "source": [ - "article_list = [doi_to_file(doi) for doi in get_random_list_of_dois(count=100)]\n", - "corrections_article_list, corrected_article_list = get_corrected_article_list(article_list=article_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['10.1371/journal.pone.0065474', '10.1371/journal.pone.0144760', '10.1371/journal.pone.0050818', '10.1371/journal.pmed.1001786', '10.1371/journal.ppat.1003068']\n" - ] - } - ], - "source": [ - "print(corrected_article_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Check raw XML for article updates" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# By default, checks only the 30,000 most recent articles\n", - "articles_different_list = revisiondate_sanity_check()\n", - "print(articles_different_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DOI and filename sanity check" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check if article filenames match their full DOIs & that DOI fields are correct\n", - "messed_up_plos_list = article_doi_sanity_check()\n", - "messed_up_pmc_list = article_doi_sanity_check(directory=pmcdir, article_list=None, source='PMC')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PubMed Corpus" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get all local, solr, and PMC DOIs" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mArticles that needs to be re-indexed on Solr:\n", - "\u001b[0m10.1371/journal.pone.0076809\n" - ] - } - ], - "source": [ - "plos_articles = get_all_plos_dois()\n", - "doi_to_pmc = get_articles_by_doi_field(check_new=False)\n", - "pmc_articles = list(doi_to_pmc.keys())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare PLOS's copy to PMC" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version of the corpus by:\n", - "* removing Currents articles\n", - "* checking if articles are live on journals.plos.org\n", - "* checking that the DOIs resolve" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "missing_plos_articles = process_missing_plos_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare PMC's copy to PLOS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version of the Corpus by:\n", - "* updating the PMCID:DOI mapping document\n", - "* removing articles too recent to be indexed (pubdate less than 3 weeks ago)\n", - "* excluding uncorrected proofs\n", - "* excluding PLOS Medicine quizzes" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmissing_pmc_articles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprocess_missing_pmc_articles\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpmc_articles\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpmc_articles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplos_articles\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mplos_articles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mprocess_missing_pmc_articles\u001b[0;34m(pmc_articles, plos_articles)\u001b[0m\n\u001b[1;32m 730\u001b[0m \u001b[0mplos_articles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_all_plos_dois\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 731\u001b[0m \u001b[0mmissing_pmc_dois\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mplos_articles\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpmc_articles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 732\u001b[0;31m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 733\u001b[0m \u001b[0;31m# Query for PMC updates & update DOI-to-PMCID dictionary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 734\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmissing_pmc_dois\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mupdate_pmc_dict_by_doi\u001b[0;34m(id_list)\u001b[0m\n\u001b[1;32m 562\u001b[0m '''\n\u001b[1;32m 563\u001b[0m \u001b[0mdoi_to_pmc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_articles_by_doi_field\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcheck_new\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 564\u001b[0;31m \u001b[0mdoi_to_pmc2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdois_not_in_pmc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_pmc_doi_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 565\u001b[0m \u001b[0mfull_pmc_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mdoi_to_pmc2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mdoi_to_pmc\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 566\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpmc_csv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mget_pmc_doi_dict\u001b[0;34m(id_list, chunk_size)\u001b[0m\n\u001b[1;32m 536\u001b[0m \u001b[0mpmc_doi_query\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpmc_doi_query_url\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mpmc_doi_string\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 537\u001b[0m \u001b[0;31m# Parse the results & create dict entry for each result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 538\u001b[0;31m \u001b[0mpmc_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpmc_doi_query\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 539\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpmc_response\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus_code\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m500\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 540\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Error for DOI chunk; retry with smaller chunk size'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 506\u001b[0m }\n\u001b[1;32m 507\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 616\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 617\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 618\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 440\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 441\u001b[0m )\n\u001b[1;32m 442\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 600\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 601\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 602\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[0;31m# If we're going to release the connection in ``finally:``, then\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0;31m# Trigger any extra validation we need to do.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 346\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 347\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[0;31m# Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_validate_conn\u001b[0;34m(self, conn)\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;31m# Force connect early to allow us to validate the connection.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 849\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sock'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# AppEngine might not have `.sock`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 850\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 851\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 852\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_verified\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0mca_cert_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mca_cert_dir\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[0mserver_hostname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhostname\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 326\u001b[0;31m ssl_context=context)\n\u001b[0m\u001b[1;32m 327\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massert_fingerprint\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/util/ssl_.py\u001b[0m in \u001b[0;36mssl_wrap_socket\u001b[0;34m(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_cert_chain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcertfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeyfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mHAS_SNI\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Platform-specific: OpenSSL with enabled SNI\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 329\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrap_socket\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msock\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mserver_hostname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mserver_hostname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 330\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m warnings.warn(\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/contrib/pyopenssl.py\u001b[0m in \u001b[0;36mwrap_socket\u001b[0;34m(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 440\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 441\u001b[0;31m \u001b[0mcnx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_handshake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 442\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOpenSSL\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSSL\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWantReadError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 443\u001b[0m \u001b[0mrd\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msock\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgettimeout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/OpenSSL/SSL.py\u001b[0m in \u001b[0;36mdo_handshake\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1713\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1714\u001b[0m \"\"\"\n\u001b[0;32m-> 1715\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_lib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSSL_do_handshake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1716\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_raise_ssl_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1717\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "missing_pmc_articles = process_missing_pmc_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save lists of missing articles to text files if needed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('missing_plos_articles.txt', 'w') as file:\n", - " for item in sorted(set(missing_plos_articles)):\n", - " file.write(\"%s\\n\" % item)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "id_list=listdir_nohidden(pmcdir, extension='.nxml')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'doi'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdoi_to_pmc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_pmc_doi_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mget_pmc_doi_dict\u001b[0;34m(id_list, chunk_size)\u001b[0m\n\u001b[1;32m 545\u001b[0m \u001b[0mpmc_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpmc_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;31m# exclude echo header\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 546\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpmc_results\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 547\u001b[0;31m \u001b[0mdoi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrib\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'doi'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 548\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[0mpmcid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrib\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'pmcid'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32msrc/lxml/lxml.etree.pyx\u001b[0m in \u001b[0;36mlxml.etree._Attrib.__getitem__ (src/lxml/lxml.etree.c:70679)\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'doi'" - ] - } - ], - "source": [ - "doi_to_pmc = get_pmc_doi_dict(id_list)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('missing_pmc_articles.txt', 'w') as file:\n", - " for item in sorted(set(missing_pmc_articles)):\n", - " file.write(\"%s\\n\" % item)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - }, - "toc": { - "nav_menu": { - "height": "174px", - "width": "252px" - }, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "toc_cell": false, - "toc_position": {}, - "toc_section_display": "block", - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/Corpus_QA.ipynb b/allofplos/Corpus_QA.ipynb deleted file mode 100644 index 1007c803..00000000 --- a/allofplos/Corpus_QA.ipynb +++ /dev/null @@ -1,512 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Required functions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "code_folding": [] - }, - "outputs": [], - "source": [ - "from samples.corpus_analysis import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PLOS/NLM article type mapping" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "# For mapping the JATS article type onto the PLOS article type, while taking NLM DTD into account.\n", - "article_types_map = get_article_types_map()\n", - "PLOS_article_types_structured = counter(article_types_map).most_common()\n", - "print(PLOS_article_types_structured)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# create .csv file mapping JATS to PLOS article types\n", - "article_types_map_to_csv(article_types_map)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Retracted and corrected articles" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get list of retracted articles" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "79 retracted articles found.\n", - "['journal.pbio.0030123', 'journal.pbio.0050005', 'journal.pbio.0050146', 'journal.pbio.1001212', 'journal.pcbi.1002308', 'journal.pgen.1003361', 'journal.pgen.1003791', 'journal.pgen.1005586', 'journal.pgen.1000424', 'journal.pmed.1001214', 'journal.pone.0072333', 'journal.pone.0084127', 'journal.pone.0027571', 'journal.pone.0046410', 'journal.pone.0080145', 'journal.pone.0019652', 'journal.pone.0075928', 'journal.pone.0075046', 'journal.pone.0062178', 'journal.pone.0051549', 'journal.pone.0093095', 'journal.pone.0069669', 'journal.pone.0133525', 'journal.pone.0115980', 'journal.pone.0115741', 'journal.pone.0139044', 'journal.pone.0146193', 'journal.pone.0045667', 'journal.pone.0040789', 'journal.pone.0094830', 'journal.pone.0031943', 'journal.pone.0097700', 'journal.pone.0047218', 'journal.pone.0090951', 'journal.pone.0014232', 'journal.pone.0090318', 'journal.pone.0072895', 'journal.pone.0065651', 'journal.pone.0059556', 'journal.pone.0076809', 'journal.pone.0099630', 'journal.pone.0121549', 'journal.pone.0048402', 'journal.pone.0062170', 'journal.pone.0020152', 'journal.pone.0164571', 'journal.pone.0164378', 'journal.pone.0116682', 'journal.pone.0125542', 'journal.pone.0047110', 'journal.pone.0026503', 'journal.pone.0037102', 'journal.pone.0014163', 'journal.pone.0043204', 'journal.pone.0001276', 'journal.pone.0035142', 'journal.pone.0011299', 'journal.pone.0005373', 'journal.pone.0030980', 'journal.pone.0000306', 'journal.pone.0064576', 'journal.pone.0016011', 'journal.pone.0001444', 'journal.pone.0043406', 'journal.pone.0029192', 'journal.pone.0001908', 'journal.pone.0016256', 'journal.pone.0013512', 'journal.pone.0045965', 'journal.pone.0022730', 'journal.pone.0006333', 'journal.pone.0004168', 'journal.pone.0035453', 'journal.pone.0032853', 'journal.ppat.1003435', 'journal.ppat.1002062', 'journal.ppat.1000915', 'journal.ppat.1000210', 'journal.ppat.0020025']\n" - ] - } - ], - "source": [ - "retractions_article_list, retracted_article_list = get_retracted_article_list()\n", - "print(retracted_article_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get list of corrected articles" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "journal.pcbi.1003582.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003490\n", - "journal.pcbi.1003732.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003159\n", - "journal.pone.0101541.xml has incorrect linked DOI: journal.PONE-D-13-26510\n", - "journal.pone.0104353.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104472.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104581.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104601.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105485.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105486.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105490.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105658.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105668.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105669.xml has incorrect linked DOI: journal.\n", - "9127 corrected articles found.\n" - ] - } - ], - "source": [ - "corrections_article_list, corrected_article_list = get_corrected_article_list()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Check raw XML for article updates" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "downloaded new version of journal.pone.0182022.xml\n", - "downloaded new version of journal.pone.0175323.xml\n", - "downloaded new version of journal.pone.0171255.xml\n", - "downloaded new version of journal.pone.0158499.xml\n", - "30000 article checked for updates.\n", - "4 articles have updates.\n", - "['journal.pone.0182022.xml', 'journal.pone.0175323.xml', 'journal.pone.0171255.xml', 'journal.pone.0158499.xml']\n" - ] - } - ], - "source": [ - "# By default, checks only the 30,000 most recent articles\n", - "articles_different_list = revisiondate_sanity_check()\n", - "print(articles_different_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DOI and filename sanity check" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "All article file names match DOIs.\n", - "PMC2687079.nxml has invalid DOI field: '10.1371/annotation/1cdc7975-50d7-40a5-99ca-83580df2982f '\n" - ] - } - ], - "source": [ - "# Check if article filenames match their full DOIs & that DOI fields are correct\n", - "messed_up_plos_list = article_doi_sanity_check()\n", - "messed_up_pmc_list = article_doi_sanity_check(directory=pmcdir, article_list=None, source='PMC')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PubMed Corpus" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get all local, solr, and PMC DOIs" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mArticles that needs to be re-indexed on Solr:\n", - "\u001b[0m10.1371/journal.pone.0076809\n" - ] - } - ], - "source": [ - "plos_articles = compare_local_and_solr()\n", - "doi_to_pmc = get_articles_by_doi_field(check_new=False)\n", - "pmc_articles = list(doi_to_pmc.keys())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare PLOS's copy to PMC" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version of the corpus by:\n", - "* removing Currents articles\n", - "* checking if articles are live on journals.plos.org\n", - "* checking that the DOIs resolve" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mPMC DOI fields with spaces in them:\n", - "\u001b[0m\"10.1371/annotation/1cdc7975-50d7-40a5-99ca-83580df2982f \" \n", - "\n", - "\u001b[1mWorking articles that need to be re-indexed on Solr:\n", - "\u001b[0m10.1371/annotation/1391941e-93d3-48d3-8c9a-b7c6d98f9527\n", - "10.1371/annotation/a81b1fab-890c-447b-a308-5bc8ca3eb21d\n", - "10.1371/annotation/df340d50-1f94-4d8b-a252-1a82a7fa5cc7 \n", - "\n", - "\u001b[1mArticles on PMC but not on solr or journals:\n", - "\u001b[0m10.1371/journal.pone.0002957\n", - "10.1371/annotation/b83e925b-2f2a-47b9-b939-0a1eeab18324\n", - "10.1371/journal.pbio.0020201\n", - "10.1371/annotation/011969ee-3f4b-4260-8d95-1b9a4ca39008\n", - "10.1371/annotation/8f2ddf91-3499-4627-9a91-449b78465f9d\n", - "10.1371/annotation/33d82b59-59a3-4412-9853-e78e49af76b9 \n", - "\n", - "\u001b[1mMissing PLOS articles where DOI resolves to different DOI:\n", - "\u001b[0m 10.1371/annotation/5e4082fd-6d86-441f-b946-a6e87a22ea57 resolves to: 10.1371/annotation/d9496d01-8c5d-4d24-8287-94449ada5064\n", - "\u001b[0m 10.1371/annotation/b8b66a84-4919-4a3e-ba3e-bb11f3853755 resolves to: 10.1371/annotation/5fbbf39a-fb47-4ce1-8069-acd830b3d41f\n", - "\n", - " \u001b[1mOther articles on PMC that aren't working correctly for PLOS:\n", - "\u001b[0m10.1371/annotation/363b6074-caec-4238-b88f-acbf45de498f\n", - "10.1371/annotation/2259f958-a68e-4e57-92b5-2ef003070cf1 \n", - "\n" - ] - } - ], - "source": [ - "missing_plos_articles = process_missing_plos_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare PMC's copy to PLOS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version of the Corpus by:\n", - "* updating the PMCID:DOI mapping document\n", - "* removing articles too recent to be indexed (pubdate less than 3 weeks ago)\n", - "* excluding uncorrected proofs\n", - "* excluding PLOS Medicine quizzes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mArticles missing from PMC:\n", - "\u001b[0m10.1371/annotation/08286cd8-527f-4f14-856f-57267107efa8\n", - "10.1371/annotation/0bbea8d3-1f94-48af-915c-aec02da2f5c3\n", - "10.1371/annotation/0c5390b8-72b0-4b7e-85a3-b8c0fd9f62bf\n", - "10.1371/annotation/0ccac188-950f-4908-b232-35fb44ba7847\n", - "10.1371/annotation/0cfd3d5f-c1d0-48f8-ad69-34a95e31a8d2\n", - "10.1371/annotation/0e045706-ea24-41db-be90-27d1cbcd35b1\n", - "10.1371/annotation/17310bbb-e5bf-4901-8b6e-529577a280db\n", - "10.1371/annotation/1c419628-f1b5-45de-9f8a-43f834309ebb\n", - "10.1371/annotation/1dc00176-e096-4621-9494-2d848dac8262\n", - "10.1371/annotation/1e464689-3c86-4399-b229-1e00d65593a5\n", - "10.1371/annotation/1f110857-27d7-4e83-9eb3-4e5f51950a26\n", - "10.1371/annotation/21379809-1376-4250-b4c2-bf51eac58a98\n", - "10.1371/annotation/221e5f19-370e-4a52-add8-f882437bc85d\n", - "10.1371/annotation/230cca90-58e9-4aa1-b6b2-a1d744524fbd\n", - "10.1371/annotation/23bca9d0-f934-400e-8bb9-f5ff07f9e625\n", - "10.1371/annotation/270b432d-50ec-41f1-ad4d-ddd9f51f62a5\n", - "10.1371/annotation/2b218d50-a9d5-45b2-80d0-0e806e530749\n", - "10.1371/annotation/2c275a1b-2d36-4492-b36a-192bddf14f78\n", - "10.1371/annotation/2ca25d9c-7347-4b09-bd7a-09d6d37ff322\n", - "10.1371/annotation/2f278ed8-d5e7-440a-9e49-c8d1df20d1f1\n", - "10.1371/annotation/31412345-fc86-4d67-b37c-93d42f5f0a59\n", - "10.1371/annotation/3265139d-64c7-4c4c-83d3-1e139031e7df\n", - "10.1371/annotation/34304231-e54b-4080-af70-6f957f32d552\n", - "10.1371/annotation/39b41d98-b117-41cf-b5de-b8486a67b1cd\n", - "10.1371/annotation/4290dfee-64fd-4157-89e3-8edbba912420\n", - "10.1371/annotation/44f67041-2f8e-42df-826a-82172ae05a22\n", - "10.1371/annotation/49257f53-8cb1-431b-be64-7b410598b845\n", - "10.1371/annotation/4993e0e2-c580-4547-90d8-3227b87e6ae9\n", - "10.1371/annotation/4a8d9f38-1d0d-4389-a284-9f2564e1ac0b\n", - "10.1371/annotation/4b9340db-455b-4e0d-86e5-b6783747111f\n", - "10.1371/annotation/4bb6b73b-b5bb-4143-9ec3-99c90b93f3ad\n", - "10.1371/annotation/4d6c4127-82e4-408d-af89-5f2e207d523b\n", - "10.1371/annotation/4f08219c-2d7b-4309-8351-d3fe2378993f\n", - "10.1371/annotation/5487e265-8175-47cb-b9a4-d85862a4a96f\n", - "10.1371/annotation/59bcbe81-eddd-46a4-90dc-88c1ea70df72\n", - "10.1371/annotation/5e0195b6-60b9-4c03-84ae-c6c31e625be1\n", - "10.1371/annotation/6130c605-086b-46af-8f6f-6c76b8eb9c84\n", - "10.1371/annotation/638b42e3-a351-4827-a612-17fe29b48e28\n", - "10.1371/annotation/677fdf34-651e-4dc8-a0be-d0d633237a85\n", - "10.1371/annotation/712bb339-6073-4e62-9f68-b285caedd913\n", - "10.1371/annotation/730cdfd0-78c5-48fc-a095-f633905ff2f0\n", - "10.1371/annotation/7645d066-aa98-45d6-8c3e-3a30d9e03e4d\n", - "10.1371/annotation/7e304601-fc5c-40fe-857c-d6ea894d1647\n", - "10.1371/annotation/7f73ed17-709e-4d7f-9aae-aab1f4a34985\n", - "10.1371/annotation/865eaad7-8547-49ac-a42d-47e9d0755bb3\n", - "10.1371/annotation/87e2a80b-3ed7-4ef9-96cb-1268d91b6366\n", - "10.1371/annotation/8941aee3-4bb8-42a0-b09a-e7c416beeef7\n", - "10.1371/annotation/8c6eaae4-72a7-460a-8b1a-f855731f3706\n", - "10.1371/annotation/8fa70b21-32e7-4ed3-b397-ab776b5bbf30\n", - "10.1371/annotation/9239a129-5677-43b0-8fe1-0c1e75e988df\n", - "10.1371/annotation/93141e7a-61f3-48bd-87bd-216b030d773d\n", - "10.1371/annotation/936a4359-1bf5-4c33-be7d-1468e75eaa8b\n", - "10.1371/annotation/93d63399-0e71-4a25-a45c-311910ee6da5\n", - "10.1371/annotation/9630862b-4676-4b82-9869-8d8fbb2a2e65\n", - "10.1371/annotation/974531b0-9da4-4575-b3d1-955b0163fde0\n", - "10.1371/annotation/98908e14-e9fd-458f-9cea-ba4bec139f20\n", - "10.1371/annotation/b03fbc42-8f70-4873-9cce-854e48249a13\n", - "10.1371/annotation/b0e62f4f-812f-40b1-aef8-365b229eb2cf\n", - "10.1371/annotation/b4e623eb-4950-48d9-8d85-8d70426d95a3\n", - "10.1371/annotation/b60d4ec5-4c6f-43ab-9f63-322e3cd59636\n", - "10.1371/annotation/bae9fc08-fbfa-45b5-9d1d-0b8254d6efd5\n", - "10.1371/annotation/bc97a85c-1ecd-4cd8-ab61-0aef01f949a1\n", - "10.1371/annotation/c066bb84-13ea-4b36-a481-f149df8ce929\n", - "10.1371/annotation/c313df3a-52bd-4cbe-af14-6676480d1a43\n", - "10.1371/annotation/c81daa7c-5375-4349-970b-c63d288947eb\n", - "10.1371/annotation/caf130c3-5026-41cd-9dda-5eac7c0f016f\n", - "10.1371/annotation/d271d9c1-5588-4b43-85c3-d3de58ab61a4\n", - "10.1371/annotation/dfa05103-fc65-4f07-b30f-72a6e91613ff\n", - "10.1371/annotation/ea14adcb-033d-492d-8f8b-e047aa080cd4\n", - "10.1371/annotation/ebea4bd5-2b96-4842-b110-2f7c156e5060\n", - "10.1371/annotation/eff6e471-306a-41bd-88e3-13857af094af\n", - "10.1371/annotation/f016476b-5b84-4c9a-899f-fe8b8bc927b5\n", - "10.1371/annotation/f216b2b0-ab6b-45d8-b6ba-134a477b79b7\n", - "10.1371/annotation/f32bc670-c9cf-4bb0-9376-cd8cfd1053c1\n", - "10.1371/annotation/f8605b0a-d01c-41aa-ac9b-b605d7903a28\n", - "10.1371/annotation/f9660803-198b-4d0d-8200-719a2eb2a443\n", - "10.1371/annotation/fcca88ac-d684-46e0-a483-62af67e777bd\n", - "10.1371/annotation/fd9f9796-b42d-480d-b9f4-0adfbb919148\n", - "10.1371/annotation/fddd2ff3-c991-4c2f-8b84-a27eb20fba91\n", - "10.1371/annotation/ff089043-990a-48c2-a90f-15606c11cc98\n", - "10.1371/journal.pcbi.1005632\n", - "10.1371/journal.pcbi.1005676\n", - "10.1371/journal.pcbi.1005677\n", - "10.1371/journal.pcbi.1005692\n", - "10.1371/journal.pgen.1006910\n", - "10.1371/journal.pone.0181246\n", - "10.1371/journal.pone.0182517\n", - "10.1371/journal.ppat.1006535\n", - "10.1371/journal.ppat.1006543 \n", - "\n" - ] - } - ], - "source": [ - "missing_pmc_articles = process_missing_pmc_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save lists of missing articles to text files if needed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "with open('missing_plos_articles.txt', 'w') as file:\n", - " for item in sorted(set(missing_plos_articles)):\n", - " file.write(\"%s\\n\" % item)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "code_folding": [], - "collapsed": true - }, - "outputs": [], - "source": [ - "with open('missing_pmc_articles.txt', 'w') as file:\n", - " for item in sorted(set(missing_pmc_articles)):\n", - " file.write(\"%s\\n\" % item)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - }, - "toc": { - "nav_menu": { - "height": "174px", - "width": "252px" - }, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "toc_cell": false, - "toc_position": {}, - "toc_section_display": "block", - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/Corpus Analysis basics.ipynb b/allofplos/allofplos_basics.ipynb similarity index 100% rename from allofplos/Corpus Analysis basics.ipynb rename to allofplos/allofplos_basics.ipynb diff --git a/allofplos/jupyternb/Corpus_Analysis-old.ipynb b/allofplos/jupyternb/Corpus_Analysis-old.ipynb deleted file mode 100644 index 4c590f64..00000000 --- a/allofplos/jupyternb/Corpus_Analysis-old.ipynb +++ /dev/null @@ -1,792 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Required functions" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "code_folding": [], - "collapsed": true - }, - "outputs": [], - "source": [ - "from Samples.corpus_analysis import *" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "# PLOS article types" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "hidden": true - }, - "source": [ - "## JATS-standard NLM article types" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "editable": false, - "hidden": true, - "run_control": { - "frozen": true - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "15 types of articles found.\n", - "[('research-article', 204109), ('correction', 9113), ('article-commentary', 1284), ('discussion', 1087), ('review-article', 612), ('other', 584), ('editorial', 340), ('letter', 300), ('retraction', 79), ('book-review', 77), ('meeting-report', 38), ('case-report', 23), ('expression-of-concern', 13), ('obituary', 10), ('brief-report', 1)]\n" - ] - } - ], - "source": [ - "jats_article_type_list = get_jats_article_type_list()\n", - "print(jats_article_type_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "hidden": true - }, - "source": [ - "## PLOS article types" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "editable": false, - "hidden": true, - "run_control": { - "frozen": true - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "60 types of articles found.\n", - "[('Research Article', 202673), ('Correction', 9124), ('Synopsis', 1093), ('Perspective', 801), ('Review', 555), ('Editorial', 486), ('Pearls', 438), ('Essay', 379), ('Policy Forum', 309), ('Correspondence', 287), ('Primer', 237), ('Viewpoints', 209), ('Community Page', 139), ('Opinion', 136), ('Health in Action', 118), ('Education', 103), ('Retraction', 79), ('Book Review/Science in the Media', 76), ('Message from ISCB', 70), ('Symposium', 70), ('Policy Platform', 54), ('Feature', 53), ('Formal Comment', 52), ('Research in Translation', 51), ('Guidelines and Guidance', 51), ('Collection Review', 50), ('Research Matters', 44), ('Interview', 44), ('The PLoS Medicine Debate', 38), ('Historical Profiles and Perspectives', 38), ('Unsolved Mystery', 34), ('Overview', 34), ('Neglected Diseases', 29), ('Expert Commentary', 29), ('Learning Forum', 27), ('From Innovation to Application', 24), ('Obituary', 22), ('Quiz', 21), ('Correspondence and Other Communications', 13), ('Expression of Concern', 13), ('Journal Club', 12), ('Meta-Research Article', 12), ('Student Forum', 12), ('Open Highlights', 11), ('Topic Page', 11), ('Case Report', 10), ('Photo Quiz', 10), ('Best Practice', 5), ('Deep Reads', 4), ('Historical and Philosophical Perspectives', 3), ('Special Report', 3), ('Book Review', 2), ('Message from the Founders', 1), ('Message from PLoS', 1), ('Short Reports', 1), ('Methods and Resources', 1), ('Technical Report', 1), ('Message from the PLoS Founders', 1), ('Collection Review ', 1), ('Debate', 1)]\n" - ] - } - ], - "source": [ - "PLOS_article_type_list = get_plos_article_type_list()\n", - "print(PLOS_article_type_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true - }, - "source": [ - "## PLOS/NLM article type mapping" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "article_types_map = get_article_types_map()\n", - "PLOS_article_types_structured = counter(article_types_map).most_common()\n", - "print(PLOS_article_types_structured)" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "# create .csv file mapping JATS to PLOS article types\n", - "article_types_map_to_csv(article_types_map)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "# Taking random samples of DOIs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "random_sample_of_dois = get_random_list_of_DOIs() # returns 100 DOIs by default" - ] - }, - { - "cell_type": "code", - "execution_count": 203, - "metadata": { - "hidden": true, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['journal.pone.0074820', 'journal.pone.0063497', 'journal.pone.0126357', 'journal.pntd.0004807', 'journal.pone.0031896', 'journal.pone.0045503', 'journal.pone.0138217', 'journal.pbio.0050002', 'journal.pone.0122848', 'journal.pone.0099248']\n" - ] - } - ], - "source": [ - "random_sample_of_articles = [doi_to_article(doi) for doi in random_sample_of_dois]\n", - "print(random_sample_of_articles[0:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "# Retracted and corrected articles" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "hidden": true - }, - "source": [ - "## Get list of retracted articles" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "editable": false, - "hidden": true, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "79 retracted articles found.\n", - "['journal.pbio.0030123', 'journal.pbio.0050005', 'journal.pbio.0050146', 'journal.pbio.1001212', 'journal.pcbi.1002308', 'journal.pgen.1003361', 'journal.pgen.1003791', 'journal.pgen.1005586', 'journal.pgen.1000424', 'journal.pmed.1001214', 'journal.pone.0072333', 'journal.pone.0084127', 'journal.pone.0027571', 'journal.pone.0046410', 'journal.pone.0080145', 'journal.pone.0019652', 'journal.pone.0075928', 'journal.pone.0075046', 'journal.pone.0062178', 'journal.pone.0051549', 'journal.pone.0093095', 'journal.pone.0069669', 'journal.pone.0133525', 'journal.pone.0115980', 'journal.pone.0115741', 'journal.pone.0139044', 'journal.pone.0146193', 'journal.pone.0045667', 'journal.pone.0040789', 'journal.pone.0094830', 'journal.pone.0031943', 'journal.pone.0097700', 'journal.pone.0047218', 'journal.pone.0090951', 'journal.pone.0014232', 'journal.pone.0090318', 'journal.pone.0072895', 'journal.pone.0065651', 'journal.pone.0059556', 'journal.pone.0076809', 'journal.pone.0099630', 'journal.pone.0121549', 'journal.pone.0048402', 'journal.pone.0062170', 'journal.pone.0020152', 'journal.pone.0164571', 'journal.pone.0164378', 'journal.pone.0116682', 'journal.pone.0125542', 'journal.pone.0047110', 'journal.pone.0026503', 'journal.pone.0037102', 'journal.pone.0014163', 'journal.pone.0043204', 'journal.pone.0001276', 'journal.pone.0035142', 'journal.pone.0011299', 'journal.pone.0005373', 'journal.pone.0030980', 'journal.pone.0000306', 'journal.pone.0064576', 'journal.pone.0016011', 'journal.pone.0001444', 'journal.pone.0043406', 'journal.pone.0029192', 'journal.pone.0001908', 'journal.pone.0016256', 'journal.pone.0013512', 'journal.pone.0045965', 'journal.pone.0022730', 'journal.pone.0006333', 'journal.pone.0004168', 'journal.pone.0035453', 'journal.pone.0032853', 'journal.ppat.1003435', 'journal.ppat.1002062', 'journal.ppat.1000915', 'journal.ppat.1000210', 'journal.ppat.0020025']\n" - ] - } - ], - "source": [ - "retractions_article_list, retracted_article_list = get_retracted_article_list()\n", - "print(retracted_article_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "hidden": true - }, - "source": [ - "## Get list of corrected articles" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "editable": false, - "hidden": true, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "journal.pcbi.1003582.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003490\n", - "journal.pcbi.1003732.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003159\n", - "journal.pone.0101541.xml has incorrect linked DOI: journal.PONE-D-13-26510\n", - "journal.pone.0104353.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104472.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104581.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104601.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105485.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105486.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105490.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105658.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105668.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105669.xml has incorrect linked DOI: journal.\n", - "9127 corrected articles found.\n" - ] - } - ], - "source": [ - "corrections_article_list, corrected_article_list = get_corrected_article_list()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "# What's going on with revision_dates & article updates?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Keep an eye on this URL for any changes. On PMC, was updated in the last few months, but that might not have has time to propagate. https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:3913708&metadataPrefix=pmc" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "## Step 1: Query solr for revision_date field" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "hidden": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "URL for solr query: http://api.plos.org/search?q=*:*&fq=doc_type:full+-doi:image&fl=id,publication_date&wt=json&indent=true&sort=%20id%20asc&fq=publication_date:[2017-08-17T00:00:00Z+TO+2017-08-25T23:59:59Z]&rows=1000\n", - "613 results returned from this search.\n", - "['2017-08-21T00:00:00Z', '2017-08-18T00:00:00Z', '2017-08-22T00:00:00Z', '2017-08-18T00:00:00Z', '2017-08-24T00:00:00Z', '2017-08-24T00:00:00Z', '2017-08-18T00:00:00Z', '2017-08-23T00:00:00Z', '2017-08-24T00:00:00Z', '2017-08-17T00:00:00Z']\n" - ] - } - ], - "source": [ - "# This should print 10 date strings \n", - "publication_dates_list = get_solr_records(days_ago=8, item='publication_date')\n", - "print(publication_dates_list[0:10])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "# This should return an error\n", - "revision_dates_list = get_solr_records(days_ago=8, item='revision_date')\n", - "print(revision_dates_list[0:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 2: Peek inside raw XML for any changes" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "downloaded new version of journal.pone.0182022.xml\n", - "downloaded new version of journal.pone.0175323.xml\n", - "downloaded new version of journal.pone.0171255.xml\n", - "downloaded new version of journal.pone.0158499.xml\n", - "30000 article checked for updates.\n", - "4 articles have updates.\n", - "['journal.pone.0182022.xml', 'journal.pone.0175323.xml', 'journal.pone.0171255.xml', 'journal.pone.0158499.xml']\n" - ] - } - ], - "source": [ - "articles_different_list = revisiondate_sanity_check()\n", - "print(articles_different_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DOI and filename sanity check" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Check if article filenames match their full DOIs & that DOI fields are correct\n", - "# NOT WORKING AND MUST BE FIXED!\n", - "messed_up_plos_list = article_doi_sanity_check()\n", - "messed_up_pmc_list = article_doi_sanity_check(directory=pmcdir, article_list=None, source='PMC')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PubMed Corpus" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get all local, solr, and PMC DOIs" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mArticles that needs to be re-indexed on Solr:\n", - "\u001b[0m10.1371/journal.pone.0076809\n" - ] - } - ], - "source": [ - "plos_articles = compare_local_and_solr()\n", - "doi_to_pmc = get_articles_by_doi_field(check_new=False)\n", - "pmc_articles = list(doi_to_pmc.keys())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare PLOS's copy to PMC" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version of the corpus by:\n", - "* removing Currents articles\n", - "* checking if articles are live on journals.plos.org\n", - "* checking that the DOIs resolve" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mPMC DOI fields with spaces in them:\n", - "\u001b[0m\"10.1371/annotation/1cdc7975-50d7-40a5-99ca-83580df2982f \" \n", - "\n", - "\u001b[1mWorking articles that need to be re-indexed on Solr:\n", - "\u001b[0m10.1371/annotation/1391941e-93d3-48d3-8c9a-b7c6d98f9527\n", - "10.1371/annotation/a81b1fab-890c-447b-a308-5bc8ca3eb21d\n", - "10.1371/annotation/df340d50-1f94-4d8b-a252-1a82a7fa5cc7 \n", - "\n", - "\u001b[1mArticles on PMC but not on solr or journals:\n", - "\u001b[0m10.1371/journal.pone.0002957\n", - "10.1371/annotation/b83e925b-2f2a-47b9-b939-0a1eeab18324\n", - "10.1371/journal.pbio.0020201\n", - "10.1371/annotation/011969ee-3f4b-4260-8d95-1b9a4ca39008\n", - "10.1371/annotation/8f2ddf91-3499-4627-9a91-449b78465f9d\n", - "10.1371/annotation/33d82b59-59a3-4412-9853-e78e49af76b9 \n", - "\n", - "\u001b[1mMissing PLOS articles where DOI resolves to different DOI:\n", - "\u001b[0m 10.1371/annotation/5e4082fd-6d86-441f-b946-a6e87a22ea57 resolves to: 10.1371/annotation/d9496d01-8c5d-4d24-8287-94449ada5064\n", - "\u001b[0m 10.1371/annotation/b8b66a84-4919-4a3e-ba3e-bb11f3853755 resolves to: 10.1371/annotation/5fbbf39a-fb47-4ce1-8069-acd830b3d41f\n", - "\n", - " \u001b[1mOther articles on PMC that aren't working correctly for PLOS:\n", - "\u001b[0m10.1371/annotation/363b6074-caec-4238-b88f-acbf45de498f\n", - "10.1371/annotation/2259f958-a68e-4e57-92b5-2ef003070cf1 \n", - "\n" - ] - } - ], - "source": [ - "missing_plos_articles = process_missing_plos_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare PMC's copy to PLOS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version of the Corpus by:\n", - "* updating the PMCID:DOI mapping document\n", - "* removing articles too recent to be indexed (pubdate less than 3 weeks ago)\n", - "* excluding uncorrected proofs\n", - "* excluding PLOS Medicine quizzes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mArticles missing from PMC:\n", - "\u001b[0m10.1371/annotation/08286cd8-527f-4f14-856f-57267107efa8\n", - "10.1371/annotation/0bbea8d3-1f94-48af-915c-aec02da2f5c3\n", - "10.1371/annotation/0c5390b8-72b0-4b7e-85a3-b8c0fd9f62bf\n", - "10.1371/annotation/0ccac188-950f-4908-b232-35fb44ba7847\n", - "10.1371/annotation/0cfd3d5f-c1d0-48f8-ad69-34a95e31a8d2\n", - "10.1371/annotation/0e045706-ea24-41db-be90-27d1cbcd35b1\n", - "10.1371/annotation/17310bbb-e5bf-4901-8b6e-529577a280db\n", - "10.1371/annotation/1c419628-f1b5-45de-9f8a-43f834309ebb\n", - "10.1371/annotation/1dc00176-e096-4621-9494-2d848dac8262\n", - "10.1371/annotation/1e464689-3c86-4399-b229-1e00d65593a5\n", - "10.1371/annotation/1f110857-27d7-4e83-9eb3-4e5f51950a26\n", - "10.1371/annotation/21379809-1376-4250-b4c2-bf51eac58a98\n", - "10.1371/annotation/221e5f19-370e-4a52-add8-f882437bc85d\n", - "10.1371/annotation/230cca90-58e9-4aa1-b6b2-a1d744524fbd\n", - "10.1371/annotation/23bca9d0-f934-400e-8bb9-f5ff07f9e625\n", - "10.1371/annotation/270b432d-50ec-41f1-ad4d-ddd9f51f62a5\n", - "10.1371/annotation/2b218d50-a9d5-45b2-80d0-0e806e530749\n", - "10.1371/annotation/2c275a1b-2d36-4492-b36a-192bddf14f78\n", - "10.1371/annotation/2ca25d9c-7347-4b09-bd7a-09d6d37ff322\n", - "10.1371/annotation/2f278ed8-d5e7-440a-9e49-c8d1df20d1f1\n", - "10.1371/annotation/31412345-fc86-4d67-b37c-93d42f5f0a59\n", - "10.1371/annotation/3265139d-64c7-4c4c-83d3-1e139031e7df\n", - "10.1371/annotation/34304231-e54b-4080-af70-6f957f32d552\n", - "10.1371/annotation/39b41d98-b117-41cf-b5de-b8486a67b1cd\n", - "10.1371/annotation/4290dfee-64fd-4157-89e3-8edbba912420\n", - "10.1371/annotation/44f67041-2f8e-42df-826a-82172ae05a22\n", - "10.1371/annotation/49257f53-8cb1-431b-be64-7b410598b845\n", - "10.1371/annotation/4993e0e2-c580-4547-90d8-3227b87e6ae9\n", - "10.1371/annotation/4a8d9f38-1d0d-4389-a284-9f2564e1ac0b\n", - "10.1371/annotation/4b9340db-455b-4e0d-86e5-b6783747111f\n", - "10.1371/annotation/4bb6b73b-b5bb-4143-9ec3-99c90b93f3ad\n", - "10.1371/annotation/4d6c4127-82e4-408d-af89-5f2e207d523b\n", - "10.1371/annotation/4f08219c-2d7b-4309-8351-d3fe2378993f\n", - "10.1371/annotation/5487e265-8175-47cb-b9a4-d85862a4a96f\n", - "10.1371/annotation/59bcbe81-eddd-46a4-90dc-88c1ea70df72\n", - "10.1371/annotation/5e0195b6-60b9-4c03-84ae-c6c31e625be1\n", - "10.1371/annotation/6130c605-086b-46af-8f6f-6c76b8eb9c84\n", - "10.1371/annotation/638b42e3-a351-4827-a612-17fe29b48e28\n", - "10.1371/annotation/677fdf34-651e-4dc8-a0be-d0d633237a85\n", - "10.1371/annotation/712bb339-6073-4e62-9f68-b285caedd913\n", - "10.1371/annotation/730cdfd0-78c5-48fc-a095-f633905ff2f0\n", - "10.1371/annotation/7645d066-aa98-45d6-8c3e-3a30d9e03e4d\n", - "10.1371/annotation/7e304601-fc5c-40fe-857c-d6ea894d1647\n", - "10.1371/annotation/7f73ed17-709e-4d7f-9aae-aab1f4a34985\n", - "10.1371/annotation/865eaad7-8547-49ac-a42d-47e9d0755bb3\n", - "10.1371/annotation/87e2a80b-3ed7-4ef9-96cb-1268d91b6366\n", - "10.1371/annotation/8941aee3-4bb8-42a0-b09a-e7c416beeef7\n", - "10.1371/annotation/8c6eaae4-72a7-460a-8b1a-f855731f3706\n", - "10.1371/annotation/8fa70b21-32e7-4ed3-b397-ab776b5bbf30\n", - "10.1371/annotation/9239a129-5677-43b0-8fe1-0c1e75e988df\n", - "10.1371/annotation/93141e7a-61f3-48bd-87bd-216b030d773d\n", - "10.1371/annotation/936a4359-1bf5-4c33-be7d-1468e75eaa8b\n", - "10.1371/annotation/93d63399-0e71-4a25-a45c-311910ee6da5\n", - "10.1371/annotation/9630862b-4676-4b82-9869-8d8fbb2a2e65\n", - "10.1371/annotation/974531b0-9da4-4575-b3d1-955b0163fde0\n", - "10.1371/annotation/98908e14-e9fd-458f-9cea-ba4bec139f20\n", - "10.1371/annotation/b03fbc42-8f70-4873-9cce-854e48249a13\n", - "10.1371/annotation/b0e62f4f-812f-40b1-aef8-365b229eb2cf\n", - "10.1371/annotation/b4e623eb-4950-48d9-8d85-8d70426d95a3\n", - "10.1371/annotation/b60d4ec5-4c6f-43ab-9f63-322e3cd59636\n", - "10.1371/annotation/bae9fc08-fbfa-45b5-9d1d-0b8254d6efd5\n", - "10.1371/annotation/bc97a85c-1ecd-4cd8-ab61-0aef01f949a1\n", - "10.1371/annotation/c066bb84-13ea-4b36-a481-f149df8ce929\n", - "10.1371/annotation/c313df3a-52bd-4cbe-af14-6676480d1a43\n", - "10.1371/annotation/c81daa7c-5375-4349-970b-c63d288947eb\n", - "10.1371/annotation/caf130c3-5026-41cd-9dda-5eac7c0f016f\n", - "10.1371/annotation/d271d9c1-5588-4b43-85c3-d3de58ab61a4\n", - "10.1371/annotation/dfa05103-fc65-4f07-b30f-72a6e91613ff\n", - "10.1371/annotation/ea14adcb-033d-492d-8f8b-e047aa080cd4\n", - "10.1371/annotation/ebea4bd5-2b96-4842-b110-2f7c156e5060\n", - "10.1371/annotation/eff6e471-306a-41bd-88e3-13857af094af\n", - "10.1371/annotation/f016476b-5b84-4c9a-899f-fe8b8bc927b5\n", - "10.1371/annotation/f216b2b0-ab6b-45d8-b6ba-134a477b79b7\n", - "10.1371/annotation/f32bc670-c9cf-4bb0-9376-cd8cfd1053c1\n", - "10.1371/annotation/f8605b0a-d01c-41aa-ac9b-b605d7903a28\n", - "10.1371/annotation/f9660803-198b-4d0d-8200-719a2eb2a443\n", - "10.1371/annotation/fcca88ac-d684-46e0-a483-62af67e777bd\n", - "10.1371/annotation/fd9f9796-b42d-480d-b9f4-0adfbb919148\n", - "10.1371/annotation/fddd2ff3-c991-4c2f-8b84-a27eb20fba91\n", - "10.1371/annotation/ff089043-990a-48c2-a90f-15606c11cc98\n", - "10.1371/journal.pcbi.1005632\n", - "10.1371/journal.pcbi.1005676\n", - "10.1371/journal.pcbi.1005677\n", - "10.1371/journal.pcbi.1005692\n", - "10.1371/journal.pgen.1006910\n", - "10.1371/journal.pone.0181246\n", - "10.1371/journal.pone.0182517\n", - "10.1371/journal.ppat.1006535\n", - "10.1371/journal.ppat.1006543 \n", - "\n" - ] - } - ], - "source": [ - "missing_pmc_articles = process_missing_pmc_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save lists of missing articles to text files if needed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "with open('missing_plos_articles.txt', 'w') as file:\n", - " for item in sorted(set(missing_plos_articles)):\n", - " file.write(\"%s\\n\" % item)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "with open('missing_pmc_articles.txt', 'w') as file:\n", - " for item in sorted(set(missing_pmc_articles)):\n", - " file.write(\"%s\\n\" % item)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Count of articles by pubdate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## How many articles published each day? month? year? For a period of time?" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "### Could consider making graphs of this..." - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "hidden": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('Aug 2013', 2629), ('Dec 2013', 8), ('Jan 2014', 5), ('Jul 2013', 2627), ('Jun 2013', 2542), ('Jun 2014', 1), ('Mar 2014', 3), ('Mar 2015', 2), ('May 2013', 932), ('May 2014', 1), ('Nov 2013', 20), ('Oct 2013', 47), ('Sep 2013', 1183)]\n" - ] - } - ], - "source": [ - "import collections\n", - "counter = collections.Counter\n", - "\n", - "example_article = 'journal.pone.0012380.xml'\n", - "pubdate_list = []\n", - "article_files = listdir_nohidden(corpusdir)\n", - "pubdate_list = [get_article_pubdate(article_file) for article_file in listdir_nohidden(corpusdir)[90000:100000]]\n", - "# monthly_pubdate_list = [date.replace(day=1,hour=0,minute=0,second=0,microsecond=0) for date in pubdate_list]\n", - "monthly_pubdate_list = [date.strftime('%b %Y') for date in pubdate_list]\n", - "monthly_pubdate_list = sorted(monthly_pubdate_list)\n", - "pubdate_count = sorted(counter(monthly_pubdate_list).most_common())\n", - "print(pubdate_count)\n", - "# month_list = [x.strftime('%b %Y') for x[0] in pubdate_count]\n", - "# month_list = [x[0].strftime('%b %Y') for x in pubdate_count]" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": { - "hidden": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['May 2013', 'Jun 2013', 'Jul 2013', 'Aug 2013', 'Sep 2013', 'Oct 2013', 'Dec 2013']\n" - ] - } - ], - "source": [ - "month_list = [x[0].strftime('%b %Y') for x in pubdate_count]\n", - "print(month_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Count of articles published in each journal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import collections\n", - "counter = collections.Counter\n", - "\n", - "journal_list = []\n", - "for article_file in listdir_nohidden(corpusdir):\n", - " r = get_articleXML_content(corpusdir,\n", - " article_file,\n", - " tag_path_elements=[\"/\",\n", - " \"article\",\n", - " \"front\",\n", - " \"journal-meta\",\n", - " \"journal-title-group\",\n", - " \"journal-title\"])\n", - "\n", - " journal = r[0].text\n", - " journal_list.append(journal)\n", - "\n", - "print(len(set(journal_list)), 'PLOS journals found.')\n", - "journals_structured = counter(journal_list).most_common()\n", - "print(journals_structured)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - }, - "toc": { - "colors": { - "hover_highlight": "#DAA520", - "navigate_num": "#000000", - "navigate_text": "#333333", - "running_highlight": "#FF0000", - "selected_highlight": "#FFD700", - "sidebar_border": "#EEEEEE", - "wrapper_background": "#FFFFFF" - }, - "moveMenuLeft": true, - "nav_menu": { - "height": "174px", - "width": "252px" - }, - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 4, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false, - "widenNotebook": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/jupyternb/Download PLOS article XML from journals.plos.org.ipynb b/allofplos/jupyternb/Download PLOS article XML from journals.plos.org.ipynb deleted file mode 100644 index 7788ea5e..00000000 --- a/allofplos/jupyternb/Download PLOS article XML from journals.plos.org.ipynb +++ /dev/null @@ -1,236 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For each article in articleerrors.txt, \n", - "* go to journals.plos.org[article] URL to grab the raw XML \n", - "* download the xml from that webpage \n", - "* write file name based on name of article \n", - "* save xml to file \n", - "* add time delay " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import lxml.etree as et\n", - "import os\n", - "import time" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First go through text list of XML files in pre-defined list articleerrors.txt, convert to Python list, and truncate characters so it fits the PLOS URL scheme. NOTE: journal name in prefix does not matter." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "with open(\"articleerrors.txt\",\"r\") as f:\n", - " article_list = [x[:-5] for x in f.readlines()]\n", - " article_list.pop(0)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "sample_article_list = article_list[350:360]" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "prefix = 'http://journals.plos.org/plosone/article/file?id=10.1371/'\n", - "suffix = '&type=manuscript'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For each article in the article list, grab the XML from the constructed URL, parse with etree, and save to new XML file. Counter for every 50 articles. Time delay added so as not to overwhelm server" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.1%\n", - "6.0%\n", - "11.9%\n", - "17.8%\n", - "23.7%\n", - "29.6%\n", - "35.5%\n", - "41.4%\n", - "47.4%\n", - "53.3%\n", - "59.2%\n", - "65.1%\n", - "71.0%\n", - "76.9%\n", - "82.8%\n", - "88.7%\n", - "94.6%\n" - ] - } - ], - "source": [ - "for i, article in enumerate(article_list):\n", - " url = prefix + article + suffix\n", - " articleXML = et.parse(url)\n", - " article_path = os.path.join(\"fixed_XML_articles\", article + \".xml\")\n", - " with open(article_path, 'w') as f:\n", - " f.write(et.tostring(articleXML, method = 'xml', encoding = 'unicode'))\n", - " if i%75 ==0:\n", - " print(\"{:.1%}\".format((i+1)/len(article_list)))\n", - " time.sleep(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# if __name__ == __main__:\n", - " # main()\n", - " # this allows you to use python your_file.py " - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1269" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "val = !ls fixed_XML_articles/\n", - "len(val)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'builtin_function_or_method' object has no attribute 'lower'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mstupidlist\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetoutput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'ls AllofPLOS_article_XML/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstupidlist\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m: 'builtin_function_or_method' object has no attribute 'lower'" - ] - } - ], - "source": [ - "import os\n", - "stupidlist = !ls AllofPLOS_article_XML/\n", - "for x in stupidlist:\n", - " os.rename.lower()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "a = \"hi\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - }, - "toc": { - "colors": { - "hover_highlight": "#DAA520", - "navigate_num": "#000000", - "navigate_text": "#333333", - "running_highlight": "#FF0000", - "selected_highlight": "#FFD700", - "sidebar_border": "#EEEEEE", - "wrapper_background": "#FFFFFF" - }, - "moveMenuLeft": true, - "nav_menu": { - "height": "12px", - "width": "252px" - }, - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 4, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false, - "widenNotebook": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/jupyternb/Monthly integrity check for PLOS corpus.ipynb b/allofplos/jupyternb/Monthly integrity check for PLOS corpus.ipynb deleted file mode 100644 index c5a0b1d0..00000000 --- a/allofplos/jupyternb/Monthly integrity check for PLOS corpus.ipynb +++ /dev/null @@ -1,148 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# What to check:\n", - "* Maybe as part of existing monthly maintenance?\n", - "* First test this on a small subset of articles\n", - "For each file in folder, make sure filename == relevant DOI field in XML\n", - " If so, pass_file_name_test is True\n", - " else pass_file_name_test is False\n", - "List of solr query DOIs == list of DOIs in article XML folder == list of DOIs in zip file\n", - " if DOIs in solr and not folder, download those from solr & add to folder & zip\n", - " and if it's that one messed-up article, only if it's been fixed\n", - " if it's been fixed, print note to remove this logic from the code\n", - " if DOIs in folder in solr, write those DOIs to error-list & txt file & email with warning\n", - " if no error proceed to XML content testing\n", - " if error print that content still needs to be checked\n", - " \n", - "Content of content-repo XML == Content of article folder XML == Content of zip file XML\n", - " if content in repo doesn't match article folder via https://bitbucket.org/ianb/formencode/src/tip/formencode/doctest_xml_compare.py?fileviewer=file-view-default#cl-70\n", - " if uncorrected proof vs vor_update, download vor_update\n", - " otherwise save diff and return error (or: preserve old version and make content-repo default and take diff via https://www.logilab.org/859 )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "file_list = drive.ListFile({'q': \"'root' in parents and trashed=false\"}).GetList()\n", - "gdrive_zip_file = [item for item in file_list if item[\"id\"] == gd_id]\n", - "gdrive_zip_filename = (item for item in gdrive_zip_file['originalFilename'])\n", - "current_zipname = str(glob(prefix_zip_name+\"*.zip\")[0])\n", - "if gdrive_filename == current_zipname: \n", - " print(\"Zip file up-to-date on Google drive. No changes made.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import filecmp\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "filecmp.cmp('test_file.txt', 'accman_to_check_list.txt', shallow=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "os.stat_result(st_mode=33188, st_ino=10556917, st_dev=16777220, st_nlink=1, st_uid=738185890, st_gid=984564325, st_size=903, st_atime=1490388647, st_mtime=1490388644, st_ctime=1490388644)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "py3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - }, - "toc": { - "colors": { - "hover_highlight": "#DAA520", - "navigate_num": "#000000", - "navigate_text": "#333333", - "running_highlight": "#FF0000", - "selected_highlight": "#FFD700", - "sidebar_border": "#EEEEEE", - "wrapper_background": "#FFFFFF" - }, - "moveMenuLeft": true, - "nav_menu": { - "height": "30px", - "width": "252px" - }, - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 4, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false, - "widenNotebook": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py index 59a6ee0a..8820f80e 100644 --- a/allofplos/plos_corpus.py +++ b/allofplos/plos_corpus.py @@ -34,8 +34,7 @@ import requests from tqdm import tqdm -from plos_regex import (regex_match_prefix, regex_body_match, full_doi_regex_match, full_doi_regex_search, - make_regex_bool, validate_doi, validate_file, validate_url) +from plos_regex import validate_doi help_str = "This program downloads a zip file with all PLOS articles and checks for updates" @@ -253,7 +252,7 @@ def extract_filenames(directory, extension='.xml'): :return: A list with all the file names inside this directory, excluding extensions """ filenames = [os.path.basename(article_file).rstrip(extension) for article_file in listdir_nohidden(directory, extension) if - isfile(article_file)] + os.path.isfile(article_file)] return filenames @@ -427,7 +426,7 @@ def move_articles(source, destination): shutil.rmtree(source) -def get_articleXML_content(article_file, tag_path_elements=None): +def get_article_xml(article_file, tag_path_elements=None): """ For a local article file, read its XML tree Can also interpret DOIs @@ -473,9 +472,9 @@ def check_article_type(article_file): :param article_file: the xml file for a single article :return: JATS article_type at that xpath location """ - article_type = get_articleXML_content(article_file=article_file, - tag_path_elements=["/", - "article"]) + article_type = get_article_xml(article_file=article_file, + tag_path_elements=["/", + "article"]) return article_type[0].attrib['article-type'] @@ -488,12 +487,12 @@ def get_related_article_doi(article_file, corrected=True): :param corrected: default true, part of the Corrections workflow, more strict in tag search :return: tuple of partial doi string at that xpath location, related_article_type """ - r = get_articleXML_content(article_file=article_file, - tag_path_elements=["/", - "article", - "front", - "article-meta", - "related-article"]) + r = get_article_xml(article_file=article_file, + tag_path_elements=["/", + "article", + "front", + "article-meta", + "related-article"]) related_article = '' if corrected: for x in r: @@ -511,6 +510,56 @@ def get_related_article_doi(article_file, corrected=True): return related_article, related_article_type +def get_article_pubdate(article_file, date_format='%d %m %Y'): + """ + For an individual article, get its date of publication + :param article_file: file path/DOI of the article + :param date_format: string format used to convert to datetime object + :return: datetime object with the date of publication + """ + day = '' + month = '' + year = '' + raw_xml = get_article_xml(article_file=article_file, + tag_path_elements=["/", + "article", + "front", + "article-meta", + "pub-date"]) + for x in raw_xml: + for name, value in x.items(): + if value == 'epub': + date_fields = x + for y in date_fields: + if y.tag == 'day': + day = y.text + if y.tag == 'month': + month = y.text + if y.tag == 'year': + year = y.text + date = (day, month, year) + string_date = ' '.join(date) + pubdate = datetime.datetime.strptime(string_date, date_format) + return pubdate + + +def compare_article_pubdate(article, days=22): + """ + Check if an article's publication date was more than 3 weeks ago. + :param article: doi/file of the article + :param days: how long ago to compare the publication date (default 22 days) + :return: boolean for whether the pubdate was older than the days value + """ + try: + pubdate = get_article_pubdate(article) + today = datetime.datetime.now() + three_wks_ago = datetime.timedelta(days) + compare_date = today - three_wks_ago + return pubdate < compare_date + except OSError: + print("Pubdate error in {}".format(article)) + + def download_updated_xml(article_file, tempdir=newarticledir, vor_check=False): @@ -617,7 +666,7 @@ def check_if_uncorrected_proof(article_file): :param article: Partial DOI/filename of the article :return: Boolean for whether article is an uncorrected proof (true = yes, false = no) """ - tree = get_articleXML_content(article_file) + tree = get_article_xml(article_file) for subtree in tree: if subtree.text == 'uncorrected-proof': return True @@ -699,7 +748,8 @@ def check_for_vor_updates(uncorrected_list=None): if uncorrected_list is None: uncorrected_list = get_uncorrected_proofs_list() # Make it check a single article - if isinstance(uncorrected_list, str): uncorrected_list = [uncorrected_list] + if isinstance(uncorrected_list, str): + uncorrected_list = [uncorrected_list] # Create article list chunks for Solr query no longer than 10 DOIs at a time list_chunks = [uncorrected_list[x:x+10] for x in range(0, len(uncorrected_list), 10)] @@ -824,28 +874,30 @@ def download_check_and_move(article_list, text_list, tempdir, destination): move_articles(tempdir, destination) -def download_file_from_google_drive(id, filename, destination=corpusdir): +def download_file_from_google_drive(id, filename, destination=corpusdir, file_size=None): """ General method for downloading from Google Drive. Doesn't require using API or having credentials :param id: Google Drive id for file (constant even if filename change) :param filename: name of the zip file :param destination: directory where to download the zip file, defaults to corpusdir + :param file_size: size of the file being downloaded :return: None """ URL = "https://docs.google.com/uc?export=download" - session = requests.Session() + file_path = os.path.join(destination, filename) + if not os.path.isfile(file_path): + session = requests.Session() - response = session.get(URL, params={'id': id}, stream=True) - token = get_confirm_token(response) + response = session.get(URL, params={'id': id}, stream=True) + token = get_confirm_token(response) - if token: - params = {'id': id, 'confirm': token} - response = session.get(URL, params=params, stream=True) - r = requests.get(URL, params=params, stream=True) - file_path = os.path.join(destination, filename) - save_response_content(response, file_path) + if token: + params = {'id': id, 'confirm': token} + response = session.get(URL, params=params, stream=True) + r = requests.get(URL, params=params, stream=True) + save_response_content(response, file_path, file_size=file_size) return file_path @@ -862,20 +914,21 @@ def get_confirm_token(response): return None -def save_response_content(response, download_path): +def save_response_content(response, download_path, file_size=None): """ Saves the downloaded file parts from Google Drive to local file Includes progress bar for download % :param response: session-based google query :param download_path: path to local zip file + :param file_size: size of the file being downloaded :return: None """ CHUNK_SIZE = 32768 # for downloading zip file if os.path.basename(download_path) == local_zip: with open(download_path, "wb") as f: - size = zip_size - pieces = size / CHUNK_SIZE + size = file_size + pieces = round(size / CHUNK_SIZE) with tqdm(total=pieces) as pbar: for chunk in response.iter_content(CHUNK_SIZE): pbar.update(1) @@ -894,17 +947,17 @@ def get_zip_metadata(method='initial'): Gets metadata txt file from Google Drive, that has info about zip file Used to get the file name, as well as byte size for progress bar Includes progress bar for download % - :param method: TODO: COMPLETE HERE - :return: TODO: COMPLETE HERE + :param method: boolean if initializing the PLOS Corpus (defaults to True) + :return: tuple of data about zip file: date zip created, zip size, and location of metadata txt file """ if method == 'initial': - download_file_from_google_drive(metadata_id, zip_metadata) - with open(zip_metadata) as f: + metadata_path = download_file_from_google_drive(metadata_id, zip_metadata) + with open(metadata_path) as f: zip_stats = f.read().splitlines() zip_datestring = zip_stats[0] zip_date = datetime.datetime.strptime(zip_datestring, time_formatting) zip_size = int(zip_stats[1]) - return zip_date, zip_size + return zip_date, zip_size, metadata_path def unzip_articles(file_path, @@ -921,7 +974,7 @@ def unzip_articles(file_path, :return: None """ try: - os.makedirs(directory) + os.makedirs(extract_directory) except OSError as e: if e.errno != errno.EEXIST: raise @@ -954,11 +1007,11 @@ def create_local_plos_corpus(corpusdir=corpusdir, rm_metadata=True): if os.path.isdir(corpusdir) is False: os.mkdir(corpusdir) print('Creating folder for article xml') - zip_date, zip_size = get_zip_metadata() - zip_path = download_file_from_google_drive(zip_id, local_zip) + zip_date, zip_size, metadata_path = get_zip_metadata() + zip_path = download_file_from_google_drive(zip_id, local_zip, file_size=zip_size) unzip_articles(file_path=zip_path) if rm_metadata: - os.remove(zip_metadata) + os.remove(metadata_path) if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -968,9 +1021,12 @@ def create_local_plos_corpus(corpusdir=corpusdir, rm_metadata=True): URL_TMP = INT_URL_TMP else: URL_TMP = EXT_URL_TMP - # Step 0: Initialize first copy of repository - corpus_files = [name for name in os.listdir(corpusdir) if os.path.isfile( - os.path.join(corpusdir, name))] + # Step 0: Initialize first copy of repository] + try: + corpus_files = [name for name in os.listdir(corpusdir) if os.path.isfile( + os.path.join(corpusdir, name))] + except FileNotFoundError: + corpus_files = [] if len(corpus_files) < min_files_for_valid_corpus: print('Not enough articles in corpusdir, re-downloading zip file') # TODO: check if zip file is in top-level directory before downloading diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py index aa59e4fc..bf36b536 100644 --- a/allofplos/plos_regex.py +++ b/allofplos/plos_regex.py @@ -1,9 +1,9 @@ -import numpy as np -import re - """ The following RegEx pertains to the 7 main PLOS journals and the defunct PLOS Clinical Trials, as well as PLOS Currents. """ + +import re + corpusdir = 'allofplos_xml/' corpusdir_regex = re.escape(corpusdir) 'http://journals.plos.org/plosone/article/file?id=' @@ -73,8 +73,7 @@ def show_invalid_dois(doi_list): if linked DOI fields in other articles (such as retractions and corrections) are correct. :return: list of DOI candidates that don't match PLOS's pattern """ - nonmatches = np.array([not validate_doi(x) for x in doi_list]) - return list(np.array(doi_list)[nonmatches]) + return list(filter(lambda x: not validate_doi(x), doi_list)) def currents_doi_filter(doi_list): @@ -83,5 +82,4 @@ def currents_doi_filter(doi_list): if linked DOI fields in PMC articles are correct. :return: list of DOI candidates that don't match Currents' pattern """ - nonmatches = np.array([not bool(currents_doi_regex.search(x)) for x in doi_list]) - return list(np.array(doi_list)[nonmatches]) \ No newline at end of file + return list(filter(lambda x: not bool(currents_doi_regex.search(x)), doi_list)) diff --git a/allofplos/plospmc.py b/allofplos/plospmc.py deleted file mode 100644 index 22cb1943..00000000 --- a/allofplos/plospmc.py +++ /dev/null @@ -1,77 +0,0 @@ -""" Small stand-alone script for getting all the PMC IDs for PLOS articles. -""" - -import requests -import time - -def get_all_pmc_dois(retstart=0, retmax=80000, count=None): - """Query the entrez database to get a comprehensive list of all PMCIDs associated with all PLOS journals, - individually included in the search url. - Supposedly can return 100,000, but based on the maximum not working for another function, lowered to 80K to be safe. - :param restart: the first record to return - :param retmax: the maximum number of records to return - :return: the full list of PMCIDs in PMC for PLOS articles - """ - pmc_allplos_query_url = ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=' - '(((((("PLoS+ONE"[Journal])+OR+"PLoS+Genetics"[Journal])+OR+"PLoS+Pathogens"[Journal])' - 'OR+"PLoS+Neglected+Tropical+Diseases"[Journal])+OR+"PLoS+Computational+Biology"[Journal])' - 'OR+"PLoS+Biology"[Journal])+OR+"PLoS+Medicine"[Journal]+OR+"plos+currents"[Journal]' - '&retmode=json&tool=corpustest&email=email@provider.com') - - pmcidlist = [] - r = requests.get(pmc_allplos_query_url).json() - if count is None: - count = int(r['esearchresult']['count']) - print(count, "articles found in PMC") - while retstart < count: - query = pmc_allplos_query_url + '&retstart={0}&retmax={1}'.format(retstart, retmax) - r = requests.get(query).json() - idlist = r['esearchresult']['idlist'] - for id in idlist: - pmcidlist.append('PMC' + id) - retstart += retmax - time.sleep(1) - pmcidlist = sorted(list(set(pmcidlist))) - - print(len(pmcidlist), "articles found") - return pmcidlist - - -def get_pmc_doi_dict(doi_list, chunk_size=150): - '''Using the PMC ID query API, return the accompanying PMCID for each DOI in a given list. - Can (ostensibly) query up to 200 DOIs at a time but sometimes that doesn't work. - :param doi list: a list of valid PLOS DOIs - :param chunk_size: number of DOIs to query at a single time - :return: tuple of dictionary mapping DOI to PMCID, list of DOIs not found in PMC - ''' - - doi_to_pmc = {} - dois_not_in_pmc = [] - # Make chunks of 200 DOIs at a time - list_chunks = [doi_list[x:x+chunk_size] for x in range(0, len(doi_list), chunk_size)] - for chunk in list_chunks: - pmc_doi_string = ','.join(chunk) - # Create the search URL - pmc_doi_query = pmc_doi_query_url + pmc_doi_string - # Parse the results & create dict entry for each result - pmc_response = requests.get(pmc_doi_query) - if pmc_response.status_code == 500: - print('Error for DOI chunk; retry with smaller chunk size') - else: - pmc_results = et.XML(pmc_response.content) - pmc_results = pmc_results.getchildren()[1:] # exclude echo header - for result in pmc_results: - doi = result.attrib['doi'] - try: - pmcid = result.attrib['pmcid'] - doi_to_pmc[doi] = pmcid - except KeyError: - if result.attrib['status'] == 'error': - dois_not_in_pmc.append(doi) - else: - print('Weird error for', doi) - time.sleep(1) - return doi_to_pmc, dois_not_in_pmc - -if __name__ == '__main__': - pmcidlist = get_all_pmc_dois() diff --git a/allofplos/samples/corpus_analysis.py b/allofplos/samples/corpus_analysis.py index 1a67c441..3ab1a0df 100644 --- a/allofplos/samples/corpus_analysis.py +++ b/allofplos/samples/corpus_analysis.py @@ -9,54 +9,23 @@ import collections import csv -import datetime -import lxml.etree as et -from glob import glob -from os.path import join -from os import (listdir, rmdir, mkdir) import os import progressbar -import re +import random import requests -from shutil import move, rmtree -import time -from download import download -import numpy as np -from plos_corpus import (listdir_nohidden, extract_filenames, check_article_type, get_articleXML_content, - get_related_article_doi, download_updated_xml, unzip_articles, get_all_solr_dois, - file_to_doi, doi_to_file, check_if_uncorrected_proof, newarticledir) -from plos_regex import (regex_match_prefix, regex_body_match, regex_body_currents, full_doi_regex_match, - full_doi_regex_search, currents_doi_regex, make_regex_bool, validate_doi, validate_file, - validate_url, find_valid_dois, show_invalid_dois, currents_doi_filter) +from plos_corpus import (listdir_nohidden, check_article_type, get_article_xml, + get_related_article_doi, download_updated_xml, get_all_solr_dois, + file_to_doi, newarticledir, get_article_pubdate) +from plos_regex import (full_doi_regex_match, validate_doi, validate_file, validate_url, currents_doi_filter) counter = collections.Counter -newpmcarticledir = "new_pmc_articles" -USER_EMAIL = 'elizabeth.seiver@gmail.com' - -pmcdir = "pmc_articles/" corpusdir = 'allofplos_xml' -pmc_csv = 'doi_to_pmc.csv' -# xml URL takes PMC identifier minus 'PMC' -pmc_xml_url = 'https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:' -pmc_xml_url_suffix = '&metadataPrefix=pmc' -# can query up to 200 DOIs from PMC -pmc_doi_query_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=corpustest&email={0}&ids='.format(USER_EMAIL) -pmc_doi_query_url_suffix = '&versions=no&format=json' -pmc_pmcid_query_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=' -pmc_allplos_query_url = ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=' - '(((((((("PLoS+ONE"[Journal])+OR+"PLoS+Genetics"[Journal])+OR+"PLoS+Pathogens"[Journal])' - 'OR+"PLoS+Neglected+Tropical+Diseases"[Journal])+OR+"PLoS+Computational+Biology"[Journal])' - 'OR+"PLoS+Biology"[Journal])+OR+"PLoS+Medicine"[Journal])+OR+"plos+currents"[Journal])' - '+OR+"PLoS+Clinical+Trials"[Journal])&retmax=1000&retmode=json&tool=corpustest' - '&email={0}'.format(USER_EMAIL)) -PMC_FTP_URL = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/' -pmc_file_list = 'oa_file_list.txt' -newpmcarticledir = "new_pmc_articles" - - -def validate_corpus(): +max_invalid_files_to_print = 100 + + +def validate_corpus(corpusdir=corpusdir): """ For every local article file and DOI listed on Solr, validate file names, DOIs, URLs in terms of regular expressions. @@ -83,21 +52,25 @@ def validate_corpus(): # check files and filenames plos_files = listdir_nohidden(corpusdir) - plos_valid_filenames = [article for article in plos_files if validate_file(article)] - if len(plos_valid_dois) == len(plos_valid_filenames): - pass - else: - print("Invalid filenames: {}".format(set(plos_valid_dois) - set(plos_valid_filenames))) - return False - plos_valid_files = [article for article in plos_valid_filenames if os.path.isfile(article)] - valid_files_count = len(plos_valid_files) - if set(plos_valid_filenames) == set(plos_valid_files): - return True - else: - if valid_files_count > 220000: - print("Invalid files: {}".format(set(plos_valid_filenames) - set(plos_valid_files))) + if plos_files: + plos_valid_filenames = [article for article in plos_files if validate_file(article)] + if len(plos_valid_dois) == len(plos_valid_filenames): + pass else: - print("Not enough valid PLOS local article files. Corpus may need to be redownloaded") + print("Invalid filenames: {}".format(set(plos_valid_dois) - set(plos_valid_filenames))) + return False + plos_valid_files = [article for article in plos_valid_filenames if os.path.isfile(article)] + if set(plos_valid_filenames) == set(plos_valid_files): + return True + else: + invalid_files = set(plos_valid_filenames) - set(plos_valid_files) + if len(invalid_files) > max_invalid_files_to_print: + print("Too many invalid files to print: {}".format(len(invalid_files))) + else: + print("Invalid files: {}".format(invalid_files)) + return False + else: + print("Corpus directory empty. Re-download by running create_local_plos_corpus()") return False # These functions are for getting the article types of all PLOS articles. @@ -119,13 +92,12 @@ def get_jats_article_type_list(article_list=None, directory=corpusdir): def get_plos_article_type(article_file): - article_categories = get_articleXML_content( - article_file=article_file, - tag_path_elements=["/", - "article", - "front", - "article-meta", - "article-categories"]) + article_categories = get_article_xml(article_file=article_file, + tag_path_elements=["/", + "article", + "front", + "article-meta", + "article-categories"]) subject_list = article_categories[0].getchildren() for subject in subject_list: @@ -156,10 +128,9 @@ def get_plos_article_type_list(article_list=None): def get_article_dtd(article_file): try: - dtd = get_articleXML_content( - article_file=article_file, - tag_path_elements=["/", - "article"]) + dtd = get_article_xml(article_file=article_file, + tag_path_elements=["/", + "article"]) dtd = dtd[0].attrib['dtd-version'] except KeyError: print('Error parsing DTD from', article_file) @@ -331,55 +302,14 @@ def revisiondate_sanity_check(article_list=None, tempdir=newarticledir, director # These functions are for getting & analyzing the PLOS Corpus from PMC -def get_pmc_articles(): - """ - :return: a list of all article files in PMC folder - """ - # step 1: download tarball file if needed - pmc_url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/articles.O-Z.xml.tar.gz' - pmcdir = 'PMC_articles/' - pmc_local_tar = 'PMC_files.tar.gz' - pmc_path = os.path.join(pmcdir, pmc_local_tar) - if os.path.isdir(pmcdir) is False: - os.mkdir(pmcdir) - print('Creating folder for PMC article xml') - - if len([name for name in os.listdir(pmcdir) if os.path.isfile(os.path.join(pmcdir, name))]) < 200000: - print('Not enough articles in pmcdir, re-downloading zip file') - path = download(pmc_url, pmc_path) - - # Step 2: unzip archive - unzip_articles(directory=pmcdir, filetype='tar', file=pmc_local_tar) - - # Step 3: delete non-PLOS folders - listdirs = glob("PMC_articles/*/") - print(len(listdirs), "folders for all O-Z journals") - for directory in list(listdirs): - if directory.lower().startswith('pmc_articles/plos') is False: - rmtree(directory) - listdirs.remove(directory) - print(len(listdirs), "folders remaining for PLOS journals") - - # Step 4: put all PLOS articles in higher level pmcdir folder & flatten hierarchy - root = pmcdir - print("moving PMC articles to top-level folder") - for dirrr in list(listdirs): - files = [f for dp, dn, filenames in os.walk(dirrr) for f in filenames if os.path.splitext(f)[1] == '.nxml'] - for file in files: - move(join(dirrr, file), join(root, file)) - rmtree(dirrr) - pmc_articles = listdir_nohidden(pmcdir, extension='.nxml') - - return pmc_articles - def get_article_doi(article_file): - raw_xml = get_articleXML_content(article_file=article_file, - tag_path_elements=["/", - "article", - "front", - "article-meta", - "article-id"]) + raw_xml = get_article_xml(article_file=article_file, + tag_path_elements=["/", + "article", + "front", + "article-meta", + "article-id"]) for x in raw_xml: for name, value in x.items(): if value == 'doi': @@ -470,45 +400,6 @@ def get_articles_by_doi_field(directory=pmcdir, article_list=None, check_new=Tru return doi_to_pmc -def get_article_pubdate(article_file, date_format='%d %m %Y'): - day = '' - month = '' - year = '' - raw_xml = get_articleXML_content(article_file=article_file, - tag_path_elements=["/", - "article", - "front", - "article-meta", - "pub-date"]) - for x in raw_xml: - for name, value in x.items(): - if value == 'epub': - date_fields = x - for y in date_fields: - if y.tag == 'day': - day = y.text - if y.tag == 'month': - month = y.text - if y.tag == 'year': - year = y.text - date = (day, month, year) - string_date = ' '.join(date) - pubdate = datetime.datetime.strptime(string_date, date_format) - return pubdate - - -def compare_article_pubdate(article, days=22): - try: - pubdate = get_article_pubdate(article) - today = datetime.datetime.now() - three_wks_ago = datetime.timedelta(days) - compare_date = today - three_wks_ago - return pubdate < compare_date - except OSError: - print("Pubdate error in {}".format(article)) - pass - - def check_solr_doi(doi): ''' For an article doi, see if there's a record of it in Solr. @@ -518,73 +409,6 @@ def check_solr_doi(doi): return bool(article_search['response']['numFound']) -def get_pmc_doi_dict(id_list=None, chunk_size=150): - ''' - Using the PMC ID query API, return the accompanying PMCID for each identifier in a given list. - Can (ostensibly) query up to 200 identifiers at a time. Can accept lists of DOIs or PMC IDs - :return: tuple of dictionary mapping DOI to PMCID, list of DOIs not found in PMC - ''' - if id_list is None: - id_list = extract_filenames(pmcdir, extension='.nxml') - doi_to_pmc = {} - dois_not_in_pmc = [] - # Make chunks of 200 DOIs at a time - list_chunks = [id_list[x:x+chunk_size] for x in range(0, len(id_list), chunk_size)] - for chunk in list_chunks: - pmc_doi_string = ','.join(chunk) - # Create the search URL - pmc_doi_query = pmc_doi_query_url + pmc_doi_string - # Parse the results & create dict entry for each result - pmc_response = requests.get(pmc_doi_query) - if pmc_response.status_code == 500: - print('Error for DOI chunk; retry with smaller chunk size') - else: - pmc_results = et.XML(pmc_response.content) - pmc_results = pmc_results.getchildren()[1:] # exclude echo header - for result in pmc_results: - doi = result.attrib['doi'] - try: - pmcid = result.attrib['pmcid'] - doi_to_pmc[doi] = pmcid - except KeyError: - if result.attrib['status'] == 'error': - dois_not_in_pmc.append(doi) - else: - print('Weird error for', doi) - time.sleep(1) - return doi_to_pmc, dois_not_in_pmc - - -def update_pmc_dict_by_doi(id_list): - ''' - With a list of identifiers, query PMC ID service to check for PMCIDs for articles. Print to .csv - :return: tuple of full dictionary of DOIs to PMC IDs, DOIs without matching PMCIDs - ''' - doi_to_pmc = get_articles_by_doi_field(check_new=False) - doi_to_pmc2, dois_not_in_pmc = get_pmc_doi_dict(id_list) - full_pmc_dict = {**doi_to_pmc2, **doi_to_pmc} - with open(pmc_csv, 'w') as file: - writer = csv.writer(file) - writer.writerow(['DOI', 'PMC ID']) - for key, value in full_pmc_dict.items(): - writer.writerow([key, value]) - return full_pmc_dict, dois_not_in_pmc - - -def exclude_recent_dois(doi_list): - ''' - For arriving at a list of DOIs ostensibly missing from PMC, remove the most recent articles - which likely have not yet had the opportunity to propagate. - :return: a list of missing DOIs which are old enough to be expected to be on PMC. - ''' - missing_pmc_articles = [] - for doi in doi_list: - article_file = doi_to_file(doi) - if compare_article_pubdate(article_file): - missing_pmc_articles.append(doi) - return missing_pmc_articles - - def check_if_doi_resolves(doi, plos_valid=True): """ Return metadata for a given DOI. If the link works, make sure that it points to the same DOI @@ -606,81 +430,6 @@ def check_if_doi_resolves(doi, plos_valid=True): return "doesn't work" -def process_missing_plos_articles(plos_articles=None, pmc_articles=None): - ''' - For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version - of the Corpus by removing Currents articles, checking if articles are live on journals.plos.org, - and checking that the DOIs resolve. Prints the different kinds of errors that can occur. - :return: list of missing articles - ''' - if plos_articles is None or not plos_articles: - plos_articles = get_all_plos_dois() - if pmc_articles is None or not pmc_articles: - doi_to_pmc = get_articles_by_doi_field(check_new=False) - pmc_articles = list(doi_to_pmc.keys()) - missing_plos_articles = list(set(pmc_articles) - set(plos_articles)) - - # remove Currents articles - for article in missing_plos_articles: - if article.startswith('10.1371/currents') or \ - len(article) == 21 or \ - article == '10.1371/198d344bc40a75f927c9bc5024279815': - missing_plos_articles.remove(article) - - # check if articles are live on journals.plos.org - # check if DOIs resolve - missing_articles_link_works = [] - missing_articles_404_error = [] - doi_works = [] - doi_doesnt_work = [] - doi_mismatch = [] - doi_has_space = [] - for doi in missing_plos_articles: - if ' ' in doi: - doi_has_space.append(doi) - continue - doi_check = check_if_doi_resolves(doi) - if doi_check == 'works': - doi_works.append(doi) - elif doi_check == "doesn't work": - doi_doesnt_work.append(doi) - else: - doi_mismatch.append(doi) - continue - url = doi_to_url(doi) - article_exists = check_if_link_works(url) - if article_exists: - missing_articles_link_works.append(doi) - else: - missing_articles_404_error.append(doi) - - doi_mismatch = sorted(doi_mismatch) - link404_invalid_doi = sorted(list(set(missing_articles_404_error).intersection(doi_doesnt_work))) - linkworks_valid_doi = sorted(list(set(missing_articles_link_works).intersection(doi_works))) - - if doi_has_space: - print('\033[1m' + 'PMC DOI fields with spaces in them:') - for doi in doi_has_space: - print('\033[0m' + '"' + doi + '" \n') - if linkworks_valid_doi: - print('\033[1m' + 'Working articles that need to be re-indexed on Solr:') - print('\033[0m' + '\n'.join(linkworks_valid_doi), '\n') - if link404_invalid_doi: - print('\033[1m' + 'Articles on PMC but not on solr or journals:') - print('\033[0m' + '\n'.join(missing_articles_404_error), '\n') - if doi_mismatch: - print('\033[1m' + 'Missing PLOS articles where DOI resolves to different DOI:') - for doi in doi_mismatch: - print('\033[0m', doi, 'resolves to:', check_if_doi_resolves(doi)) - - remainder = set(missing_plos_articles) - set(linkworks_valid_doi + missing_articles_404_error + - doi_mismatch + doi_has_space) - if remainder: - print('\n \033[1m' + "Other articles on PMC that aren't working correctly for PLOS:") - print('\033[0m' + '\n'.join(remainder), '\n') - return missing_plos_articles - - def get_all_plos_dois(local_articles=None, solr_articles=None): ''' Collects lists of articles for local and solr, calculates the difference. @@ -694,7 +443,8 @@ def get_all_plos_dois(local_articles=None, solr_articles=None): local_articles = [file_to_doi(article_file) for article_file in listdir_nohidden(corpusdir)] missing_local_articles = set(solr_articles) - set(local_articles) if missing_local_articles: - print('re-run plos_corpus.py to download latest {} PLOS articles locally.'.format(len(missing_local_articles))) + print('re-run plos_corpus.py to download latest {0} PLOS articles locally.' + .format(len(missing_local_articles))) missing_solr_articles = set(local_articles) - set(solr_articles) plos_articles = set(solr_articles + local_articles) if missing_solr_articles: @@ -706,251 +456,17 @@ def get_all_plos_dois(local_articles=None, solr_articles=None): def get_random_list_of_dois(directory=corpusdir, count=100): ''' - Gets a list of random DOIs. Tries first to construct from local files in corpusdir, otherwise tries Solr DOI list - as backup. + Gets a list of random DOIs. Tries first to construct from local files in + corpusdir, otherwise tries Solr DOI list as backup. :param directory: defaults to searching corpusdir :param count: specify how many DOIs are to be returned :return: a list of random DOIs for analysis ''' try: article_list = listdir_nohidden(directory) - np_list = np.array(article_list) - sample_file_list = list(np.random.choice(np_list, size=count, replace=False)) + sample_file_list = random.sample(article_list, count) sample_doi_list = [file_to_doi(file) for file in sample_file_list] except OSError: doi_list = get_all_solr_dois() - np_list = np.array(doi_list) - sample_doi_list = list(np.random.choice(np_list, size=count, replace=False)) + sample_doi_list = random.sample(doi_list, count) return sample_doi_list - - -def process_missing_pmc_articles(pmc_articles=None, plos_articles=None): - ''' - For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version - of the Corpus by updating the PMCID:DOI mapping document, removing articles too recent to be indexed - (pubdate less than 3 weeks ago), and excluding uncorrected proofs. - :return: list of missing articles from PMC - ''' - if pmc_articles is None: - doi_to_pmc = get_articles_by_doi_field(check_new=False) - pmc_articles = list(doi_to_pmc.keys()) - - if plos_articles is None: - plos_articles = get_all_plos_dois() - missing_pmc_dois = list(set(plos_articles) - set(pmc_articles)) - - # Query for PMC updates & update DOI-to-PMCID dictionary - if missing_pmc_dois: - full_pmc_dict, dois_not_in_pmc = update_pmc_dict_by_doi(missing_pmc_dois) - - # Exclude PLOS Medicine quizzes - for doi in dois_not_in_pmc: - if "pmed" in doi: - article = doi_to_article(doi) - article_type = get_plos_article_type(article) - if article_type == 'Quiz': - dois_not_in_pmc.remove(doi) - - # Remove articles too recent to have been indexed on PMC - if dois_not_in_pmc: - missing_pmc_dois = exclude_recent_dois(dois_not_in_pmc) - - # Remove uncorrected proofs - if missing_pmc_dois: - for doi in missing_pmc_dois: - article_file = doi_to_file(doi) - if check_if_uncorrected_proof(article_file): - missing_pmc_dois.remove(doi) - - # Make sure that the DOI resolves - for doi in missing_pmc_dois: - resolves = check_if_doi_resolves(doi) - if resolves != "works": - print('DOI not working for this PLOS DOI:', doi, resolves) - missing_pmc_dois.remove(doi) - - if len(missing_pmc_dois) == 0: - print('No PMC articles missing.') - else: - for doi in missing_pmc_dois: - if ' ' in doi: - print('There is a space in this DOI: ' + '"' + doi + '"') - print('\033[1m' + 'Articles missing from PMC:') - print('\033[0m' + '\n'.join(sorted(missing_pmc_dois)), '\n') - - return missing_pmc_dois - - -def get_all_pmc_dois(retstart=0, retmax=80000, count=None): - """ - Query the entrez database to get a comprehensive list of all PMCIDs associated with all PLOS journals, - individually included in the search url. - See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch for more info on search parameters - :return: the full list of PMCIDs in PMC for PLOS articles - """ - pmc_allplos_query_url = ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=' - '((((((((("PLoS+ONE"[Journal])+OR+"PLoS+Genetics"[Journal])+OR+"PLoS+Pathogens"[Journal])' - 'OR+"PLoS+Neglected+Tropical+Diseases"[Journal])+OR+"PLoS+Computational+Biology"[Journal])' - 'OR+"PLoS+Biology"[Journal])+OR+"PLoS+Medicine"[Journal])+OR+"plos+currents"[Journal])+OR+' - '"PLoS Clinical Trials"[Journal])' - '&retmode=json&tool=corpustest&email={0}'.format(USER_EMAIL)) - - pmcidlist = [] - r = requests.get(pmc_allplos_query_url).json() - if count is None: - count = int(r['esearchresult']['count']) - print(count, "articles found in PMC") - while retstart < count: - query = pmc_allplos_query_url + '&retstart={0}&retmax={1}'.format(retstart, retmax) - r = requests.get(query).json() - idlist = r['esearchresult']['idlist'] - for id in idlist: - pmcidlist.append('PMC' + id) - retstart += retmax - time.sleep(1) - pmcidlist = sorted(list(set(pmcidlist))) - if pmcidlist != count: - print("Error in number of IDs returned. Got {} when expected {}.".format(len(pmcidlist), count)) - - return pmcidlist - - -def update_local_pmc_from_remote(): - ''' - Using the current set of articles indexed live on PMC, compare them to the locally maintained index. - If any of them are missing, download them to the local .csv dictionary. - :return: full dictionary of PMC IDs''' - remote_pmc_ids = get_all_pmc_dois() - local_pmc_dict = get_articles_by_doi_field() - local_pmc_ids = list(local_pmc_dict.values()) - missing_pmcids = list(set(remote_pmc_ids) - set(local_pmc_ids)) - if missing_pmcids: - full_pmc_dict, dois_not_in_pmc = update_pmc_dict_by_doi(missing_pmcids) - else: - full_pmc_dict = doi_to_pmc - weird_pmc_ids = list(set(local_pmc_ids) - set(remote_pmc_ids)) - if 0 < weird_pmc_ids < 10000: - print("Some articles on local not on remote:", print(weird_pmc_ids)) - return full_pmc_dict - - -def get_needed_pmc_articles(): - """ - Compare local to remote set of PLOS PMC IDs. - TO DO: Add check for latest update date - :return: tuple of doi dict, and list of DOIs that are on remote and not local, to be downloaded. - """ - doi_to_pmc = get_articles_by_doi_field(check_new=False) - remote_pmc_ids = list(doi_to_pmc.values()) - local_pmc_ids = extract_filenames(pmcdir, extension='.nxml') - missing_pmc_articles = list(set(remote_pmc_ids) - set(local_pmc_ids)) - return doi_to_pmc, missing_pmc_articles - - -def get_pmc_article_zip_links(): - """ - Creates a dictionary mapping every PMC ID to the partial PMC download URL - Based on txt file hosted by PMC - TO DO: see if there's a way to download monthly, weekly, etc from PMC - :return: dictionary mapping PMC IDs to partial download links - """ - - # write info file to disk if it doesn't exist already or is too old - try: - mod_date = datetime.datetime.fromtimestamp(os.path.getmtime(pmc_file_list)) - file_age = datetime.datetime.now() - mod_date - if file_age > datetime.timedelta(days=1): - os.remove(pmc_file_list) - except FileNotFoundError: - pass - if os.path.isfile(pmc_file_list) is False: - with open(pmc_file_list, 'w') as f: - f.write(requests.get('http://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt').text) - - # parse file by line - with open(pmc_file_list) as f: - pmc_lists = [x.strip().split('\t') for x in f] - - # turn into dictionary mapping of PMCID to partial PMC URL - pmc_urls = {d[2]: d[0] for d in pmc_lists[1:]} - - return pmc_urls - - -def download_pmc_article_xml(missing_pmc_articles=None, pmc_urls=None): - """ - Get missing PMC articles. Get dictionary mapping them to partial URLs. Download and unzip the tarballs. - Keep and rename the nxml files and delete the others. - NOTE: This hasn't worked very well. PMC connections are unreliable & there are a lot of timeouts. - :return: list of files downloaded from PMC - """ - new_pmc_articles = [] - if missing_pmc_articles is None: - doi_to_pmc, missing_pmc_articles = get_needed_pmc_articles() - print(len(missing_pmc_articles), "PMC articles to download.") - if missing_pmc_articles: - if pmc_urls is None: - pmc_urls = get_pmc_article_zip_links() - # download and unzip tarballs - for article in missing_pmc_articles: - dl_url = PMC_FTP_URL + pmc_urls[article] - filename = (pmc_urls[article]).split("/")[3] - local_file = os.path.join(newpmcarticledir, filename) - if os.path.isfile(local_file) is False: - try: - download(dl_url, local_file) - unzip_articles(directory=newpmcarticledir, filetype='tar', file=filename) - except RuntimeError: - print('Error downloading', article) - continue - - # get rid of non-.nxml files - allfiles = glob.glob('New_PMC_articles/*/*') - for file in allfiles: - if file.endswith('.nxml') is False: - os.remove(file) - - # move and process the nxml files - files = glob.glob('New_PMC_articles/*/*') - for old_file in files: - # make sure directory and linked doi line up - directory = (old_file).split('/')[1] - linked_doi = doi_to_pmc[get_article_doi(article_file=old_file)] - if linked_doi == directory: - # rename file from directory & move to higher level directory - new_file = '/'.join(((old_file).split('/'))[0:2]) + '.nxml' - shutil.move(old_file, new_file) - new_pmc_articles.append(new_file) - else: - print('error:', linked_doi, directory) - for directory in glob.glob('New_PMC_articles/*/'): - os.rmdir(directory) - - return new_pmc_articles - - -def move_pmc_articles(source, destination): - """ - Move PMC articles from one folder to another - :param source: Temporary directory of new article files - :param destination: Directory where files are copied to - """ - oldnum_destination = len(listdir_nohidden(destination, extension='.nxml')) - oldnum_source = len(listdir_nohidden(source, extension='.nxml')) - if oldnum_source > 0: - print("PMC Corpus started with", - oldnum_destination, - "articles.\nFile moving procedure initiated, please hold...") - copytree(source, destination, ignore=ignore_func) - newnum_destination = len(listdir_nohidden(destination)) - if newnum_destination - oldnum_destination > 0: - print(newnum_destination - oldnum_destination, - "files moved. PMC Corpus now has", - newnum_destination, "articles.") - logging.info("New article files moved successfully") - else: - print("No files found to move in source directory.") - logging.info("No article files moved") - # Delete temporary folder in most cases - if source == newarticledir: - shutil.rmtree(source) From 59048f1812f239a3b33cb00679ab6c1c14ff1664 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Tue, 3 Oct 2017 18:07:43 -0700 Subject: [PATCH 04/24] re-add analysis --- allofplos/Corpus_Analysis_Examples.ipynb | 257 ++++++ allofplos/Corpus_QA-Copy1.ipynb | 450 ++++++++++ allofplos/Corpus_QA.ipynb | 512 +++++++++++ allofplos/Production team investigates.ipynb | 393 +++++++++ allofplos/jupyternb/Corpus_Analysis-old.ipynb | 792 ++++++++++++++++++ ...S article XML from journals.plos.org.ipynb | 236 ++++++ ...thly integrity check for PLOS corpus.ipynb | 148 ++++ allofplos/plos_pmc.py | 532 ++++++++++++ allofplos/twoto3_nb.py | 80 ++ 9 files changed, 3400 insertions(+) create mode 100644 allofplos/Corpus_Analysis_Examples.ipynb create mode 100644 allofplos/Corpus_QA-Copy1.ipynb create mode 100644 allofplos/Corpus_QA.ipynb create mode 100644 allofplos/Production team investigates.ipynb create mode 100644 allofplos/jupyternb/Corpus_Analysis-old.ipynb create mode 100644 allofplos/jupyternb/Download PLOS article XML from journals.plos.org.ipynb create mode 100644 allofplos/jupyternb/Monthly integrity check for PLOS corpus.ipynb create mode 100644 allofplos/plos_pmc.py create mode 100755 allofplos/twoto3_nb.py diff --git a/allofplos/Corpus_Analysis_Examples.ipynb b/allofplos/Corpus_Analysis_Examples.ipynb new file mode 100644 index 00000000..bc5f45bb --- /dev/null +++ b/allofplos/Corpus_Analysis_Examples.ipynb @@ -0,0 +1,257 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Required functions" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "code_folding": [], + "collapsed": true + }, + "outputs": [], + "source": [ + "from samples.corpus_analysis import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PLOS article types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## JATS-standard NLM article types" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "editable": false, + "run_control": { + "frozen": true + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15 types of articles found.\n", + "[('research-article', 204109), ('correction', 9113), ('article-commentary', 1284), ('discussion', 1087), ('review-article', 612), ('other', 584), ('editorial', 340), ('letter', 300), ('retraction', 79), ('book-review', 77), ('meeting-report', 38), ('case-report', 23), ('expression-of-concern', 13), ('obituary', 10), ('brief-report', 1)]\n" + ] + } + ], + "source": [ + "jats_article_type_list = get_jats_article_type_list()\n", + "print(jats_article_type_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PLOS article types" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "editable": false, + "run_control": { + "frozen": true + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "60 types of articles found.\n", + "[('Research Article', 202673), ('Correction', 9124), ('Synopsis', 1093), ('Perspective', 801), ('Review', 555), ('Editorial', 486), ('Pearls', 438), ('Essay', 379), ('Policy Forum', 309), ('Correspondence', 287), ('Primer', 237), ('Viewpoints', 209), ('Community Page', 139), ('Opinion', 136), ('Health in Action', 118), ('Education', 103), ('Retraction', 79), ('Book Review/Science in the Media', 76), ('Message from ISCB', 70), ('Symposium', 70), ('Policy Platform', 54), ('Feature', 53), ('Formal Comment', 52), ('Research in Translation', 51), ('Guidelines and Guidance', 51), ('Collection Review', 50), ('Research Matters', 44), ('Interview', 44), ('The PLoS Medicine Debate', 38), ('Historical Profiles and Perspectives', 38), ('Unsolved Mystery', 34), ('Overview', 34), ('Neglected Diseases', 29), ('Expert Commentary', 29), ('Learning Forum', 27), ('From Innovation to Application', 24), ('Obituary', 22), ('Quiz', 21), ('Correspondence and Other Communications', 13), ('Expression of Concern', 13), ('Journal Club', 12), ('Meta-Research Article', 12), ('Student Forum', 12), ('Open Highlights', 11), ('Topic Page', 11), ('Case Report', 10), ('Photo Quiz', 10), ('Best Practice', 5), ('Deep Reads', 4), ('Historical and Philosophical Perspectives', 3), ('Special Report', 3), ('Book Review', 2), ('Message from the Founders', 1), ('Message from PLoS', 1), ('Short Reports', 1), ('Methods and Resources', 1), ('Technical Report', 1), ('Message from the PLoS Founders', 1), ('Collection Review ', 1), ('Debate', 1)]\n" + ] + } + ], + "source": [ + "PLOS_article_type_list = get_plos_article_type_list()\n", + "print(PLOS_article_type_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Taking random samples of DOIs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "random_sample_of_dois = get_random_list_of_DOIs() # returns 100 DOIs by default" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['journal.pone.0074820', 'journal.pone.0063497', 'journal.pone.0126357', 'journal.pntd.0004807', 'journal.pone.0031896', 'journal.pone.0045503', 'journal.pone.0138217', 'journal.pbio.0050002', 'journal.pone.0122848', 'journal.pone.0099248']\n" + ] + } + ], + "source": [ + "random_sample_of_articles = [doi_to_article(doi) for doi in random_sample_of_dois]\n", + "print(random_sample_of_articles[0:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Retracted and corrected articles" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get list of retracted articles" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "editable": false, + "run_control": { + "frozen": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "79 retracted articles found.\n", + "['journal.pbio.0030123', 'journal.pbio.0050005', 'journal.pbio.0050146', 'journal.pbio.1001212', 'journal.pcbi.1002308', 'journal.pgen.1003361', 'journal.pgen.1003791', 'journal.pgen.1005586', 'journal.pgen.1000424', 'journal.pmed.1001214', 'journal.pone.0072333', 'journal.pone.0084127', 'journal.pone.0027571', 'journal.pone.0046410', 'journal.pone.0080145', 'journal.pone.0019652', 'journal.pone.0075928', 'journal.pone.0075046', 'journal.pone.0062178', 'journal.pone.0051549', 'journal.pone.0093095', 'journal.pone.0069669', 'journal.pone.0133525', 'journal.pone.0115980', 'journal.pone.0115741', 'journal.pone.0139044', 'journal.pone.0146193', 'journal.pone.0045667', 'journal.pone.0040789', 'journal.pone.0094830', 'journal.pone.0031943', 'journal.pone.0097700', 'journal.pone.0047218', 'journal.pone.0090951', 'journal.pone.0014232', 'journal.pone.0090318', 'journal.pone.0072895', 'journal.pone.0065651', 'journal.pone.0059556', 'journal.pone.0076809', 'journal.pone.0099630', 'journal.pone.0121549', 'journal.pone.0048402', 'journal.pone.0062170', 'journal.pone.0020152', 'journal.pone.0164571', 'journal.pone.0164378', 'journal.pone.0116682', 'journal.pone.0125542', 'journal.pone.0047110', 'journal.pone.0026503', 'journal.pone.0037102', 'journal.pone.0014163', 'journal.pone.0043204', 'journal.pone.0001276', 'journal.pone.0035142', 'journal.pone.0011299', 'journal.pone.0005373', 'journal.pone.0030980', 'journal.pone.0000306', 'journal.pone.0064576', 'journal.pone.0016011', 'journal.pone.0001444', 'journal.pone.0043406', 'journal.pone.0029192', 'journal.pone.0001908', 'journal.pone.0016256', 'journal.pone.0013512', 'journal.pone.0045965', 'journal.pone.0022730', 'journal.pone.0006333', 'journal.pone.0004168', 'journal.pone.0035453', 'journal.pone.0032853', 'journal.ppat.1003435', 'journal.ppat.1002062', 'journal.ppat.1000915', 'journal.ppat.1000210', 'journal.ppat.0020025']\n" + ] + } + ], + "source": [ + "retractions_article_list, retracted_article_list = get_retracted_article_list()\n", + "print(retracted_article_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get list of corrected articles" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "editable": false, + "run_control": { + "frozen": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "journal.pcbi.1003582.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003490\n", + "journal.pcbi.1003732.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003159\n", + "journal.pone.0101541.xml has incorrect linked DOI: journal.PONE-D-13-26510\n", + "journal.pone.0104353.xml has incorrect linked DOI: journal.\n", + "journal.pone.0104472.xml has incorrect linked DOI: journal.\n", + "journal.pone.0104581.xml has incorrect linked DOI: journal.\n", + "journal.pone.0104601.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105485.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105486.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105490.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105658.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105668.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105669.xml has incorrect linked DOI: journal.\n", + "9127 corrected articles found.\n" + ] + } + ], + "source": [ + "corrections_article_list, corrected_article_list = get_corrected_article_list()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + }, + "toc": { + "colors": { + "hover_highlight": "#DAA520", + "navigate_num": "#000000", + "navigate_text": "#333333", + "running_highlight": "#FF0000", + "selected_highlight": "#FFD700", + "sidebar_border": "#EEEEEE", + "wrapper_background": "#FFFFFF" + }, + "moveMenuLeft": true, + "nav_menu": { + "height": "174px", + "width": "252px" + }, + "navigate_menu": true, + "number_sections": true, + "sideBar": true, + "threshold": 4, + "toc_cell": false, + "toc_section_display": "block", + "toc_window_display": false, + "widenNotebook": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/allofplos/Corpus_QA-Copy1.ipynb b/allofplos/Corpus_QA-Copy1.ipynb new file mode 100644 index 00000000..b7ccf708 --- /dev/null +++ b/allofplos/Corpus_QA-Copy1.ipynb @@ -0,0 +1,450 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Required functions" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "code_folding": [] + }, + "outputs": [], + "source": [ + "from samples.corpus_analysis import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PLOS/NLM article type mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'i' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# For mapping the JATS article type onto the PLOS article type, while taking NLM DTD into account.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0marticle_types_map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_article_types_map\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mPLOS_article_types_structured\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcounter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_types_map\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmost_common\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mPLOS_article_types_structured\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mget_article_types_map\u001b[0;34m(directory)\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0marticle_file\u001b[0m \u001b[0;32min\u001b[0m \u001b[0marticle_files\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 182\u001b[0m \u001b[0mjats_article_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_article_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 183\u001b[0;31m \u001b[0mplos_article_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_plos_article_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 184\u001b[0m \u001b[0mdtd_version\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_article_dtd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0mtypes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mjats_article_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplos_article_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtd_version\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mget_plos_article_type\u001b[0;34m(article_file)\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msubject\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msubject_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msubject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'subj-group-type'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"heading\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 140\u001b[0;31m \u001b[0msubject_instance\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msubject_list\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 141\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msubject_instance\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitertext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'i' is not defined" + ] + } + ], + "source": [ + "# For mapping the JATS article type onto the PLOS article type, while taking NLM DTD into account.\n", + "article_types_map = get_article_types_map()\n", + "PLOS_article_types_structured = counter(article_types_map).most_common()\n", + "print(PLOS_article_types_structured)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create .csv file mapping JATS to PLOS article types\n", + "article_types_map_to_csv(article_types_map)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Retracted and corrected articles" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get list of retracted articles" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 retracted articles found.\n" + ] + } + ], + "source": [ + "# article_list = [doi_to_file(doi) for doi in get_random_list_of_dois(count=5000)]\n", + "retractions_doi_list, retracted_doi_list = get_retracted_doi_list(article_list=article_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['10.1371/journal.pbio.1002215']" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retractions_doi_list" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "article_list = [doi_to_file('10.1371/journal.pbio.1002215')]" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "def get_retracted_doi_list(article_list=None, directory=corpusdir):\n", + " \"\"\"\n", + " Scans through articles in a directory to see if they are retraction notifications,\n", + " scans articles that are that type to find DOIs of retracted articles\n", + " :return: tuple of lists of DOIs for retractions articles, and retracted articles\n", + " \"\"\"\n", + " retractions_doi_list = []\n", + " retracted_doi_list = []\n", + " if article_list is None:\n", + " article_list = listdir_nohidden(directory)\n", + " for article_file in article_list:\n", + " if check_if_retraction_article(article_file):\n", + " retractions_doi_list.append(file_to_doi(article_file))\n", + " # Look in those articles to find actual articles that are retracted\n", + " retracted_doi = get_related_retraction_article(article_file)[0]\n", + " retracted_doi_list.append(retracted_doi)\n", + " # check linked DOI for accuracy\n", + " if make_regex_bool(full_doi_regex_match.search(retracted_doi)) is False:\n", + " print(\"{} has incorrect linked DOI field: '{}'\".format(article_file, retracted_doi))\n", + " if len(retractions_doi_list) == len(retracted_doi_list):\n", + " print(len(retracted_doi_list), 'retracted articles found.')\n", + " else:\n", + " print('Number of retraction articles and retracted articles are different: ',\n", + " '{} vs. {}'.format(len(retractions_article_list), len(retracted_article_list)))\n", + " return retractions_doi_list, retracted_doi_list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get list of corrected articles" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5 corrected articles found.\n" + ] + } + ], + "source": [ + "article_list = [doi_to_file(doi) for doi in get_random_list_of_dois(count=100)]\n", + "corrections_article_list, corrected_article_list = get_corrected_article_list(article_list=article_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['10.1371/journal.pone.0065474', '10.1371/journal.pone.0144760', '10.1371/journal.pone.0050818', '10.1371/journal.pmed.1001786', '10.1371/journal.ppat.1003068']\n" + ] + } + ], + "source": [ + "print(corrected_article_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check raw XML for article updates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# By default, checks only the 30,000 most recent articles\n", + "articles_different_list = revisiondate_sanity_check()\n", + "print(articles_different_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DOI and filename sanity check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check if article filenames match their full DOIs & that DOI fields are correct\n", + "messed_up_plos_list = article_doi_sanity_check()\n", + "messed_up_pmc_list = article_doi_sanity_check(directory=pmcdir, article_list=None, source='PMC')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PubMed Corpus" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get all local, solr, and PMC DOIs" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1mArticles that needs to be re-indexed on Solr:\n", + "\u001b[0m10.1371/journal.pone.0076809\n" + ] + } + ], + "source": [ + "plos_articles = get_all_plos_dois()\n", + "doi_to_pmc = get_articles_by_doi_field(check_new=False)\n", + "pmc_articles = list(doi_to_pmc.keys())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare PLOS's copy to PMC" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version of the corpus by:\n", + "* removing Currents articles\n", + "* checking if articles are live on journals.plos.org\n", + "* checking that the DOIs resolve" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "missing_plos_articles = process_missing_plos_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare PMC's copy to PLOS" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version of the Corpus by:\n", + "* updating the PMCID:DOI mapping document\n", + "* removing articles too recent to be indexed (pubdate less than 3 weeks ago)\n", + "* excluding uncorrected proofs\n", + "* excluding PLOS Medicine quizzes" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmissing_pmc_articles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprocess_missing_pmc_articles\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpmc_articles\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpmc_articles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplos_articles\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mplos_articles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mprocess_missing_pmc_articles\u001b[0;34m(pmc_articles, plos_articles)\u001b[0m\n\u001b[1;32m 730\u001b[0m \u001b[0mplos_articles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_all_plos_dois\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 731\u001b[0m \u001b[0mmissing_pmc_dois\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mplos_articles\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpmc_articles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 732\u001b[0;31m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 733\u001b[0m \u001b[0;31m# Query for PMC updates & update DOI-to-PMCID dictionary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 734\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmissing_pmc_dois\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mupdate_pmc_dict_by_doi\u001b[0;34m(id_list)\u001b[0m\n\u001b[1;32m 562\u001b[0m '''\n\u001b[1;32m 563\u001b[0m \u001b[0mdoi_to_pmc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_articles_by_doi_field\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcheck_new\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 564\u001b[0;31m \u001b[0mdoi_to_pmc2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdois_not_in_pmc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_pmc_doi_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 565\u001b[0m \u001b[0mfull_pmc_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mdoi_to_pmc2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mdoi_to_pmc\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 566\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpmc_csv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mget_pmc_doi_dict\u001b[0;34m(id_list, chunk_size)\u001b[0m\n\u001b[1;32m 536\u001b[0m \u001b[0mpmc_doi_query\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpmc_doi_query_url\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mpmc_doi_string\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 537\u001b[0m \u001b[0;31m# Parse the results & create dict entry for each result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 538\u001b[0;31m \u001b[0mpmc_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpmc_doi_query\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 539\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpmc_response\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus_code\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m500\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 540\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Error for DOI chunk; retry with smaller chunk size'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 506\u001b[0m }\n\u001b[1;32m 507\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 616\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 617\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 618\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 440\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 441\u001b[0m )\n\u001b[1;32m 442\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 600\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 601\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 602\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[0;31m# If we're going to release the connection in ``finally:``, then\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0;31m# Trigger any extra validation we need to do.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 346\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 347\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[0;31m# Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_validate_conn\u001b[0;34m(self, conn)\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;31m# Force connect early to allow us to validate the connection.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 849\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sock'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# AppEngine might not have `.sock`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 850\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 851\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 852\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_verified\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0mca_cert_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mca_cert_dir\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[0mserver_hostname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhostname\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 326\u001b[0;31m ssl_context=context)\n\u001b[0m\u001b[1;32m 327\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massert_fingerprint\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/util/ssl_.py\u001b[0m in \u001b[0;36mssl_wrap_socket\u001b[0;34m(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_cert_chain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcertfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeyfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mHAS_SNI\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Platform-specific: OpenSSL with enabled SNI\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 329\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrap_socket\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msock\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mserver_hostname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mserver_hostname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 330\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m warnings.warn(\n", + "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/contrib/pyopenssl.py\u001b[0m in \u001b[0;36mwrap_socket\u001b[0;34m(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 440\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 441\u001b[0;31m \u001b[0mcnx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_handshake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 442\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOpenSSL\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSSL\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWantReadError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 443\u001b[0m \u001b[0mrd\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msock\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgettimeout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/OpenSSL/SSL.py\u001b[0m in \u001b[0;36mdo_handshake\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1713\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1714\u001b[0m \"\"\"\n\u001b[0;32m-> 1715\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_lib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSSL_do_handshake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1716\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_raise_ssl_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1717\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "missing_pmc_articles = process_missing_pmc_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save lists of missing articles to text files if needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('missing_plos_articles.txt', 'w') as file:\n", + " for item in sorted(set(missing_plos_articles)):\n", + " file.write(\"%s\\n\" % item)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "id_list=listdir_nohidden(pmcdir, extension='.nxml')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'doi'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdoi_to_pmc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_pmc_doi_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mget_pmc_doi_dict\u001b[0;34m(id_list, chunk_size)\u001b[0m\n\u001b[1;32m 545\u001b[0m \u001b[0mpmc_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpmc_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;31m# exclude echo header\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 546\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpmc_results\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 547\u001b[0;31m \u001b[0mdoi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrib\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'doi'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 548\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[0mpmcid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrib\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'pmcid'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32msrc/lxml/lxml.etree.pyx\u001b[0m in \u001b[0;36mlxml.etree._Attrib.__getitem__ (src/lxml/lxml.etree.c:70679)\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'doi'" + ] + } + ], + "source": [ + "doi_to_pmc = get_pmc_doi_dict(id_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('missing_pmc_articles.txt', 'w') as file:\n", + " for item in sorted(set(missing_pmc_articles)):\n", + " file.write(\"%s\\n\" % item)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + }, + "toc": { + "nav_menu": { + "height": "174px", + "width": "252px" + }, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "toc_cell": false, + "toc_position": {}, + "toc_section_display": "block", + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/allofplos/Corpus_QA.ipynb b/allofplos/Corpus_QA.ipynb new file mode 100644 index 00000000..1007c803 --- /dev/null +++ b/allofplos/Corpus_QA.ipynb @@ -0,0 +1,512 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Required functions" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "code_folding": [] + }, + "outputs": [], + "source": [ + "from samples.corpus_analysis import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PLOS/NLM article type mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "# For mapping the JATS article type onto the PLOS article type, while taking NLM DTD into account.\n", + "article_types_map = get_article_types_map()\n", + "PLOS_article_types_structured = counter(article_types_map).most_common()\n", + "print(PLOS_article_types_structured)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# create .csv file mapping JATS to PLOS article types\n", + "article_types_map_to_csv(article_types_map)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Retracted and corrected articles" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get list of retracted articles" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "deletable": false, + "editable": false, + "run_control": { + "frozen": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "79 retracted articles found.\n", + "['journal.pbio.0030123', 'journal.pbio.0050005', 'journal.pbio.0050146', 'journal.pbio.1001212', 'journal.pcbi.1002308', 'journal.pgen.1003361', 'journal.pgen.1003791', 'journal.pgen.1005586', 'journal.pgen.1000424', 'journal.pmed.1001214', 'journal.pone.0072333', 'journal.pone.0084127', 'journal.pone.0027571', 'journal.pone.0046410', 'journal.pone.0080145', 'journal.pone.0019652', 'journal.pone.0075928', 'journal.pone.0075046', 'journal.pone.0062178', 'journal.pone.0051549', 'journal.pone.0093095', 'journal.pone.0069669', 'journal.pone.0133525', 'journal.pone.0115980', 'journal.pone.0115741', 'journal.pone.0139044', 'journal.pone.0146193', 'journal.pone.0045667', 'journal.pone.0040789', 'journal.pone.0094830', 'journal.pone.0031943', 'journal.pone.0097700', 'journal.pone.0047218', 'journal.pone.0090951', 'journal.pone.0014232', 'journal.pone.0090318', 'journal.pone.0072895', 'journal.pone.0065651', 'journal.pone.0059556', 'journal.pone.0076809', 'journal.pone.0099630', 'journal.pone.0121549', 'journal.pone.0048402', 'journal.pone.0062170', 'journal.pone.0020152', 'journal.pone.0164571', 'journal.pone.0164378', 'journal.pone.0116682', 'journal.pone.0125542', 'journal.pone.0047110', 'journal.pone.0026503', 'journal.pone.0037102', 'journal.pone.0014163', 'journal.pone.0043204', 'journal.pone.0001276', 'journal.pone.0035142', 'journal.pone.0011299', 'journal.pone.0005373', 'journal.pone.0030980', 'journal.pone.0000306', 'journal.pone.0064576', 'journal.pone.0016011', 'journal.pone.0001444', 'journal.pone.0043406', 'journal.pone.0029192', 'journal.pone.0001908', 'journal.pone.0016256', 'journal.pone.0013512', 'journal.pone.0045965', 'journal.pone.0022730', 'journal.pone.0006333', 'journal.pone.0004168', 'journal.pone.0035453', 'journal.pone.0032853', 'journal.ppat.1003435', 'journal.ppat.1002062', 'journal.ppat.1000915', 'journal.ppat.1000210', 'journal.ppat.0020025']\n" + ] + } + ], + "source": [ + "retractions_article_list, retracted_article_list = get_retracted_article_list()\n", + "print(retracted_article_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get list of corrected articles" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "deletable": false, + "editable": false, + "run_control": { + "frozen": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "journal.pcbi.1003582.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003490\n", + "journal.pcbi.1003732.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003159\n", + "journal.pone.0101541.xml has incorrect linked DOI: journal.PONE-D-13-26510\n", + "journal.pone.0104353.xml has incorrect linked DOI: journal.\n", + "journal.pone.0104472.xml has incorrect linked DOI: journal.\n", + "journal.pone.0104581.xml has incorrect linked DOI: journal.\n", + "journal.pone.0104601.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105485.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105486.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105490.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105658.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105668.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105669.xml has incorrect linked DOI: journal.\n", + "9127 corrected articles found.\n" + ] + } + ], + "source": [ + "corrections_article_list, corrected_article_list = get_corrected_article_list()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check raw XML for article updates" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "deletable": false, + "editable": false, + "run_control": { + "frozen": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "downloaded new version of journal.pone.0182022.xml\n", + "downloaded new version of journal.pone.0175323.xml\n", + "downloaded new version of journal.pone.0171255.xml\n", + "downloaded new version of journal.pone.0158499.xml\n", + "30000 article checked for updates.\n", + "4 articles have updates.\n", + "['journal.pone.0182022.xml', 'journal.pone.0175323.xml', 'journal.pone.0171255.xml', 'journal.pone.0158499.xml']\n" + ] + } + ], + "source": [ + "# By default, checks only the 30,000 most recent articles\n", + "articles_different_list = revisiondate_sanity_check()\n", + "print(articles_different_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DOI and filename sanity check" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "deletable": false, + "editable": false, + "run_control": { + "frozen": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All article file names match DOIs.\n", + "PMC2687079.nxml has invalid DOI field: '10.1371/annotation/1cdc7975-50d7-40a5-99ca-83580df2982f '\n" + ] + } + ], + "source": [ + "# Check if article filenames match their full DOIs & that DOI fields are correct\n", + "messed_up_plos_list = article_doi_sanity_check()\n", + "messed_up_pmc_list = article_doi_sanity_check(directory=pmcdir, article_list=None, source='PMC')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PubMed Corpus" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get all local, solr, and PMC DOIs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "deletable": false, + "editable": false, + "run_control": { + "frozen": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1mArticles that needs to be re-indexed on Solr:\n", + "\u001b[0m10.1371/journal.pone.0076809\n" + ] + } + ], + "source": [ + "plos_articles = compare_local_and_solr()\n", + "doi_to_pmc = get_articles_by_doi_field(check_new=False)\n", + "pmc_articles = list(doi_to_pmc.keys())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare PLOS's copy to PMC" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version of the corpus by:\n", + "* removing Currents articles\n", + "* checking if articles are live on journals.plos.org\n", + "* checking that the DOIs resolve" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "deletable": false, + "editable": false, + "run_control": { + "frozen": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1mPMC DOI fields with spaces in them:\n", + "\u001b[0m\"10.1371/annotation/1cdc7975-50d7-40a5-99ca-83580df2982f \" \n", + "\n", + "\u001b[1mWorking articles that need to be re-indexed on Solr:\n", + "\u001b[0m10.1371/annotation/1391941e-93d3-48d3-8c9a-b7c6d98f9527\n", + "10.1371/annotation/a81b1fab-890c-447b-a308-5bc8ca3eb21d\n", + "10.1371/annotation/df340d50-1f94-4d8b-a252-1a82a7fa5cc7 \n", + "\n", + "\u001b[1mArticles on PMC but not on solr or journals:\n", + "\u001b[0m10.1371/journal.pone.0002957\n", + "10.1371/annotation/b83e925b-2f2a-47b9-b939-0a1eeab18324\n", + "10.1371/journal.pbio.0020201\n", + "10.1371/annotation/011969ee-3f4b-4260-8d95-1b9a4ca39008\n", + "10.1371/annotation/8f2ddf91-3499-4627-9a91-449b78465f9d\n", + "10.1371/annotation/33d82b59-59a3-4412-9853-e78e49af76b9 \n", + "\n", + "\u001b[1mMissing PLOS articles where DOI resolves to different DOI:\n", + "\u001b[0m 10.1371/annotation/5e4082fd-6d86-441f-b946-a6e87a22ea57 resolves to: 10.1371/annotation/d9496d01-8c5d-4d24-8287-94449ada5064\n", + "\u001b[0m 10.1371/annotation/b8b66a84-4919-4a3e-ba3e-bb11f3853755 resolves to: 10.1371/annotation/5fbbf39a-fb47-4ce1-8069-acd830b3d41f\n", + "\n", + " \u001b[1mOther articles on PMC that aren't working correctly for PLOS:\n", + "\u001b[0m10.1371/annotation/363b6074-caec-4238-b88f-acbf45de498f\n", + "10.1371/annotation/2259f958-a68e-4e57-92b5-2ef003070cf1 \n", + "\n" + ] + } + ], + "source": [ + "missing_plos_articles = process_missing_plos_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare PMC's copy to PLOS" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version of the Corpus by:\n", + "* updating the PMCID:DOI mapping document\n", + "* removing articles too recent to be indexed (pubdate less than 3 weeks ago)\n", + "* excluding uncorrected proofs\n", + "* excluding PLOS Medicine quizzes" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "deletable": false, + "editable": false, + "run_control": { + "frozen": true + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1mArticles missing from PMC:\n", + "\u001b[0m10.1371/annotation/08286cd8-527f-4f14-856f-57267107efa8\n", + "10.1371/annotation/0bbea8d3-1f94-48af-915c-aec02da2f5c3\n", + "10.1371/annotation/0c5390b8-72b0-4b7e-85a3-b8c0fd9f62bf\n", + "10.1371/annotation/0ccac188-950f-4908-b232-35fb44ba7847\n", + "10.1371/annotation/0cfd3d5f-c1d0-48f8-ad69-34a95e31a8d2\n", + "10.1371/annotation/0e045706-ea24-41db-be90-27d1cbcd35b1\n", + "10.1371/annotation/17310bbb-e5bf-4901-8b6e-529577a280db\n", + "10.1371/annotation/1c419628-f1b5-45de-9f8a-43f834309ebb\n", + "10.1371/annotation/1dc00176-e096-4621-9494-2d848dac8262\n", + "10.1371/annotation/1e464689-3c86-4399-b229-1e00d65593a5\n", + "10.1371/annotation/1f110857-27d7-4e83-9eb3-4e5f51950a26\n", + "10.1371/annotation/21379809-1376-4250-b4c2-bf51eac58a98\n", + "10.1371/annotation/221e5f19-370e-4a52-add8-f882437bc85d\n", + "10.1371/annotation/230cca90-58e9-4aa1-b6b2-a1d744524fbd\n", + "10.1371/annotation/23bca9d0-f934-400e-8bb9-f5ff07f9e625\n", + "10.1371/annotation/270b432d-50ec-41f1-ad4d-ddd9f51f62a5\n", + "10.1371/annotation/2b218d50-a9d5-45b2-80d0-0e806e530749\n", + "10.1371/annotation/2c275a1b-2d36-4492-b36a-192bddf14f78\n", + "10.1371/annotation/2ca25d9c-7347-4b09-bd7a-09d6d37ff322\n", + "10.1371/annotation/2f278ed8-d5e7-440a-9e49-c8d1df20d1f1\n", + "10.1371/annotation/31412345-fc86-4d67-b37c-93d42f5f0a59\n", + "10.1371/annotation/3265139d-64c7-4c4c-83d3-1e139031e7df\n", + "10.1371/annotation/34304231-e54b-4080-af70-6f957f32d552\n", + "10.1371/annotation/39b41d98-b117-41cf-b5de-b8486a67b1cd\n", + "10.1371/annotation/4290dfee-64fd-4157-89e3-8edbba912420\n", + "10.1371/annotation/44f67041-2f8e-42df-826a-82172ae05a22\n", + "10.1371/annotation/49257f53-8cb1-431b-be64-7b410598b845\n", + "10.1371/annotation/4993e0e2-c580-4547-90d8-3227b87e6ae9\n", + "10.1371/annotation/4a8d9f38-1d0d-4389-a284-9f2564e1ac0b\n", + "10.1371/annotation/4b9340db-455b-4e0d-86e5-b6783747111f\n", + "10.1371/annotation/4bb6b73b-b5bb-4143-9ec3-99c90b93f3ad\n", + "10.1371/annotation/4d6c4127-82e4-408d-af89-5f2e207d523b\n", + "10.1371/annotation/4f08219c-2d7b-4309-8351-d3fe2378993f\n", + "10.1371/annotation/5487e265-8175-47cb-b9a4-d85862a4a96f\n", + "10.1371/annotation/59bcbe81-eddd-46a4-90dc-88c1ea70df72\n", + "10.1371/annotation/5e0195b6-60b9-4c03-84ae-c6c31e625be1\n", + "10.1371/annotation/6130c605-086b-46af-8f6f-6c76b8eb9c84\n", + "10.1371/annotation/638b42e3-a351-4827-a612-17fe29b48e28\n", + "10.1371/annotation/677fdf34-651e-4dc8-a0be-d0d633237a85\n", + "10.1371/annotation/712bb339-6073-4e62-9f68-b285caedd913\n", + "10.1371/annotation/730cdfd0-78c5-48fc-a095-f633905ff2f0\n", + "10.1371/annotation/7645d066-aa98-45d6-8c3e-3a30d9e03e4d\n", + "10.1371/annotation/7e304601-fc5c-40fe-857c-d6ea894d1647\n", + "10.1371/annotation/7f73ed17-709e-4d7f-9aae-aab1f4a34985\n", + "10.1371/annotation/865eaad7-8547-49ac-a42d-47e9d0755bb3\n", + "10.1371/annotation/87e2a80b-3ed7-4ef9-96cb-1268d91b6366\n", + "10.1371/annotation/8941aee3-4bb8-42a0-b09a-e7c416beeef7\n", + "10.1371/annotation/8c6eaae4-72a7-460a-8b1a-f855731f3706\n", + "10.1371/annotation/8fa70b21-32e7-4ed3-b397-ab776b5bbf30\n", + "10.1371/annotation/9239a129-5677-43b0-8fe1-0c1e75e988df\n", + "10.1371/annotation/93141e7a-61f3-48bd-87bd-216b030d773d\n", + "10.1371/annotation/936a4359-1bf5-4c33-be7d-1468e75eaa8b\n", + "10.1371/annotation/93d63399-0e71-4a25-a45c-311910ee6da5\n", + "10.1371/annotation/9630862b-4676-4b82-9869-8d8fbb2a2e65\n", + "10.1371/annotation/974531b0-9da4-4575-b3d1-955b0163fde0\n", + "10.1371/annotation/98908e14-e9fd-458f-9cea-ba4bec139f20\n", + "10.1371/annotation/b03fbc42-8f70-4873-9cce-854e48249a13\n", + "10.1371/annotation/b0e62f4f-812f-40b1-aef8-365b229eb2cf\n", + "10.1371/annotation/b4e623eb-4950-48d9-8d85-8d70426d95a3\n", + "10.1371/annotation/b60d4ec5-4c6f-43ab-9f63-322e3cd59636\n", + "10.1371/annotation/bae9fc08-fbfa-45b5-9d1d-0b8254d6efd5\n", + "10.1371/annotation/bc97a85c-1ecd-4cd8-ab61-0aef01f949a1\n", + "10.1371/annotation/c066bb84-13ea-4b36-a481-f149df8ce929\n", + "10.1371/annotation/c313df3a-52bd-4cbe-af14-6676480d1a43\n", + "10.1371/annotation/c81daa7c-5375-4349-970b-c63d288947eb\n", + "10.1371/annotation/caf130c3-5026-41cd-9dda-5eac7c0f016f\n", + "10.1371/annotation/d271d9c1-5588-4b43-85c3-d3de58ab61a4\n", + "10.1371/annotation/dfa05103-fc65-4f07-b30f-72a6e91613ff\n", + "10.1371/annotation/ea14adcb-033d-492d-8f8b-e047aa080cd4\n", + "10.1371/annotation/ebea4bd5-2b96-4842-b110-2f7c156e5060\n", + "10.1371/annotation/eff6e471-306a-41bd-88e3-13857af094af\n", + "10.1371/annotation/f016476b-5b84-4c9a-899f-fe8b8bc927b5\n", + "10.1371/annotation/f216b2b0-ab6b-45d8-b6ba-134a477b79b7\n", + "10.1371/annotation/f32bc670-c9cf-4bb0-9376-cd8cfd1053c1\n", + "10.1371/annotation/f8605b0a-d01c-41aa-ac9b-b605d7903a28\n", + "10.1371/annotation/f9660803-198b-4d0d-8200-719a2eb2a443\n", + "10.1371/annotation/fcca88ac-d684-46e0-a483-62af67e777bd\n", + "10.1371/annotation/fd9f9796-b42d-480d-b9f4-0adfbb919148\n", + "10.1371/annotation/fddd2ff3-c991-4c2f-8b84-a27eb20fba91\n", + "10.1371/annotation/ff089043-990a-48c2-a90f-15606c11cc98\n", + "10.1371/journal.pcbi.1005632\n", + "10.1371/journal.pcbi.1005676\n", + "10.1371/journal.pcbi.1005677\n", + "10.1371/journal.pcbi.1005692\n", + "10.1371/journal.pgen.1006910\n", + "10.1371/journal.pone.0181246\n", + "10.1371/journal.pone.0182517\n", + "10.1371/journal.ppat.1006535\n", + "10.1371/journal.ppat.1006543 \n", + "\n" + ] + } + ], + "source": [ + "missing_pmc_articles = process_missing_pmc_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save lists of missing articles to text files if needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "with open('missing_plos_articles.txt', 'w') as file:\n", + " for item in sorted(set(missing_plos_articles)):\n", + " file.write(\"%s\\n\" % item)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "code_folding": [], + "collapsed": true + }, + "outputs": [], + "source": [ + "with open('missing_pmc_articles.txt', 'w') as file:\n", + " for item in sorted(set(missing_pmc_articles)):\n", + " file.write(\"%s\\n\" % item)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + }, + "toc": { + "nav_menu": { + "height": "174px", + "width": "252px" + }, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "toc_cell": false, + "toc_position": {}, + "toc_section_display": "block", + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/allofplos/Production team investigates.ipynb b/allofplos/Production team investigates.ipynb new file mode 100644 index 00000000..2f898f00 --- /dev/null +++ b/allofplos/Production team investigates.ipynb @@ -0,0 +1,393 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from plos_corpus import *\n", + "from samples.corpus_analysis import *\n", + "corpusdir = 'allofplos_xml'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Q: Are annotation DOIs resolving correctly?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def make_annotation_dict(save_output=True):\n", + " \"\"\"\n", + " For every article file whose DOI contains the word \"annotation\", check whether its DOI resolves correctly\n", + " by creating a dictionary of the resolution status.\n", + " :return: dictionary where each key is a DOI, each value is associated resolution of that DOI via doi.org.\n", + " :param save_output: exports dictionary to csv\n", + " \"\"\"\n", + " dois = [file_to_doi(file) for file in listdir_nohidden(corpusdir)]\n", + " annotation_list = [x for x in dois if x.startswith('10.1371/annotation')]\n", + " anno_dict = {doi: check_if_doi_resolves(doi) for doi in annotation_list}\n", + " \n", + " if save_output:\n", + " with open('annotations.csv', 'w') as f:\n", + " writer = csv.writer(f)\n", + " writer.writerow(['DOI', 'Resolution'])\n", + " for key, value in anno_dict.items():\n", + " writer.writerow([key, value])\n", + "\n", + " return anno_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# run this\n", + "make_annotation_dict()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# Q: Which `` elements follow a certain pattern?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_tina_test_set():\n", + " \"\"\"\n", + " Return a list of DOIs good for Tina's function\n", + " \"\"\"\n", + " random_list_of_dois = get_random_list_of_dois(count=10)\n", + " random_list_of_articles = [doi_to_file(doi) for doi in random_list_of_dois if 'annotation' not in doi]\n", + " search_1_dois = ('10.1371/journal.pmed.1002035', '10.1371/journal.pone.0047559', '10.1371/journal.pone.0047944')\n", + " search_1_articles = [doi_to_file(doi) for doi in search_1_dois]\n", + " search_test_set = list(set(random_list_of_articles + search_1_articles))\n", + " return search_test_set\n", + "\n", + "def find_contrib_pattern(article_list=None, csv=True):\n", + " \"\"\"\n", + " Three separate searches would be most helpful:\n", + " Search #1: Find all articles where a element contains an element. \n", + " Example: pmed.1002035, pone.0047559, and pone.0047944 should all be found by this search.\n", + " Search #2: Find all articles where a element that contains an element is\n", + " immediately followed by element that contains a element.\n", + " Example: pone.0047559 and pone.0047944 should both be found by this search, but not pmed.1002035.\n", + " Search #3: Find all articles where a element that contains an element is\n", + " immediately followed by element that contains a element that contains a .\n", + " Example: pone.0047944 should be found by this search, but not pmed.1002035 or pone.0047559.)\n", + " To test this function, use get_tina_test_set() to run on a subset of articles\n", + " \"\"\"\n", + " if article_list is None:\n", + " article_list = listdir_nohidden(corpusdir)\n", + "\n", + " search_1_results = []\n", + " search_2_results = []\n", + " search_3_results = []\n", + "\n", + " for article_file in article_list:\n", + " tag_path_elements = ('/',\n", + " 'article',\n", + " 'front',\n", + " 'article-meta')\n", + " article_xml = get_articleXML_content(article_file, tag_path_elements=tag_path_elements)\n", + " meta_categories = article_xml[0].getchildren()\n", + " contrib_groups = [category for category in meta_categories if category.tag == 'contrib-group']\n", + " for contrib_group in contrib_groups:\n", + " for contributor in contrib_group:\n", + " for element in contributor:\n", + " if element.tag == 'on-behalf-of':\n", + " search_1_results.append(file_to_doi(article_file))\n", + " next_element = contributor.getnext()\n", + " if next_element is not None:\n", + " for elem in next_element:\n", + " if elem.tag == 'collab':\n", + " search_2_results.append(file_to_doi(article_file))\n", + " for subelem in elem:\n", + " if subelem.tag == 'contrib-group':\n", + " search_3_results.append(file_to_doi(article_file))\n", + " break\n", + "\n", + " search_1_results = set(search_1_results)\n", + " search_2_results = set(search_2_results)\n", + " search_3_results = set(search_3_results)\n", + " search_results = list(set(search_1_results + search_2_results + search_3_results))\n", + " doi_results = []\n", + " for doi in search_results:\n", + " if doi in search_1_results:\n", + " s1 = 'yes'\n", + " else:\n", + " s1 = 'no'\n", + " if doi in search_2_results:\n", + " s2 = 'yes'\n", + " else:\n", + " s2 = 'no'\n", + " if doi in search_3_results:\n", + " s3 = 'yes'\n", + " else:\n", + " s3 = 'no'\n", + " doi_result = (doi, s1, s2, s3)\n", + " doi_results.append(doi_result)\n", + " if csv:\n", + " with open('search_results.csv', 'w') as f:\n", + " writer = csv.writer(f)\n", + " writer.writerow(['DOI', 'Search 1', 'Search 2', 'Search 3'])\n", + " for doi_result in sorted(doi_results):\n", + " writer.writerow(doi_result)\n", + " return doi_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test this function\n", + "test_list = get_tina_test_set()\n", + "doi_results = find_contrib_pattern(article_list=test_list, csv=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(doi_results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# run this function for real\n", + "doi_results = find_contrib_pattern()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Q: Which articles after 2015 have 2 or more corrections attached?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "corrections_article_list, corrected_article_list = get_corrected_article_list()\n", + "multiple_corrections = set([article for article in corrected_article_list\n", + " if corrected_article_list.count(article) > 1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "multiple_corrections.remove('10.1371/journal.')\n", + "multiple_corrections_post_2015 = [article for article in multiple_corrections\n", + " if get_article_pubdate(doi_to_file(article)).year >= 2015]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "multiple_corrections_post_2015\n", + "with open('2_or_more_corrections.csv', 'w') as f:\n", + " writer = csv.writer(f)\n", + " writer.writerow(['DOI'])\n", + " for item in multiple_corrections_post_2015:\n", + " writer.writerow(item)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Q: Which articles have a series of table-wrap elements?" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [], + "source": [ + "example_doi = '10.1371/journal.pone.0068090'\n", + "search_1_file = 'xml_testing/Search-1_TRUE.xml'\n", + "search_2_file = 'xml_testing/Search-2_TRUE.xml'\n", + "intro_file = doi_to_file(example_doi)\n", + "fail_file = doi_to_file('10.1371/journal.pone.0182980')\n", + "test_list = [intro_file, search_1_file, search_2_file, fail_file]\n", + "\n", + "intro_condition = []\n", + "search_1 = []\n", + "search_2 = []\n", + "\n", + "def find_table_wraps(article):\n", + " \"\"\"\n", + " find all articles with a `table-wrap` element. of those, if there is no immediate sub-tag of\n", + " 'alternative' in table\n", + " \"\"\"\n", + " intro_condition = False\n", + " search_1 = False\n", + " search_2 = False\n", + "\n", + " article_tree = et.parse(article)\n", + " table_wraps = article_tree.findall('.//table-wrap')\n", + " if table_wraps:\n", + " for table_wrap in table_wraps:\n", + " try:\n", + " if all('alternatives' not in table_part.tag for table_part in table_wrap) and \\\n", + " all('graphic' not in table_part.tag for table_part in table_wrap):\n", + " intro_condition = True\n", + " except TypeError:\n", + " # this is an imperfect work-around. if alternatives were a sub-sub-element,\n", + " # it would be incorrectly excluded from intro_\n", + " alternatives = table_wrap.findall('.//alternatives')\n", + " if alternatives == 0:\n", + " intro_condition = True\n", + " if intro_condition:\n", + " danger = table_wrap.findall('.//graphic')\n", + " if danger:\n", + " search_1 = True\n", + " danger2 = table_wrap.findall('.//inline-graphic')\n", + " if danger2:\n", + " search_2 = True\n", + " else:\n", + " pass\n", + " \n", + "# for table_part in table_parts:\n", + "# if 'alternatives' in table_part.tag:\n", + "# print('alternatives')\n", + "\n", + " else:\n", + " pass\n", + "\n", + " return intro_condition, search_1, search_2\n" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "allofplos_xml/journal.pone.0068090.xml True False False\n", + "xml_testing/Search-1_TRUE.xml True True False\n", + "xml_testing/Search-2_TRUE.xml True True True\n", + "allofplos_xml/journal.pone.0182980.xml False False False\n" + ] + } + ], + "source": [ + "table_results = []\n", + "for article_file in test_list:\n", + " intro_condition, search_1, search_2 = find_table_wraps(article_file)\n", + " print(article_file, intro_condition, search_1, search_2)" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "metadata": {}, + "outputs": [], + "source": [ + "table_results = []\n", + "file_list = listdir_nohidden(corpusdir)\n", + "for article_file in file_list:\n", + " intro_condition, search_1, search_2 = find_table_wraps(article_file)\n", + " if intro_condition:\n", + " result = [file_to_doi(article_file), search_1, search_2]\n", + " table_results.append(result)\n", + "\n", + "# print(table_results)\n", + "with open('table_search_results_revised.csv', 'w') as f:\n", + " writer = csv.writer(f)\n", + " writer.writerow(['DOI', 'Search 1', 'Search 2'])\n", + " for doi_result in sorted(table_results):\n", + " writer.writerow(doi_result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for article_file in listdir_nohidden(corpusdir)[180000:180010]:\n", + " print(find_table_wraps(article_file))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + }, + "toc": { + "nav_menu": { + "height": "12px", + "width": "252px" + }, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "toc_cell": false, + "toc_position": {}, + "toc_section_display": "block", + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/allofplos/jupyternb/Corpus_Analysis-old.ipynb b/allofplos/jupyternb/Corpus_Analysis-old.ipynb new file mode 100644 index 00000000..4c590f64 --- /dev/null +++ b/allofplos/jupyternb/Corpus_Analysis-old.ipynb @@ -0,0 +1,792 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Required functions" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "code_folding": [], + "collapsed": true + }, + "outputs": [], + "source": [ + "from Samples.corpus_analysis import *" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "# PLOS article types" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true + }, + "source": [ + "## JATS-standard NLM article types" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "editable": false, + "hidden": true, + "run_control": { + "frozen": true + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15 types of articles found.\n", + "[('research-article', 204109), ('correction', 9113), ('article-commentary', 1284), ('discussion', 1087), ('review-article', 612), ('other', 584), ('editorial', 340), ('letter', 300), ('retraction', 79), ('book-review', 77), ('meeting-report', 38), ('case-report', 23), ('expression-of-concern', 13), ('obituary', 10), ('brief-report', 1)]\n" + ] + } + ], + "source": [ + "jats_article_type_list = get_jats_article_type_list()\n", + "print(jats_article_type_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true + }, + "source": [ + "## PLOS article types" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "editable": false, + "hidden": true, + "run_control": { + "frozen": true + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "60 types of articles found.\n", + "[('Research Article', 202673), ('Correction', 9124), ('Synopsis', 1093), ('Perspective', 801), ('Review', 555), ('Editorial', 486), ('Pearls', 438), ('Essay', 379), ('Policy Forum', 309), ('Correspondence', 287), ('Primer', 237), ('Viewpoints', 209), ('Community Page', 139), ('Opinion', 136), ('Health in Action', 118), ('Education', 103), ('Retraction', 79), ('Book Review/Science in the Media', 76), ('Message from ISCB', 70), ('Symposium', 70), ('Policy Platform', 54), ('Feature', 53), ('Formal Comment', 52), ('Research in Translation', 51), ('Guidelines and Guidance', 51), ('Collection Review', 50), ('Research Matters', 44), ('Interview', 44), ('The PLoS Medicine Debate', 38), ('Historical Profiles and Perspectives', 38), ('Unsolved Mystery', 34), ('Overview', 34), ('Neglected Diseases', 29), ('Expert Commentary', 29), ('Learning Forum', 27), ('From Innovation to Application', 24), ('Obituary', 22), ('Quiz', 21), ('Correspondence and Other Communications', 13), ('Expression of Concern', 13), ('Journal Club', 12), ('Meta-Research Article', 12), ('Student Forum', 12), ('Open Highlights', 11), ('Topic Page', 11), ('Case Report', 10), ('Photo Quiz', 10), ('Best Practice', 5), ('Deep Reads', 4), ('Historical and Philosophical Perspectives', 3), ('Special Report', 3), ('Book Review', 2), ('Message from the Founders', 1), ('Message from PLoS', 1), ('Short Reports', 1), ('Methods and Resources', 1), ('Technical Report', 1), ('Message from the PLoS Founders', 1), ('Collection Review ', 1), ('Debate', 1)]\n" + ] + } + ], + "source": [ + "PLOS_article_type_list = get_plos_article_type_list()\n", + "print(PLOS_article_type_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true + }, + "source": [ + "## PLOS/NLM article type mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "hidden": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "article_types_map = get_article_types_map()\n", + "PLOS_article_types_structured = counter(article_types_map).most_common()\n", + "print(PLOS_article_types_structured)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "collapsed": true, + "hidden": true + }, + "outputs": [], + "source": [ + "# create .csv file mapping JATS to PLOS article types\n", + "article_types_map_to_csv(article_types_map)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "# Taking random samples of DOIs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "hidden": true + }, + "outputs": [], + "source": [ + "random_sample_of_dois = get_random_list_of_DOIs() # returns 100 DOIs by default" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": { + "hidden": true, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['journal.pone.0074820', 'journal.pone.0063497', 'journal.pone.0126357', 'journal.pntd.0004807', 'journal.pone.0031896', 'journal.pone.0045503', 'journal.pone.0138217', 'journal.pbio.0050002', 'journal.pone.0122848', 'journal.pone.0099248']\n" + ] + } + ], + "source": [ + "random_sample_of_articles = [doi_to_article(doi) for doi in random_sample_of_dois]\n", + "print(random_sample_of_articles[0:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "# Retracted and corrected articles" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true + }, + "source": [ + "## Get list of retracted articles" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "editable": false, + "hidden": true, + "run_control": { + "frozen": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "79 retracted articles found.\n", + "['journal.pbio.0030123', 'journal.pbio.0050005', 'journal.pbio.0050146', 'journal.pbio.1001212', 'journal.pcbi.1002308', 'journal.pgen.1003361', 'journal.pgen.1003791', 'journal.pgen.1005586', 'journal.pgen.1000424', 'journal.pmed.1001214', 'journal.pone.0072333', 'journal.pone.0084127', 'journal.pone.0027571', 'journal.pone.0046410', 'journal.pone.0080145', 'journal.pone.0019652', 'journal.pone.0075928', 'journal.pone.0075046', 'journal.pone.0062178', 'journal.pone.0051549', 'journal.pone.0093095', 'journal.pone.0069669', 'journal.pone.0133525', 'journal.pone.0115980', 'journal.pone.0115741', 'journal.pone.0139044', 'journal.pone.0146193', 'journal.pone.0045667', 'journal.pone.0040789', 'journal.pone.0094830', 'journal.pone.0031943', 'journal.pone.0097700', 'journal.pone.0047218', 'journal.pone.0090951', 'journal.pone.0014232', 'journal.pone.0090318', 'journal.pone.0072895', 'journal.pone.0065651', 'journal.pone.0059556', 'journal.pone.0076809', 'journal.pone.0099630', 'journal.pone.0121549', 'journal.pone.0048402', 'journal.pone.0062170', 'journal.pone.0020152', 'journal.pone.0164571', 'journal.pone.0164378', 'journal.pone.0116682', 'journal.pone.0125542', 'journal.pone.0047110', 'journal.pone.0026503', 'journal.pone.0037102', 'journal.pone.0014163', 'journal.pone.0043204', 'journal.pone.0001276', 'journal.pone.0035142', 'journal.pone.0011299', 'journal.pone.0005373', 'journal.pone.0030980', 'journal.pone.0000306', 'journal.pone.0064576', 'journal.pone.0016011', 'journal.pone.0001444', 'journal.pone.0043406', 'journal.pone.0029192', 'journal.pone.0001908', 'journal.pone.0016256', 'journal.pone.0013512', 'journal.pone.0045965', 'journal.pone.0022730', 'journal.pone.0006333', 'journal.pone.0004168', 'journal.pone.0035453', 'journal.pone.0032853', 'journal.ppat.1003435', 'journal.ppat.1002062', 'journal.ppat.1000915', 'journal.ppat.1000210', 'journal.ppat.0020025']\n" + ] + } + ], + "source": [ + "retractions_article_list, retracted_article_list = get_retracted_article_list()\n", + "print(retracted_article_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true + }, + "source": [ + "## Get list of corrected articles" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "editable": false, + "hidden": true, + "run_control": { + "frozen": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "journal.pcbi.1003582.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003490\n", + "journal.pcbi.1003732.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003159\n", + "journal.pone.0101541.xml has incorrect linked DOI: journal.PONE-D-13-26510\n", + "journal.pone.0104353.xml has incorrect linked DOI: journal.\n", + "journal.pone.0104472.xml has incorrect linked DOI: journal.\n", + "journal.pone.0104581.xml has incorrect linked DOI: journal.\n", + "journal.pone.0104601.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105485.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105486.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105490.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105658.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105668.xml has incorrect linked DOI: journal.\n", + "journal.pone.0105669.xml has incorrect linked DOI: journal.\n", + "9127 corrected articles found.\n" + ] + } + ], + "source": [ + "corrections_article_list, corrected_article_list = get_corrected_article_list()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# What's going on with revision_dates & article updates?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Keep an eye on this URL for any changes. On PMC, was updated in the last few months, but that might not have has time to propagate. https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:3913708&metadataPrefix=pmc" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "## Step 1: Query solr for revision_date field" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "hidden": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "URL for solr query: http://api.plos.org/search?q=*:*&fq=doc_type:full+-doi:image&fl=id,publication_date&wt=json&indent=true&sort=%20id%20asc&fq=publication_date:[2017-08-17T00:00:00Z+TO+2017-08-25T23:59:59Z]&rows=1000\n", + "613 results returned from this search.\n", + "['2017-08-21T00:00:00Z', '2017-08-18T00:00:00Z', '2017-08-22T00:00:00Z', '2017-08-18T00:00:00Z', '2017-08-24T00:00:00Z', '2017-08-24T00:00:00Z', '2017-08-18T00:00:00Z', '2017-08-23T00:00:00Z', '2017-08-24T00:00:00Z', '2017-08-17T00:00:00Z']\n" + ] + } + ], + "source": [ + "# This should print 10 date strings \n", + "publication_dates_list = get_solr_records(days_ago=8, item='publication_date')\n", + "print(publication_dates_list[0:10])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "hidden": true + }, + "outputs": [], + "source": [ + "# This should return an error\n", + "revision_dates_list = get_solr_records(days_ago=8, item='revision_date')\n", + "print(revision_dates_list[0:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Peek inside raw XML for any changes" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "downloaded new version of journal.pone.0182022.xml\n", + "downloaded new version of journal.pone.0175323.xml\n", + "downloaded new version of journal.pone.0171255.xml\n", + "downloaded new version of journal.pone.0158499.xml\n", + "30000 article checked for updates.\n", + "4 articles have updates.\n", + "['journal.pone.0182022.xml', 'journal.pone.0175323.xml', 'journal.pone.0171255.xml', 'journal.pone.0158499.xml']\n" + ] + } + ], + "source": [ + "articles_different_list = revisiondate_sanity_check()\n", + "print(articles_different_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DOI and filename sanity check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Check if article filenames match their full DOIs & that DOI fields are correct\n", + "# NOT WORKING AND MUST BE FIXED!\n", + "messed_up_plos_list = article_doi_sanity_check()\n", + "messed_up_pmc_list = article_doi_sanity_check(directory=pmcdir, article_list=None, source='PMC')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PubMed Corpus" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get all local, solr, and PMC DOIs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1mArticles that needs to be re-indexed on Solr:\n", + "\u001b[0m10.1371/journal.pone.0076809\n" + ] + } + ], + "source": [ + "plos_articles = compare_local_and_solr()\n", + "doi_to_pmc = get_articles_by_doi_field(check_new=False)\n", + "pmc_articles = list(doi_to_pmc.keys())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare PLOS's copy to PMC" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version of the corpus by:\n", + "* removing Currents articles\n", + "* checking if articles are live on journals.plos.org\n", + "* checking that the DOIs resolve" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1mPMC DOI fields with spaces in them:\n", + "\u001b[0m\"10.1371/annotation/1cdc7975-50d7-40a5-99ca-83580df2982f \" \n", + "\n", + "\u001b[1mWorking articles that need to be re-indexed on Solr:\n", + "\u001b[0m10.1371/annotation/1391941e-93d3-48d3-8c9a-b7c6d98f9527\n", + "10.1371/annotation/a81b1fab-890c-447b-a308-5bc8ca3eb21d\n", + "10.1371/annotation/df340d50-1f94-4d8b-a252-1a82a7fa5cc7 \n", + "\n", + "\u001b[1mArticles on PMC but not on solr or journals:\n", + "\u001b[0m10.1371/journal.pone.0002957\n", + "10.1371/annotation/b83e925b-2f2a-47b9-b939-0a1eeab18324\n", + "10.1371/journal.pbio.0020201\n", + "10.1371/annotation/011969ee-3f4b-4260-8d95-1b9a4ca39008\n", + "10.1371/annotation/8f2ddf91-3499-4627-9a91-449b78465f9d\n", + "10.1371/annotation/33d82b59-59a3-4412-9853-e78e49af76b9 \n", + "\n", + "\u001b[1mMissing PLOS articles where DOI resolves to different DOI:\n", + "\u001b[0m 10.1371/annotation/5e4082fd-6d86-441f-b946-a6e87a22ea57 resolves to: 10.1371/annotation/d9496d01-8c5d-4d24-8287-94449ada5064\n", + "\u001b[0m 10.1371/annotation/b8b66a84-4919-4a3e-ba3e-bb11f3853755 resolves to: 10.1371/annotation/5fbbf39a-fb47-4ce1-8069-acd830b3d41f\n", + "\n", + " \u001b[1mOther articles on PMC that aren't working correctly for PLOS:\n", + "\u001b[0m10.1371/annotation/363b6074-caec-4238-b88f-acbf45de498f\n", + "10.1371/annotation/2259f958-a68e-4e57-92b5-2ef003070cf1 \n", + "\n" + ] + } + ], + "source": [ + "missing_plos_articles = process_missing_plos_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare PMC's copy to PLOS" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version of the Corpus by:\n", + "* updating the PMCID:DOI mapping document\n", + "* removing articles too recent to be indexed (pubdate less than 3 weeks ago)\n", + "* excluding uncorrected proofs\n", + "* excluding PLOS Medicine quizzes" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1mArticles missing from PMC:\n", + "\u001b[0m10.1371/annotation/08286cd8-527f-4f14-856f-57267107efa8\n", + "10.1371/annotation/0bbea8d3-1f94-48af-915c-aec02da2f5c3\n", + "10.1371/annotation/0c5390b8-72b0-4b7e-85a3-b8c0fd9f62bf\n", + "10.1371/annotation/0ccac188-950f-4908-b232-35fb44ba7847\n", + "10.1371/annotation/0cfd3d5f-c1d0-48f8-ad69-34a95e31a8d2\n", + "10.1371/annotation/0e045706-ea24-41db-be90-27d1cbcd35b1\n", + "10.1371/annotation/17310bbb-e5bf-4901-8b6e-529577a280db\n", + "10.1371/annotation/1c419628-f1b5-45de-9f8a-43f834309ebb\n", + "10.1371/annotation/1dc00176-e096-4621-9494-2d848dac8262\n", + "10.1371/annotation/1e464689-3c86-4399-b229-1e00d65593a5\n", + "10.1371/annotation/1f110857-27d7-4e83-9eb3-4e5f51950a26\n", + "10.1371/annotation/21379809-1376-4250-b4c2-bf51eac58a98\n", + "10.1371/annotation/221e5f19-370e-4a52-add8-f882437bc85d\n", + "10.1371/annotation/230cca90-58e9-4aa1-b6b2-a1d744524fbd\n", + "10.1371/annotation/23bca9d0-f934-400e-8bb9-f5ff07f9e625\n", + "10.1371/annotation/270b432d-50ec-41f1-ad4d-ddd9f51f62a5\n", + "10.1371/annotation/2b218d50-a9d5-45b2-80d0-0e806e530749\n", + "10.1371/annotation/2c275a1b-2d36-4492-b36a-192bddf14f78\n", + "10.1371/annotation/2ca25d9c-7347-4b09-bd7a-09d6d37ff322\n", + "10.1371/annotation/2f278ed8-d5e7-440a-9e49-c8d1df20d1f1\n", + "10.1371/annotation/31412345-fc86-4d67-b37c-93d42f5f0a59\n", + "10.1371/annotation/3265139d-64c7-4c4c-83d3-1e139031e7df\n", + "10.1371/annotation/34304231-e54b-4080-af70-6f957f32d552\n", + "10.1371/annotation/39b41d98-b117-41cf-b5de-b8486a67b1cd\n", + "10.1371/annotation/4290dfee-64fd-4157-89e3-8edbba912420\n", + "10.1371/annotation/44f67041-2f8e-42df-826a-82172ae05a22\n", + "10.1371/annotation/49257f53-8cb1-431b-be64-7b410598b845\n", + "10.1371/annotation/4993e0e2-c580-4547-90d8-3227b87e6ae9\n", + "10.1371/annotation/4a8d9f38-1d0d-4389-a284-9f2564e1ac0b\n", + "10.1371/annotation/4b9340db-455b-4e0d-86e5-b6783747111f\n", + "10.1371/annotation/4bb6b73b-b5bb-4143-9ec3-99c90b93f3ad\n", + "10.1371/annotation/4d6c4127-82e4-408d-af89-5f2e207d523b\n", + "10.1371/annotation/4f08219c-2d7b-4309-8351-d3fe2378993f\n", + "10.1371/annotation/5487e265-8175-47cb-b9a4-d85862a4a96f\n", + "10.1371/annotation/59bcbe81-eddd-46a4-90dc-88c1ea70df72\n", + "10.1371/annotation/5e0195b6-60b9-4c03-84ae-c6c31e625be1\n", + "10.1371/annotation/6130c605-086b-46af-8f6f-6c76b8eb9c84\n", + "10.1371/annotation/638b42e3-a351-4827-a612-17fe29b48e28\n", + "10.1371/annotation/677fdf34-651e-4dc8-a0be-d0d633237a85\n", + "10.1371/annotation/712bb339-6073-4e62-9f68-b285caedd913\n", + "10.1371/annotation/730cdfd0-78c5-48fc-a095-f633905ff2f0\n", + "10.1371/annotation/7645d066-aa98-45d6-8c3e-3a30d9e03e4d\n", + "10.1371/annotation/7e304601-fc5c-40fe-857c-d6ea894d1647\n", + "10.1371/annotation/7f73ed17-709e-4d7f-9aae-aab1f4a34985\n", + "10.1371/annotation/865eaad7-8547-49ac-a42d-47e9d0755bb3\n", + "10.1371/annotation/87e2a80b-3ed7-4ef9-96cb-1268d91b6366\n", + "10.1371/annotation/8941aee3-4bb8-42a0-b09a-e7c416beeef7\n", + "10.1371/annotation/8c6eaae4-72a7-460a-8b1a-f855731f3706\n", + "10.1371/annotation/8fa70b21-32e7-4ed3-b397-ab776b5bbf30\n", + "10.1371/annotation/9239a129-5677-43b0-8fe1-0c1e75e988df\n", + "10.1371/annotation/93141e7a-61f3-48bd-87bd-216b030d773d\n", + "10.1371/annotation/936a4359-1bf5-4c33-be7d-1468e75eaa8b\n", + "10.1371/annotation/93d63399-0e71-4a25-a45c-311910ee6da5\n", + "10.1371/annotation/9630862b-4676-4b82-9869-8d8fbb2a2e65\n", + "10.1371/annotation/974531b0-9da4-4575-b3d1-955b0163fde0\n", + "10.1371/annotation/98908e14-e9fd-458f-9cea-ba4bec139f20\n", + "10.1371/annotation/b03fbc42-8f70-4873-9cce-854e48249a13\n", + "10.1371/annotation/b0e62f4f-812f-40b1-aef8-365b229eb2cf\n", + "10.1371/annotation/b4e623eb-4950-48d9-8d85-8d70426d95a3\n", + "10.1371/annotation/b60d4ec5-4c6f-43ab-9f63-322e3cd59636\n", + "10.1371/annotation/bae9fc08-fbfa-45b5-9d1d-0b8254d6efd5\n", + "10.1371/annotation/bc97a85c-1ecd-4cd8-ab61-0aef01f949a1\n", + "10.1371/annotation/c066bb84-13ea-4b36-a481-f149df8ce929\n", + "10.1371/annotation/c313df3a-52bd-4cbe-af14-6676480d1a43\n", + "10.1371/annotation/c81daa7c-5375-4349-970b-c63d288947eb\n", + "10.1371/annotation/caf130c3-5026-41cd-9dda-5eac7c0f016f\n", + "10.1371/annotation/d271d9c1-5588-4b43-85c3-d3de58ab61a4\n", + "10.1371/annotation/dfa05103-fc65-4f07-b30f-72a6e91613ff\n", + "10.1371/annotation/ea14adcb-033d-492d-8f8b-e047aa080cd4\n", + "10.1371/annotation/ebea4bd5-2b96-4842-b110-2f7c156e5060\n", + "10.1371/annotation/eff6e471-306a-41bd-88e3-13857af094af\n", + "10.1371/annotation/f016476b-5b84-4c9a-899f-fe8b8bc927b5\n", + "10.1371/annotation/f216b2b0-ab6b-45d8-b6ba-134a477b79b7\n", + "10.1371/annotation/f32bc670-c9cf-4bb0-9376-cd8cfd1053c1\n", + "10.1371/annotation/f8605b0a-d01c-41aa-ac9b-b605d7903a28\n", + "10.1371/annotation/f9660803-198b-4d0d-8200-719a2eb2a443\n", + "10.1371/annotation/fcca88ac-d684-46e0-a483-62af67e777bd\n", + "10.1371/annotation/fd9f9796-b42d-480d-b9f4-0adfbb919148\n", + "10.1371/annotation/fddd2ff3-c991-4c2f-8b84-a27eb20fba91\n", + "10.1371/annotation/ff089043-990a-48c2-a90f-15606c11cc98\n", + "10.1371/journal.pcbi.1005632\n", + "10.1371/journal.pcbi.1005676\n", + "10.1371/journal.pcbi.1005677\n", + "10.1371/journal.pcbi.1005692\n", + "10.1371/journal.pgen.1006910\n", + "10.1371/journal.pone.0181246\n", + "10.1371/journal.pone.0182517\n", + "10.1371/journal.ppat.1006535\n", + "10.1371/journal.ppat.1006543 \n", + "\n" + ] + } + ], + "source": [ + "missing_pmc_articles = process_missing_pmc_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save lists of missing articles to text files if needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "with open('missing_plos_articles.txt', 'w') as file:\n", + " for item in sorted(set(missing_plos_articles)):\n", + " file.write(\"%s\\n\" % item)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "with open('missing_pmc_articles.txt', 'w') as file:\n", + " for item in sorted(set(missing_pmc_articles)):\n", + " file.write(\"%s\\n\" % item)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Count of articles by pubdate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How many articles published each day? month? year? For a period of time?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "### Could consider making graphs of this..." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "hidden": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('Aug 2013', 2629), ('Dec 2013', 8), ('Jan 2014', 5), ('Jul 2013', 2627), ('Jun 2013', 2542), ('Jun 2014', 1), ('Mar 2014', 3), ('Mar 2015', 2), ('May 2013', 932), ('May 2014', 1), ('Nov 2013', 20), ('Oct 2013', 47), ('Sep 2013', 1183)]\n" + ] + } + ], + "source": [ + "import collections\n", + "counter = collections.Counter\n", + "\n", + "example_article = 'journal.pone.0012380.xml'\n", + "pubdate_list = []\n", + "article_files = listdir_nohidden(corpusdir)\n", + "pubdate_list = [get_article_pubdate(article_file) for article_file in listdir_nohidden(corpusdir)[90000:100000]]\n", + "# monthly_pubdate_list = [date.replace(day=1,hour=0,minute=0,second=0,microsecond=0) for date in pubdate_list]\n", + "monthly_pubdate_list = [date.strftime('%b %Y') for date in pubdate_list]\n", + "monthly_pubdate_list = sorted(monthly_pubdate_list)\n", + "pubdate_count = sorted(counter(monthly_pubdate_list).most_common())\n", + "print(pubdate_count)\n", + "# month_list = [x.strftime('%b %Y') for x[0] in pubdate_count]\n", + "# month_list = [x[0].strftime('%b %Y') for x in pubdate_count]" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "hidden": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['May 2013', 'Jun 2013', 'Jul 2013', 'Aug 2013', 'Sep 2013', 'Oct 2013', 'Dec 2013']\n" + ] + } + ], + "source": [ + "month_list = [x[0].strftime('%b %Y') for x in pubdate_count]\n", + "print(month_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Count of articles published in each journal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import collections\n", + "counter = collections.Counter\n", + "\n", + "journal_list = []\n", + "for article_file in listdir_nohidden(corpusdir):\n", + " r = get_articleXML_content(corpusdir,\n", + " article_file,\n", + " tag_path_elements=[\"/\",\n", + " \"article\",\n", + " \"front\",\n", + " \"journal-meta\",\n", + " \"journal-title-group\",\n", + " \"journal-title\"])\n", + "\n", + " journal = r[0].text\n", + " journal_list.append(journal)\n", + "\n", + "print(len(set(journal_list)), 'PLOS journals found.')\n", + "journals_structured = counter(journal_list).most_common()\n", + "print(journals_structured)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + }, + "toc": { + "colors": { + "hover_highlight": "#DAA520", + "navigate_num": "#000000", + "navigate_text": "#333333", + "running_highlight": "#FF0000", + "selected_highlight": "#FFD700", + "sidebar_border": "#EEEEEE", + "wrapper_background": "#FFFFFF" + }, + "moveMenuLeft": true, + "nav_menu": { + "height": "174px", + "width": "252px" + }, + "navigate_menu": true, + "number_sections": true, + "sideBar": true, + "threshold": 4, + "toc_cell": false, + "toc_section_display": "block", + "toc_window_display": false, + "widenNotebook": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/allofplos/jupyternb/Download PLOS article XML from journals.plos.org.ipynb b/allofplos/jupyternb/Download PLOS article XML from journals.plos.org.ipynb new file mode 100644 index 00000000..7788ea5e --- /dev/null +++ b/allofplos/jupyternb/Download PLOS article XML from journals.plos.org.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For each article in articleerrors.txt, \n", + "* go to journals.plos.org[article] URL to grab the raw XML \n", + "* download the xml from that webpage \n", + "* write file name based on name of article \n", + "* save xml to file \n", + "* add time delay " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import lxml.etree as et\n", + "import os\n", + "import time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First go through text list of XML files in pre-defined list articleerrors.txt, convert to Python list, and truncate characters so it fits the PLOS URL scheme. NOTE: journal name in prefix does not matter." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "with open(\"articleerrors.txt\",\"r\") as f:\n", + " article_list = [x[:-5] for x in f.readlines()]\n", + " article_list.pop(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "sample_article_list = article_list[350:360]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "prefix = 'http://journals.plos.org/plosone/article/file?id=10.1371/'\n", + "suffix = '&type=manuscript'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For each article in the article list, grab the XML from the constructed URL, parse with etree, and save to new XML file. Counter for every 50 articles. Time delay added so as not to overwhelm server" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.1%\n", + "6.0%\n", + "11.9%\n", + "17.8%\n", + "23.7%\n", + "29.6%\n", + "35.5%\n", + "41.4%\n", + "47.4%\n", + "53.3%\n", + "59.2%\n", + "65.1%\n", + "71.0%\n", + "76.9%\n", + "82.8%\n", + "88.7%\n", + "94.6%\n" + ] + } + ], + "source": [ + "for i, article in enumerate(article_list):\n", + " url = prefix + article + suffix\n", + " articleXML = et.parse(url)\n", + " article_path = os.path.join(\"fixed_XML_articles\", article + \".xml\")\n", + " with open(article_path, 'w') as f:\n", + " f.write(et.tostring(articleXML, method = 'xml', encoding = 'unicode'))\n", + " if i%75 ==0:\n", + " print(\"{:.1%}\".format((i+1)/len(article_list)))\n", + " time.sleep(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# if __name__ == __main__:\n", + " # main()\n", + " # this allows you to use python your_file.py " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1269" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "val = !ls fixed_XML_articles/\n", + "len(val)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'builtin_function_or_method' object has no attribute 'lower'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mstupidlist\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetoutput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'ls AllofPLOS_article_XML/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstupidlist\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: 'builtin_function_or_method' object has no attribute 'lower'" + ] + } + ], + "source": [ + "import os\n", + "stupidlist = !ls AllofPLOS_article_XML/\n", + "for x in stupidlist:\n", + " os.rename.lower()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "a = \"hi\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "py3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + }, + "toc": { + "colors": { + "hover_highlight": "#DAA520", + "navigate_num": "#000000", + "navigate_text": "#333333", + "running_highlight": "#FF0000", + "selected_highlight": "#FFD700", + "sidebar_border": "#EEEEEE", + "wrapper_background": "#FFFFFF" + }, + "moveMenuLeft": true, + "nav_menu": { + "height": "12px", + "width": "252px" + }, + "navigate_menu": true, + "number_sections": true, + "sideBar": true, + "threshold": 4, + "toc_cell": false, + "toc_section_display": "block", + "toc_window_display": false, + "widenNotebook": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/allofplos/jupyternb/Monthly integrity check for PLOS corpus.ipynb b/allofplos/jupyternb/Monthly integrity check for PLOS corpus.ipynb new file mode 100644 index 00000000..c5a0b1d0 --- /dev/null +++ b/allofplos/jupyternb/Monthly integrity check for PLOS corpus.ipynb @@ -0,0 +1,148 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# What to check:\n", + "* Maybe as part of existing monthly maintenance?\n", + "* First test this on a small subset of articles\n", + "For each file in folder, make sure filename == relevant DOI field in XML\n", + " If so, pass_file_name_test is True\n", + " else pass_file_name_test is False\n", + "List of solr query DOIs == list of DOIs in article XML folder == list of DOIs in zip file\n", + " if DOIs in solr and not folder, download those from solr & add to folder & zip\n", + " and if it's that one messed-up article, only if it's been fixed\n", + " if it's been fixed, print note to remove this logic from the code\n", + " if DOIs in folder in solr, write those DOIs to error-list & txt file & email with warning\n", + " if no error proceed to XML content testing\n", + " if error print that content still needs to be checked\n", + " \n", + "Content of content-repo XML == Content of article folder XML == Content of zip file XML\n", + " if content in repo doesn't match article folder via https://bitbucket.org/ianb/formencode/src/tip/formencode/doctest_xml_compare.py?fileviewer=file-view-default#cl-70\n", + " if uncorrected proof vs vor_update, download vor_update\n", + " otherwise save diff and return error (or: preserve old version and make content-repo default and take diff via https://www.logilab.org/859 )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "file_list = drive.ListFile({'q': \"'root' in parents and trashed=false\"}).GetList()\n", + "gdrive_zip_file = [item for item in file_list if item[\"id\"] == gd_id]\n", + "gdrive_zip_filename = (item for item in gdrive_zip_file['originalFilename'])\n", + "current_zipname = str(glob(prefix_zip_name+\"*.zip\")[0])\n", + "if gdrive_filename == current_zipname: \n", + " print(\"Zip file up-to-date on Google drive. No changes made.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import filecmp\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filecmp.cmp('test_file.txt', 'accman_to_check_list.txt', shallow=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "os.stat_result(st_mode=33188, st_ino=10556917, st_dev=16777220, st_nlink=1, st_uid=738185890, st_gid=984564325, st_size=903, st_atime=1490388647, st_mtime=1490388644, st_ctime=1490388644)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py3", + "language": "python", + "name": "py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + }, + "toc": { + "colors": { + "hover_highlight": "#DAA520", + "navigate_num": "#000000", + "navigate_text": "#333333", + "running_highlight": "#FF0000", + "selected_highlight": "#FFD700", + "sidebar_border": "#EEEEEE", + "wrapper_background": "#FFFFFF" + }, + "moveMenuLeft": true, + "nav_menu": { + "height": "30px", + "width": "252px" + }, + "navigate_menu": true, + "number_sections": true, + "sideBar": true, + "threshold": 4, + "toc_cell": false, + "toc_section_display": "block", + "toc_window_display": false, + "widenNotebook": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/allofplos/plos_pmc.py b/allofplos/plos_pmc.py new file mode 100644 index 00000000..3b2ff048 --- /dev/null +++ b/allofplos/plos_pmc.py @@ -0,0 +1,532 @@ +""" Small stand-alone script for getting all the PMC IDs for PLOS articles. +""" + +import requests +import time +import datetime +from glob import glob +from shutil import move, rmtree + +import lxml.etree as et +from download import download + +from plos_corpus import (listdir_nohidden, extract_filenames, check_article_type, get_article_xml, + get_related_article_doi, download_updated_xml, unzip_articles, get_all_solr_dois, + file_to_doi, doi_to_file, check_if_uncorrected_proof, newarticledir, get_article_pubdate, + compare_article_pubdate) +from plos_regex import (regex_match_prefix, regex_body_match, regex_body_currents, full_doi_regex_match, + full_doi_regex_search, currents_doi_regex, validate_doi, validate_file, + validate_url, find_valid_dois, show_invalid_dois, currents_doi_filter) + + +newpmcarticledir = "new_pmc_articles" +pmc_csv = 'doi_to_pmc.csv' +pmcdir = "pmc_articles/" +# xml URL takes PMC identifier minus 'PMC' +pmc_xml_url = 'https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:' +pmc_xml_url_suffix = '&metadataPrefix=pmc' + +# can query up to 200 DOIs from PMC +USER_EMAIL = 'elizabeth.seiver@gmail.com' +pmc_doi_query_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=corpustest&email={0}&ids='.format(USER_EMAIL) +pmc_doi_query_url_suffix = '&versions=no&format=json' +pmc_pmcid_query_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=' +pmc_allplos_query_url = ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=' + '(((((((("PLoS+ONE"[Journal])+OR+"PLoS+Genetics"[Journal])+OR+"PLoS+Pathogens"[Journal])' + 'OR+"PLoS+Neglected+Tropical+Diseases"[Journal])+OR+"PLoS+Computational+Biology"[Journal])' + 'OR+"PLoS+Biology"[Journal])+OR+"PLoS+Medicine"[Journal])+OR+"plos+currents"[Journal])' + '+OR+"PLoS+Clinical+Trials"[Journal])&retmax=1000&retmode=json&tool=corpustest' + '&email={0}'.format(USER_EMAIL)) +PMC_FTP_URL = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/' +pmc_file_list = 'oa_file_list.txt' +newpmcarticledir = "new_pmc_articles" + +def get_all_pmc_dois(retstart=0, retmax=80000, count=None): + """Query the entrez database to get a comprehensive list of all PMCIDs associated with all PLOS journals, + individually included in the search url. + Supposedly can return 100,000, but based on the maximum not working for another function, lowered to 80K to be safe. + :param restart: the first record to return + :param retmax: the maximum number of records to return + :return: the full list of PMCIDs in PMC for PLOS articles + """ + pmc_allplos_query_url = ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=' + '(((((("PLoS+ONE"[Journal])+OR+"PLoS+Genetics"[Journal])+OR+"PLoS+Pathogens"[Journal])' + 'OR+"PLoS+Neglected+Tropical+Diseases"[Journal])+OR+"PLoS+Computational+Biology"[Journal])' + 'OR+"PLoS+Biology"[Journal])+OR+"PLoS+Medicine"[Journal]+OR+"plos+currents"[Journal]' + '&retmode=json&tool=corpustest&email=email@provider.com') + + pmcidlist = [] + r = requests.get(pmc_allplos_query_url).json() + if count is None: + count = int(r['esearchresult']['count']) + print(count, "articles found in PMC") + while retstart < count: + query = pmc_allplos_query_url + '&retstart={0}&retmax={1}'.format(retstart, retmax) + r = requests.get(query).json() + idlist = r['esearchresult']['idlist'] + for id in idlist: + pmcidlist.append('PMC' + id) + retstart += retmax + time.sleep(1) + pmcidlist = sorted(list(set(pmcidlist))) + + print(len(pmcidlist), "articles found") + return pmcidlist + + +def get_pmc_doi_dict(doi_list, chunk_size=150): + '''Using the PMC ID query API, return the accompanying PMCID for each DOI in a given list. + Can (ostensibly) query up to 200 DOIs at a time but sometimes that doesn't work. + :param doi list: a list of valid PLOS DOIs + :param chunk_size: number of DOIs to query at a single time + :return: tuple of dictionary mapping DOI to PMCID, list of DOIs not found in PMC + ''' + + doi_to_pmc = {} + dois_not_in_pmc = [] + # Make chunks of 200 DOIs at a time + list_chunks = [doi_list[x:x+chunk_size] for x in range(0, len(doi_list), chunk_size)] + for chunk in list_chunks: + pmc_doi_string = ','.join(chunk) + # Create the search URL + pmc_doi_query = pmc_doi_query_url + pmc_doi_string + # Parse the results & create dict entry for each result + pmc_response = requests.get(pmc_doi_query) + if pmc_response.status_code == 500: + print('Error for DOI chunk; retry with smaller chunk size') + else: + pmc_results = et.XML(pmc_response.content) + pmc_results = pmc_results.getchildren()[1:] # exclude echo header + for result in pmc_results: + doi = result.attrib['doi'] + try: + pmcid = result.attrib['pmcid'] + doi_to_pmc[doi] = pmcid + except KeyError: + if result.attrib['status'] == 'error': + dois_not_in_pmc.append(doi) + else: + print('Weird error for', doi) + time.sleep(1) + return doi_to_pmc, dois_not_in_pmc + + +def get_pmc_articles(): + """ + :return: a list of all article files in PMC folder + """ + # step 1: download tarball file if needed + pmc_url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/articles.O-Z.xml.tar.gz' + pmcdir = 'pmc_articles/' + pmc_local_tar = 'pmc_files.tar.gz' + pmc_path = os.path.join(pmcdir, pmc_local_tar) + if os.path.isdir(pmcdir) is False: + os.mkdir(pmcdir) + print('Creating folder for PMC article xml') + + if len([name for name in os.listdir(pmcdir) if os.path.isfile(os.path.join(pmcdir, name))]) < 200000: + print('Not enough articles in pmcdir, re-downloading zip file') + path = download(pmc_url, pmc_path) + + # Step 2: unzip archive + unzip_articles(file_path=pmc_path, extract_directory=pmcdir, filetype='tar') + + # Step 3: delete non-PLOS folders + listdirs = glob("pmc_articles/*/") + print(len(listdirs), "folders for all O-Z journals") + for directory in list(listdirs): + if directory.lower().startswith('pmc_articles/plos') is False: + rmtree(directory) + listdirs.remove(directory) + print(len(listdirs), "folders remaining for PLOS journals") + + # Step 4: put all PLOS articles in higher level pmcdir folder & flatten hierarchy + root = pmcdir + print("moving PMC articles to top-level folder") + for dirrr in list(listdirs): + files = [f for dp, dn, filenames in os.walk(dirrr) for f in filenames if os.path.splitext(f)[1] == '.nxml'] + for file in files: + move(join(dirrr, file), join(root, file)) + rmtree(dirrr) + pmc_articles = listdir_nohidden(pmcdir, extension='.nxml') + + return pmc_articles + + +def get_pmc_doi_dict(id_list=None, chunk_size=150): + ''' + Using the PMC ID query API, return the accompanying PMCID for each identifier in a given list. + Can (ostensibly) query up to 200 identifiers at a time. Can accept lists of DOIs or PMC IDs + :return: tuple of dictionary mapping DOI to PMCID, list of DOIs not found in PMC + ''' + if id_list is None: + id_list = extract_filenames(pmcdir, extension='.nxml') + doi_to_pmc = {} + dois_not_in_pmc = [] + # Make chunks of 200 DOIs at a time + list_chunks = [id_list[x:x+chunk_size] for x in range(0, len(id_list), chunk_size)] + for chunk in list_chunks: + pmc_doi_string = ','.join(chunk) + # Create the search URL + pmc_doi_query = pmc_doi_query_url + pmc_doi_string + # Parse the results & create dict entry for each result + pmc_response = requests.get(pmc_doi_query) + if pmc_response.status_code == 500: + print('Error for DOI chunk; retry with smaller chunk size') + else: + pmc_results = et.XML(pmc_response.content) + pmc_results = pmc_results.getchildren()[1:] # exclude echo header + for result in pmc_results: + doi = result.attrib['doi'] + try: + pmcid = result.attrib['pmcid'] + doi_to_pmc[doi] = pmcid + except KeyError: + if result.attrib['status'] == 'error': + dois_not_in_pmc.append(doi) + else: + print('Weird error for', doi) + time.sleep(1) + return doi_to_pmc, dois_not_in_pmc + + +def update_pmc_dict_by_doi(id_list): + ''' + With a list of identifiers, query PMC ID service to check for PMCIDs for articles. Print to .csv + :return: tuple of full dictionary of DOIs to PMC IDs, DOIs without matching PMCIDs + ''' + doi_to_pmc = get_articles_by_doi_field(check_new=False) + doi_to_pmc2, dois_not_in_pmc = get_pmc_doi_dict(id_list) + full_pmc_dict = {**doi_to_pmc2, **doi_to_pmc} + with open(pmc_csv, 'w') as file: + writer = csv.writer(file) + writer.writerow(['DOI', 'PMC ID']) + for key, value in full_pmc_dict.items(): + writer.writerow([key, value]) + return full_pmc_dict, dois_not_in_pmc + + +def exclude_recent_dois(doi_list): + ''' + For arriving at a list of DOIs ostensibly missing from PMC, remove the most recent articles + which likely have not yet had the opportunity to propagate. + :return: a list of missing DOIs which are old enough to be expected to be on PMC. + ''' + missing_pmc_articles = [] + for doi in doi_list: + article_file = doi_to_file(doi) + if compare_article_pubdate(article_file): + missing_pmc_articles.append(doi) + return missing_pmc_articles + + +def process_missing_plos_articles(plos_articles=None, pmc_articles=None): + ''' + For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version + of the Corpus by removing Currents articles, checking if articles are live on journals.plos.org, + and checking that the DOIs resolve. Prints the different kinds of errors that can occur. + :return: list of missing articles + ''' + if plos_articles is None or not plos_articles: + plos_articles = get_all_plos_dois() + if pmc_articles is None or not pmc_articles: + doi_to_pmc = get_articles_by_doi_field(check_new=False) + pmc_articles = list(doi_to_pmc.keys()) + missing_plos_articles = list(set(pmc_articles) - set(plos_articles)) + + # remove Currents articles + for article in missing_plos_articles: + if article.startswith('10.1371/currents') or \ + len(article) == 21 or \ + article == '10.1371/198d344bc40a75f927c9bc5024279815': + missing_plos_articles.remove(article) + + # check if articles are live on journals.plos.org + # check if DOIs resolve + missing_articles_link_works = [] + missing_articles_404_error = [] + doi_works = [] + doi_doesnt_work = [] + doi_mismatch = [] + doi_has_space = [] + for doi in missing_plos_articles: + if ' ' in doi: + doi_has_space.append(doi) + continue + doi_check = check_if_doi_resolves(doi) + if doi_check == 'works': + doi_works.append(doi) + elif doi_check == "doesn't work": + doi_doesnt_work.append(doi) + else: + doi_mismatch.append(doi) + continue + url = doi_to_url(doi) + article_exists = check_if_link_works(url) + if article_exists: + missing_articles_link_works.append(doi) + else: + missing_articles_404_error.append(doi) + + doi_mismatch = sorted(doi_mismatch) + link404_invalid_doi = sorted(list(set(missing_articles_404_error).intersection(doi_doesnt_work))) + linkworks_valid_doi = sorted(list(set(missing_articles_link_works).intersection(doi_works))) + + if doi_has_space: + print('\033[1m' + 'PMC DOI fields with spaces in them:') + for doi in doi_has_space: + print('\033[0m' + '"' + doi + '" \n') + if linkworks_valid_doi: + print('\033[1m' + 'Working articles that need to be re-indexed on Solr:') + print('\033[0m' + '\n'.join(linkworks_valid_doi), '\n') + if link404_invalid_doi: + print('\033[1m' + 'Articles on PMC but not on solr or journals:') + print('\033[0m' + '\n'.join(missing_articles_404_error), '\n') + if doi_mismatch: + print('\033[1m' + 'Missing PLOS articles where DOI resolves to different DOI:') + for doi in doi_mismatch: + print('\033[0m', doi, 'resolves to:', check_if_doi_resolves(doi)) + + remainder = set(missing_plos_articles) - set(linkworks_valid_doi + missing_articles_404_error + + doi_mismatch + doi_has_space) + if remainder: + print('\n \033[1m' + "Other articles on PMC that aren't working correctly for PLOS:") + print('\033[0m' + '\n'.join(remainder), '\n') + return missing_plos_articles + + +def process_missing_pmc_articles(pmc_articles=None, plos_articles=None): + ''' + For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version + of the Corpus by updating the PMCID:DOI mapping document, removing articles too recent to be indexed + (pubdate less than 3 weeks ago), and excluding uncorrected proofs. + :return: list of missing articles from PMC + ''' + if pmc_articles is None: + doi_to_pmc = get_articles_by_doi_field(check_new=False) + pmc_articles = list(doi_to_pmc.keys()) + + if plos_articles is None: + plos_articles = get_all_plos_dois() + missing_pmc_dois = list(set(plos_articles) - set(pmc_articles)) + + # Query for PMC updates & update DOI-to-PMCID dictionary + if missing_pmc_dois: + full_pmc_dict, dois_not_in_pmc = update_pmc_dict_by_doi(missing_pmc_dois) + + # Exclude PLOS Medicine quizzes + for doi in dois_not_in_pmc: + if "pmed" in doi: + article = doi_to_article(doi) + article_type = get_plos_article_type(article) + if article_type == 'Quiz': + dois_not_in_pmc.remove(doi) + + # Remove articles too recent to have been indexed on PMC + if dois_not_in_pmc: + missing_pmc_dois = exclude_recent_dois(dois_not_in_pmc) + + # Remove uncorrected proofs + if missing_pmc_dois: + for doi in missing_pmc_dois: + article_file = doi_to_file(doi) + if check_if_uncorrected_proof(article_file): + missing_pmc_dois.remove(doi) + + # Make sure that the DOI resolves + for doi in missing_pmc_dois: + resolves = check_if_doi_resolves(doi) + if resolves != "works": + print('DOI not working for this PLOS DOI:', doi, resolves) + missing_pmc_dois.remove(doi) + + if len(missing_pmc_dois) == 0: + print('No PMC articles missing.') + else: + for doi in missing_pmc_dois: + if ' ' in doi: + print('There is a space in this DOI: ' + '"' + doi + '"') + print('\033[1m' + 'Articles missing from PMC:') + print('\033[0m' + '\n'.join(sorted(missing_pmc_dois)), '\n') + + return missing_pmc_dois + + +def get_all_pmc_dois(retstart=0, retmax=80000, count=None): + """ + Query the entrez database to get a comprehensive list of all PMCIDs associated with all PLOS journals, + individually included in the search url. + See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch for more info on search parameters + :return: the full list of PMCIDs in PMC for PLOS articles + """ + pmc_allplos_query_url = ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=' + '((((((((("PLoS+ONE"[Journal])+OR+"PLoS+Genetics"[Journal])+OR+"PLoS+Pathogens"[Journal])' + 'OR+"PLoS+Neglected+Tropical+Diseases"[Journal])+OR+"PLoS+Computational+Biology"[Journal])' + 'OR+"PLoS+Biology"[Journal])+OR+"PLoS+Medicine"[Journal])+OR+"plos+currents"[Journal])+OR+' + '"PLoS Clinical Trials"[Journal])' + '&retmode=json&tool=corpustest&email={0}'.format(USER_EMAIL)) + + pmcidlist = [] + r = requests.get(pmc_allplos_query_url).json() + if count is None: + count = int(r['esearchresult']['count']) + print(count, "articles found in PMC") + while retstart < count: + query = pmc_allplos_query_url + '&retstart={0}&retmax={1}'.format(retstart, retmax) + r = requests.get(query).json() + idlist = r['esearchresult']['idlist'] + for id in idlist: + pmcidlist.append('PMC' + id) + retstart += retmax + time.sleep(1) + pmcidlist = sorted(list(set(pmcidlist))) + if pmcidlist != count: + print("Error in number of IDs returned. Got {0} when expected {1}." + .format(len(pmcidlist), count)) + + return pmcidlist + + +def update_local_pmc_from_remote(): + ''' + Using the current set of articles indexed live on PMC, compare them to the locally maintained index. + If any of them are missing, download them to the local .csv dictionary. + :return: full dictionary of PMC IDs''' + remote_pmc_ids = get_all_pmc_dois() + local_pmc_dict = get_articles_by_doi_field() + local_pmc_ids = list(local_pmc_dict.values()) + missing_pmcids = list(set(remote_pmc_ids) - set(local_pmc_ids)) + if missing_pmcids: + full_pmc_dict, dois_not_in_pmc = update_pmc_dict_by_doi(missing_pmcids) + else: + full_pmc_dict = doi_to_pmc + weird_pmc_ids = list(set(local_pmc_ids) - set(remote_pmc_ids)) + if 0 < weird_pmc_ids < 10000: + print("Some articles on local not on remote:", print(weird_pmc_ids)) + return full_pmc_dict + + +def get_needed_pmc_articles(): + """ + Compare local to remote set of PLOS PMC IDs. + TO DO: Add check for latest update date + :return: tuple of doi dict, and list of DOIs that are on remote and not local, to be downloaded. + """ + doi_to_pmc = get_articles_by_doi_field(check_new=False) + remote_pmc_ids = list(doi_to_pmc.values()) + local_pmc_ids = extract_filenames(pmcdir, extension='.nxml') + missing_pmc_articles = list(set(remote_pmc_ids) - set(local_pmc_ids)) + return doi_to_pmc, missing_pmc_articles + + +def get_pmc_article_zip_links(): + """ + Creates a dictionary mapping every PMC ID to the partial PMC download URL + Based on txt file hosted by PMC + TO DO: see if there's a way to download monthly, weekly, etc from PMC + :return: dictionary mapping PMC IDs to partial download links + """ + + # write info file to disk if it doesn't exist already or is too old + try: + mod_date = datetime.datetime.fromtimestamp(os.path.getmtime(pmc_file_list)) + file_age = datetime.datetime.now() - mod_date + if file_age > datetime.timedelta(days=1): + os.remove(pmc_file_list) + except FileNotFoundError: + pass + if os.path.isfile(pmc_file_list) is False: + with open(pmc_file_list, 'w') as f: + f.write(requests.get('http://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt').text) + + # parse file by line + with open(pmc_file_list) as f: + pmc_lists = [x.strip().split('\t') for x in f] + + # turn into dictionary mapping of PMCID to partial PMC URL + pmc_urls = {d[2]: d[0] for d in pmc_lists[1:]} + + return pmc_urls + + +def download_pmc_article_xml(missing_pmc_articles=None, pmc_urls=None): + """ + Get missing PMC articles. Get dictionary mapping them to partial URLs. Download and unzip the tarballs. + Keep and rename the nxml files and delete the others. + NOTE: This hasn't worked very well. PMC connections are unreliable & there are a lot of timeouts. + :return: list of files downloaded from PMC + """ + new_pmc_articles = [] + if missing_pmc_articles is None: + doi_to_pmc, missing_pmc_articles = get_needed_pmc_articles() + print(len(missing_pmc_articles), "PMC articles to download.") + if missing_pmc_articles: + if pmc_urls is None: + pmc_urls = get_pmc_article_zip_links() + # download and unzip tarballs + for article in missing_pmc_articles: + dl_url = PMC_FTP_URL + pmc_urls[article] + filename = (pmc_urls[article]).split("/")[3] + local_file = os.path.join(newpmcarticledir, filename) + if os.path.isfile(local_file) is False: + try: + download(dl_url, local_file) + unzip_articles(directory=newpmcarticledir, filetype='tar', file=filename) + except RuntimeError: + print('Error downloading', article) + continue + + # get rid of non-.nxml files + allfiles = glob.glob('new_pmc_articles/*/*') + for file in allfiles: + if file.endswith('.nxml') is False: + os.remove(file) + + # move and process the nxml files + files = glob.glob('new_pmc_articles/*/*') + for old_file in files: + # make sure directory and linked doi line up + directory = (old_file).split('/')[1] + linked_doi = doi_to_pmc[get_article_doi(article_file=old_file)] + if linked_doi == directory: + # rename file from directory & move to higher level directory + new_file = '/'.join(((old_file).split('/'))[0:2]) + '.nxml' + shutil.move(old_file, new_file) + new_pmc_articles.append(new_file) + else: + print('error:', linked_doi, directory) + for directory in glob.glob('new_pmc_articles/*/'): + os.rmdir(directory) + + return new_pmc_articles + + +def move_pmc_articles(source, destination): + """ + Move PMC articles from one folder to another + :param source: Temporary directory of new article files + :param destination: Directory where files are copied to + """ + oldnum_destination = len(listdir_nohidden(destination, extension='.nxml')) + oldnum_source = len(listdir_nohidden(source, extension='.nxml')) + if oldnum_source > 0: + print("PMC Corpus started with", + oldnum_destination, + "articles.\nFile moving procedure initiated, please hold...") + copytree(source, destination, ignore=ignore_func) + newnum_destination = len(listdir_nohidden(destination)) + if newnum_destination - oldnum_destination > 0: + print(newnum_destination - oldnum_destination, + "files moved. PMC Corpus now has", + newnum_destination, "articles.") + logging.info("New article files moved successfully") + else: + print("No files found to move in source directory.") + logging.info("No article files moved") + # Delete temporary folder in most cases + if source == newarticledir: + shutil.rmtree(source) + + +if __name__ == '__main__': + pmcidlist = get_all_pmc_dois() diff --git a/allofplos/twoto3_nb.py b/allofplos/twoto3_nb.py new file mode 100755 index 00000000..32fa9160 --- /dev/null +++ b/allofplos/twoto3_nb.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +To run: python3 nb2to3.py notebook-or-directory +""" +# Authors: Thomas Kluyver, Fernando Perez +# See: https://gist.github.com/takluyver/c8839593c615bb2f6e80 + +import argparse +import pathlib +from nbformat import read, write + +import lib2to3 +from lib2to3.refactor import RefactoringTool, get_fixers_from_package + + +def refactor_notebook_inplace(rt, path): + + def refactor_cell(src): + #print('\n***SRC***\n', src) + try: + tree = rt.refactor_string(src+'\n', str(path) + '/cell-%d' % i) + except (lib2to3.pgen2.parse.ParseError, + lib2to3.pgen2.tokenize.TokenError): + return src + else: + return str(tree)[:-1] + + + print("Refactoring:", path) + nb = read(str(path), as_version=4) + + # Run 2to3 on code + for i, cell in enumerate(nb.cells, start=1): + if cell.cell_type == 'code': + if cell.execution_count in (' ', '*'): + cell.execution_count = None + + if cell.source.startswith('%%'): + # For cell magics, try to refactor the body, in case it's + # valid python + head, source = cell.source.split('\n', 1) + cell.source = head + '\n' + refactor_cell(source) + else: + cell.source = refactor_cell(cell.source) + + + # Update notebook metadata + nb.metadata.kernelspec = { + 'display_name': 'Python 3', + 'name': 'python3', + 'language': 'python', + } + if 'language_info' in nb.metadata: + nb.metadata.language_info.codemirror_mode = { + 'name': 'ipython', + 'version': 3, + } + nb.metadata.language_info.pygments_lexer = 'ipython3' + nb.metadata.language_info.pop('version', None) + + write(nb, str(path)) + +def main(argv=None): + ap = argparse.ArgumentParser() + ap.add_argument('path', type=pathlib.Path, + help="Notebook or directory containing notebooks") + + options = ap.parse_args(argv) + + avail_fixes = set(get_fixers_from_package('lib2to3.fixes')) + rt = RefactoringTool(avail_fixes) + + if options.path.is_dir(): + for nb_path in options.path.rglob('*.ipynb'): + refactor_notebook_inplace(rt, nb_path) + else: + refactor_notebook_inplace(rt, options.path) + +if __name__ == '__main__': + main() From d36309e383947272bc4356a9fc2ecb1456c493a7 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Wed, 4 Oct 2017 19:52:59 -0700 Subject: [PATCH 05/24] mv function --- allofplos/plos_pmc.py | 51 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/allofplos/plos_pmc.py b/allofplos/plos_pmc.py index 3b2ff048..46bd8ba9 100644 --- a/allofplos/plos_pmc.py +++ b/allofplos/plos_pmc.py @@ -41,6 +41,7 @@ pmc_file_list = 'oa_file_list.txt' newpmcarticledir = "new_pmc_articles" + def get_all_pmc_dois(retstart=0, retmax=80000, count=None): """Query the entrez database to get a comprehensive list of all PMCIDs associated with all PLOS journals, individually included in the search url. @@ -74,6 +75,56 @@ def get_all_pmc_dois(retstart=0, retmax=80000, count=None): return pmcidlist +def get_articles_by_doi_field(directory=pmcdir, article_list=None, check_new=True): + doi_to_pmc = {} + if directory == pmcdir and article_list is None: + article_list = get_pmc_articles() + elif article_list is None: + article_list = listdir_nohidden(directory) + if article_list == 0: + article_list = listdir_nohidden(directory, extension='.nxml') + + if directory != pmcdir: + for article in article_list: + doi = get_article_doi(article_file=article) + doi_to_pmc[doi] = article + else: + try: + # read doi_to_pmc dict from csv + with open(pmc_csv, 'r') as csv_file: + reader = csv.reader(csv_file) + next(reader, None) + doi_to_pmc = dict(reader) + + scratch = False + n = 0 + if check_new: + for article in article_list: + if article not in doi_to_pmc.values(): + doi = get_article_doi(article) + doi_to_pmc[doi] = os.path.basename(article).rstrip('.nxml').rstrip('.xml') + n = n + 1 + if n: + print(n, 'DOI/PMCID pairs added to dictionary.') + + except FileNotFoundError: + print('Creating doi_to_pmc dictionary from scratch.') + scratch = True + n = 0 + file_list = listdir_nohidden(pmcdir, extension='.nxml') + doi_to_pmc = {get_article_doi(pmc_file): os.path.basename(pmc_file).rstrip('.nxml') for pmc_file in file_list} + # write doi_to_pmc dict to csv + if scratch or n > 0: + with open(pmc_csv, 'w') as f: + writer = csv.writer(f) + writer.writerow(['DOI', 'PMC ID']) + for key, value in doi_to_pmc: + writer.writerow([key, value]) + print('DOI, PMC ID list exported to', pmc_csv) + + return doi_to_pmc + + def get_pmc_doi_dict(doi_list, chunk_size=150): '''Using the PMC ID query API, return the accompanying PMCID for each DOI in a given list. Can (ostensibly) query up to 200 DOIs at a time but sometimes that doesn't work. From 749b61b0427b1a03283beda8d93a4cc84ba5fe6e Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Fri, 13 Oct 2017 15:16:19 -0700 Subject: [PATCH 06/24] pmc fix --- allofplos/samples/corpus_analysis.py | 1 + 1 file changed, 1 insertion(+) diff --git a/allofplos/samples/corpus_analysis.py b/allofplos/samples/corpus_analysis.py index 3ab1a0df..353291a8 100644 --- a/allofplos/samples/corpus_analysis.py +++ b/allofplos/samples/corpus_analysis.py @@ -22,6 +22,7 @@ counter = collections.Counter corpusdir = 'allofplos_xml' +pmcdir = "pmc_articles" max_invalid_files_to_print = 100 From b0d70bf1013bdcf6e20ad86f855b92ef73ccfef8 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Mon, 16 Oct 2017 00:51:34 -0700 Subject: [PATCH 07/24] + joey aperta query --- allofplos/Production team investigates.ipynb | 201 +++++++++++++++++-- 1 file changed, 182 insertions(+), 19 deletions(-) diff --git a/allofplos/Production team investigates.ipynb b/allofplos/Production team investigates.ipynb index 2f898f00..1e35c340 100644 --- a/allofplos/Production team investigates.ipynb +++ b/allofplos/Production team investigates.ipynb @@ -3,12 +3,14 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from plos_corpus import *\n", "from samples.corpus_analysis import *\n", - "corpusdir = 'allofplos_xml'" + "corpusdir_prod = '../../allofplos/allofplos/allofplos_xml/'" ] }, { @@ -21,7 +23,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def make_annotation_dict(save_output=True):\n", @@ -48,7 +52,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# run this\n", @@ -67,7 +73,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def get_tina_test_set():\n", @@ -156,7 +164,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# test this function\n", @@ -167,7 +177,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "print(doi_results)" @@ -176,7 +188,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# run this function for real\n", @@ -193,7 +207,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "corrections_article_list, corrected_article_list = get_corrected_article_list()\n", @@ -204,7 +220,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "multiple_corrections.remove('10.1371/journal.')\n", @@ -215,7 +233,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "multiple_corrections_post_2015\n", @@ -236,7 +256,9 @@ { "cell_type": "code", "execution_count": 190, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "example_doi = '10.1371/journal.pone.0068090'\n", @@ -319,7 +341,9 @@ { "cell_type": "code", "execution_count": 197, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "table_results = []\n", @@ -341,7 +365,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "for article_file in listdir_nohidden(corpusdir)[180000:180010]:\n", @@ -349,16 +375,140 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# Which Aperta articles have a group collaboration contributor element?" + ] + }, + { + "cell_type": "markdown", "metadata": {}, + "source": [ + "## Example: 10.1371/journal.pmed.1002170\n", + "\n", + "\n", + "International Ebola Response Team\n", + "\n", + "\n", + "\n", + "\n", + "

\n", + "¶ The International Ebola Response Team comprises the authors listed in this article in alphabetical order\n", + "

\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": true + }, "outputs": [], - "source": [] + "source": [ + " def get_article_collab(doi, corpusdir=corpusdir_prod):\n", + " \"\"\"\n", + " For a given PLOS article, see if there is a collaborator group in the authors list. Print data if so\n", + " :return: tuple of doi, collaborators, and the footnote number if so\n", + " \"\"\"\n", + " tag_path_elements = ('/',\n", + " 'article',\n", + " 'front',\n", + " 'article-meta')\n", + " article_xml = get_article_xml(doi_to_file(doi, directory=corpusdir), tag_path_elements=tag_path_elements)\n", + " meta_categories = article_xml[0].getchildren()\n", + " contrib_groups = [category for category in meta_categories if category.tag == 'contrib-group']\n", + " collab = False\n", + " rid = ''\n", + " footnote = False\n", + " collab_tuple = ''\n", + " try:\n", + " for contrib_group in contrib_groups:\n", + " for contrib in contrib_group:\n", + " if contrib.attrib['contrib-type'] == 'author':\n", + " for child in contrib:\n", + " if child.tag == \"collab\":\n", + " collab = True\n", + " collaborators = child.text\n", + " continue\n", + " if child.tag == 'role':\n", + " continue\n", + " elif child.tag == 'xref':\n", + " rid = (child.attrib['rid'])\n", + " if collab and rid:\n", + " break\n", + "\n", + " except IndexError:\n", + " print('No authors found for {}'.format(doi))\n", + " return False\n", + "\n", + " if collab and rid:\n", + " tag_path_elements = ('/',\n", + " 'article',\n", + " 'front',\n", + " 'article-meta',\n", + " 'author-notes')\n", + "\n", + " article_xml = get_article_xml(doi_to_file(doi, directory=corpusdir), tag_path_elements=tag_path_elements)\n", + " notes = article_xml[0].getchildren()\n", + " for note in notes:\n", + " if note.tag == 'fn' and rid in note.attrib.values():\n", + " footnote = True\n", + " if footnote is False:\n", + " print('footnote not found for {}'.format(doi))\n", + "\n", + " collab_tuple = (doi, collaborators, rid)\n", + "\n", + " elif collab:\n", + " print('rid not found for {}'.format(doi))\n", + "\n", + " if collab_tuple:\n", + " print(collab_tuple)\n", + "\n", + " return collab_tuple" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Restrict to PLOS Biology Aperta articles\n", + "article_list = [article for article in listdir_nohidden(corpusdir_prod) if 'pbio.2' in article] \n", + "doi_list = [file_to_doi(article) for article in article_list]\n", + "doi_list.append('10.1371/journal.pmed.1002170')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('10.1371/journal.pbio.2001069', 'CycliX consortium', 'fn001')\n", + "('10.1371/journal.pbio.2001855', 'BEEHIVE collaboration', 'fn001')\n", + "('10.1371/journal.pmed.1002170', 'International Ebola Response Team', 'fn001')\n" + ] + } + ], + "source": [ + "for doi in doi_list:\n", + " get_article_collab(doi)" + ] } ], "metadata": { "kernelspec": { - "display_name": "py3", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -375,17 +525,30 @@ "version": "3.6.2" }, "toc": { + "colors": { + "hover_highlight": "#DAA520", + "navigate_num": "#000000", + "navigate_text": "#333333", + "running_highlight": "#FF0000", + "selected_highlight": "#FFD700", + "sidebar_border": "#EEEEEE", + "wrapper_background": "#FFFFFF" + }, + "moveMenuLeft": true, "nav_menu": { "height": "12px", "width": "252px" }, + "navigate_menu": true, "number_sections": true, "sideBar": true, "skip_h1_title": false, + "threshold": 4, "toc_cell": false, "toc_position": {}, "toc_section_display": "block", - "toc_window_display": false + "toc_window_display": false, + "widenNotebook": false } }, "nbformat": 4, From 9b97481cd537653e74a609b106c634ffd1d66af0 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Mon, 16 Oct 2017 18:19:25 -0700 Subject: [PATCH 08/24] updated notebooks plus new ones --- allofplos/PLOS Medicine Colleen and NLP.ipynb | 116 +++++++ allofplos/PLOS Medicine and NLP.ipynb | 42 +++ allofplos/Production team investigates.ipynb | 311 +++++++++++------- 3 files changed, 359 insertions(+), 110 deletions(-) create mode 100644 allofplos/PLOS Medicine Colleen and NLP.ipynb create mode 100644 allofplos/PLOS Medicine and NLP.ipynb diff --git a/allofplos/PLOS Medicine Colleen and NLP.ipynb b/allofplos/PLOS Medicine Colleen and NLP.ipynb new file mode 100644 index 00000000..1edb75ae --- /dev/null +++ b/allofplos/PLOS Medicine Colleen and NLP.ipynb @@ -0,0 +1,116 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import datetime\n", + "import lxml.etree as et\n", + "import csv\n", + "pmcdir = 'pmc_articles'\n", + "from plos_corpus import (corpusdir, get_article_pubdate, check_if_uncorrected_proof, listdir_nohidden,\n", + " get_article_xml, file_to_doi, doi_to_file, get_all_solr_dois, download_check_and_move)\n", + "\n", + "from samples.corpus_analysis import (get_plos_article_type, get_article_dtd, get_random_list_of_dois, \n", + " get_related_retraction_article, check_article_type, get_plos_journal,\n", + " get_article_title, parse_article_date, get_corpus_metadata,\n", + " get_article_abstract, corpus_metadata_to_csv, get_article_dates,\n", + " read_corpus_metadata_from_csv)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook is for tracking Colleen Crangle's requests for test corpuses for NLP. First up is all PLOS ONE articles for which \"diabet\" (for diabetes, diabetic, etc) appears in the abstract." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "one_list = [article for article in listdir_nohidden(corpusdir) if 'pone' in article]\n", + "\n", + "def assemble_diabetes_corpus(article_list):\n", + " \"\"\"\n", + " Find all PLOS ONE articles that say something about diabetes or technology in the abstract.\n", + " \"\"\"\n", + " diabetes_article_list = [article for article in article_list if 'diabet' in get_article_abstract(article).lower()]\n", + " return diabetes_article_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "diabetes_article_list = assemble_diabetes_corpus(one_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "diabetes_metadata = get_corpus_metadata(article_list=diabetes_article_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "corpus_metadata_to_csv(diabetes_metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + }, + "toc": { + "nav_menu": { + "height": "12px", + "width": "252px" + }, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "toc_cell": false, + "toc_position": {}, + "toc_section_display": "block", + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/allofplos/PLOS Medicine and NLP.ipynb b/allofplos/PLOS Medicine and NLP.ipynb new file mode 100644 index 00000000..b0c90608 --- /dev/null +++ b/allofplos/PLOS Medicine and NLP.ipynb @@ -0,0 +1,42 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + }, + "toc": { + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "toc_cell": false, + "toc_position": {}, + "toc_section_display": "block", + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/allofplos/Production team investigates.ipynb b/allofplos/Production team investigates.ipynb index 1e35c340..6213b793 100644 --- a/allofplos/Production team investigates.ipynb +++ b/allofplos/Production team investigates.ipynb @@ -2,10 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, + "execution_count": 1, + "metadata": {}, "outputs": [], "source": [ "from plos_corpus import *\n", @@ -15,7 +13,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "heading_collapsed": true + }, "source": [ "# Q: Are annotation DOIs resolving correctly?" ] @@ -24,7 +24,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "hidden": true }, "outputs": [], "source": [ @@ -53,7 +54,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "hidden": true }, "outputs": [], "source": [ @@ -64,7 +66,8 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "heading_collapsed": true }, "source": [ "# Q: Which `` elements follow a certain pattern?" @@ -72,9 +75,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { - "collapsed": true + "hidden": true }, "outputs": [], "source": [ @@ -83,9 +86,9 @@ " Return a list of DOIs good for Tina's function\n", " \"\"\"\n", " random_list_of_dois = get_random_list_of_dois(count=10)\n", - " random_list_of_articles = [doi_to_file(doi) for doi in random_list_of_dois if 'annotation' not in doi]\n", + " random_list_of_articles = [doi_to_path(doi) for doi in random_list_of_dois if 'annotation' not in doi]\n", " search_1_dois = ('10.1371/journal.pmed.1002035', '10.1371/journal.pone.0047559', '10.1371/journal.pone.0047944')\n", - " search_1_articles = [doi_to_file(doi) for doi in search_1_dois]\n", + " search_1_articles = [doi_to_path(doi) for doi in search_1_dois]\n", " search_test_set = list(set(random_list_of_articles + search_1_articles))\n", " return search_test_set\n", "\n", @@ -121,15 +124,15 @@ " for contributor in contrib_group:\n", " for element in contributor:\n", " if element.tag == 'on-behalf-of':\n", - " search_1_results.append(file_to_doi(article_file))\n", + " search_1_results.append(filename_to_doi(article_file))\n", " next_element = contributor.getnext()\n", " if next_element is not None:\n", " for elem in next_element:\n", " if elem.tag == 'collab':\n", - " search_2_results.append(file_to_doi(article_file))\n", + " search_2_results.append(filename_to_doi(article_file))\n", " for subelem in elem:\n", " if subelem.tag == 'contrib-group':\n", - " search_3_results.append(file_to_doi(article_file))\n", + " search_3_results.append(filename_to_doi(article_file))\n", " break\n", "\n", " search_1_results = set(search_1_results)\n", @@ -163,11 +166,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { - "collapsed": true + "hidden": true }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'get_random_list_of_dois' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# test this function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtest_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_tina_test_set\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdoi_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfind_contrib_pattern\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtest_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcsv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mget_tina_test_set\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mReturn\u001b[0m \u001b[0ma\u001b[0m \u001b[0mlist\u001b[0m \u001b[0mof\u001b[0m \u001b[0mDOIs\u001b[0m \u001b[0mgood\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mTina\u001b[0m\u001b[0;31m'\u001b[0m\u001b[0ms\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \"\"\"\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mrandom_list_of_dois\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_random_list_of_dois\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mrandom_list_of_articles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mdoi_to_path\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdoi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdoi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrandom_list_of_dois\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'annotation'\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdoi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0msearch_1_dois\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'10.1371/journal.pmed.1002035'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'10.1371/journal.pone.0047559'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'10.1371/journal.pone.0047944'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'get_random_list_of_dois' is not defined" + ] + } + ], "source": [ "# test this function\n", "test_list = get_tina_test_set()\n", @@ -178,7 +194,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "hidden": true }, "outputs": [], "source": [ @@ -189,7 +206,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "hidden": true }, "outputs": [], "source": [ @@ -199,7 +217,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "heading_collapsed": true + }, "source": [ "# Q: Which articles after 2015 have 2 or more corrections attached?" ] @@ -208,7 +228,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "hidden": true }, "outputs": [], "source": [ @@ -221,7 +242,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "hidden": true }, "outputs": [], "source": [ @@ -234,7 +256,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "hidden": true }, "outputs": [], "source": [ @@ -248,29 +271,28 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "heading_collapsed": true + }, "source": [ - "# Q: Which articles have a series of table-wrap elements?" + "# Q: Which articles have a series of table-wrap graphic elements?" ] }, { "cell_type": "code", - "execution_count": 190, + "execution_count": 5, "metadata": { - "collapsed": true + "hidden": true }, "outputs": [], "source": [ "example_doi = '10.1371/journal.pone.0068090'\n", "search_1_file = 'xml_testing/Search-1_TRUE.xml'\n", "search_2_file = 'xml_testing/Search-2_TRUE.xml'\n", - "intro_file = doi_to_file(example_doi)\n", - "fail_file = doi_to_file('10.1371/journal.pone.0182980')\n", - "test_list = [intro_file, search_1_file, search_2_file, fail_file]\n", - "\n", - "intro_condition = []\n", - "search_1 = []\n", - "search_2 = []\n", + "intro_file = doi_to_path(example_doi, directory=corpusdir_prod)\n", + "search_1_fail_list = []\n", + "fail_file = doi_to_path('10.1371/journal.pone.0183466', directory=corpusdir_prod)\n", + "test_list = [fail_file, intro_file, search_1_file, search_2_file]\n", "\n", "def find_table_wraps(article):\n", " \"\"\"\n", @@ -278,106 +300,182 @@ " 'alternative' in table\n", " \"\"\"\n", " intro_condition = False\n", - " search_1 = False\n", - " search_2 = False\n", + " intro_condition_overall = False\n", + " search1_ids = []\n", + " search2_ids = []\n", + " alternative_graphic_ids = []\n", "\n", - " article_tree = et.parse(article)\n", + " article_tree = et.parse(article, parser=et.XMLParser(remove_comments=True)) # exclude commented-out tables\n", " table_wraps = article_tree.findall('.//table-wrap')\n", " if table_wraps:\n", " for table_wrap in table_wraps:\n", - " try:\n", - " if all('alternatives' not in table_part.tag for table_part in table_wrap) and \\\n", - " all('graphic' not in table_part.tag for table_part in table_wrap):\n", - " intro_condition = True\n", - " except TypeError:\n", - " # this is an imperfect work-around. if alternatives were a sub-sub-element,\n", - " # it would be incorrectly excluded from intro_\n", - " alternatives = table_wrap.findall('.//alternatives')\n", - " if alternatives == 0:\n", - " intro_condition = True\n", + " table_parts = table_wrap.getchildren()\n", + " # intro condition 1: table-wrap element does not include a direct child of \n", + " alternatives_parts = [table_part for table_part in table_parts if 'alternatives' in table_part.tag]\n", + " if not alternatives_parts:\n", + " intro_condition_1 = True\n", + " else:\n", + " for table_part in alternatives_parts:\n", + " table_subparts = table_part.getchildren()\n", + " if all('graphic' not in table_subpart.tag for table_subpart in table_subparts):\n", + " intro_condition_1 = True\n", + " else:\n", + " intro_condition_1 = False\n", + " new_alternative_graphic_ids = [table_subpart.attrib['id'] for table_subpart in table_subparts if 'graphic' in table_subpart.tag]\n", + " alternative_graphic_ids.extend(new_alternative_graphic_ids)\n", + "\n", + " # intro condition 2: table-wrap element does not include a direct child of \n", + " if all('graphic' not in table_part.tag for table_part in table_parts):\n", + " intro_condition_2 = True\n", + " else:\n", + " intro_condition_2 = False\n", + " \n", + " if intro_condition_1 and intro_condition_2:\n", + " intro_condition = True\n", + " # keep track of articles that have any table match intro condition\n", + " intro_condition_overall = True\n", + "\n", " if intro_condition:\n", - " danger = table_wrap.findall('.//graphic')\n", - " if danger:\n", - " search_1 = True\n", - " danger2 = table_wrap.findall('.//inline-graphic')\n", - " if danger2:\n", - " search_2 = True\n", + " graphics = table_wrap.findall('.//graphic')\n", + " if graphics:\n", + " new_search1_ids = [graphic.attrib['id'] for graphic in graphics]\n", + " search1_ids.extend(new_search1_ids)\n", + " inline_graphics = table_wrap.findall('.//inline-graphic')\n", + " if inline_graphics:\n", + " try:\n", + " search2_ids = [inline.attrib['id'] for inline in inline_graphics]\n", + " except KeyError:\n", + " print('{} has search 2 results but no ids: {}'.format(article, inline_graphics))\n", + " search2_ids = [inline.attrib for inline in inline_graphics]\n", " else:\n", " pass\n", - " \n", - "# for table_part in table_parts:\n", - "# if 'alternatives' in table_part.tag:\n", - "# print('alternatives')\n", "\n", " else:\n", - " pass\n", + " return intro_condition_overall, bool(search1_ids), bool(search2_ids)\n", "\n", - " return intro_condition, search_1, search_2\n" + " if search1_ids and alternative_graphic_ids:\n", + " # exclude graphics elements that are already accounted for under an tag\n", + " search1_ids = [did for did in search1_ids if did not in alternative_graphic_ids]\n", + " if not search1_ids:\n", + " search1_ids = False\n", + " elif len(search1_ids) == 1:\n", + " search1_ids = search1_ids[0]\n", + " if not search2_ids:\n", + " search2_ids = False\n", + " elif len(search2_ids) == 1:\n", + " search2_ids = search2_ids[0]\n", + " return intro_condition_overall, search1_ids, search2_ids\n" ] }, { "cell_type": "code", - "execution_count": 196, - "metadata": {}, + "execution_count": 6, + "metadata": { + "code_folding": [], + "hidden": true + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "allofplos_xml/journal.pone.0068090.xml True False False\n", - "xml_testing/Search-1_TRUE.xml True True False\n", - "xml_testing/Search-2_TRUE.xml True True True\n", - "allofplos_xml/journal.pone.0182980.xml False False False\n" + "../../allofplos/allofplos/allofplos_xml/journal.pone.0183466.xml False False False\n", + "../../allofplos/allofplos/allofplos_xml/journal.pone.0068090.xml True False False\n", + "xml_testing/Search-1_TRUE.xml True pmed.1002397.e001g False\n", + "xml_testing/Search-2_TRUE.xml True False pmed.1002397.e001g\n" ] } ], "source": [ - "table_results = []\n", + "# testing the code\n", "for article_file in test_list:\n", - " intro_condition, search_1, search_2 = find_table_wraps(article_file)\n", - " print(article_file, intro_condition, search_1, search_2)" + " intro_condition, search1_ids, search2_ids = find_table_wraps(article_file)\n", + " print(article_file, intro_condition, search1_ids, search2_ids)" ] }, { "cell_type": "code", - "execution_count": 197, + "execution_count": 13, "metadata": { - "collapsed": true + "hidden": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "221852\n", + "['../../allofplos/allofplos/allofplos_xml/journal.ppat.1000896.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0065590.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0036030.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0026652.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0029438.xml', '../../allofplos/allofplos/allofplos_xml/journal.pgen.1000989.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0089988.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0015594.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0149634.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0000707.xml']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 12% ( 27042 of 221852) |############# | Elapsed Time: 0:04:33 ETA: 0:57:29" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "../../allofplos/allofplos/allofplos_xml/journal.pone.0002468.xml has search 2 results but no ids: [, , , , ]\n", + "../../allofplos/allofplos/allofplos_xml/journal.pone.0002468.xml has search 2 results but no ids: [, , , , ]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 52% (116850 of 221852) |######################################################## | Elapsed Time: 0:19:57 ETA: 0:18:13" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "../../allofplos/allofplos/allofplos_xml/journal.pone.0075851.xml has search 2 results but no ids: []\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100% (221852 of 221852) |##########################################################################################################| Elapsed Time: 0:38:14 Time: 0:38:14\n" + ] + } + ], "source": [ + "# running over entire corpus, randomized, with a progressbar\n", + "import progressbar\n", + "from random import shuffle\n", + "\n", "table_results = []\n", - "file_list = listdir_nohidden(corpusdir)\n", - "for article_file in file_list:\n", - " intro_condition, search_1, search_2 = find_table_wraps(article_file)\n", + "file_list = listdir_nohidden(corpusdir_prod)\n", + "shuffle(file_list)\n", + "\n", + "bar = progressbar.ProgressBar(redirect_stdout=True, max_value=len(file_list))\n", + "for i, article_file in enumerate(file_list):\n", + " intro_condition, search1_ids, search2_ids = find_table_wraps(article_file)\n", " if intro_condition:\n", - " result = [file_to_doi(article_file), search_1, search_2]\n", + " result = [filename_to_doi(article_file), search1_ids, search2_ids]\n", " table_results.append(result)\n", + " bar.update(i+1)\n", + "bar.finish()\n", "\n", "# print(table_results)\n", - "with open('table_search_results_revised.csv', 'w') as f:\n", + "with open('table_graphics_search_results.csv', 'w') as f:\n", " writer = csv.writer(f)\n", " writer.writerow(['DOI', 'Search 1', 'Search 2'])\n", " for doi_result in sorted(table_results):\n", " writer.writerow(doi_result)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "for article_file in listdir_nohidden(corpusdir)[180000:180010]:\n", - " print(find_table_wraps(article_file))" - ] - }, { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "heading_collapsed": true }, "source": [ "# Which Aperta articles have a group collaboration contributor element?" @@ -385,7 +483,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "## Example: 10.1371/journal.pmed.1002170\n", "\n", @@ -405,7 +505,8 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": true + "collapsed": true, + "hidden": true }, "outputs": [], "source": [ @@ -475,7 +576,8 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": true + "collapsed": true, + "hidden": true }, "outputs": [], "source": [ @@ -488,7 +590,9 @@ { "cell_type": "code", "execution_count": 22, - "metadata": {}, + "metadata": { + "hidden": true + }, "outputs": [ { "name": "stdout", @@ -508,7 +612,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "py3", "language": "python", "name": "python3" }, @@ -522,33 +626,20 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.3" }, "toc": { - "colors": { - "hover_highlight": "#DAA520", - "navigate_num": "#000000", - "navigate_text": "#333333", - "running_highlight": "#FF0000", - "selected_highlight": "#FFD700", - "sidebar_border": "#EEEEEE", - "wrapper_background": "#FFFFFF" - }, - "moveMenuLeft": true, "nav_menu": { "height": "12px", "width": "252px" }, - "navigate_menu": true, "number_sections": true, "sideBar": true, "skip_h1_title": false, - "threshold": 4, "toc_cell": false, "toc_position": {}, "toc_section_display": "block", - "toc_window_display": false, - "widenNotebook": false + "toc_window_display": false } }, "nbformat": 4, From 8589c9a7de3ad10335007a25870f8ba88e76eee8 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Tue, 17 Oct 2017 23:22:07 -0700 Subject: [PATCH 09/24] update regex + unit test reflecting new file structure --- allofplos/file_rename.py | 18 ++++++++++++++++++ allofplos/plos_regex.py | 2 +- allofplos/tests/unittests.py | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 allofplos/file_rename.py diff --git a/allofplos/file_rename.py b/allofplos/file_rename.py new file mode 100644 index 00000000..86e2e3c9 --- /dev/null +++ b/allofplos/file_rename.py @@ -0,0 +1,18 @@ +import os +import re + +from plos_corpus import listdir_nohidden, corpusdir +from plos_regex import validate_file + +annotation_articles = [article for article in listdir_nohidden(corpusdir) if 'correction' in article] + +for article in annotation_articles: + count = 0 + parts = re.split('\/|\.', article) + new_filename = os.path.join(corpusdir, 'plos.correction.' + parts[-2] + '.xml') + if validate_file(new_filename) and new_filename != article: + os.rename(article, new_filename) + count += 1 + else: + pass +print('{} files renamed'.format(count)) diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py index 5d596e27..3be21b2b 100644 --- a/allofplos/plos_regex.py +++ b/allofplos/plos_regex.py @@ -18,7 +18,7 @@ r"|([a-zA-Z0-9]{13}$)" r"|([a-zA-Z0-9]{32}$))") regex_file_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" - r"|(journal\.p[a-zA-Z]{3}\.correction\.[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))") + r"|(plos\.correction\.[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))") full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match) full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}" "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") diff --git a/allofplos/tests/unittests.py b/allofplos/tests/unittests.py index a46f2ac4..ccbd20dc 100644 --- a/allofplos/tests/unittests.py +++ b/allofplos/tests/unittests.py @@ -18,7 +18,7 @@ example_url2_int = 'http://contentrepo.plos.org:8002/v1/objects/mogilefs-'\ 'prod-repo?key=10.1371/annotation/3155a3e9-5fbe-435c-a'\ '07a-e9a4846ec0b6.XML' -example_file2 = 'allofplos_xml/journal.pone.correction.3155a3e9-5fbe-435c'\ +example_file2 = 'allofplos_xml/plos.correction.3155a3e9-5fbe-435c'\ '-a07a-e9a4846ec0b6.xml' example_doi2 = '10.1371/annotation/3155a3e9-5fbe-435c-a07a-e9a4846ec0b6' From d3027f3302991d42c3897b99c139ccf3fd688475 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Wed, 18 Oct 2017 00:01:43 -0700 Subject: [PATCH 10/24] transform function fixes (incomplete) --- allofplos/plos_corpus.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py index 61160bd0..f89bef51 100644 --- a/allofplos/plos_corpus.py +++ b/allofplos/plos_corpus.py @@ -23,7 +23,6 @@ import errno import logging import os -from os.path import isfile, join import shutil import time import tarfile @@ -103,7 +102,7 @@ def filename_to_url(filename, plos_network=False): :return: online location of a PLOS article's XML """ if correction in filename: - article = 'annotation/' + (filename.split('.', 4)[3]) + article = 'annotation/' + (filename.split('.', 4)[2]) else: article = os.path.splitext((os.path.basename(filename)))[0] doi = prefix + article @@ -122,7 +121,7 @@ def filename_to_doi(filename): :return: full unique identifier for a PLOS article """ if correction in filename and validate_file(filename): - article = 'annotation/' + (filename.split('.', 4)[3]) + article = 'annotation/' + (filename.split('.', 4)[2]) doi = prefix + article elif validate_file(filename): doi = prefix + os.path.splitext((os.path.basename(filename)))[0] @@ -219,11 +218,7 @@ def doi_to_path(doi, directory=corpusdir): :return: relative path to local XML file """ if doi.startswith(annotation_doi) and validate_doi(doi): - try: - url = doi_to_url(doi) - article_file = url_to_path(url) - except KeyError: - print("error, can't find linked DOI for {0}".format(doi)) + article_file = os.path.join(directory, "plos.correction." + doi.split('/')[-1] + suffix_lower) elif validate_doi(doi): article_file = os.path.join(directory, doi.lstrip(prefix) + suffix_lower) elif validate_file(doi): From 1f3937e3ea1b53e20ebb0864a9d892a5f3e1b208 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Wed, 18 Oct 2017 15:01:15 -0700 Subject: [PATCH 11/24] rm files that didn't belong --- allofplos/Corpus_Analysis_Examples.ipynb | 257 ------- allofplos/Corpus_QA-Copy1.ipynb | 450 ------------ allofplos/Corpus_QA.ipynb | 512 -------------- allofplos/PLOS Medicine Colleen and NLP.ipynb | 116 ---- allofplos/PLOS Medicine and NLP.ipynb | 42 -- allofplos/Production team investigates.ipynb | 647 ------------------ allofplos/csvfile.ipynb | 288 -------- allofplos/file_rename.py | 18 - allofplos/plos_pmc.py | 583 ---------------- allofplos/twoto3_nb.py | 80 --- 10 files changed, 2993 deletions(-) delete mode 100644 allofplos/Corpus_Analysis_Examples.ipynb delete mode 100644 allofplos/Corpus_QA-Copy1.ipynb delete mode 100644 allofplos/Corpus_QA.ipynb delete mode 100644 allofplos/PLOS Medicine Colleen and NLP.ipynb delete mode 100644 allofplos/PLOS Medicine and NLP.ipynb delete mode 100644 allofplos/Production team investigates.ipynb delete mode 100644 allofplos/csvfile.ipynb delete mode 100644 allofplos/file_rename.py delete mode 100644 allofplos/plos_pmc.py delete mode 100755 allofplos/twoto3_nb.py diff --git a/allofplos/Corpus_Analysis_Examples.ipynb b/allofplos/Corpus_Analysis_Examples.ipynb deleted file mode 100644 index bc5f45bb..00000000 --- a/allofplos/Corpus_Analysis_Examples.ipynb +++ /dev/null @@ -1,257 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Required functions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "code_folding": [], - "collapsed": true - }, - "outputs": [], - "source": [ - "from samples.corpus_analysis import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PLOS article types" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## JATS-standard NLM article types" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "editable": false, - "run_control": { - "frozen": true - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "15 types of articles found.\n", - "[('research-article', 204109), ('correction', 9113), ('article-commentary', 1284), ('discussion', 1087), ('review-article', 612), ('other', 584), ('editorial', 340), ('letter', 300), ('retraction', 79), ('book-review', 77), ('meeting-report', 38), ('case-report', 23), ('expression-of-concern', 13), ('obituary', 10), ('brief-report', 1)]\n" - ] - } - ], - "source": [ - "jats_article_type_list = get_jats_article_type_list()\n", - "print(jats_article_type_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## PLOS article types" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "editable": false, - "run_control": { - "frozen": true - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "60 types of articles found.\n", - "[('Research Article', 202673), ('Correction', 9124), ('Synopsis', 1093), ('Perspective', 801), ('Review', 555), ('Editorial', 486), ('Pearls', 438), ('Essay', 379), ('Policy Forum', 309), ('Correspondence', 287), ('Primer', 237), ('Viewpoints', 209), ('Community Page', 139), ('Opinion', 136), ('Health in Action', 118), ('Education', 103), ('Retraction', 79), ('Book Review/Science in the Media', 76), ('Message from ISCB', 70), ('Symposium', 70), ('Policy Platform', 54), ('Feature', 53), ('Formal Comment', 52), ('Research in Translation', 51), ('Guidelines and Guidance', 51), ('Collection Review', 50), ('Research Matters', 44), ('Interview', 44), ('The PLoS Medicine Debate', 38), ('Historical Profiles and Perspectives', 38), ('Unsolved Mystery', 34), ('Overview', 34), ('Neglected Diseases', 29), ('Expert Commentary', 29), ('Learning Forum', 27), ('From Innovation to Application', 24), ('Obituary', 22), ('Quiz', 21), ('Correspondence and Other Communications', 13), ('Expression of Concern', 13), ('Journal Club', 12), ('Meta-Research Article', 12), ('Student Forum', 12), ('Open Highlights', 11), ('Topic Page', 11), ('Case Report', 10), ('Photo Quiz', 10), ('Best Practice', 5), ('Deep Reads', 4), ('Historical and Philosophical Perspectives', 3), ('Special Report', 3), ('Book Review', 2), ('Message from the Founders', 1), ('Message from PLoS', 1), ('Short Reports', 1), ('Methods and Resources', 1), ('Technical Report', 1), ('Message from the PLoS Founders', 1), ('Collection Review ', 1), ('Debate', 1)]\n" - ] - } - ], - "source": [ - "PLOS_article_type_list = get_plos_article_type_list()\n", - "print(PLOS_article_type_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Taking random samples of DOIs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "random_sample_of_dois = get_random_list_of_DOIs() # returns 100 DOIs by default" - ] - }, - { - "cell_type": "code", - "execution_count": 203, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['journal.pone.0074820', 'journal.pone.0063497', 'journal.pone.0126357', 'journal.pntd.0004807', 'journal.pone.0031896', 'journal.pone.0045503', 'journal.pone.0138217', 'journal.pbio.0050002', 'journal.pone.0122848', 'journal.pone.0099248']\n" - ] - } - ], - "source": [ - "random_sample_of_articles = [doi_to_article(doi) for doi in random_sample_of_dois]\n", - "print(random_sample_of_articles[0:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Retracted and corrected articles" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get list of retracted articles" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "79 retracted articles found.\n", - "['journal.pbio.0030123', 'journal.pbio.0050005', 'journal.pbio.0050146', 'journal.pbio.1001212', 'journal.pcbi.1002308', 'journal.pgen.1003361', 'journal.pgen.1003791', 'journal.pgen.1005586', 'journal.pgen.1000424', 'journal.pmed.1001214', 'journal.pone.0072333', 'journal.pone.0084127', 'journal.pone.0027571', 'journal.pone.0046410', 'journal.pone.0080145', 'journal.pone.0019652', 'journal.pone.0075928', 'journal.pone.0075046', 'journal.pone.0062178', 'journal.pone.0051549', 'journal.pone.0093095', 'journal.pone.0069669', 'journal.pone.0133525', 'journal.pone.0115980', 'journal.pone.0115741', 'journal.pone.0139044', 'journal.pone.0146193', 'journal.pone.0045667', 'journal.pone.0040789', 'journal.pone.0094830', 'journal.pone.0031943', 'journal.pone.0097700', 'journal.pone.0047218', 'journal.pone.0090951', 'journal.pone.0014232', 'journal.pone.0090318', 'journal.pone.0072895', 'journal.pone.0065651', 'journal.pone.0059556', 'journal.pone.0076809', 'journal.pone.0099630', 'journal.pone.0121549', 'journal.pone.0048402', 'journal.pone.0062170', 'journal.pone.0020152', 'journal.pone.0164571', 'journal.pone.0164378', 'journal.pone.0116682', 'journal.pone.0125542', 'journal.pone.0047110', 'journal.pone.0026503', 'journal.pone.0037102', 'journal.pone.0014163', 'journal.pone.0043204', 'journal.pone.0001276', 'journal.pone.0035142', 'journal.pone.0011299', 'journal.pone.0005373', 'journal.pone.0030980', 'journal.pone.0000306', 'journal.pone.0064576', 'journal.pone.0016011', 'journal.pone.0001444', 'journal.pone.0043406', 'journal.pone.0029192', 'journal.pone.0001908', 'journal.pone.0016256', 'journal.pone.0013512', 'journal.pone.0045965', 'journal.pone.0022730', 'journal.pone.0006333', 'journal.pone.0004168', 'journal.pone.0035453', 'journal.pone.0032853', 'journal.ppat.1003435', 'journal.ppat.1002062', 'journal.ppat.1000915', 'journal.ppat.1000210', 'journal.ppat.0020025']\n" - ] - } - ], - "source": [ - "retractions_article_list, retracted_article_list = get_retracted_article_list()\n", - "print(retracted_article_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get list of corrected articles" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "journal.pcbi.1003582.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003490\n", - "journal.pcbi.1003732.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003159\n", - "journal.pone.0101541.xml has incorrect linked DOI: journal.PONE-D-13-26510\n", - "journal.pone.0104353.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104472.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104581.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104601.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105485.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105486.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105490.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105658.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105668.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105669.xml has incorrect linked DOI: journal.\n", - "9127 corrected articles found.\n" - ] - } - ], - "source": [ - "corrections_article_list, corrected_article_list = get_corrected_article_list()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - }, - "toc": { - "colors": { - "hover_highlight": "#DAA520", - "navigate_num": "#000000", - "navigate_text": "#333333", - "running_highlight": "#FF0000", - "selected_highlight": "#FFD700", - "sidebar_border": "#EEEEEE", - "wrapper_background": "#FFFFFF" - }, - "moveMenuLeft": true, - "nav_menu": { - "height": "174px", - "width": "252px" - }, - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 4, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false, - "widenNotebook": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/Corpus_QA-Copy1.ipynb b/allofplos/Corpus_QA-Copy1.ipynb deleted file mode 100644 index b7ccf708..00000000 --- a/allofplos/Corpus_QA-Copy1.ipynb +++ /dev/null @@ -1,450 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Required functions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "code_folding": [] - }, - "outputs": [], - "source": [ - "from samples.corpus_analysis import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PLOS/NLM article type mapping" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'i' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# For mapping the JATS article type onto the PLOS article type, while taking NLM DTD into account.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0marticle_types_map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_article_types_map\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mPLOS_article_types_structured\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcounter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_types_map\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmost_common\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mPLOS_article_types_structured\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mget_article_types_map\u001b[0;34m(directory)\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0marticle_file\u001b[0m \u001b[0;32min\u001b[0m \u001b[0marticle_files\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 182\u001b[0m \u001b[0mjats_article_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_article_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 183\u001b[0;31m \u001b[0mplos_article_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_plos_article_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 184\u001b[0m \u001b[0mdtd_version\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_article_dtd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0mtypes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mjats_article_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplos_article_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtd_version\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mget_plos_article_type\u001b[0;34m(article_file)\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msubject\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msubject_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msubject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'subj-group-type'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"heading\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 140\u001b[0;31m \u001b[0msubject_instance\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msubject_list\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 141\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msubject_instance\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitertext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'i' is not defined" - ] - } - ], - "source": [ - "# For mapping the JATS article type onto the PLOS article type, while taking NLM DTD into account.\n", - "article_types_map = get_article_types_map()\n", - "PLOS_article_types_structured = counter(article_types_map).most_common()\n", - "print(PLOS_article_types_structured)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create .csv file mapping JATS to PLOS article types\n", - "article_types_map_to_csv(article_types_map)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Retracted and corrected articles" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get list of retracted articles" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 retracted articles found.\n" - ] - } - ], - "source": [ - "# article_list = [doi_to_file(doi) for doi in get_random_list_of_dois(count=5000)]\n", - "retractions_doi_list, retracted_doi_list = get_retracted_doi_list(article_list=article_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['10.1371/journal.pbio.1002215']" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "retractions_doi_list" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "article_list = [doi_to_file('10.1371/journal.pbio.1002215')]" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [], - "source": [ - "def get_retracted_doi_list(article_list=None, directory=corpusdir):\n", - " \"\"\"\n", - " Scans through articles in a directory to see if they are retraction notifications,\n", - " scans articles that are that type to find DOIs of retracted articles\n", - " :return: tuple of lists of DOIs for retractions articles, and retracted articles\n", - " \"\"\"\n", - " retractions_doi_list = []\n", - " retracted_doi_list = []\n", - " if article_list is None:\n", - " article_list = listdir_nohidden(directory)\n", - " for article_file in article_list:\n", - " if check_if_retraction_article(article_file):\n", - " retractions_doi_list.append(file_to_doi(article_file))\n", - " # Look in those articles to find actual articles that are retracted\n", - " retracted_doi = get_related_retraction_article(article_file)[0]\n", - " retracted_doi_list.append(retracted_doi)\n", - " # check linked DOI for accuracy\n", - " if make_regex_bool(full_doi_regex_match.search(retracted_doi)) is False:\n", - " print(\"{} has incorrect linked DOI field: '{}'\".format(article_file, retracted_doi))\n", - " if len(retractions_doi_list) == len(retracted_doi_list):\n", - " print(len(retracted_doi_list), 'retracted articles found.')\n", - " else:\n", - " print('Number of retraction articles and retracted articles are different: ',\n", - " '{} vs. {}'.format(len(retractions_article_list), len(retracted_article_list)))\n", - " return retractions_doi_list, retracted_doi_list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get list of corrected articles" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5 corrected articles found.\n" - ] - } - ], - "source": [ - "article_list = [doi_to_file(doi) for doi in get_random_list_of_dois(count=100)]\n", - "corrections_article_list, corrected_article_list = get_corrected_article_list(article_list=article_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['10.1371/journal.pone.0065474', '10.1371/journal.pone.0144760', '10.1371/journal.pone.0050818', '10.1371/journal.pmed.1001786', '10.1371/journal.ppat.1003068']\n" - ] - } - ], - "source": [ - "print(corrected_article_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Check raw XML for article updates" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# By default, checks only the 30,000 most recent articles\n", - "articles_different_list = revisiondate_sanity_check()\n", - "print(articles_different_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DOI and filename sanity check" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check if article filenames match their full DOIs & that DOI fields are correct\n", - "messed_up_plos_list = article_doi_sanity_check()\n", - "messed_up_pmc_list = article_doi_sanity_check(directory=pmcdir, article_list=None, source='PMC')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PubMed Corpus" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get all local, solr, and PMC DOIs" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mArticles that needs to be re-indexed on Solr:\n", - "\u001b[0m10.1371/journal.pone.0076809\n" - ] - } - ], - "source": [ - "plos_articles = get_all_plos_dois()\n", - "doi_to_pmc = get_articles_by_doi_field(check_new=False)\n", - "pmc_articles = list(doi_to_pmc.keys())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare PLOS's copy to PMC" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version of the corpus by:\n", - "* removing Currents articles\n", - "* checking if articles are live on journals.plos.org\n", - "* checking that the DOIs resolve" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "missing_plos_articles = process_missing_plos_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare PMC's copy to PLOS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version of the Corpus by:\n", - "* updating the PMCID:DOI mapping document\n", - "* removing articles too recent to be indexed (pubdate less than 3 weeks ago)\n", - "* excluding uncorrected proofs\n", - "* excluding PLOS Medicine quizzes" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmissing_pmc_articles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprocess_missing_pmc_articles\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpmc_articles\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpmc_articles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplos_articles\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mplos_articles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mprocess_missing_pmc_articles\u001b[0;34m(pmc_articles, plos_articles)\u001b[0m\n\u001b[1;32m 730\u001b[0m \u001b[0mplos_articles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_all_plos_dois\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 731\u001b[0m \u001b[0mmissing_pmc_dois\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mplos_articles\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpmc_articles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 732\u001b[0;31m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 733\u001b[0m \u001b[0;31m# Query for PMC updates & update DOI-to-PMCID dictionary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 734\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmissing_pmc_dois\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mupdate_pmc_dict_by_doi\u001b[0;34m(id_list)\u001b[0m\n\u001b[1;32m 562\u001b[0m '''\n\u001b[1;32m 563\u001b[0m \u001b[0mdoi_to_pmc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_articles_by_doi_field\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcheck_new\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 564\u001b[0;31m \u001b[0mdoi_to_pmc2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdois_not_in_pmc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_pmc_doi_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 565\u001b[0m \u001b[0mfull_pmc_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mdoi_to_pmc2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mdoi_to_pmc\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 566\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpmc_csv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mget_pmc_doi_dict\u001b[0;34m(id_list, chunk_size)\u001b[0m\n\u001b[1;32m 536\u001b[0m \u001b[0mpmc_doi_query\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpmc_doi_query_url\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mpmc_doi_string\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 537\u001b[0m \u001b[0;31m# Parse the results & create dict entry for each result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 538\u001b[0;31m \u001b[0mpmc_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpmc_doi_query\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 539\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpmc_response\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus_code\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m500\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 540\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Error for DOI chunk; retry with smaller chunk size'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 506\u001b[0m }\n\u001b[1;32m 507\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 616\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 617\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 618\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 440\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 441\u001b[0m )\n\u001b[1;32m 442\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout_obj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 600\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 601\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 602\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[0;31m# If we're going to release the connection in ``finally:``, then\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0;31m# Trigger any extra validation we need to do.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 346\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 347\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[0;31m# Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_validate_conn\u001b[0;34m(self, conn)\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;31m# Force connect early to allow us to validate the connection.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 849\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sock'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# AppEngine might not have `.sock`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 850\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 851\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 852\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_verified\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0mca_cert_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mca_cert_dir\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[0mserver_hostname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhostname\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 326\u001b[0;31m ssl_context=context)\n\u001b[0m\u001b[1;32m 327\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massert_fingerprint\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/util/ssl_.py\u001b[0m in \u001b[0;36mssl_wrap_socket\u001b[0;34m(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_cert_chain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcertfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeyfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mHAS_SNI\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Platform-specific: OpenSSL with enabled SNI\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 329\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrap_socket\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msock\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mserver_hostname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mserver_hostname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 330\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m warnings.warn(\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/urllib3/contrib/pyopenssl.py\u001b[0m in \u001b[0;36mwrap_socket\u001b[0;34m(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 440\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 441\u001b[0;31m \u001b[0mcnx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_handshake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 442\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOpenSSL\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSSL\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWantReadError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 443\u001b[0m \u001b[0mrd\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msock\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgettimeout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/py3/lib/python3.6/site-packages/OpenSSL/SSL.py\u001b[0m in \u001b[0;36mdo_handshake\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1713\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1714\u001b[0m \"\"\"\n\u001b[0;32m-> 1715\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_lib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSSL_do_handshake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1716\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_raise_ssl_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1717\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "missing_pmc_articles = process_missing_pmc_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save lists of missing articles to text files if needed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('missing_plos_articles.txt', 'w') as file:\n", - " for item in sorted(set(missing_plos_articles)):\n", - " file.write(\"%s\\n\" % item)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "id_list=listdir_nohidden(pmcdir, extension='.nxml')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'doi'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdoi_to_pmc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_pmc_doi_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/PLOS_Corpus_Project/allofplos/allofplos/samples/corpus_analysis.py\u001b[0m in \u001b[0;36mget_pmc_doi_dict\u001b[0;34m(id_list, chunk_size)\u001b[0m\n\u001b[1;32m 545\u001b[0m \u001b[0mpmc_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpmc_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;31m# exclude echo header\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 546\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpmc_results\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 547\u001b[0;31m \u001b[0mdoi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrib\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'doi'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 548\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[0mpmcid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrib\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'pmcid'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32msrc/lxml/lxml.etree.pyx\u001b[0m in \u001b[0;36mlxml.etree._Attrib.__getitem__ (src/lxml/lxml.etree.c:70679)\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'doi'" - ] - } - ], - "source": [ - "doi_to_pmc = get_pmc_doi_dict(id_list)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('missing_pmc_articles.txt', 'w') as file:\n", - " for item in sorted(set(missing_pmc_articles)):\n", - " file.write(\"%s\\n\" % item)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - }, - "toc": { - "nav_menu": { - "height": "174px", - "width": "252px" - }, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "toc_cell": false, - "toc_position": {}, - "toc_section_display": "block", - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/Corpus_QA.ipynb b/allofplos/Corpus_QA.ipynb deleted file mode 100644 index 1007c803..00000000 --- a/allofplos/Corpus_QA.ipynb +++ /dev/null @@ -1,512 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Required functions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "code_folding": [] - }, - "outputs": [], - "source": [ - "from samples.corpus_analysis import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PLOS/NLM article type mapping" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "# For mapping the JATS article type onto the PLOS article type, while taking NLM DTD into account.\n", - "article_types_map = get_article_types_map()\n", - "PLOS_article_types_structured = counter(article_types_map).most_common()\n", - "print(PLOS_article_types_structured)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# create .csv file mapping JATS to PLOS article types\n", - "article_types_map_to_csv(article_types_map)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Retracted and corrected articles" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get list of retracted articles" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "79 retracted articles found.\n", - "['journal.pbio.0030123', 'journal.pbio.0050005', 'journal.pbio.0050146', 'journal.pbio.1001212', 'journal.pcbi.1002308', 'journal.pgen.1003361', 'journal.pgen.1003791', 'journal.pgen.1005586', 'journal.pgen.1000424', 'journal.pmed.1001214', 'journal.pone.0072333', 'journal.pone.0084127', 'journal.pone.0027571', 'journal.pone.0046410', 'journal.pone.0080145', 'journal.pone.0019652', 'journal.pone.0075928', 'journal.pone.0075046', 'journal.pone.0062178', 'journal.pone.0051549', 'journal.pone.0093095', 'journal.pone.0069669', 'journal.pone.0133525', 'journal.pone.0115980', 'journal.pone.0115741', 'journal.pone.0139044', 'journal.pone.0146193', 'journal.pone.0045667', 'journal.pone.0040789', 'journal.pone.0094830', 'journal.pone.0031943', 'journal.pone.0097700', 'journal.pone.0047218', 'journal.pone.0090951', 'journal.pone.0014232', 'journal.pone.0090318', 'journal.pone.0072895', 'journal.pone.0065651', 'journal.pone.0059556', 'journal.pone.0076809', 'journal.pone.0099630', 'journal.pone.0121549', 'journal.pone.0048402', 'journal.pone.0062170', 'journal.pone.0020152', 'journal.pone.0164571', 'journal.pone.0164378', 'journal.pone.0116682', 'journal.pone.0125542', 'journal.pone.0047110', 'journal.pone.0026503', 'journal.pone.0037102', 'journal.pone.0014163', 'journal.pone.0043204', 'journal.pone.0001276', 'journal.pone.0035142', 'journal.pone.0011299', 'journal.pone.0005373', 'journal.pone.0030980', 'journal.pone.0000306', 'journal.pone.0064576', 'journal.pone.0016011', 'journal.pone.0001444', 'journal.pone.0043406', 'journal.pone.0029192', 'journal.pone.0001908', 'journal.pone.0016256', 'journal.pone.0013512', 'journal.pone.0045965', 'journal.pone.0022730', 'journal.pone.0006333', 'journal.pone.0004168', 'journal.pone.0035453', 'journal.pone.0032853', 'journal.ppat.1003435', 'journal.ppat.1002062', 'journal.ppat.1000915', 'journal.ppat.1000210', 'journal.ppat.0020025']\n" - ] - } - ], - "source": [ - "retractions_article_list, retracted_article_list = get_retracted_article_list()\n", - "print(retracted_article_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get list of corrected articles" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "journal.pcbi.1003582.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003490\n", - "journal.pcbi.1003732.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003159\n", - "journal.pone.0101541.xml has incorrect linked DOI: journal.PONE-D-13-26510\n", - "journal.pone.0104353.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104472.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104581.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104601.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105485.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105486.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105490.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105658.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105668.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105669.xml has incorrect linked DOI: journal.\n", - "9127 corrected articles found.\n" - ] - } - ], - "source": [ - "corrections_article_list, corrected_article_list = get_corrected_article_list()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Check raw XML for article updates" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "downloaded new version of journal.pone.0182022.xml\n", - "downloaded new version of journal.pone.0175323.xml\n", - "downloaded new version of journal.pone.0171255.xml\n", - "downloaded new version of journal.pone.0158499.xml\n", - "30000 article checked for updates.\n", - "4 articles have updates.\n", - "['journal.pone.0182022.xml', 'journal.pone.0175323.xml', 'journal.pone.0171255.xml', 'journal.pone.0158499.xml']\n" - ] - } - ], - "source": [ - "# By default, checks only the 30,000 most recent articles\n", - "articles_different_list = revisiondate_sanity_check()\n", - "print(articles_different_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DOI and filename sanity check" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "All article file names match DOIs.\n", - "PMC2687079.nxml has invalid DOI field: '10.1371/annotation/1cdc7975-50d7-40a5-99ca-83580df2982f '\n" - ] - } - ], - "source": [ - "# Check if article filenames match their full DOIs & that DOI fields are correct\n", - "messed_up_plos_list = article_doi_sanity_check()\n", - "messed_up_pmc_list = article_doi_sanity_check(directory=pmcdir, article_list=None, source='PMC')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PubMed Corpus" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get all local, solr, and PMC DOIs" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mArticles that needs to be re-indexed on Solr:\n", - "\u001b[0m10.1371/journal.pone.0076809\n" - ] - } - ], - "source": [ - "plos_articles = compare_local_and_solr()\n", - "doi_to_pmc = get_articles_by_doi_field(check_new=False)\n", - "pmc_articles = list(doi_to_pmc.keys())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare PLOS's copy to PMC" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version of the corpus by:\n", - "* removing Currents articles\n", - "* checking if articles are live on journals.plos.org\n", - "* checking that the DOIs resolve" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mPMC DOI fields with spaces in them:\n", - "\u001b[0m\"10.1371/annotation/1cdc7975-50d7-40a5-99ca-83580df2982f \" \n", - "\n", - "\u001b[1mWorking articles that need to be re-indexed on Solr:\n", - "\u001b[0m10.1371/annotation/1391941e-93d3-48d3-8c9a-b7c6d98f9527\n", - "10.1371/annotation/a81b1fab-890c-447b-a308-5bc8ca3eb21d\n", - "10.1371/annotation/df340d50-1f94-4d8b-a252-1a82a7fa5cc7 \n", - "\n", - "\u001b[1mArticles on PMC but not on solr or journals:\n", - "\u001b[0m10.1371/journal.pone.0002957\n", - "10.1371/annotation/b83e925b-2f2a-47b9-b939-0a1eeab18324\n", - "10.1371/journal.pbio.0020201\n", - "10.1371/annotation/011969ee-3f4b-4260-8d95-1b9a4ca39008\n", - "10.1371/annotation/8f2ddf91-3499-4627-9a91-449b78465f9d\n", - "10.1371/annotation/33d82b59-59a3-4412-9853-e78e49af76b9 \n", - "\n", - "\u001b[1mMissing PLOS articles where DOI resolves to different DOI:\n", - "\u001b[0m 10.1371/annotation/5e4082fd-6d86-441f-b946-a6e87a22ea57 resolves to: 10.1371/annotation/d9496d01-8c5d-4d24-8287-94449ada5064\n", - "\u001b[0m 10.1371/annotation/b8b66a84-4919-4a3e-ba3e-bb11f3853755 resolves to: 10.1371/annotation/5fbbf39a-fb47-4ce1-8069-acd830b3d41f\n", - "\n", - " \u001b[1mOther articles on PMC that aren't working correctly for PLOS:\n", - "\u001b[0m10.1371/annotation/363b6074-caec-4238-b88f-acbf45de498f\n", - "10.1371/annotation/2259f958-a68e-4e57-92b5-2ef003070cf1 \n", - "\n" - ] - } - ], - "source": [ - "missing_plos_articles = process_missing_plos_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare PMC's copy to PLOS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version of the Corpus by:\n", - "* updating the PMCID:DOI mapping document\n", - "* removing articles too recent to be indexed (pubdate less than 3 weeks ago)\n", - "* excluding uncorrected proofs\n", - "* excluding PLOS Medicine quizzes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "deletable": false, - "editable": false, - "run_control": { - "frozen": true - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mArticles missing from PMC:\n", - "\u001b[0m10.1371/annotation/08286cd8-527f-4f14-856f-57267107efa8\n", - "10.1371/annotation/0bbea8d3-1f94-48af-915c-aec02da2f5c3\n", - "10.1371/annotation/0c5390b8-72b0-4b7e-85a3-b8c0fd9f62bf\n", - "10.1371/annotation/0ccac188-950f-4908-b232-35fb44ba7847\n", - "10.1371/annotation/0cfd3d5f-c1d0-48f8-ad69-34a95e31a8d2\n", - "10.1371/annotation/0e045706-ea24-41db-be90-27d1cbcd35b1\n", - "10.1371/annotation/17310bbb-e5bf-4901-8b6e-529577a280db\n", - "10.1371/annotation/1c419628-f1b5-45de-9f8a-43f834309ebb\n", - "10.1371/annotation/1dc00176-e096-4621-9494-2d848dac8262\n", - "10.1371/annotation/1e464689-3c86-4399-b229-1e00d65593a5\n", - "10.1371/annotation/1f110857-27d7-4e83-9eb3-4e5f51950a26\n", - "10.1371/annotation/21379809-1376-4250-b4c2-bf51eac58a98\n", - "10.1371/annotation/221e5f19-370e-4a52-add8-f882437bc85d\n", - "10.1371/annotation/230cca90-58e9-4aa1-b6b2-a1d744524fbd\n", - "10.1371/annotation/23bca9d0-f934-400e-8bb9-f5ff07f9e625\n", - "10.1371/annotation/270b432d-50ec-41f1-ad4d-ddd9f51f62a5\n", - "10.1371/annotation/2b218d50-a9d5-45b2-80d0-0e806e530749\n", - "10.1371/annotation/2c275a1b-2d36-4492-b36a-192bddf14f78\n", - "10.1371/annotation/2ca25d9c-7347-4b09-bd7a-09d6d37ff322\n", - "10.1371/annotation/2f278ed8-d5e7-440a-9e49-c8d1df20d1f1\n", - "10.1371/annotation/31412345-fc86-4d67-b37c-93d42f5f0a59\n", - "10.1371/annotation/3265139d-64c7-4c4c-83d3-1e139031e7df\n", - "10.1371/annotation/34304231-e54b-4080-af70-6f957f32d552\n", - "10.1371/annotation/39b41d98-b117-41cf-b5de-b8486a67b1cd\n", - "10.1371/annotation/4290dfee-64fd-4157-89e3-8edbba912420\n", - "10.1371/annotation/44f67041-2f8e-42df-826a-82172ae05a22\n", - "10.1371/annotation/49257f53-8cb1-431b-be64-7b410598b845\n", - "10.1371/annotation/4993e0e2-c580-4547-90d8-3227b87e6ae9\n", - "10.1371/annotation/4a8d9f38-1d0d-4389-a284-9f2564e1ac0b\n", - "10.1371/annotation/4b9340db-455b-4e0d-86e5-b6783747111f\n", - "10.1371/annotation/4bb6b73b-b5bb-4143-9ec3-99c90b93f3ad\n", - "10.1371/annotation/4d6c4127-82e4-408d-af89-5f2e207d523b\n", - "10.1371/annotation/4f08219c-2d7b-4309-8351-d3fe2378993f\n", - "10.1371/annotation/5487e265-8175-47cb-b9a4-d85862a4a96f\n", - "10.1371/annotation/59bcbe81-eddd-46a4-90dc-88c1ea70df72\n", - "10.1371/annotation/5e0195b6-60b9-4c03-84ae-c6c31e625be1\n", - "10.1371/annotation/6130c605-086b-46af-8f6f-6c76b8eb9c84\n", - "10.1371/annotation/638b42e3-a351-4827-a612-17fe29b48e28\n", - "10.1371/annotation/677fdf34-651e-4dc8-a0be-d0d633237a85\n", - "10.1371/annotation/712bb339-6073-4e62-9f68-b285caedd913\n", - "10.1371/annotation/730cdfd0-78c5-48fc-a095-f633905ff2f0\n", - "10.1371/annotation/7645d066-aa98-45d6-8c3e-3a30d9e03e4d\n", - "10.1371/annotation/7e304601-fc5c-40fe-857c-d6ea894d1647\n", - "10.1371/annotation/7f73ed17-709e-4d7f-9aae-aab1f4a34985\n", - "10.1371/annotation/865eaad7-8547-49ac-a42d-47e9d0755bb3\n", - "10.1371/annotation/87e2a80b-3ed7-4ef9-96cb-1268d91b6366\n", - "10.1371/annotation/8941aee3-4bb8-42a0-b09a-e7c416beeef7\n", - "10.1371/annotation/8c6eaae4-72a7-460a-8b1a-f855731f3706\n", - "10.1371/annotation/8fa70b21-32e7-4ed3-b397-ab776b5bbf30\n", - "10.1371/annotation/9239a129-5677-43b0-8fe1-0c1e75e988df\n", - "10.1371/annotation/93141e7a-61f3-48bd-87bd-216b030d773d\n", - "10.1371/annotation/936a4359-1bf5-4c33-be7d-1468e75eaa8b\n", - "10.1371/annotation/93d63399-0e71-4a25-a45c-311910ee6da5\n", - "10.1371/annotation/9630862b-4676-4b82-9869-8d8fbb2a2e65\n", - "10.1371/annotation/974531b0-9da4-4575-b3d1-955b0163fde0\n", - "10.1371/annotation/98908e14-e9fd-458f-9cea-ba4bec139f20\n", - "10.1371/annotation/b03fbc42-8f70-4873-9cce-854e48249a13\n", - "10.1371/annotation/b0e62f4f-812f-40b1-aef8-365b229eb2cf\n", - "10.1371/annotation/b4e623eb-4950-48d9-8d85-8d70426d95a3\n", - "10.1371/annotation/b60d4ec5-4c6f-43ab-9f63-322e3cd59636\n", - "10.1371/annotation/bae9fc08-fbfa-45b5-9d1d-0b8254d6efd5\n", - "10.1371/annotation/bc97a85c-1ecd-4cd8-ab61-0aef01f949a1\n", - "10.1371/annotation/c066bb84-13ea-4b36-a481-f149df8ce929\n", - "10.1371/annotation/c313df3a-52bd-4cbe-af14-6676480d1a43\n", - "10.1371/annotation/c81daa7c-5375-4349-970b-c63d288947eb\n", - "10.1371/annotation/caf130c3-5026-41cd-9dda-5eac7c0f016f\n", - "10.1371/annotation/d271d9c1-5588-4b43-85c3-d3de58ab61a4\n", - "10.1371/annotation/dfa05103-fc65-4f07-b30f-72a6e91613ff\n", - "10.1371/annotation/ea14adcb-033d-492d-8f8b-e047aa080cd4\n", - "10.1371/annotation/ebea4bd5-2b96-4842-b110-2f7c156e5060\n", - "10.1371/annotation/eff6e471-306a-41bd-88e3-13857af094af\n", - "10.1371/annotation/f016476b-5b84-4c9a-899f-fe8b8bc927b5\n", - "10.1371/annotation/f216b2b0-ab6b-45d8-b6ba-134a477b79b7\n", - "10.1371/annotation/f32bc670-c9cf-4bb0-9376-cd8cfd1053c1\n", - "10.1371/annotation/f8605b0a-d01c-41aa-ac9b-b605d7903a28\n", - "10.1371/annotation/f9660803-198b-4d0d-8200-719a2eb2a443\n", - "10.1371/annotation/fcca88ac-d684-46e0-a483-62af67e777bd\n", - "10.1371/annotation/fd9f9796-b42d-480d-b9f4-0adfbb919148\n", - "10.1371/annotation/fddd2ff3-c991-4c2f-8b84-a27eb20fba91\n", - "10.1371/annotation/ff089043-990a-48c2-a90f-15606c11cc98\n", - "10.1371/journal.pcbi.1005632\n", - "10.1371/journal.pcbi.1005676\n", - "10.1371/journal.pcbi.1005677\n", - "10.1371/journal.pcbi.1005692\n", - "10.1371/journal.pgen.1006910\n", - "10.1371/journal.pone.0181246\n", - "10.1371/journal.pone.0182517\n", - "10.1371/journal.ppat.1006535\n", - "10.1371/journal.ppat.1006543 \n", - "\n" - ] - } - ], - "source": [ - "missing_pmc_articles = process_missing_pmc_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save lists of missing articles to text files if needed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "with open('missing_plos_articles.txt', 'w') as file:\n", - " for item in sorted(set(missing_plos_articles)):\n", - " file.write(\"%s\\n\" % item)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "code_folding": [], - "collapsed": true - }, - "outputs": [], - "source": [ - "with open('missing_pmc_articles.txt', 'w') as file:\n", - " for item in sorted(set(missing_pmc_articles)):\n", - " file.write(\"%s\\n\" % item)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - }, - "toc": { - "nav_menu": { - "height": "174px", - "width": "252px" - }, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "toc_cell": false, - "toc_position": {}, - "toc_section_display": "block", - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/PLOS Medicine Colleen and NLP.ipynb b/allofplos/PLOS Medicine Colleen and NLP.ipynb deleted file mode 100644 index 1edb75ae..00000000 --- a/allofplos/PLOS Medicine Colleen and NLP.ipynb +++ /dev/null @@ -1,116 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import datetime\n", - "import lxml.etree as et\n", - "import csv\n", - "pmcdir = 'pmc_articles'\n", - "from plos_corpus import (corpusdir, get_article_pubdate, check_if_uncorrected_proof, listdir_nohidden,\n", - " get_article_xml, file_to_doi, doi_to_file, get_all_solr_dois, download_check_and_move)\n", - "\n", - "from samples.corpus_analysis import (get_plos_article_type, get_article_dtd, get_random_list_of_dois, \n", - " get_related_retraction_article, check_article_type, get_plos_journal,\n", - " get_article_title, parse_article_date, get_corpus_metadata,\n", - " get_article_abstract, corpus_metadata_to_csv, get_article_dates,\n", - " read_corpus_metadata_from_csv)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook is for tracking Colleen Crangle's requests for test corpuses for NLP. First up is all PLOS ONE articles for which \"diabet\" (for diabetes, diabetic, etc) appears in the abstract." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "one_list = [article for article in listdir_nohidden(corpusdir) if 'pone' in article]\n", - "\n", - "def assemble_diabetes_corpus(article_list):\n", - " \"\"\"\n", - " Find all PLOS ONE articles that say something about diabetes or technology in the abstract.\n", - " \"\"\"\n", - " diabetes_article_list = [article for article in article_list if 'diabet' in get_article_abstract(article).lower()]\n", - " return diabetes_article_list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "diabetes_article_list = assemble_diabetes_corpus(one_list)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "diabetes_metadata = get_corpus_metadata(article_list=diabetes_article_list)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "corpus_metadata_to_csv(diabetes_metadata)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - }, - "toc": { - "nav_menu": { - "height": "12px", - "width": "252px" - }, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "toc_cell": false, - "toc_position": {}, - "toc_section_display": "block", - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/PLOS Medicine and NLP.ipynb b/allofplos/PLOS Medicine and NLP.ipynb deleted file mode 100644 index b0c90608..00000000 --- a/allofplos/PLOS Medicine and NLP.ipynb +++ /dev/null @@ -1,42 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - }, - "toc": { - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "toc_cell": false, - "toc_position": {}, - "toc_section_display": "block", - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/Production team investigates.ipynb b/allofplos/Production team investigates.ipynb deleted file mode 100644 index 6213b793..00000000 --- a/allofplos/Production team investigates.ipynb +++ /dev/null @@ -1,647 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from plos_corpus import *\n", - "from samples.corpus_analysis import *\n", - "corpusdir_prod = '../../allofplos/allofplos/allofplos_xml/'" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "# Q: Are annotation DOIs resolving correctly?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "def make_annotation_dict(save_output=True):\n", - " \"\"\"\n", - " For every article file whose DOI contains the word \"annotation\", check whether its DOI resolves correctly\n", - " by creating a dictionary of the resolution status.\n", - " :return: dictionary where each key is a DOI, each value is associated resolution of that DOI via doi.org.\n", - " :param save_output: exports dictionary to csv\n", - " \"\"\"\n", - " dois = [file_to_doi(file) for file in listdir_nohidden(corpusdir)]\n", - " annotation_list = [x for x in dois if x.startswith('10.1371/annotation')]\n", - " anno_dict = {doi: check_if_doi_resolves(doi) for doi in annotation_list}\n", - " \n", - " if save_output:\n", - " with open('annotations.csv', 'w') as f:\n", - " writer = csv.writer(f)\n", - " writer.writerow(['DOI', 'Resolution'])\n", - " for key, value in anno_dict.items():\n", - " writer.writerow([key, value])\n", - "\n", - " return anno_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "# run this\n", - "make_annotation_dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true, - "heading_collapsed": true - }, - "source": [ - "# Q: Which `` elements follow a certain pattern?" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "hidden": true - }, - "outputs": [], - "source": [ - "def get_tina_test_set():\n", - " \"\"\"\n", - " Return a list of DOIs good for Tina's function\n", - " \"\"\"\n", - " random_list_of_dois = get_random_list_of_dois(count=10)\n", - " random_list_of_articles = [doi_to_path(doi) for doi in random_list_of_dois if 'annotation' not in doi]\n", - " search_1_dois = ('10.1371/journal.pmed.1002035', '10.1371/journal.pone.0047559', '10.1371/journal.pone.0047944')\n", - " search_1_articles = [doi_to_path(doi) for doi in search_1_dois]\n", - " search_test_set = list(set(random_list_of_articles + search_1_articles))\n", - " return search_test_set\n", - "\n", - "def find_contrib_pattern(article_list=None, csv=True):\n", - " \"\"\"\n", - " Three separate searches would be most helpful:\n", - " Search #1: Find all articles where a element contains an element. \n", - " Example: pmed.1002035, pone.0047559, and pone.0047944 should all be found by this search.\n", - " Search #2: Find all articles where a element that contains an element is\n", - " immediately followed by element that contains a element.\n", - " Example: pone.0047559 and pone.0047944 should both be found by this search, but not pmed.1002035.\n", - " Search #3: Find all articles where a element that contains an element is\n", - " immediately followed by element that contains a element that contains a .\n", - " Example: pone.0047944 should be found by this search, but not pmed.1002035 or pone.0047559.)\n", - " To test this function, use get_tina_test_set() to run on a subset of articles\n", - " \"\"\"\n", - " if article_list is None:\n", - " article_list = listdir_nohidden(corpusdir)\n", - "\n", - " search_1_results = []\n", - " search_2_results = []\n", - " search_3_results = []\n", - "\n", - " for article_file in article_list:\n", - " tag_path_elements = ('/',\n", - " 'article',\n", - " 'front',\n", - " 'article-meta')\n", - " article_xml = get_articleXML_content(article_file, tag_path_elements=tag_path_elements)\n", - " meta_categories = article_xml[0].getchildren()\n", - " contrib_groups = [category for category in meta_categories if category.tag == 'contrib-group']\n", - " for contrib_group in contrib_groups:\n", - " for contributor in contrib_group:\n", - " for element in contributor:\n", - " if element.tag == 'on-behalf-of':\n", - " search_1_results.append(filename_to_doi(article_file))\n", - " next_element = contributor.getnext()\n", - " if next_element is not None:\n", - " for elem in next_element:\n", - " if elem.tag == 'collab':\n", - " search_2_results.append(filename_to_doi(article_file))\n", - " for subelem in elem:\n", - " if subelem.tag == 'contrib-group':\n", - " search_3_results.append(filename_to_doi(article_file))\n", - " break\n", - "\n", - " search_1_results = set(search_1_results)\n", - " search_2_results = set(search_2_results)\n", - " search_3_results = set(search_3_results)\n", - " search_results = list(set(search_1_results + search_2_results + search_3_results))\n", - " doi_results = []\n", - " for doi in search_results:\n", - " if doi in search_1_results:\n", - " s1 = 'yes'\n", - " else:\n", - " s1 = 'no'\n", - " if doi in search_2_results:\n", - " s2 = 'yes'\n", - " else:\n", - " s2 = 'no'\n", - " if doi in search_3_results:\n", - " s3 = 'yes'\n", - " else:\n", - " s3 = 'no'\n", - " doi_result = (doi, s1, s2, s3)\n", - " doi_results.append(doi_result)\n", - " if csv:\n", - " with open('search_results.csv', 'w') as f:\n", - " writer = csv.writer(f)\n", - " writer.writerow(['DOI', 'Search 1', 'Search 2', 'Search 3'])\n", - " for doi_result in sorted(doi_results):\n", - " writer.writerow(doi_result)\n", - " return doi_results" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "hidden": true - }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'get_random_list_of_dois' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# test this function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtest_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_tina_test_set\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdoi_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfind_contrib_pattern\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marticle_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtest_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcsv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mget_tina_test_set\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mReturn\u001b[0m \u001b[0ma\u001b[0m \u001b[0mlist\u001b[0m \u001b[0mof\u001b[0m \u001b[0mDOIs\u001b[0m \u001b[0mgood\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mTina\u001b[0m\u001b[0;31m'\u001b[0m\u001b[0ms\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \"\"\"\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mrandom_list_of_dois\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_random_list_of_dois\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mrandom_list_of_articles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mdoi_to_path\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdoi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdoi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrandom_list_of_dois\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'annotation'\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdoi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0msearch_1_dois\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'10.1371/journal.pmed.1002035'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'10.1371/journal.pone.0047559'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'10.1371/journal.pone.0047944'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'get_random_list_of_dois' is not defined" - ] - } - ], - "source": [ - "# test this function\n", - "test_list = get_tina_test_set()\n", - "doi_results = find_contrib_pattern(article_list=test_list, csv=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "print(doi_results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "# run this function for real\n", - "doi_results = find_contrib_pattern()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "# Q: Which articles after 2015 have 2 or more corrections attached?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "corrections_article_list, corrected_article_list = get_corrected_article_list()\n", - "multiple_corrections = set([article for article in corrected_article_list\n", - " if corrected_article_list.count(article) > 1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "multiple_corrections.remove('10.1371/journal.')\n", - "multiple_corrections_post_2015 = [article for article in multiple_corrections\n", - " if get_article_pubdate(doi_to_file(article)).year >= 2015]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "multiple_corrections_post_2015\n", - "with open('2_or_more_corrections.csv', 'w') as f:\n", - " writer = csv.writer(f)\n", - " writer.writerow(['DOI'])\n", - " for item in multiple_corrections_post_2015:\n", - " writer.writerow(item)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "# Q: Which articles have a series of table-wrap graphic elements?" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "hidden": true - }, - "outputs": [], - "source": [ - "example_doi = '10.1371/journal.pone.0068090'\n", - "search_1_file = 'xml_testing/Search-1_TRUE.xml'\n", - "search_2_file = 'xml_testing/Search-2_TRUE.xml'\n", - "intro_file = doi_to_path(example_doi, directory=corpusdir_prod)\n", - "search_1_fail_list = []\n", - "fail_file = doi_to_path('10.1371/journal.pone.0183466', directory=corpusdir_prod)\n", - "test_list = [fail_file, intro_file, search_1_file, search_2_file]\n", - "\n", - "def find_table_wraps(article):\n", - " \"\"\"\n", - " find all articles with a `table-wrap` element. of those, if there is no immediate sub-tag of\n", - " 'alternative' in table\n", - " \"\"\"\n", - " intro_condition = False\n", - " intro_condition_overall = False\n", - " search1_ids = []\n", - " search2_ids = []\n", - " alternative_graphic_ids = []\n", - "\n", - " article_tree = et.parse(article, parser=et.XMLParser(remove_comments=True)) # exclude commented-out tables\n", - " table_wraps = article_tree.findall('.//table-wrap')\n", - " if table_wraps:\n", - " for table_wrap in table_wraps:\n", - " table_parts = table_wrap.getchildren()\n", - " # intro condition 1: table-wrap element does not include a direct child of \n", - " alternatives_parts = [table_part for table_part in table_parts if 'alternatives' in table_part.tag]\n", - " if not alternatives_parts:\n", - " intro_condition_1 = True\n", - " else:\n", - " for table_part in alternatives_parts:\n", - " table_subparts = table_part.getchildren()\n", - " if all('graphic' not in table_subpart.tag for table_subpart in table_subparts):\n", - " intro_condition_1 = True\n", - " else:\n", - " intro_condition_1 = False\n", - " new_alternative_graphic_ids = [table_subpart.attrib['id'] for table_subpart in table_subparts if 'graphic' in table_subpart.tag]\n", - " alternative_graphic_ids.extend(new_alternative_graphic_ids)\n", - "\n", - " # intro condition 2: table-wrap element does not include a direct child of \n", - " if all('graphic' not in table_part.tag for table_part in table_parts):\n", - " intro_condition_2 = True\n", - " else:\n", - " intro_condition_2 = False\n", - " \n", - " if intro_condition_1 and intro_condition_2:\n", - " intro_condition = True\n", - " # keep track of articles that have any table match intro condition\n", - " intro_condition_overall = True\n", - "\n", - " if intro_condition:\n", - " graphics = table_wrap.findall('.//graphic')\n", - " if graphics:\n", - " new_search1_ids = [graphic.attrib['id'] for graphic in graphics]\n", - " search1_ids.extend(new_search1_ids)\n", - " inline_graphics = table_wrap.findall('.//inline-graphic')\n", - " if inline_graphics:\n", - " try:\n", - " search2_ids = [inline.attrib['id'] for inline in inline_graphics]\n", - " except KeyError:\n", - " print('{} has search 2 results but no ids: {}'.format(article, inline_graphics))\n", - " search2_ids = [inline.attrib for inline in inline_graphics]\n", - " else:\n", - " pass\n", - "\n", - " else:\n", - " return intro_condition_overall, bool(search1_ids), bool(search2_ids)\n", - "\n", - " if search1_ids and alternative_graphic_ids:\n", - " # exclude graphics elements that are already accounted for under an tag\n", - " search1_ids = [did for did in search1_ids if did not in alternative_graphic_ids]\n", - " if not search1_ids:\n", - " search1_ids = False\n", - " elif len(search1_ids) == 1:\n", - " search1_ids = search1_ids[0]\n", - " if not search2_ids:\n", - " search2_ids = False\n", - " elif len(search2_ids) == 1:\n", - " search2_ids = search2_ids[0]\n", - " return intro_condition_overall, search1_ids, search2_ids\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "code_folding": [], - "hidden": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "../../allofplos/allofplos/allofplos_xml/journal.pone.0183466.xml False False False\n", - "../../allofplos/allofplos/allofplos_xml/journal.pone.0068090.xml True False False\n", - "xml_testing/Search-1_TRUE.xml True pmed.1002397.e001g False\n", - "xml_testing/Search-2_TRUE.xml True False pmed.1002397.e001g\n" - ] - } - ], - "source": [ - "# testing the code\n", - "for article_file in test_list:\n", - " intro_condition, search1_ids, search2_ids = find_table_wraps(article_file)\n", - " print(article_file, intro_condition, search1_ids, search2_ids)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "hidden": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "221852\n", - "['../../allofplos/allofplos/allofplos_xml/journal.ppat.1000896.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0065590.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0036030.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0026652.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0029438.xml', '../../allofplos/allofplos/allofplos_xml/journal.pgen.1000989.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0089988.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0015594.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0149634.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0000707.xml']\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 12% ( 27042 of 221852) |############# | Elapsed Time: 0:04:33 ETA: 0:57:29" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "../../allofplos/allofplos/allofplos_xml/journal.pone.0002468.xml has search 2 results but no ids: [, , , , ]\n", - "../../allofplos/allofplos/allofplos_xml/journal.pone.0002468.xml has search 2 results but no ids: [, , , , ]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 52% (116850 of 221852) |######################################################## | Elapsed Time: 0:19:57 ETA: 0:18:13" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "../../allofplos/allofplos/allofplos_xml/journal.pone.0075851.xml has search 2 results but no ids: []\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100% (221852 of 221852) |##########################################################################################################| Elapsed Time: 0:38:14 Time: 0:38:14\n" - ] - } - ], - "source": [ - "# running over entire corpus, randomized, with a progressbar\n", - "import progressbar\n", - "from random import shuffle\n", - "\n", - "table_results = []\n", - "file_list = listdir_nohidden(corpusdir_prod)\n", - "shuffle(file_list)\n", - "\n", - "bar = progressbar.ProgressBar(redirect_stdout=True, max_value=len(file_list))\n", - "for i, article_file in enumerate(file_list):\n", - " intro_condition, search1_ids, search2_ids = find_table_wraps(article_file)\n", - " if intro_condition:\n", - " result = [filename_to_doi(article_file), search1_ids, search2_ids]\n", - " table_results.append(result)\n", - " bar.update(i+1)\n", - "bar.finish()\n", - "\n", - "# print(table_results)\n", - "with open('table_graphics_search_results.csv', 'w') as f:\n", - " writer = csv.writer(f)\n", - " writer.writerow(['DOI', 'Search 1', 'Search 2'])\n", - " for doi_result in sorted(table_results):\n", - " writer.writerow(doi_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true, - "heading_collapsed": true - }, - "source": [ - "# Which Aperta articles have a group collaboration contributor element?" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "hidden": true - }, - "source": [ - "## Example: 10.1371/journal.pmed.1002170\n", - "\n", - "\n", - "International Ebola Response Team\n", - "\n", - "\n", - "\n", - "\n", - "

\n", - "¶ The International Ebola Response Team comprises the authors listed in this article in alphabetical order\n", - "

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - " def get_article_collab(doi, corpusdir=corpusdir_prod):\n", - " \"\"\"\n", - " For a given PLOS article, see if there is a collaborator group in the authors list. Print data if so\n", - " :return: tuple of doi, collaborators, and the footnote number if so\n", - " \"\"\"\n", - " tag_path_elements = ('/',\n", - " 'article',\n", - " 'front',\n", - " 'article-meta')\n", - " article_xml = get_article_xml(doi_to_file(doi, directory=corpusdir), tag_path_elements=tag_path_elements)\n", - " meta_categories = article_xml[0].getchildren()\n", - " contrib_groups = [category for category in meta_categories if category.tag == 'contrib-group']\n", - " collab = False\n", - " rid = ''\n", - " footnote = False\n", - " collab_tuple = ''\n", - " try:\n", - " for contrib_group in contrib_groups:\n", - " for contrib in contrib_group:\n", - " if contrib.attrib['contrib-type'] == 'author':\n", - " for child in contrib:\n", - " if child.tag == \"collab\":\n", - " collab = True\n", - " collaborators = child.text\n", - " continue\n", - " if child.tag == 'role':\n", - " continue\n", - " elif child.tag == 'xref':\n", - " rid = (child.attrib['rid'])\n", - " if collab and rid:\n", - " break\n", - "\n", - " except IndexError:\n", - " print('No authors found for {}'.format(doi))\n", - " return False\n", - "\n", - " if collab and rid:\n", - " tag_path_elements = ('/',\n", - " 'article',\n", - " 'front',\n", - " 'article-meta',\n", - " 'author-notes')\n", - "\n", - " article_xml = get_article_xml(doi_to_file(doi, directory=corpusdir), tag_path_elements=tag_path_elements)\n", - " notes = article_xml[0].getchildren()\n", - " for note in notes:\n", - " if note.tag == 'fn' and rid in note.attrib.values():\n", - " footnote = True\n", - " if footnote is False:\n", - " print('footnote not found for {}'.format(doi))\n", - "\n", - " collab_tuple = (doi, collaborators, rid)\n", - "\n", - " elif collab:\n", - " print('rid not found for {}'.format(doi))\n", - "\n", - " if collab_tuple:\n", - " print(collab_tuple)\n", - "\n", - " return collab_tuple" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "# Restrict to PLOS Biology Aperta articles\n", - "article_list = [article for article in listdir_nohidden(corpusdir_prod) if 'pbio.2' in article] \n", - "doi_list = [file_to_doi(article) for article in article_list]\n", - "doi_list.append('10.1371/journal.pmed.1002170')" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "hidden": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "('10.1371/journal.pbio.2001069', 'CycliX consortium', 'fn001')\n", - "('10.1371/journal.pbio.2001855', 'BEEHIVE collaboration', 'fn001')\n", - "('10.1371/journal.pmed.1002170', 'International Ebola Response Team', 'fn001')\n" - ] - } - ], - "source": [ - "for doi in doi_list:\n", - " get_article_collab(doi)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - }, - "toc": { - "nav_menu": { - "height": "12px", - "width": "252px" - }, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "toc_cell": false, - "toc_position": {}, - "toc_section_display": "block", - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/csvfile.ipynb b/allofplos/csvfile.ipynb deleted file mode 100644 index 0d0d79c3..00000000 --- a/allofplos/csvfile.ipynb +++ /dev/null @@ -1,288 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import os\n", - "import datetime\n", - "import lxml.etree as et\n", - "import csv\n", - "pmcdir = 'pmc_articles'\n", - "from plos_corpus import (corpusdir, get_article_pubdate, check_if_uncorrected_proof, listdir_nohidden,\n", - " get_article_xml, file_to_doi, doi_to_file, get_all_solr_dois, download_check_and_move)\n", - "\n", - "from samples.corpus_analysis import (get_plos_article_type, get_article_dtd, get_random_list_of_dois, \n", - " get_related_retraction_article, check_article_type, get_plos_journal,\n", - " get_article_title, parse_article_date, get_corpus_metadata,\n", - " get_article_abstract, corpus_metadata_to_csv, get_article_dates,\n", - " read_corpus_metadata_from_csv)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "corpus_metadata, wrong_dates = get_corpus_metadata(article_list=article_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 27% (60403 of 221314) |#### | Elapsed Time: 0:33:25 ETA: 1:40:03" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error getting history dates for allofplos_xml/journal.pone.0034143.xml\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 46% (103042 of 221314) |####### | Elapsed Time: 0:57:26 ETA: 1:01:12" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error parsing DTD from allofplos_xml/journal.pone.0076809.xml\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 79% (176482 of 221314) |############# | Elapsed Time: 1:37:52 ETA: 0:25:51" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No abstract found for research article 10.1371/journal.pone.0150341\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 84% (186376 of 221314) |############## | Elapsed Time: 1:43:42 ETA: 0:20:36" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No abstract found for research article 10.1371/journal.pone.0160248\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 85% (189962 of 221314) |############## | Elapsed Time: 1:45:51 ETA: 0:18:34" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No abstract found for research article 10.1371/journal.pone.0163841\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 90% (199535 of 221314) |############### | Elapsed Time: 1:51:29 ETA: 0:12:43" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No abstract found for research article 10.1371/journal.pone.0173427\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 90% (200365 of 221314) |############### | Elapsed Time: 1:51:57 ETA: 0:12:10" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No abstract found for research article 10.1371/journal.pone.0174259\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 94% (210201 of 221314) |################ | Elapsed Time: 1:57:46 ETA: 0:06:38" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No abstract found for research article 10.1371/journal.pone.0184204\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 96% (212865 of 221314) |################ | Elapsed Time: 1:58:41 ETA: 0:04:23" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error parsing article body: allofplos_xml/journal.pone.correction.5fbbf39a-fb47-4ce1-8069-acd830b3d41f.xml\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100% (221314 of 221314) |#################| Elapsed Time: 2:02:36 Time: 2:02:36\n" - ] - } - ], - "source": [ - "article_list = listdir_nohidden(corpusdir)\n", - "corpus_metadata_to_csv(article_list=article_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "#to execute: update corpus_metadata with a list of DOIs. check that functions can handle an overlapping list, and make\n", - "# sure that appending is working correctly. there were some errors reading in a csv that had been appended/extended." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "corpus_metadata_reconstructed = read_corpus_metadata_from_csv()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('10.1371/journal.pbio.0000001',\n", - " 'journal.pbio.0000001',\n", - " 'A Functional Analysis of the Spacer of V(D)J Recombination Signal Sequences',\n", - " 'PLOS Biology',\n", - " 'research-article',\n", - " 'Research Article',\n", - " 'NLM 3.0',\n", - " '2003-10-13',\n", - " '2003-06-01',\n", - " '2003-07-10',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '9942',\n", - " '',\n", - " 'During lymphocyte development, V(D)J recombination assembles antigen receptor genes from component V, D, and J gene segments. These gene segments are flanked by a recombination signal sequence (RSS), which serves as the binding site for the recombination machinery. The murine Jβ2.6 gene segment is a recombinationally inactive pseudogene, but examination of its RSS reveals no obvious reason for its failure to recombine. Mutagenesis of the Jβ2.6 RSS demonstrates that the sequences of the heptamer, nonamer, and spacer are all important. Strikingly, changes solely in the spacer sequence can result in dramatic differences in the level of recombination. The subsequent analysis of a library of more than 4,000 spacer variants revealed that spacer residues of particular functional importance are correlated with their degree of conservation. Biochemical assays indicate distinct cooperation between the spacer and heptamer/nonamer along each step of the reaction pathway. The results suggest that the spacer serves not only to ensure the appropriate distance between the heptamer and nonamer but also regulates RSS activity by providing additional RAG:RSS interaction surfaces. We conclude that while RSSs are defined by a “digital” requirement for absolutely conserved nucleotides, the quality of RSS function is determined in an “analog” manner by numerous complex interactions between the RAG proteins and the less-well conserved nucleotides in the heptamer, the nonamer, and, importantly, the spacer. Those modulatory effects are accurately predicted by a new computational algorithm for “RSS information content.” The interplay between such binary and multiplicative modes of interactions provides a general model for analyzing protein–DNA interactions in various biological systems.')" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "corpus_metadata_reconstructed[0]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - }, - "toc": { - "colors": { - "hover_highlight": "#DAA520", - "navigate_num": "#000000", - "navigate_text": "#333333", - "running_highlight": "#FF0000", - "selected_highlight": "#FFD700", - "sidebar_border": "#EEEEEE", - "wrapper_background": "#FFFFFF" - }, - "moveMenuLeft": true, - "nav_menu": { - "height": "12px", - "width": "252px" - }, - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "threshold": 4, - "toc_cell": false, - "toc_position": {}, - "toc_section_display": "block", - "toc_window_display": false, - "widenNotebook": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/file_rename.py b/allofplos/file_rename.py deleted file mode 100644 index 86e2e3c9..00000000 --- a/allofplos/file_rename.py +++ /dev/null @@ -1,18 +0,0 @@ -import os -import re - -from plos_corpus import listdir_nohidden, corpusdir -from plos_regex import validate_file - -annotation_articles = [article for article in listdir_nohidden(corpusdir) if 'correction' in article] - -for article in annotation_articles: - count = 0 - parts = re.split('\/|\.', article) - new_filename = os.path.join(corpusdir, 'plos.correction.' + parts[-2] + '.xml') - if validate_file(new_filename) and new_filename != article: - os.rename(article, new_filename) - count += 1 - else: - pass -print('{} files renamed'.format(count)) diff --git a/allofplos/plos_pmc.py b/allofplos/plos_pmc.py deleted file mode 100644 index 46bd8ba9..00000000 --- a/allofplos/plos_pmc.py +++ /dev/null @@ -1,583 +0,0 @@ -""" Small stand-alone script for getting all the PMC IDs for PLOS articles. -""" - -import requests -import time -import datetime -from glob import glob -from shutil import move, rmtree - -import lxml.etree as et -from download import download - -from plos_corpus import (listdir_nohidden, extract_filenames, check_article_type, get_article_xml, - get_related_article_doi, download_updated_xml, unzip_articles, get_all_solr_dois, - file_to_doi, doi_to_file, check_if_uncorrected_proof, newarticledir, get_article_pubdate, - compare_article_pubdate) -from plos_regex import (regex_match_prefix, regex_body_match, regex_body_currents, full_doi_regex_match, - full_doi_regex_search, currents_doi_regex, validate_doi, validate_file, - validate_url, find_valid_dois, show_invalid_dois, currents_doi_filter) - - -newpmcarticledir = "new_pmc_articles" -pmc_csv = 'doi_to_pmc.csv' -pmcdir = "pmc_articles/" -# xml URL takes PMC identifier minus 'PMC' -pmc_xml_url = 'https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:' -pmc_xml_url_suffix = '&metadataPrefix=pmc' - -# can query up to 200 DOIs from PMC -USER_EMAIL = 'elizabeth.seiver@gmail.com' -pmc_doi_query_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=corpustest&email={0}&ids='.format(USER_EMAIL) -pmc_doi_query_url_suffix = '&versions=no&format=json' -pmc_pmcid_query_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=' -pmc_allplos_query_url = ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=' - '(((((((("PLoS+ONE"[Journal])+OR+"PLoS+Genetics"[Journal])+OR+"PLoS+Pathogens"[Journal])' - 'OR+"PLoS+Neglected+Tropical+Diseases"[Journal])+OR+"PLoS+Computational+Biology"[Journal])' - 'OR+"PLoS+Biology"[Journal])+OR+"PLoS+Medicine"[Journal])+OR+"plos+currents"[Journal])' - '+OR+"PLoS+Clinical+Trials"[Journal])&retmax=1000&retmode=json&tool=corpustest' - '&email={0}'.format(USER_EMAIL)) -PMC_FTP_URL = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/' -pmc_file_list = 'oa_file_list.txt' -newpmcarticledir = "new_pmc_articles" - - -def get_all_pmc_dois(retstart=0, retmax=80000, count=None): - """Query the entrez database to get a comprehensive list of all PMCIDs associated with all PLOS journals, - individually included in the search url. - Supposedly can return 100,000, but based on the maximum not working for another function, lowered to 80K to be safe. - :param restart: the first record to return - :param retmax: the maximum number of records to return - :return: the full list of PMCIDs in PMC for PLOS articles - """ - pmc_allplos_query_url = ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=' - '(((((("PLoS+ONE"[Journal])+OR+"PLoS+Genetics"[Journal])+OR+"PLoS+Pathogens"[Journal])' - 'OR+"PLoS+Neglected+Tropical+Diseases"[Journal])+OR+"PLoS+Computational+Biology"[Journal])' - 'OR+"PLoS+Biology"[Journal])+OR+"PLoS+Medicine"[Journal]+OR+"plos+currents"[Journal]' - '&retmode=json&tool=corpustest&email=email@provider.com') - - pmcidlist = [] - r = requests.get(pmc_allplos_query_url).json() - if count is None: - count = int(r['esearchresult']['count']) - print(count, "articles found in PMC") - while retstart < count: - query = pmc_allplos_query_url + '&retstart={0}&retmax={1}'.format(retstart, retmax) - r = requests.get(query).json() - idlist = r['esearchresult']['idlist'] - for id in idlist: - pmcidlist.append('PMC' + id) - retstart += retmax - time.sleep(1) - pmcidlist = sorted(list(set(pmcidlist))) - - print(len(pmcidlist), "articles found") - return pmcidlist - - -def get_articles_by_doi_field(directory=pmcdir, article_list=None, check_new=True): - doi_to_pmc = {} - if directory == pmcdir and article_list is None: - article_list = get_pmc_articles() - elif article_list is None: - article_list = listdir_nohidden(directory) - if article_list == 0: - article_list = listdir_nohidden(directory, extension='.nxml') - - if directory != pmcdir: - for article in article_list: - doi = get_article_doi(article_file=article) - doi_to_pmc[doi] = article - else: - try: - # read doi_to_pmc dict from csv - with open(pmc_csv, 'r') as csv_file: - reader = csv.reader(csv_file) - next(reader, None) - doi_to_pmc = dict(reader) - - scratch = False - n = 0 - if check_new: - for article in article_list: - if article not in doi_to_pmc.values(): - doi = get_article_doi(article) - doi_to_pmc[doi] = os.path.basename(article).rstrip('.nxml').rstrip('.xml') - n = n + 1 - if n: - print(n, 'DOI/PMCID pairs added to dictionary.') - - except FileNotFoundError: - print('Creating doi_to_pmc dictionary from scratch.') - scratch = True - n = 0 - file_list = listdir_nohidden(pmcdir, extension='.nxml') - doi_to_pmc = {get_article_doi(pmc_file): os.path.basename(pmc_file).rstrip('.nxml') for pmc_file in file_list} - # write doi_to_pmc dict to csv - if scratch or n > 0: - with open(pmc_csv, 'w') as f: - writer = csv.writer(f) - writer.writerow(['DOI', 'PMC ID']) - for key, value in doi_to_pmc: - writer.writerow([key, value]) - print('DOI, PMC ID list exported to', pmc_csv) - - return doi_to_pmc - - -def get_pmc_doi_dict(doi_list, chunk_size=150): - '''Using the PMC ID query API, return the accompanying PMCID for each DOI in a given list. - Can (ostensibly) query up to 200 DOIs at a time but sometimes that doesn't work. - :param doi list: a list of valid PLOS DOIs - :param chunk_size: number of DOIs to query at a single time - :return: tuple of dictionary mapping DOI to PMCID, list of DOIs not found in PMC - ''' - - doi_to_pmc = {} - dois_not_in_pmc = [] - # Make chunks of 200 DOIs at a time - list_chunks = [doi_list[x:x+chunk_size] for x in range(0, len(doi_list), chunk_size)] - for chunk in list_chunks: - pmc_doi_string = ','.join(chunk) - # Create the search URL - pmc_doi_query = pmc_doi_query_url + pmc_doi_string - # Parse the results & create dict entry for each result - pmc_response = requests.get(pmc_doi_query) - if pmc_response.status_code == 500: - print('Error for DOI chunk; retry with smaller chunk size') - else: - pmc_results = et.XML(pmc_response.content) - pmc_results = pmc_results.getchildren()[1:] # exclude echo header - for result in pmc_results: - doi = result.attrib['doi'] - try: - pmcid = result.attrib['pmcid'] - doi_to_pmc[doi] = pmcid - except KeyError: - if result.attrib['status'] == 'error': - dois_not_in_pmc.append(doi) - else: - print('Weird error for', doi) - time.sleep(1) - return doi_to_pmc, dois_not_in_pmc - - -def get_pmc_articles(): - """ - :return: a list of all article files in PMC folder - """ - # step 1: download tarball file if needed - pmc_url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/articles.O-Z.xml.tar.gz' - pmcdir = 'pmc_articles/' - pmc_local_tar = 'pmc_files.tar.gz' - pmc_path = os.path.join(pmcdir, pmc_local_tar) - if os.path.isdir(pmcdir) is False: - os.mkdir(pmcdir) - print('Creating folder for PMC article xml') - - if len([name for name in os.listdir(pmcdir) if os.path.isfile(os.path.join(pmcdir, name))]) < 200000: - print('Not enough articles in pmcdir, re-downloading zip file') - path = download(pmc_url, pmc_path) - - # Step 2: unzip archive - unzip_articles(file_path=pmc_path, extract_directory=pmcdir, filetype='tar') - - # Step 3: delete non-PLOS folders - listdirs = glob("pmc_articles/*/") - print(len(listdirs), "folders for all O-Z journals") - for directory in list(listdirs): - if directory.lower().startswith('pmc_articles/plos') is False: - rmtree(directory) - listdirs.remove(directory) - print(len(listdirs), "folders remaining for PLOS journals") - - # Step 4: put all PLOS articles in higher level pmcdir folder & flatten hierarchy - root = pmcdir - print("moving PMC articles to top-level folder") - for dirrr in list(listdirs): - files = [f for dp, dn, filenames in os.walk(dirrr) for f in filenames if os.path.splitext(f)[1] == '.nxml'] - for file in files: - move(join(dirrr, file), join(root, file)) - rmtree(dirrr) - pmc_articles = listdir_nohidden(pmcdir, extension='.nxml') - - return pmc_articles - - -def get_pmc_doi_dict(id_list=None, chunk_size=150): - ''' - Using the PMC ID query API, return the accompanying PMCID for each identifier in a given list. - Can (ostensibly) query up to 200 identifiers at a time. Can accept lists of DOIs or PMC IDs - :return: tuple of dictionary mapping DOI to PMCID, list of DOIs not found in PMC - ''' - if id_list is None: - id_list = extract_filenames(pmcdir, extension='.nxml') - doi_to_pmc = {} - dois_not_in_pmc = [] - # Make chunks of 200 DOIs at a time - list_chunks = [id_list[x:x+chunk_size] for x in range(0, len(id_list), chunk_size)] - for chunk in list_chunks: - pmc_doi_string = ','.join(chunk) - # Create the search URL - pmc_doi_query = pmc_doi_query_url + pmc_doi_string - # Parse the results & create dict entry for each result - pmc_response = requests.get(pmc_doi_query) - if pmc_response.status_code == 500: - print('Error for DOI chunk; retry with smaller chunk size') - else: - pmc_results = et.XML(pmc_response.content) - pmc_results = pmc_results.getchildren()[1:] # exclude echo header - for result in pmc_results: - doi = result.attrib['doi'] - try: - pmcid = result.attrib['pmcid'] - doi_to_pmc[doi] = pmcid - except KeyError: - if result.attrib['status'] == 'error': - dois_not_in_pmc.append(doi) - else: - print('Weird error for', doi) - time.sleep(1) - return doi_to_pmc, dois_not_in_pmc - - -def update_pmc_dict_by_doi(id_list): - ''' - With a list of identifiers, query PMC ID service to check for PMCIDs for articles. Print to .csv - :return: tuple of full dictionary of DOIs to PMC IDs, DOIs without matching PMCIDs - ''' - doi_to_pmc = get_articles_by_doi_field(check_new=False) - doi_to_pmc2, dois_not_in_pmc = get_pmc_doi_dict(id_list) - full_pmc_dict = {**doi_to_pmc2, **doi_to_pmc} - with open(pmc_csv, 'w') as file: - writer = csv.writer(file) - writer.writerow(['DOI', 'PMC ID']) - for key, value in full_pmc_dict.items(): - writer.writerow([key, value]) - return full_pmc_dict, dois_not_in_pmc - - -def exclude_recent_dois(doi_list): - ''' - For arriving at a list of DOIs ostensibly missing from PMC, remove the most recent articles - which likely have not yet had the opportunity to propagate. - :return: a list of missing DOIs which are old enough to be expected to be on PMC. - ''' - missing_pmc_articles = [] - for doi in doi_list: - article_file = doi_to_file(doi) - if compare_article_pubdate(article_file): - missing_pmc_articles.append(doi) - return missing_pmc_articles - - -def process_missing_plos_articles(plos_articles=None, pmc_articles=None): - ''' - For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version - of the Corpus by removing Currents articles, checking if articles are live on journals.plos.org, - and checking that the DOIs resolve. Prints the different kinds of errors that can occur. - :return: list of missing articles - ''' - if plos_articles is None or not plos_articles: - plos_articles = get_all_plos_dois() - if pmc_articles is None or not pmc_articles: - doi_to_pmc = get_articles_by_doi_field(check_new=False) - pmc_articles = list(doi_to_pmc.keys()) - missing_plos_articles = list(set(pmc_articles) - set(plos_articles)) - - # remove Currents articles - for article in missing_plos_articles: - if article.startswith('10.1371/currents') or \ - len(article) == 21 or \ - article == '10.1371/198d344bc40a75f927c9bc5024279815': - missing_plos_articles.remove(article) - - # check if articles are live on journals.plos.org - # check if DOIs resolve - missing_articles_link_works = [] - missing_articles_404_error = [] - doi_works = [] - doi_doesnt_work = [] - doi_mismatch = [] - doi_has_space = [] - for doi in missing_plos_articles: - if ' ' in doi: - doi_has_space.append(doi) - continue - doi_check = check_if_doi_resolves(doi) - if doi_check == 'works': - doi_works.append(doi) - elif doi_check == "doesn't work": - doi_doesnt_work.append(doi) - else: - doi_mismatch.append(doi) - continue - url = doi_to_url(doi) - article_exists = check_if_link_works(url) - if article_exists: - missing_articles_link_works.append(doi) - else: - missing_articles_404_error.append(doi) - - doi_mismatch = sorted(doi_mismatch) - link404_invalid_doi = sorted(list(set(missing_articles_404_error).intersection(doi_doesnt_work))) - linkworks_valid_doi = sorted(list(set(missing_articles_link_works).intersection(doi_works))) - - if doi_has_space: - print('\033[1m' + 'PMC DOI fields with spaces in them:') - for doi in doi_has_space: - print('\033[0m' + '"' + doi + '" \n') - if linkworks_valid_doi: - print('\033[1m' + 'Working articles that need to be re-indexed on Solr:') - print('\033[0m' + '\n'.join(linkworks_valid_doi), '\n') - if link404_invalid_doi: - print('\033[1m' + 'Articles on PMC but not on solr or journals:') - print('\033[0m' + '\n'.join(missing_articles_404_error), '\n') - if doi_mismatch: - print('\033[1m' + 'Missing PLOS articles where DOI resolves to different DOI:') - for doi in doi_mismatch: - print('\033[0m', doi, 'resolves to:', check_if_doi_resolves(doi)) - - remainder = set(missing_plos_articles) - set(linkworks_valid_doi + missing_articles_404_error + - doi_mismatch + doi_has_space) - if remainder: - print('\n \033[1m' + "Other articles on PMC that aren't working correctly for PLOS:") - print('\033[0m' + '\n'.join(remainder), '\n') - return missing_plos_articles - - -def process_missing_pmc_articles(pmc_articles=None, plos_articles=None): - ''' - For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version - of the Corpus by updating the PMCID:DOI mapping document, removing articles too recent to be indexed - (pubdate less than 3 weeks ago), and excluding uncorrected proofs. - :return: list of missing articles from PMC - ''' - if pmc_articles is None: - doi_to_pmc = get_articles_by_doi_field(check_new=False) - pmc_articles = list(doi_to_pmc.keys()) - - if plos_articles is None: - plos_articles = get_all_plos_dois() - missing_pmc_dois = list(set(plos_articles) - set(pmc_articles)) - - # Query for PMC updates & update DOI-to-PMCID dictionary - if missing_pmc_dois: - full_pmc_dict, dois_not_in_pmc = update_pmc_dict_by_doi(missing_pmc_dois) - - # Exclude PLOS Medicine quizzes - for doi in dois_not_in_pmc: - if "pmed" in doi: - article = doi_to_article(doi) - article_type = get_plos_article_type(article) - if article_type == 'Quiz': - dois_not_in_pmc.remove(doi) - - # Remove articles too recent to have been indexed on PMC - if dois_not_in_pmc: - missing_pmc_dois = exclude_recent_dois(dois_not_in_pmc) - - # Remove uncorrected proofs - if missing_pmc_dois: - for doi in missing_pmc_dois: - article_file = doi_to_file(doi) - if check_if_uncorrected_proof(article_file): - missing_pmc_dois.remove(doi) - - # Make sure that the DOI resolves - for doi in missing_pmc_dois: - resolves = check_if_doi_resolves(doi) - if resolves != "works": - print('DOI not working for this PLOS DOI:', doi, resolves) - missing_pmc_dois.remove(doi) - - if len(missing_pmc_dois) == 0: - print('No PMC articles missing.') - else: - for doi in missing_pmc_dois: - if ' ' in doi: - print('There is a space in this DOI: ' + '"' + doi + '"') - print('\033[1m' + 'Articles missing from PMC:') - print('\033[0m' + '\n'.join(sorted(missing_pmc_dois)), '\n') - - return missing_pmc_dois - - -def get_all_pmc_dois(retstart=0, retmax=80000, count=None): - """ - Query the entrez database to get a comprehensive list of all PMCIDs associated with all PLOS journals, - individually included in the search url. - See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch for more info on search parameters - :return: the full list of PMCIDs in PMC for PLOS articles - """ - pmc_allplos_query_url = ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=' - '((((((((("PLoS+ONE"[Journal])+OR+"PLoS+Genetics"[Journal])+OR+"PLoS+Pathogens"[Journal])' - 'OR+"PLoS+Neglected+Tropical+Diseases"[Journal])+OR+"PLoS+Computational+Biology"[Journal])' - 'OR+"PLoS+Biology"[Journal])+OR+"PLoS+Medicine"[Journal])+OR+"plos+currents"[Journal])+OR+' - '"PLoS Clinical Trials"[Journal])' - '&retmode=json&tool=corpustest&email={0}'.format(USER_EMAIL)) - - pmcidlist = [] - r = requests.get(pmc_allplos_query_url).json() - if count is None: - count = int(r['esearchresult']['count']) - print(count, "articles found in PMC") - while retstart < count: - query = pmc_allplos_query_url + '&retstart={0}&retmax={1}'.format(retstart, retmax) - r = requests.get(query).json() - idlist = r['esearchresult']['idlist'] - for id in idlist: - pmcidlist.append('PMC' + id) - retstart += retmax - time.sleep(1) - pmcidlist = sorted(list(set(pmcidlist))) - if pmcidlist != count: - print("Error in number of IDs returned. Got {0} when expected {1}." - .format(len(pmcidlist), count)) - - return pmcidlist - - -def update_local_pmc_from_remote(): - ''' - Using the current set of articles indexed live on PMC, compare them to the locally maintained index. - If any of them are missing, download them to the local .csv dictionary. - :return: full dictionary of PMC IDs''' - remote_pmc_ids = get_all_pmc_dois() - local_pmc_dict = get_articles_by_doi_field() - local_pmc_ids = list(local_pmc_dict.values()) - missing_pmcids = list(set(remote_pmc_ids) - set(local_pmc_ids)) - if missing_pmcids: - full_pmc_dict, dois_not_in_pmc = update_pmc_dict_by_doi(missing_pmcids) - else: - full_pmc_dict = doi_to_pmc - weird_pmc_ids = list(set(local_pmc_ids) - set(remote_pmc_ids)) - if 0 < weird_pmc_ids < 10000: - print("Some articles on local not on remote:", print(weird_pmc_ids)) - return full_pmc_dict - - -def get_needed_pmc_articles(): - """ - Compare local to remote set of PLOS PMC IDs. - TO DO: Add check for latest update date - :return: tuple of doi dict, and list of DOIs that are on remote and not local, to be downloaded. - """ - doi_to_pmc = get_articles_by_doi_field(check_new=False) - remote_pmc_ids = list(doi_to_pmc.values()) - local_pmc_ids = extract_filenames(pmcdir, extension='.nxml') - missing_pmc_articles = list(set(remote_pmc_ids) - set(local_pmc_ids)) - return doi_to_pmc, missing_pmc_articles - - -def get_pmc_article_zip_links(): - """ - Creates a dictionary mapping every PMC ID to the partial PMC download URL - Based on txt file hosted by PMC - TO DO: see if there's a way to download monthly, weekly, etc from PMC - :return: dictionary mapping PMC IDs to partial download links - """ - - # write info file to disk if it doesn't exist already or is too old - try: - mod_date = datetime.datetime.fromtimestamp(os.path.getmtime(pmc_file_list)) - file_age = datetime.datetime.now() - mod_date - if file_age > datetime.timedelta(days=1): - os.remove(pmc_file_list) - except FileNotFoundError: - pass - if os.path.isfile(pmc_file_list) is False: - with open(pmc_file_list, 'w') as f: - f.write(requests.get('http://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt').text) - - # parse file by line - with open(pmc_file_list) as f: - pmc_lists = [x.strip().split('\t') for x in f] - - # turn into dictionary mapping of PMCID to partial PMC URL - pmc_urls = {d[2]: d[0] for d in pmc_lists[1:]} - - return pmc_urls - - -def download_pmc_article_xml(missing_pmc_articles=None, pmc_urls=None): - """ - Get missing PMC articles. Get dictionary mapping them to partial URLs. Download and unzip the tarballs. - Keep and rename the nxml files and delete the others. - NOTE: This hasn't worked very well. PMC connections are unreliable & there are a lot of timeouts. - :return: list of files downloaded from PMC - """ - new_pmc_articles = [] - if missing_pmc_articles is None: - doi_to_pmc, missing_pmc_articles = get_needed_pmc_articles() - print(len(missing_pmc_articles), "PMC articles to download.") - if missing_pmc_articles: - if pmc_urls is None: - pmc_urls = get_pmc_article_zip_links() - # download and unzip tarballs - for article in missing_pmc_articles: - dl_url = PMC_FTP_URL + pmc_urls[article] - filename = (pmc_urls[article]).split("/")[3] - local_file = os.path.join(newpmcarticledir, filename) - if os.path.isfile(local_file) is False: - try: - download(dl_url, local_file) - unzip_articles(directory=newpmcarticledir, filetype='tar', file=filename) - except RuntimeError: - print('Error downloading', article) - continue - - # get rid of non-.nxml files - allfiles = glob.glob('new_pmc_articles/*/*') - for file in allfiles: - if file.endswith('.nxml') is False: - os.remove(file) - - # move and process the nxml files - files = glob.glob('new_pmc_articles/*/*') - for old_file in files: - # make sure directory and linked doi line up - directory = (old_file).split('/')[1] - linked_doi = doi_to_pmc[get_article_doi(article_file=old_file)] - if linked_doi == directory: - # rename file from directory & move to higher level directory - new_file = '/'.join(((old_file).split('/'))[0:2]) + '.nxml' - shutil.move(old_file, new_file) - new_pmc_articles.append(new_file) - else: - print('error:', linked_doi, directory) - for directory in glob.glob('new_pmc_articles/*/'): - os.rmdir(directory) - - return new_pmc_articles - - -def move_pmc_articles(source, destination): - """ - Move PMC articles from one folder to another - :param source: Temporary directory of new article files - :param destination: Directory where files are copied to - """ - oldnum_destination = len(listdir_nohidden(destination, extension='.nxml')) - oldnum_source = len(listdir_nohidden(source, extension='.nxml')) - if oldnum_source > 0: - print("PMC Corpus started with", - oldnum_destination, - "articles.\nFile moving procedure initiated, please hold...") - copytree(source, destination, ignore=ignore_func) - newnum_destination = len(listdir_nohidden(destination)) - if newnum_destination - oldnum_destination > 0: - print(newnum_destination - oldnum_destination, - "files moved. PMC Corpus now has", - newnum_destination, "articles.") - logging.info("New article files moved successfully") - else: - print("No files found to move in source directory.") - logging.info("No article files moved") - # Delete temporary folder in most cases - if source == newarticledir: - shutil.rmtree(source) - - -if __name__ == '__main__': - pmcidlist = get_all_pmc_dois() diff --git a/allofplos/twoto3_nb.py b/allofplos/twoto3_nb.py deleted file mode 100755 index 32fa9160..00000000 --- a/allofplos/twoto3_nb.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 -""" -To run: python3 nb2to3.py notebook-or-directory -""" -# Authors: Thomas Kluyver, Fernando Perez -# See: https://gist.github.com/takluyver/c8839593c615bb2f6e80 - -import argparse -import pathlib -from nbformat import read, write - -import lib2to3 -from lib2to3.refactor import RefactoringTool, get_fixers_from_package - - -def refactor_notebook_inplace(rt, path): - - def refactor_cell(src): - #print('\n***SRC***\n', src) - try: - tree = rt.refactor_string(src+'\n', str(path) + '/cell-%d' % i) - except (lib2to3.pgen2.parse.ParseError, - lib2to3.pgen2.tokenize.TokenError): - return src - else: - return str(tree)[:-1] - - - print("Refactoring:", path) - nb = read(str(path), as_version=4) - - # Run 2to3 on code - for i, cell in enumerate(nb.cells, start=1): - if cell.cell_type == 'code': - if cell.execution_count in (' ', '*'): - cell.execution_count = None - - if cell.source.startswith('%%'): - # For cell magics, try to refactor the body, in case it's - # valid python - head, source = cell.source.split('\n', 1) - cell.source = head + '\n' + refactor_cell(source) - else: - cell.source = refactor_cell(cell.source) - - - # Update notebook metadata - nb.metadata.kernelspec = { - 'display_name': 'Python 3', - 'name': 'python3', - 'language': 'python', - } - if 'language_info' in nb.metadata: - nb.metadata.language_info.codemirror_mode = { - 'name': 'ipython', - 'version': 3, - } - nb.metadata.language_info.pygments_lexer = 'ipython3' - nb.metadata.language_info.pop('version', None) - - write(nb, str(path)) - -def main(argv=None): - ap = argparse.ArgumentParser() - ap.add_argument('path', type=pathlib.Path, - help="Notebook or directory containing notebooks") - - options = ap.parse_args(argv) - - avail_fixes = set(get_fixers_from_package('lib2to3.fixes')) - rt = RefactoringTool(avail_fixes) - - if options.path.is_dir(): - for nb_path in options.path.rglob('*.ipynb'): - refactor_notebook_inplace(rt, nb_path) - else: - refactor_notebook_inplace(rt, options.path) - -if __name__ == '__main__': - main() From 6d7e7d62910920dada2b470da5f6e6e203e6cfde Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Wed, 18 Oct 2017 15:31:25 -0700 Subject: [PATCH 12/24] rm old notebooks --- allofplos/jupyternb/Corpus_Analysis-old.ipynb | 792 ------------------ ...S article XML from journals.plos.org.ipynb | 236 ------ ...thly integrity check for PLOS corpus.ipynb | 148 ---- 3 files changed, 1176 deletions(-) delete mode 100644 allofplos/jupyternb/Corpus_Analysis-old.ipynb delete mode 100644 allofplos/jupyternb/Download PLOS article XML from journals.plos.org.ipynb delete mode 100644 allofplos/jupyternb/Monthly integrity check for PLOS corpus.ipynb diff --git a/allofplos/jupyternb/Corpus_Analysis-old.ipynb b/allofplos/jupyternb/Corpus_Analysis-old.ipynb deleted file mode 100644 index 4c590f64..00000000 --- a/allofplos/jupyternb/Corpus_Analysis-old.ipynb +++ /dev/null @@ -1,792 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Required functions" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "code_folding": [], - "collapsed": true - }, - "outputs": [], - "source": [ - "from Samples.corpus_analysis import *" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "# PLOS article types" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "hidden": true - }, - "source": [ - "## JATS-standard NLM article types" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "editable": false, - "hidden": true, - "run_control": { - "frozen": true - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "15 types of articles found.\n", - "[('research-article', 204109), ('correction', 9113), ('article-commentary', 1284), ('discussion', 1087), ('review-article', 612), ('other', 584), ('editorial', 340), ('letter', 300), ('retraction', 79), ('book-review', 77), ('meeting-report', 38), ('case-report', 23), ('expression-of-concern', 13), ('obituary', 10), ('brief-report', 1)]\n" - ] - } - ], - "source": [ - "jats_article_type_list = get_jats_article_type_list()\n", - "print(jats_article_type_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "hidden": true - }, - "source": [ - "## PLOS article types" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "editable": false, - "hidden": true, - "run_control": { - "frozen": true - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "60 types of articles found.\n", - "[('Research Article', 202673), ('Correction', 9124), ('Synopsis', 1093), ('Perspective', 801), ('Review', 555), ('Editorial', 486), ('Pearls', 438), ('Essay', 379), ('Policy Forum', 309), ('Correspondence', 287), ('Primer', 237), ('Viewpoints', 209), ('Community Page', 139), ('Opinion', 136), ('Health in Action', 118), ('Education', 103), ('Retraction', 79), ('Book Review/Science in the Media', 76), ('Message from ISCB', 70), ('Symposium', 70), ('Policy Platform', 54), ('Feature', 53), ('Formal Comment', 52), ('Research in Translation', 51), ('Guidelines and Guidance', 51), ('Collection Review', 50), ('Research Matters', 44), ('Interview', 44), ('The PLoS Medicine Debate', 38), ('Historical Profiles and Perspectives', 38), ('Unsolved Mystery', 34), ('Overview', 34), ('Neglected Diseases', 29), ('Expert Commentary', 29), ('Learning Forum', 27), ('From Innovation to Application', 24), ('Obituary', 22), ('Quiz', 21), ('Correspondence and Other Communications', 13), ('Expression of Concern', 13), ('Journal Club', 12), ('Meta-Research Article', 12), ('Student Forum', 12), ('Open Highlights', 11), ('Topic Page', 11), ('Case Report', 10), ('Photo Quiz', 10), ('Best Practice', 5), ('Deep Reads', 4), ('Historical and Philosophical Perspectives', 3), ('Special Report', 3), ('Book Review', 2), ('Message from the Founders', 1), ('Message from PLoS', 1), ('Short Reports', 1), ('Methods and Resources', 1), ('Technical Report', 1), ('Message from the PLoS Founders', 1), ('Collection Review ', 1), ('Debate', 1)]\n" - ] - } - ], - "source": [ - "PLOS_article_type_list = get_plos_article_type_list()\n", - "print(PLOS_article_type_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true - }, - "source": [ - "## PLOS/NLM article type mapping" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "article_types_map = get_article_types_map()\n", - "PLOS_article_types_structured = counter(article_types_map).most_common()\n", - "print(PLOS_article_types_structured)" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "# create .csv file mapping JATS to PLOS article types\n", - "article_types_map_to_csv(article_types_map)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "# Taking random samples of DOIs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "random_sample_of_dois = get_random_list_of_DOIs() # returns 100 DOIs by default" - ] - }, - { - "cell_type": "code", - "execution_count": 203, - "metadata": { - "hidden": true, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['journal.pone.0074820', 'journal.pone.0063497', 'journal.pone.0126357', 'journal.pntd.0004807', 'journal.pone.0031896', 'journal.pone.0045503', 'journal.pone.0138217', 'journal.pbio.0050002', 'journal.pone.0122848', 'journal.pone.0099248']\n" - ] - } - ], - "source": [ - "random_sample_of_articles = [doi_to_article(doi) for doi in random_sample_of_dois]\n", - "print(random_sample_of_articles[0:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "# Retracted and corrected articles" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "hidden": true - }, - "source": [ - "## Get list of retracted articles" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "editable": false, - "hidden": true, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "79 retracted articles found.\n", - "['journal.pbio.0030123', 'journal.pbio.0050005', 'journal.pbio.0050146', 'journal.pbio.1001212', 'journal.pcbi.1002308', 'journal.pgen.1003361', 'journal.pgen.1003791', 'journal.pgen.1005586', 'journal.pgen.1000424', 'journal.pmed.1001214', 'journal.pone.0072333', 'journal.pone.0084127', 'journal.pone.0027571', 'journal.pone.0046410', 'journal.pone.0080145', 'journal.pone.0019652', 'journal.pone.0075928', 'journal.pone.0075046', 'journal.pone.0062178', 'journal.pone.0051549', 'journal.pone.0093095', 'journal.pone.0069669', 'journal.pone.0133525', 'journal.pone.0115980', 'journal.pone.0115741', 'journal.pone.0139044', 'journal.pone.0146193', 'journal.pone.0045667', 'journal.pone.0040789', 'journal.pone.0094830', 'journal.pone.0031943', 'journal.pone.0097700', 'journal.pone.0047218', 'journal.pone.0090951', 'journal.pone.0014232', 'journal.pone.0090318', 'journal.pone.0072895', 'journal.pone.0065651', 'journal.pone.0059556', 'journal.pone.0076809', 'journal.pone.0099630', 'journal.pone.0121549', 'journal.pone.0048402', 'journal.pone.0062170', 'journal.pone.0020152', 'journal.pone.0164571', 'journal.pone.0164378', 'journal.pone.0116682', 'journal.pone.0125542', 'journal.pone.0047110', 'journal.pone.0026503', 'journal.pone.0037102', 'journal.pone.0014163', 'journal.pone.0043204', 'journal.pone.0001276', 'journal.pone.0035142', 'journal.pone.0011299', 'journal.pone.0005373', 'journal.pone.0030980', 'journal.pone.0000306', 'journal.pone.0064576', 'journal.pone.0016011', 'journal.pone.0001444', 'journal.pone.0043406', 'journal.pone.0029192', 'journal.pone.0001908', 'journal.pone.0016256', 'journal.pone.0013512', 'journal.pone.0045965', 'journal.pone.0022730', 'journal.pone.0006333', 'journal.pone.0004168', 'journal.pone.0035453', 'journal.pone.0032853', 'journal.ppat.1003435', 'journal.ppat.1002062', 'journal.ppat.1000915', 'journal.ppat.1000210', 'journal.ppat.0020025']\n" - ] - } - ], - "source": [ - "retractions_article_list, retracted_article_list = get_retracted_article_list()\n", - "print(retracted_article_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "hidden": true - }, - "source": [ - "## Get list of corrected articles" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "editable": false, - "hidden": true, - "run_control": { - "frozen": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "journal.pcbi.1003582.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003490\n", - "journal.pcbi.1003732.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003159\n", - "journal.pone.0101541.xml has incorrect linked DOI: journal.PONE-D-13-26510\n", - "journal.pone.0104353.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104472.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104581.xml has incorrect linked DOI: journal.\n", - "journal.pone.0104601.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105485.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105486.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105490.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105658.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105668.xml has incorrect linked DOI: journal.\n", - "journal.pone.0105669.xml has incorrect linked DOI: journal.\n", - "9127 corrected articles found.\n" - ] - } - ], - "source": [ - "corrections_article_list, corrected_article_list = get_corrected_article_list()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "# What's going on with revision_dates & article updates?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Keep an eye on this URL for any changes. On PMC, was updated in the last few months, but that might not have has time to propagate. https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:3913708&metadataPrefix=pmc" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "## Step 1: Query solr for revision_date field" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "hidden": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "URL for solr query: http://api.plos.org/search?q=*:*&fq=doc_type:full+-doi:image&fl=id,publication_date&wt=json&indent=true&sort=%20id%20asc&fq=publication_date:[2017-08-17T00:00:00Z+TO+2017-08-25T23:59:59Z]&rows=1000\n", - "613 results returned from this search.\n", - "['2017-08-21T00:00:00Z', '2017-08-18T00:00:00Z', '2017-08-22T00:00:00Z', '2017-08-18T00:00:00Z', '2017-08-24T00:00:00Z', '2017-08-24T00:00:00Z', '2017-08-18T00:00:00Z', '2017-08-23T00:00:00Z', '2017-08-24T00:00:00Z', '2017-08-17T00:00:00Z']\n" - ] - } - ], - "source": [ - "# This should print 10 date strings \n", - "publication_dates_list = get_solr_records(days_ago=8, item='publication_date')\n", - "print(publication_dates_list[0:10])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "hidden": true - }, - "outputs": [], - "source": [ - "# This should return an error\n", - "revision_dates_list = get_solr_records(days_ago=8, item='revision_date')\n", - "print(revision_dates_list[0:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 2: Peek inside raw XML for any changes" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "downloaded new version of journal.pone.0182022.xml\n", - "downloaded new version of journal.pone.0175323.xml\n", - "downloaded new version of journal.pone.0171255.xml\n", - "downloaded new version of journal.pone.0158499.xml\n", - "30000 article checked for updates.\n", - "4 articles have updates.\n", - "['journal.pone.0182022.xml', 'journal.pone.0175323.xml', 'journal.pone.0171255.xml', 'journal.pone.0158499.xml']\n" - ] - } - ], - "source": [ - "articles_different_list = revisiondate_sanity_check()\n", - "print(articles_different_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DOI and filename sanity check" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Check if article filenames match their full DOIs & that DOI fields are correct\n", - "# NOT WORKING AND MUST BE FIXED!\n", - "messed_up_plos_list = article_doi_sanity_check()\n", - "messed_up_pmc_list = article_doi_sanity_check(directory=pmcdir, article_list=None, source='PMC')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PubMed Corpus" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get all local, solr, and PMC DOIs" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mArticles that needs to be re-indexed on Solr:\n", - "\u001b[0m10.1371/journal.pone.0076809\n" - ] - } - ], - "source": [ - "plos_articles = compare_local_and_solr()\n", - "doi_to_pmc = get_articles_by_doi_field(check_new=False)\n", - "pmc_articles = list(doi_to_pmc.keys())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare PLOS's copy to PMC" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version of the corpus by:\n", - "* removing Currents articles\n", - "* checking if articles are live on journals.plos.org\n", - "* checking that the DOIs resolve" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mPMC DOI fields with spaces in them:\n", - "\u001b[0m\"10.1371/annotation/1cdc7975-50d7-40a5-99ca-83580df2982f \" \n", - "\n", - "\u001b[1mWorking articles that need to be re-indexed on Solr:\n", - "\u001b[0m10.1371/annotation/1391941e-93d3-48d3-8c9a-b7c6d98f9527\n", - "10.1371/annotation/a81b1fab-890c-447b-a308-5bc8ca3eb21d\n", - "10.1371/annotation/df340d50-1f94-4d8b-a252-1a82a7fa5cc7 \n", - "\n", - "\u001b[1mArticles on PMC but not on solr or journals:\n", - "\u001b[0m10.1371/journal.pone.0002957\n", - "10.1371/annotation/b83e925b-2f2a-47b9-b939-0a1eeab18324\n", - "10.1371/journal.pbio.0020201\n", - "10.1371/annotation/011969ee-3f4b-4260-8d95-1b9a4ca39008\n", - "10.1371/annotation/8f2ddf91-3499-4627-9a91-449b78465f9d\n", - "10.1371/annotation/33d82b59-59a3-4412-9853-e78e49af76b9 \n", - "\n", - "\u001b[1mMissing PLOS articles where DOI resolves to different DOI:\n", - "\u001b[0m 10.1371/annotation/5e4082fd-6d86-441f-b946-a6e87a22ea57 resolves to: 10.1371/annotation/d9496d01-8c5d-4d24-8287-94449ada5064\n", - "\u001b[0m 10.1371/annotation/b8b66a84-4919-4a3e-ba3e-bb11f3853755 resolves to: 10.1371/annotation/5fbbf39a-fb47-4ce1-8069-acd830b3d41f\n", - "\n", - " \u001b[1mOther articles on PMC that aren't working correctly for PLOS:\n", - "\u001b[0m10.1371/annotation/363b6074-caec-4238-b88f-acbf45de498f\n", - "10.1371/annotation/2259f958-a68e-4e57-92b5-2ef003070cf1 \n", - "\n" - ] - } - ], - "source": [ - "missing_plos_articles = process_missing_plos_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare PMC's copy to PLOS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version of the Corpus by:\n", - "* updating the PMCID:DOI mapping document\n", - "* removing articles too recent to be indexed (pubdate less than 3 weeks ago)\n", - "* excluding uncorrected proofs\n", - "* excluding PLOS Medicine quizzes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mArticles missing from PMC:\n", - "\u001b[0m10.1371/annotation/08286cd8-527f-4f14-856f-57267107efa8\n", - "10.1371/annotation/0bbea8d3-1f94-48af-915c-aec02da2f5c3\n", - "10.1371/annotation/0c5390b8-72b0-4b7e-85a3-b8c0fd9f62bf\n", - "10.1371/annotation/0ccac188-950f-4908-b232-35fb44ba7847\n", - "10.1371/annotation/0cfd3d5f-c1d0-48f8-ad69-34a95e31a8d2\n", - "10.1371/annotation/0e045706-ea24-41db-be90-27d1cbcd35b1\n", - "10.1371/annotation/17310bbb-e5bf-4901-8b6e-529577a280db\n", - "10.1371/annotation/1c419628-f1b5-45de-9f8a-43f834309ebb\n", - "10.1371/annotation/1dc00176-e096-4621-9494-2d848dac8262\n", - "10.1371/annotation/1e464689-3c86-4399-b229-1e00d65593a5\n", - "10.1371/annotation/1f110857-27d7-4e83-9eb3-4e5f51950a26\n", - "10.1371/annotation/21379809-1376-4250-b4c2-bf51eac58a98\n", - "10.1371/annotation/221e5f19-370e-4a52-add8-f882437bc85d\n", - "10.1371/annotation/230cca90-58e9-4aa1-b6b2-a1d744524fbd\n", - "10.1371/annotation/23bca9d0-f934-400e-8bb9-f5ff07f9e625\n", - "10.1371/annotation/270b432d-50ec-41f1-ad4d-ddd9f51f62a5\n", - "10.1371/annotation/2b218d50-a9d5-45b2-80d0-0e806e530749\n", - "10.1371/annotation/2c275a1b-2d36-4492-b36a-192bddf14f78\n", - "10.1371/annotation/2ca25d9c-7347-4b09-bd7a-09d6d37ff322\n", - "10.1371/annotation/2f278ed8-d5e7-440a-9e49-c8d1df20d1f1\n", - "10.1371/annotation/31412345-fc86-4d67-b37c-93d42f5f0a59\n", - "10.1371/annotation/3265139d-64c7-4c4c-83d3-1e139031e7df\n", - "10.1371/annotation/34304231-e54b-4080-af70-6f957f32d552\n", - "10.1371/annotation/39b41d98-b117-41cf-b5de-b8486a67b1cd\n", - "10.1371/annotation/4290dfee-64fd-4157-89e3-8edbba912420\n", - "10.1371/annotation/44f67041-2f8e-42df-826a-82172ae05a22\n", - "10.1371/annotation/49257f53-8cb1-431b-be64-7b410598b845\n", - "10.1371/annotation/4993e0e2-c580-4547-90d8-3227b87e6ae9\n", - "10.1371/annotation/4a8d9f38-1d0d-4389-a284-9f2564e1ac0b\n", - "10.1371/annotation/4b9340db-455b-4e0d-86e5-b6783747111f\n", - "10.1371/annotation/4bb6b73b-b5bb-4143-9ec3-99c90b93f3ad\n", - "10.1371/annotation/4d6c4127-82e4-408d-af89-5f2e207d523b\n", - "10.1371/annotation/4f08219c-2d7b-4309-8351-d3fe2378993f\n", - "10.1371/annotation/5487e265-8175-47cb-b9a4-d85862a4a96f\n", - "10.1371/annotation/59bcbe81-eddd-46a4-90dc-88c1ea70df72\n", - "10.1371/annotation/5e0195b6-60b9-4c03-84ae-c6c31e625be1\n", - "10.1371/annotation/6130c605-086b-46af-8f6f-6c76b8eb9c84\n", - "10.1371/annotation/638b42e3-a351-4827-a612-17fe29b48e28\n", - "10.1371/annotation/677fdf34-651e-4dc8-a0be-d0d633237a85\n", - "10.1371/annotation/712bb339-6073-4e62-9f68-b285caedd913\n", - "10.1371/annotation/730cdfd0-78c5-48fc-a095-f633905ff2f0\n", - "10.1371/annotation/7645d066-aa98-45d6-8c3e-3a30d9e03e4d\n", - "10.1371/annotation/7e304601-fc5c-40fe-857c-d6ea894d1647\n", - "10.1371/annotation/7f73ed17-709e-4d7f-9aae-aab1f4a34985\n", - "10.1371/annotation/865eaad7-8547-49ac-a42d-47e9d0755bb3\n", - "10.1371/annotation/87e2a80b-3ed7-4ef9-96cb-1268d91b6366\n", - "10.1371/annotation/8941aee3-4bb8-42a0-b09a-e7c416beeef7\n", - "10.1371/annotation/8c6eaae4-72a7-460a-8b1a-f855731f3706\n", - "10.1371/annotation/8fa70b21-32e7-4ed3-b397-ab776b5bbf30\n", - "10.1371/annotation/9239a129-5677-43b0-8fe1-0c1e75e988df\n", - "10.1371/annotation/93141e7a-61f3-48bd-87bd-216b030d773d\n", - "10.1371/annotation/936a4359-1bf5-4c33-be7d-1468e75eaa8b\n", - "10.1371/annotation/93d63399-0e71-4a25-a45c-311910ee6da5\n", - "10.1371/annotation/9630862b-4676-4b82-9869-8d8fbb2a2e65\n", - "10.1371/annotation/974531b0-9da4-4575-b3d1-955b0163fde0\n", - "10.1371/annotation/98908e14-e9fd-458f-9cea-ba4bec139f20\n", - "10.1371/annotation/b03fbc42-8f70-4873-9cce-854e48249a13\n", - "10.1371/annotation/b0e62f4f-812f-40b1-aef8-365b229eb2cf\n", - "10.1371/annotation/b4e623eb-4950-48d9-8d85-8d70426d95a3\n", - "10.1371/annotation/b60d4ec5-4c6f-43ab-9f63-322e3cd59636\n", - "10.1371/annotation/bae9fc08-fbfa-45b5-9d1d-0b8254d6efd5\n", - "10.1371/annotation/bc97a85c-1ecd-4cd8-ab61-0aef01f949a1\n", - "10.1371/annotation/c066bb84-13ea-4b36-a481-f149df8ce929\n", - "10.1371/annotation/c313df3a-52bd-4cbe-af14-6676480d1a43\n", - "10.1371/annotation/c81daa7c-5375-4349-970b-c63d288947eb\n", - "10.1371/annotation/caf130c3-5026-41cd-9dda-5eac7c0f016f\n", - "10.1371/annotation/d271d9c1-5588-4b43-85c3-d3de58ab61a4\n", - "10.1371/annotation/dfa05103-fc65-4f07-b30f-72a6e91613ff\n", - "10.1371/annotation/ea14adcb-033d-492d-8f8b-e047aa080cd4\n", - "10.1371/annotation/ebea4bd5-2b96-4842-b110-2f7c156e5060\n", - "10.1371/annotation/eff6e471-306a-41bd-88e3-13857af094af\n", - "10.1371/annotation/f016476b-5b84-4c9a-899f-fe8b8bc927b5\n", - "10.1371/annotation/f216b2b0-ab6b-45d8-b6ba-134a477b79b7\n", - "10.1371/annotation/f32bc670-c9cf-4bb0-9376-cd8cfd1053c1\n", - "10.1371/annotation/f8605b0a-d01c-41aa-ac9b-b605d7903a28\n", - "10.1371/annotation/f9660803-198b-4d0d-8200-719a2eb2a443\n", - "10.1371/annotation/fcca88ac-d684-46e0-a483-62af67e777bd\n", - "10.1371/annotation/fd9f9796-b42d-480d-b9f4-0adfbb919148\n", - "10.1371/annotation/fddd2ff3-c991-4c2f-8b84-a27eb20fba91\n", - "10.1371/annotation/ff089043-990a-48c2-a90f-15606c11cc98\n", - "10.1371/journal.pcbi.1005632\n", - "10.1371/journal.pcbi.1005676\n", - "10.1371/journal.pcbi.1005677\n", - "10.1371/journal.pcbi.1005692\n", - "10.1371/journal.pgen.1006910\n", - "10.1371/journal.pone.0181246\n", - "10.1371/journal.pone.0182517\n", - "10.1371/journal.ppat.1006535\n", - "10.1371/journal.ppat.1006543 \n", - "\n" - ] - } - ], - "source": [ - "missing_pmc_articles = process_missing_pmc_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save lists of missing articles to text files if needed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "with open('missing_plos_articles.txt', 'w') as file:\n", - " for item in sorted(set(missing_plos_articles)):\n", - " file.write(\"%s\\n\" % item)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "with open('missing_pmc_articles.txt', 'w') as file:\n", - " for item in sorted(set(missing_pmc_articles)):\n", - " file.write(\"%s\\n\" % item)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Count of articles by pubdate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## How many articles published each day? month? year? For a period of time?" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, - "source": [ - "### Could consider making graphs of this..." - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "hidden": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('Aug 2013', 2629), ('Dec 2013', 8), ('Jan 2014', 5), ('Jul 2013', 2627), ('Jun 2013', 2542), ('Jun 2014', 1), ('Mar 2014', 3), ('Mar 2015', 2), ('May 2013', 932), ('May 2014', 1), ('Nov 2013', 20), ('Oct 2013', 47), ('Sep 2013', 1183)]\n" - ] - } - ], - "source": [ - "import collections\n", - "counter = collections.Counter\n", - "\n", - "example_article = 'journal.pone.0012380.xml'\n", - "pubdate_list = []\n", - "article_files = listdir_nohidden(corpusdir)\n", - "pubdate_list = [get_article_pubdate(article_file) for article_file in listdir_nohidden(corpusdir)[90000:100000]]\n", - "# monthly_pubdate_list = [date.replace(day=1,hour=0,minute=0,second=0,microsecond=0) for date in pubdate_list]\n", - "monthly_pubdate_list = [date.strftime('%b %Y') for date in pubdate_list]\n", - "monthly_pubdate_list = sorted(monthly_pubdate_list)\n", - "pubdate_count = sorted(counter(monthly_pubdate_list).most_common())\n", - "print(pubdate_count)\n", - "# month_list = [x.strftime('%b %Y') for x[0] in pubdate_count]\n", - "# month_list = [x[0].strftime('%b %Y') for x in pubdate_count]" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": { - "hidden": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['May 2013', 'Jun 2013', 'Jul 2013', 'Aug 2013', 'Sep 2013', 'Oct 2013', 'Dec 2013']\n" - ] - } - ], - "source": [ - "month_list = [x[0].strftime('%b %Y') for x in pubdate_count]\n", - "print(month_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Count of articles published in each journal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import collections\n", - "counter = collections.Counter\n", - "\n", - "journal_list = []\n", - "for article_file in listdir_nohidden(corpusdir):\n", - " r = get_articleXML_content(corpusdir,\n", - " article_file,\n", - " tag_path_elements=[\"/\",\n", - " \"article\",\n", - " \"front\",\n", - " \"journal-meta\",\n", - " \"journal-title-group\",\n", - " \"journal-title\"])\n", - "\n", - " journal = r[0].text\n", - " journal_list.append(journal)\n", - "\n", - "print(len(set(journal_list)), 'PLOS journals found.')\n", - "journals_structured = counter(journal_list).most_common()\n", - "print(journals_structured)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - }, - "toc": { - "colors": { - "hover_highlight": "#DAA520", - "navigate_num": "#000000", - "navigate_text": "#333333", - "running_highlight": "#FF0000", - "selected_highlight": "#FFD700", - "sidebar_border": "#EEEEEE", - "wrapper_background": "#FFFFFF" - }, - "moveMenuLeft": true, - "nav_menu": { - "height": "174px", - "width": "252px" - }, - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 4, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false, - "widenNotebook": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/jupyternb/Download PLOS article XML from journals.plos.org.ipynb b/allofplos/jupyternb/Download PLOS article XML from journals.plos.org.ipynb deleted file mode 100644 index 7788ea5e..00000000 --- a/allofplos/jupyternb/Download PLOS article XML from journals.plos.org.ipynb +++ /dev/null @@ -1,236 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For each article in articleerrors.txt, \n", - "* go to journals.plos.org[article] URL to grab the raw XML \n", - "* download the xml from that webpage \n", - "* write file name based on name of article \n", - "* save xml to file \n", - "* add time delay " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import lxml.etree as et\n", - "import os\n", - "import time" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First go through text list of XML files in pre-defined list articleerrors.txt, convert to Python list, and truncate characters so it fits the PLOS URL scheme. NOTE: journal name in prefix does not matter." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "with open(\"articleerrors.txt\",\"r\") as f:\n", - " article_list = [x[:-5] for x in f.readlines()]\n", - " article_list.pop(0)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "sample_article_list = article_list[350:360]" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "prefix = 'http://journals.plos.org/plosone/article/file?id=10.1371/'\n", - "suffix = '&type=manuscript'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For each article in the article list, grab the XML from the constructed URL, parse with etree, and save to new XML file. Counter for every 50 articles. Time delay added so as not to overwhelm server" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.1%\n", - "6.0%\n", - "11.9%\n", - "17.8%\n", - "23.7%\n", - "29.6%\n", - "35.5%\n", - "41.4%\n", - "47.4%\n", - "53.3%\n", - "59.2%\n", - "65.1%\n", - "71.0%\n", - "76.9%\n", - "82.8%\n", - "88.7%\n", - "94.6%\n" - ] - } - ], - "source": [ - "for i, article in enumerate(article_list):\n", - " url = prefix + article + suffix\n", - " articleXML = et.parse(url)\n", - " article_path = os.path.join(\"fixed_XML_articles\", article + \".xml\")\n", - " with open(article_path, 'w') as f:\n", - " f.write(et.tostring(articleXML, method = 'xml', encoding = 'unicode'))\n", - " if i%75 ==0:\n", - " print(\"{:.1%}\".format((i+1)/len(article_list)))\n", - " time.sleep(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# if __name__ == __main__:\n", - " # main()\n", - " # this allows you to use python your_file.py " - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1269" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "val = !ls fixed_XML_articles/\n", - "len(val)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'builtin_function_or_method' object has no attribute 'lower'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mstupidlist\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetoutput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'ls AllofPLOS_article_XML/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstupidlist\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m: 'builtin_function_or_method' object has no attribute 'lower'" - ] - } - ], - "source": [ - "import os\n", - "stupidlist = !ls AllofPLOS_article_XML/\n", - "for x in stupidlist:\n", - " os.rename.lower()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "a = \"hi\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - }, - "toc": { - "colors": { - "hover_highlight": "#DAA520", - "navigate_num": "#000000", - "navigate_text": "#333333", - "running_highlight": "#FF0000", - "selected_highlight": "#FFD700", - "sidebar_border": "#EEEEEE", - "wrapper_background": "#FFFFFF" - }, - "moveMenuLeft": true, - "nav_menu": { - "height": "12px", - "width": "252px" - }, - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 4, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false, - "widenNotebook": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/allofplos/jupyternb/Monthly integrity check for PLOS corpus.ipynb b/allofplos/jupyternb/Monthly integrity check for PLOS corpus.ipynb deleted file mode 100644 index c5a0b1d0..00000000 --- a/allofplos/jupyternb/Monthly integrity check for PLOS corpus.ipynb +++ /dev/null @@ -1,148 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# What to check:\n", - "* Maybe as part of existing monthly maintenance?\n", - "* First test this on a small subset of articles\n", - "For each file in folder, make sure filename == relevant DOI field in XML\n", - " If so, pass_file_name_test is True\n", - " else pass_file_name_test is False\n", - "List of solr query DOIs == list of DOIs in article XML folder == list of DOIs in zip file\n", - " if DOIs in solr and not folder, download those from solr & add to folder & zip\n", - " and if it's that one messed-up article, only if it's been fixed\n", - " if it's been fixed, print note to remove this logic from the code\n", - " if DOIs in folder in solr, write those DOIs to error-list & txt file & email with warning\n", - " if no error proceed to XML content testing\n", - " if error print that content still needs to be checked\n", - " \n", - "Content of content-repo XML == Content of article folder XML == Content of zip file XML\n", - " if content in repo doesn't match article folder via https://bitbucket.org/ianb/formencode/src/tip/formencode/doctest_xml_compare.py?fileviewer=file-view-default#cl-70\n", - " if uncorrected proof vs vor_update, download vor_update\n", - " otherwise save diff and return error (or: preserve old version and make content-repo default and take diff via https://www.logilab.org/859 )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "file_list = drive.ListFile({'q': \"'root' in parents and trashed=false\"}).GetList()\n", - "gdrive_zip_file = [item for item in file_list if item[\"id\"] == gd_id]\n", - "gdrive_zip_filename = (item for item in gdrive_zip_file['originalFilename'])\n", - "current_zipname = str(glob(prefix_zip_name+\"*.zip\")[0])\n", - "if gdrive_filename == current_zipname: \n", - " print(\"Zip file up-to-date on Google drive. No changes made.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import filecmp\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "filecmp.cmp('test_file.txt', 'accman_to_check_list.txt', shallow=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "os.stat_result(st_mode=33188, st_ino=10556917, st_dev=16777220, st_nlink=1, st_uid=738185890, st_gid=984564325, st_size=903, st_atime=1490388647, st_mtime=1490388644, st_ctime=1490388644)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3", - "language": "python", - "name": "py3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - }, - "toc": { - "colors": { - "hover_highlight": "#DAA520", - "navigate_num": "#000000", - "navigate_text": "#333333", - "running_highlight": "#FF0000", - "selected_highlight": "#FFD700", - "sidebar_border": "#EEEEEE", - "wrapper_background": "#FFFFFF" - }, - "moveMenuLeft": true, - "nav_menu": { - "height": "30px", - "width": "252px" - }, - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 4, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false, - "widenNotebook": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 5e096a196c99cfef0ce625322e4b4ea84a600398 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Wed, 18 Oct 2017 15:55:00 -0700 Subject: [PATCH 13/24] redo url_to_path --- allofplos/plos_corpus.py | 47 ++++++++++------------------------------ 1 file changed, 11 insertions(+), 36 deletions(-) diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py index f89bef51..b0975e66 100644 --- a/allofplos/plos_corpus.py +++ b/allofplos/plos_corpus.py @@ -140,44 +140,19 @@ def url_to_path(url, directory=corpusdir, plos_network=False): :param directory: defaults to corpusdir, containing article files :return: relative path to local XML file in the corpusdir directory """ - doi_prefix = '' + annot_prefix = 'plos.correction.' if url.startswith(annotation_url) or url.startswith(annotation_url_int): - # go into online XML and grab the journal name - try: - articleXML = et.parse(url) - # if offline, try finding a local copy of the article but still check linked DOI - except (OSError, et.XMLSyntaxError): - if not plos_network: - article = url[len(annotation_url):url.index(url_suffix)] - else: - article = url[len(annotation_url_int):url.index(INT_URL_SUFFIX)] - for article_file in listdir_nohidden(directory): - if article in article_file: - break - articleXML = et.parse(article_file) - path_parts = ["/", "article", "front", "article-meta", "related-article"] - r = articleXML.xpath("/".join(path_parts)) - r = r[0].attrib - try: - linked_doi = r['{http://www.w3.org/1999/xlink}href'] - doi = linked_doi.lstrip('info:doi/10.1371/') - doi_prefix = ".".join(doi.split('.')[:2]) + ".correction." - except KeyError: - print('DOI error in {0}'.format(url)) - if 'annotation' in url: - if not plos_network: - annotation_code = url[url.index('annotation')+len('annotation')+1: - url.index('&type=manuscript')] - else: - annotation_code = url[url.index('annotation')+len('annotation')+1: - url.index('.XML')] - file = os.path.join(directory, doi_prefix + annotation_code + '.xml') + file_ = os.path.join(directory, + annot_prefix + + url[url.index(annotation_doi + '/')+len(annotation_doi + '/'):]. + replace(url_suffix, ''). + replace(INT_URL_SUFFIX, '') + '.xml') else: - file = os.path.join(directory, - url[url.index(prefix)+len(prefix):]. - replace(url_suffix, ''). - replace(INT_URL_SUFFIX, '') + '.xml') - return file + file_ = os.path.join(directory, + url[url.index(prefix)+len(prefix):]. + replace(url_suffix, ''). + replace(INT_URL_SUFFIX, '') + '.xml') + return file_ def url_to_doi(url): From 7dd63c33e3f7d5c20dedf12d320de895fc042623 Mon Sep 17 00:00:00 2001 From: Sebastian Bassi Date: Thu, 19 Oct 2017 00:10:59 -0700 Subject: [PATCH 14/24] WiP --- allofplos/plos_corpus.py | 14 +++++++++----- allofplos/plos_regex.py | 10 +++++----- allofplos/tests/unittests.py | 13 ++++++------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py index b0975e66..3c427df8 100644 --- a/allofplos/plos_corpus.py +++ b/allofplos/plos_corpus.py @@ -33,7 +33,7 @@ import requests from tqdm import tqdm -from plos_regex import validate_doi, validate_file +from plos_regex import validate_doi, validate_filename help_str = "This program downloads a zip file with all PLOS articles and checks for updates" @@ -115,16 +115,18 @@ def filename_to_doi(filename): Includes transform for the 'annotation' DOIs Uses regex to make sure it's a file and not a DOI Example: - filename_to_doi('allofplos_xml/journal.pone.1000001.xml') = '10.1371/journal.pone.1000001' + filename_to_doi('journal.pone.1000001.xml') = '10.1371/journal.pone.1000001' :param article_file: relative path to local XML file in the corpusdir directory :param directory: defaults to corpusdir, containing article files :return: full unique identifier for a PLOS article """ - if correction in filename and validate_file(filename): + #import pdb; pdb.set_trace() + if correction in filename and validate_filename(filename): article = 'annotation/' + (filename.split('.', 4)[2]) doi = prefix + article - elif validate_file(filename): + elif validate_filename(filename): doi = prefix + os.path.splitext((os.path.basename(filename)))[0] + # NOTE: A filename should never validate as a DOI, so the next elif is wrong. elif validate_doi(filename): doi = filename return doi @@ -142,6 +144,7 @@ def url_to_path(url, directory=corpusdir, plos_network=False): """ annot_prefix = 'plos.correction.' if url.startswith(annotation_url) or url.startswith(annotation_url_int): + # NOTE: REDO THIS! file_ = os.path.join(directory, annot_prefix + url[url.index(annotation_doi + '/')+len(annotation_doi + '/'):]. @@ -196,7 +199,8 @@ def doi_to_path(doi, directory=corpusdir): article_file = os.path.join(directory, "plos.correction." + doi.split('/')[-1] + suffix_lower) elif validate_doi(doi): article_file = os.path.join(directory, doi.lstrip(prefix) + suffix_lower) - elif validate_file(doi): + # NOTE: The following check is weird, a DOI should never validate as a file name. + elif validate_filename(doi): article_file = doi return article_file diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py index 3be21b2b..00e98e23 100644 --- a/allofplos/plos_regex.py +++ b/allofplos/plos_regex.py @@ -18,13 +18,12 @@ r"|([a-zA-Z0-9]{13}$)" r"|([a-zA-Z0-9]{32}$))") regex_file_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" - r"|(plos\.correction\.[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))") + r"|(journal\.p[a-zA-Z]{3}\.correction\.[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))") full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match) full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}" "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents) -corpus_file_regex_match = re.compile(corpusdir_regex+regex_file_search+r"\.xml") -newarticle_file_regex_match = re.compile(newarticledir_regex+regex_file_search+r"\.xml") +file_regex_match = re.compile(regex_file_search+r"\.xml") base_url = 'http://journals.plos.org/plosone/article/file?id=' url_suffix = '&type=manuscript' external_url_regex_match = re.compile(re.escape(base_url) + @@ -42,14 +41,15 @@ def validate_doi(doi): return bool(full_doi_regex_match.search(doi)) -def validate_file(article_file): +def validate_filename(filename): """ For an individual string, tests whether the full string is in a valid article file in corpusdir or newarticledir format or not. Example: 'allofplos_xml/journal.pbio.2000777.xml' is True, but 'allofplos_xml/journal.pbio.20007779.xml' is False + :filename: A string with a file name :return: True if string is in a valid PLOS corpus article format; False if not """ - if bool(corpus_file_regex_match.search(article_file)) or bool(newarticle_file_regex_match.search(article_file)): + if file_regex_match.search(filename): return True else: return False diff --git a/allofplos/tests/unittests.py b/allofplos/tests/unittests.py index ccbd20dc..fd64ed0a 100644 --- a/allofplos/tests/unittests.py +++ b/allofplos/tests/unittests.py @@ -11,15 +11,14 @@ 'journal.pbio.2001413&type=manuscript' example_url_int = 'http://contentrepo.plos.org:8002/v1/objects/mogilefs-prod-'\ 'repo?key=10.1371/journal.pbio.2001413.XML' -example_file = 'allofplos_xml/journal.pbio.2001413.xml' +example_file = 'journal.pbio.2001413.xml' example_doi = '10.1371/journal.pbio.2001413' example_url2 = 'http://journals.plos.org/plosone/article/file?id=10.1371/'\ 'annotation/3155a3e9-5fbe-435c-a07a-e9a4846ec0b6&type=manuscript' example_url2_int = 'http://contentrepo.plos.org:8002/v1/objects/mogilefs-'\ 'prod-repo?key=10.1371/annotation/3155a3e9-5fbe-435c-a'\ '07a-e9a4846ec0b6.XML' -example_file2 = 'allofplos_xml/plos.correction.3155a3e9-5fbe-435c'\ - '-a07a-e9a4846ec0b6.xml' +example_file2 = 'journal.ppat.correction.e92d19e0-996a-4bfa-afdd-20dce770ed75.xml' example_doi2 = '10.1371/annotation/3155a3e9-5fbe-435c-a07a-e9a4846ec0b6' @@ -30,7 +29,7 @@ def test_doi_conversions(self): TODO: What this tests are about! """ self.assertEqual(example_file, doi_to_path(example_doi), "{0} does not transform to {1}".format(example_doi, example_file)) - self.assertEqual(example_file2, doi_to_path(example_doi2), "{0} does not transform to {1}".format(example_doi2, example_file2)) + self.assertEqual(example_file2, doi_to_path(example_doi2, ''), "{0} does not transform to {1}".format(example_doi2, example_file2)) self.assertEqual(example_url2, doi_to_url(example_doi2), "{0} does not transform to {1}".format(example_doi2, example_url2)) self.assertEqual(example_url, doi_to_url(example_doi), "In doi_to_url, {0} does not transform to {1}".format(example_doi, example_url)) self.assertEqual(example_url2_int, doi_to_url(example_doi2, plos_network=True), @@ -69,15 +68,15 @@ def test_url_conversions(self): "{0} does not transform to {1}".format(example_url, example_doi)) self.assertEqual(example_doi2, url_to_doi(example_url2), "{0} does not transform to {1}".format(example_url2, example_doi2)) - self.assertEqual(example_file, url_to_path(example_url), + self.assertEqual(example_file, url_to_path(example_url, ''), "{0} does not transform to {1}".format(example_url, example_file)) - self.assertEqual(example_file2, url_to_path(example_url2), + self.assertEqual(example_file2, url_to_path(example_url2, ''), "{0} does not transform to {1}".format(example_url2, example_file2)) self.assertEqual(example_doi, url_to_doi(example_url_int), "{0} does not transform to {1}".format(example_url_int, example_doi)) self.assertEqual(example_doi2, url_to_doi(example_url2_int), "{0} does not transform to {1}".format(example_url2_int, example_doi2)) - self.assertEqual(example_file, url_to_path(example_url_int), + self.assertEqual(example_file, url_to_path(example_url_int, ''), "{0} does not transform to {1}".format(example_url_int, example_file)) # Test temporary commented out. #self.assertEqual(example_file2, url_to_path(example_url2_int, plos_network=True), From 040e8add426e3cc74db160d0acfe9d4ed30c2a8f Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Thu, 19 Oct 2017 11:01:53 -0700 Subject: [PATCH 15/24] update function name --- allofplos/samples/corpus_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allofplos/samples/corpus_analysis.py b/allofplos/samples/corpus_analysis.py index eb1df1fd..634bda43 100644 --- a/allofplos/samples/corpus_analysis.py +++ b/allofplos/samples/corpus_analysis.py @@ -20,7 +20,7 @@ from plos_corpus import (listdir_nohidden, check_article_type, get_article_xml, uncorrected_proofs_text_list, get_related_article_doi, download_updated_xml, get_all_solr_dois, doi_to_path, filename_to_doi, newarticledir, get_article_pubdate, doi_to_url, download_check_and_move) -from plos_regex import (full_doi_regex_match, validate_doi, validate_file, validate_url, currents_doi_filter) +from plos_regex import (full_doi_regex_match, validate_doi, validate_filename, validate_url, currents_doi_filter) counter = collections.Counter corpusdir = 'allofplos_xml' From 9049b12ae3627a34c3f29a8d7dee22510083b7c4 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Thu, 19 Oct 2017 12:58:21 -0700 Subject: [PATCH 16/24] fix article path --- allofplos/plos_corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py index 3c427df8..92ce99a5 100644 --- a/allofplos/plos_corpus.py +++ b/allofplos/plos_corpus.py @@ -570,7 +570,7 @@ def download_updated_xml(article_file, try: articletree_local = et.parse(article_file) except OSError: - article_file_alt = os.path.join(tempdir, os.path.basename(article_file)) + article_file_alt = os.path.join(tempdir, os.path.basename(article_file) + 'xml') articletree_local = et.parse(article_file_alt) articleXML_local = et.tostring(articletree_local, method='xml', encoding='unicode') From 18dc63d2ba5f3667a6aa896f23b828ad80c296de Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Thu, 19 Oct 2017 13:05:43 -0700 Subject: [PATCH 17/24] + '.' --- allofplos/plos_corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py index 92ce99a5..342ce4cf 100644 --- a/allofplos/plos_corpus.py +++ b/allofplos/plos_corpus.py @@ -570,7 +570,7 @@ def download_updated_xml(article_file, try: articletree_local = et.parse(article_file) except OSError: - article_file_alt = os.path.join(tempdir, os.path.basename(article_file) + 'xml') + article_file_alt = os.path.join(tempdir, os.path.basename(article_file) + '.xml') articletree_local = et.parse(article_file_alt) articleXML_local = et.tostring(articletree_local, method='xml', encoding='unicode') From 4b740c88c5adae9b559c6f91d819a2df3f659422 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Thu, 19 Oct 2017 13:09:34 -0700 Subject: [PATCH 18/24] fix new regex --- allofplos/plos_regex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py index 00e98e23..e3416319 100644 --- a/allofplos/plos_regex.py +++ b/allofplos/plos_regex.py @@ -18,7 +18,7 @@ r"|([a-zA-Z0-9]{13}$)" r"|([a-zA-Z0-9]{32}$))") regex_file_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" - r"|(journal\.p[a-zA-Z]{3}\.correction\.[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))") + r"|(plos\.correction\.[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))") full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match) full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}" "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") From 3024dc3628eb24556a1e207ab3737d813b30c821 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Thu, 19 Oct 2017 13:24:04 -0700 Subject: [PATCH 19/24] fix file path --- allofplos/plos_corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py index 342ce4cf..44dd504a 100644 --- a/allofplos/plos_corpus.py +++ b/allofplos/plos_corpus.py @@ -568,7 +568,7 @@ def download_updated_xml(article_file, articletree_remote = et.parse(url) articleXML_remote = et.tostring(articletree_remote, method='xml', encoding='unicode') try: - articletree_local = et.parse(article_file) + articletree_local = et.parse(os.path.join(corpusdir, os.path.basename(article_file) + '.xml')) except OSError: article_file_alt = os.path.join(tempdir, os.path.basename(article_file) + '.xml') articletree_local = et.parse(article_file_alt) From db0bda0f181b686cea333f7f18a15d9c8876f7e8 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Thu, 19 Oct 2017 13:26:51 -0700 Subject: [PATCH 20/24] rm extension --- allofplos/plos_corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py index 44dd504a..7a7a6d2c 100644 --- a/allofplos/plos_corpus.py +++ b/allofplos/plos_corpus.py @@ -568,7 +568,7 @@ def download_updated_xml(article_file, articletree_remote = et.parse(url) articleXML_remote = et.tostring(articletree_remote, method='xml', encoding='unicode') try: - articletree_local = et.parse(os.path.join(corpusdir, os.path.basename(article_file) + '.xml')) + articletree_local = et.parse(os.path.join(corpusdir, os.path.basename(article_file))) except OSError: article_file_alt = os.path.join(tempdir, os.path.basename(article_file) + '.xml') articletree_local = et.parse(article_file_alt) From d68cd6187a3c936c3f97f11c317d22836bf65b56 Mon Sep 17 00:00:00 2001 From: Sebastian Bassi Date: Thu, 19 Oct 2017 17:46:47 -0700 Subject: [PATCH 21/24] Fixing xml endings --- allofplos/plos_corpus.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/allofplos/plos_corpus.py b/allofplos/plos_corpus.py index 7a7a6d2c..3016a2a7 100644 --- a/allofplos/plos_corpus.py +++ b/allofplos/plos_corpus.py @@ -567,10 +567,12 @@ def download_updated_xml(article_file, url = URL_TMP.format(doi) articletree_remote = et.parse(url) articleXML_remote = et.tostring(articletree_remote, method='xml', encoding='unicode') + if not article_file.endswith('.xml'): + article_file += '.xml' try: articletree_local = et.parse(os.path.join(corpusdir, os.path.basename(article_file))) except OSError: - article_file_alt = os.path.join(tempdir, os.path.basename(article_file) + '.xml') + article_file_alt = os.path.join(tempdir, os.path.basename(article_file)) articletree_local = et.parse(article_file_alt) articleXML_local = et.tostring(articletree_local, method='xml', encoding='unicode') From fcdcc38b7e3c8cbe575117e680444774721e28c5 Mon Sep 17 00:00:00 2001 From: Sebastian Bassi Date: Thu, 19 Oct 2017 22:29:46 -0700 Subject: [PATCH 22/24] update HISTORY with the changes --- HISTORY.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/HISTORY.txt b/HISTORY.txt index d4a35d9e..63347d0d 100644 --- a/HISTORY.txt +++ b/HISTORY.txt @@ -1,3 +1,5 @@ +0.8.3 Filename structure for annotation DOI does not depend on journal name. Improved tests. + 0.8.2 Adding a method to generate a CSV file with all PLOS articles. 0.8.1 Adding entry point. Thanks Chris Haumesser. From be74696d1dfc78d0e50bff1b287615146c874564 Mon Sep 17 00:00:00 2001 From: Sebastian Bassi Date: Thu, 19 Oct 2017 22:31:15 -0700 Subject: [PATCH 23/24] Change version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2890d21d..a83068df 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ setup( name='allofplos', # https://packaging.python.org/en/latest/single_source_version.html - version='0.8.2', + version='0.8.3', description='Get and analyze all PLOS articles', long_description=long_description, url='https://github.com/PLOS/allofplos', From 4f87434c482e4011a111c40627416a7dceaf35c9 Mon Sep 17 00:00:00 2001 From: Sebastian Bassi Date: Thu, 19 Oct 2017 22:33:53 -0700 Subject: [PATCH 24/24] update readme --- README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 2e167ed6..ec89b1a1 100644 --- a/README.rst +++ b/README.rst @@ -13,7 +13,8 @@ corpus for further analysis. Use this program to download all PLOS XML article files instead of doing web scraping. **NOTE**: This software is not stable, we consider it beta state and will -be in this stage until version 1.0. +be in this stage until version 1.0. This means that programming interface +may change and after a new version a full corpus download may be required. Installation instructions -------------------------