Skip to content

Commit

Permalink
Merge pull request #25 from PLOS/csvfile
Browse files Browse the repository at this point in the history
Csv file creation
  • Loading branch information
sbassi authored Oct 11, 2017
2 parents d732feb + 29818bb commit 89cf95e
Show file tree
Hide file tree
Showing 8 changed files with 773 additions and 68 deletions.
2 changes: 2 additions & 0 deletions HISTORY.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
0.8.2 Adding a method to generate a CSV file with all PLOS articles.

0.8.1 Adding entry point. Thanks Chris Haumesser.

0.8.0 First public release.
2 changes: 1 addition & 1 deletion allofplos/allofplos_basics.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
"source": [
"example_dois = get_random_list_of_dois(count=10)\n",
"example_doi = example_dois[0]\n",
"example_file = doi_to_file(example_doi)\n",
"example_file = doi_to_path(example_doi)\n",
"example_url = doi_to_url(example_doi)\n",
"print(\"Three ways to represent an article\\nArticle as DOI: {}\\nArticle as local file: {}\\nArticle as url: {}\" \\\n",
" .format(example_doi, example_file, example_url))"
Expand Down
288 changes: 288 additions & 0 deletions allofplos/csvfile.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"import datetime\n",
"import lxml.etree as et\n",
"import csv\n",
"pmcdir = 'pmc_articles'\n",
"from plos_corpus import (corpusdir, get_article_pubdate, check_if_uncorrected_proof, listdir_nohidden,\n",
" get_article_xml, file_to_doi, doi_to_file, get_all_solr_dois, download_check_and_move)\n",
"\n",
"from samples.corpus_analysis import (get_plos_article_type, get_article_dtd, get_random_list_of_dois, \n",
" get_related_retraction_article, check_article_type, get_plos_journal,\n",
" get_article_title, parse_article_date, get_corpus_metadata,\n",
" get_article_abstract, corpus_metadata_to_csv, get_article_dates,\n",
" read_corpus_metadata_from_csv)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"corpus_metadata, wrong_dates = get_corpus_metadata(article_list=article_list)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 27% (60403 of 221314) |#### | Elapsed Time: 0:33:25 ETA: 1:40:03"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error getting history dates for allofplos_xml/journal.pone.0034143.xml\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 46% (103042 of 221314) |####### | Elapsed Time: 0:57:26 ETA: 1:01:12"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error parsing DTD from allofplos_xml/journal.pone.0076809.xml\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 79% (176482 of 221314) |############# | Elapsed Time: 1:37:52 ETA: 0:25:51"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"No abstract found for research article 10.1371/journal.pone.0150341\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 84% (186376 of 221314) |############## | Elapsed Time: 1:43:42 ETA: 0:20:36"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"No abstract found for research article 10.1371/journal.pone.0160248\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 85% (189962 of 221314) |############## | Elapsed Time: 1:45:51 ETA: 0:18:34"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"No abstract found for research article 10.1371/journal.pone.0163841\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 90% (199535 of 221314) |############### | Elapsed Time: 1:51:29 ETA: 0:12:43"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"No abstract found for research article 10.1371/journal.pone.0173427\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 90% (200365 of 221314) |############### | Elapsed Time: 1:51:57 ETA: 0:12:10"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"No abstract found for research article 10.1371/journal.pone.0174259\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 94% (210201 of 221314) |################ | Elapsed Time: 1:57:46 ETA: 0:06:38"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"No abstract found for research article 10.1371/journal.pone.0184204\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 96% (212865 of 221314) |################ | Elapsed Time: 1:58:41 ETA: 0:04:23"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error parsing article body: allofplos_xml/journal.pone.correction.5fbbf39a-fb47-4ce1-8069-acd830b3d41f.xml\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100% (221314 of 221314) |#################| Elapsed Time: 2:02:36 Time: 2:02:36\n"
]
}
],
"source": [
"article_list = listdir_nohidden(corpusdir)\n",
"corpus_metadata_to_csv(article_list=article_list)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#to execute: update corpus_metadata with a list of DOIs. check that functions can handle an overlapping list, and make\n",
"# sure that appending is working correctly. there were some errors reading in a csv that had been appended/extended."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"corpus_metadata_reconstructed = read_corpus_metadata_from_csv()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('10.1371/journal.pbio.0000001',\n",
" 'journal.pbio.0000001',\n",
" 'A Functional Analysis of the Spacer of V(D)J Recombination Signal Sequences',\n",
" 'PLOS Biology',\n",
" 'research-article',\n",
" 'Research Article',\n",
" 'NLM 3.0',\n",
" '2003-10-13',\n",
" '2003-06-01',\n",
" '2003-07-10',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '9942',\n",
" '',\n",
" 'During lymphocyte development, V(D)J recombination assembles antigen receptor genes from component V, D, and J gene segments. These gene segments are flanked by a recombination signal sequence (RSS), which serves as the binding site for the recombination machinery. The murine Jβ2.6 gene segment is a recombinationally inactive pseudogene, but examination of its RSS reveals no obvious reason for its failure to recombine. Mutagenesis of the Jβ2.6 RSS demonstrates that the sequences of the heptamer, nonamer, and spacer are all important. Strikingly, changes solely in the spacer sequence can result in dramatic differences in the level of recombination. The subsequent analysis of a library of more than 4,000 spacer variants revealed that spacer residues of particular functional importance are correlated with their degree of conservation. Biochemical assays indicate distinct cooperation between the spacer and heptamer/nonamer along each step of the reaction pathway. The results suggest that the spacer serves not only to ensure the appropriate distance between the heptamer and nonamer but also regulates RSS activity by providing additional RAG:RSS interaction surfaces. We conclude that while RSSs are defined by a “digital” requirement for absolutely conserved nucleotides, the quality of RSS function is determined in an “analog” manner by numerous complex interactions between the RAG proteins and the less-well conserved nucleotides in the heptamer, the nonamer, and, importantly, the spacer. Those modulatory effects are accurately predicted by a new computational algorithm for “RSS information content.” The interplay between such binary and multiplicative modes of interactions provides a general model for analyzing protein–DNA interactions in various biological systems.')"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"corpus_metadata_reconstructed[0]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"toc": {
"colors": {
"hover_highlight": "#DAA520",
"navigate_num": "#000000",
"navigate_text": "#333333",
"running_highlight": "#FF0000",
"selected_highlight": "#FFD700",
"sidebar_border": "#EEEEEE",
"wrapper_background": "#FFFFFF"
},
"moveMenuLeft": true,
"nav_menu": {
"height": "12px",
"width": "252px"
},
"navigate_menu": true,
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"threshold": 4,
"toc_cell": false,
"toc_position": {},
"toc_section_display": "block",
"toc_window_display": false,
"widenNotebook": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 89cf95e

Please sign in to comment.