diff --git a/markdown_generator/PubsFromBib.ipynb b/markdown_generator/PubsFromBib.ipynb new file mode 100644 index 0000000000000..df898a7128007 --- /dev/null +++ b/markdown_generator/PubsFromBib.ipynb @@ -0,0 +1,223 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Publications markdown generator for academicpages\n", + "\n", + "Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). \n", + "\n", + "The core python code is also in `pubsFromBibs.py`. \n", + "Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:\n", + "* bib file names\n", + "* specific venue keys based on your bib file preferences\n", + "* any specific pre-text for specific files\n", + "* Collection Name (future feature)\n", + "\n", + "TODO: Make this work with other databases of citations, \n", + "TODO: Merge this with the existing TSV parsing solution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pybtex.database.input import bibtex\n", + "import pybtex.database.input.bibtex \n", + "from time import strptime\n", + "import string\n", + "import html\n", + "import os\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#todo: incorporate different collection types rather than a catch all publications, requires other changes to template\n", + "publist = {\n", + " \"proceeding\": {\n", + " \"file\" : \"proceedings.bib\",\n", + " \"venuekey\": \"booktitle\",\n", + " \"venue-pretext\": \"In the proceedings of \",\n", + " \"collection\" : {\"name\":\"publications\",\n", + " \"permalink\":\"/publication/\"}\n", + " \n", + " },\n", + " \"journal\":{\n", + " \"file\": \"pubs.bib\",\n", + " \"venuekey\" : \"journal\",\n", + " \"venue-pretext\" : \"\",\n", + " \"collection\" : {\"name\":\"publications\",\n", + " \"permalink\":\"/publication/\"}\n", + " } \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "html_escape_table = {\n", + " \"&\": \"&\",\n", + " '\"': \""\",\n", + " \"'\": \"'\"\n", + " }\n", + "\n", + "def html_escape(text):\n", + " \"\"\"Produce entities within text.\"\"\"\n", + " return \"\".join(html_escape_table.get(c,c) for c in text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "for pubsource in publist:\n", + " parser = bibtex.Parser()\n", + " bibdata = parser.parse_file(publist[pubsource][\"file\"])\n", + "\n", + " #loop through the individual references in a given bibtex file\n", + " for bib_id in bibdata.entries:\n", + " #reset default date\n", + " pub_year = \"1900\"\n", + " pub_month = \"01\"\n", + " pub_day = \"01\"\n", + " \n", + " b = bibdata.entries[bib_id].fields\n", + " \n", + " try:\n", + " pub_year = f'{b[\"year\"]}'\n", + "\n", + " #todo: this hack for month and day needs some cleanup\n", + " if \"month\" in b.keys(): \n", + " if(len(b[\"month\"])<3):\n", + " pub_month = \"0\"+b[\"month\"]\n", + " pub_month = pub_month[-2:]\n", + " elif(b[\"month\"] not in range(12)):\n", + " tmnth = strptime(b[\"month\"][:3],'%b').tm_mon \n", + " pub_month = \"{:02d}\".format(tmnth) \n", + " else:\n", + " pub_month = str(b[\"month\"])\n", + " if \"day\" in b.keys(): \n", + " pub_day = str(b[\"day\"])\n", + "\n", + " \n", + " pub_date = pub_year+\"-\"+pub_month+\"-\"+pub_day\n", + " \n", + " #strip out {} as needed (some bibtex entries that maintain formatting)\n", + " clean_title = b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\").replace(\" \",\"-\") \n", + "\n", + " url_slug = re.sub(\"\\\\[.*\\\\]|[^a-zA-Z0-9_-]\", \"\", clean_title)\n", + " url_slug = url_slug.replace(\"--\",\"-\")\n", + "\n", + " md_filename = (str(pub_date) + \"-\" + url_slug + \".md\").replace(\"--\",\"-\")\n", + " html_filename = (str(pub_date) + \"-\" + url_slug).replace(\"--\",\"-\")\n", + "\n", + " #Build Citation from text\n", + " citation = \"\"\n", + "\n", + " #citation authors - todo - add highlighting for primary author?\n", + " for author in bibdata.entries[bib_id].persons[\"author\"]:\n", + " citation = citation+\" \"+author.first_names[0]+\" \"+author.last_names[0]+\", \"\n", + "\n", + " #citation title\n", + " citation = citation + \"\\\"\" + html_escape(b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")) + \".\\\"\"\n", + "\n", + " #add venue logic depending on citation type\n", + " venue = publist[pubsource][\"venue-pretext\"]+b[publist[pubsource][\"venuekey\"]].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")\n", + "\n", + " citation = citation + \" \" + html_escape(venue)\n", + " citation = citation + \", \" + pub_year + \".\"\n", + "\n", + " \n", + " ## YAML variables\n", + " md = \"---\\ntitle: \\\"\" + html_escape(b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")) + '\"\\n'\n", + " \n", + " md += \"\"\"collection: \"\"\" + publist[pubsource][\"collection\"][\"name\"]\n", + "\n", + " md += \"\"\"\\npermalink: \"\"\" + publist[pubsource][\"collection\"][\"permalink\"] + html_filename\n", + " \n", + " note = False\n", + " if \"note\" in b.keys():\n", + " if len(str(b[\"note\"])) > 5:\n", + " md += \"\\nexcerpt: '\" + html_escape(b[\"note\"]) + \"'\"\n", + " note = True\n", + "\n", + " md += \"\\ndate: \" + str(pub_date) \n", + "\n", + " md += \"\\nvenue: '\" + html_escape(venue) + \"'\"\n", + " \n", + " url = False\n", + " if \"url\" in b.keys():\n", + " if len(str(b[\"url\"])) > 5:\n", + " md += \"\\npaperurl: '\" + b[\"url\"] + \"'\"\n", + " url = True\n", + "\n", + " md += \"\\ncitation: '\" + html_escape(citation) + \"'\"\n", + "\n", + " md += \"\\n---\"\n", + "\n", + " \n", + " ## Markdown description for individual page\n", + " if note:\n", + " md += \"\\n\" + html_escape(b[\"note\"]) + \"\\n\"\n", + "\n", + " if url:\n", + " md += \"\\n[Access paper here](\" + b[\"url\"] + \"){:target=\\\"_blank\\\"}\\n\" \n", + " else:\n", + " md += \"\\nUse [Google Scholar](https://scholar.google.com/scholar?q=\"+html.escape(clean_title.replace(\"-\",\"+\"))+\"){:target=\\\"_blank\\\"} for full citation\"\n", + "\n", + " md_filename = os.path.basename(md_filename)\n", + "\n", + " with open(\"../_publications/\" + md_filename, 'w') as f:\n", + " f.write(md)\n", + " print(f'SUCESSFULLY PARSED {bib_id}: \\\"', b[\"title\"][:60],\"...\"*(len(b['title'])>60),\"\\\"\")\n", + " # field may not exist for a reference\n", + " except KeyError as e:\n", + " print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \\\"', b[\"title\"][:30],\"...\"*(len(b['title'])>30),\"\\\"\")\n", + " continue\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/markdown_generator/pubsFromBib.py b/markdown_generator/pubsFromBib.py new file mode 100644 index 0000000000000..92b4d02f942f3 --- /dev/null +++ b/markdown_generator/pubsFromBib.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Publications markdown generator for academicpages +# +# Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). +# +# The core python code is also in `pubsFromBibs.py`. +# Run either from the `markdown_generator` folder after replacing updating the publist dictionary with: +# * bib file names +# * specific venue keys based on your bib file preferences +# * any specific pre-text for specific files +# * Collection Name (future feature) +# +# TODO: Make this work with other databases of citations, +# TODO: Merge this with the existing TSV parsing solution + + +from pybtex.database.input import bibtex +import pybtex.database.input.bibtex +from time import strptime +import string +import html +import os +import re + +#todo: incorporate different collection types rather than a catch all publications, requires other changes to template +publist = { + "proceeding": { + "file" : "proceedings.bib", + "venuekey": "booktitle", + "venue-pretext": "In the proceedings of ", + "collection" : {"name":"publications", + "permalink":"/publication/"} + + }, + "journal":{ + "file": "pubs.bib", + "venuekey" : "journal", + "venue-pretext" : "", + "collection" : {"name":"publications", + "permalink":"/publication/"} + } +} + +html_escape_table = { + "&": "&", + '"': """, + "'": "'" + } + +def html_escape(text): + """Produce entities within text.""" + return "".join(html_escape_table.get(c,c) for c in text) + + +for pubsource in publist: + parser = bibtex.Parser() + bibdata = parser.parse_file(publist[pubsource]["file"]) + + #loop through the individual references in a given bibtex file + for bib_id in bibdata.entries: + #reset default date + pub_year = "1900" + pub_month = "01" + pub_day = "01" + + b = bibdata.entries[bib_id].fields + + try: + pub_year = f'{b["year"]}' + + #todo: this hack for month and day needs some cleanup + if "month" in b.keys(): + if(len(b["month"])<3): + pub_month = "0"+b["month"] + pub_month = pub_month[-2:] + elif(b["month"] not in range(12)): + tmnth = strptime(b["month"][:3],'%b').tm_mon + pub_month = "{:02d}".format(tmnth) + else: + pub_month = str(b["month"]) + if "day" in b.keys(): + pub_day = str(b["day"]) + + + pub_date = pub_year+"-"+pub_month+"-"+pub_day + + #strip out {} as needed (some bibtex entries that maintain formatting) + clean_title = b["title"].replace("{", "").replace("}","").replace("\\","").replace(" ","-") + + url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title) + url_slug = url_slug.replace("--","-") + + md_filename = (str(pub_date) + "-" + url_slug + ".md").replace("--","-") + html_filename = (str(pub_date) + "-" + url_slug).replace("--","-") + + #Build Citation from text + citation = "" + + #citation authors - todo - add highlighting for primary author? + for author in bibdata.entries[bib_id].persons["author"]: + citation = citation+" "+author.first_names[0]+" "+author.last_names[0]+", " + + #citation title + citation = citation + "\"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + ".\"" + + #add venue logic depending on citation type + venue = publist[pubsource]["venue-pretext"]+b[publist[pubsource]["venuekey"]].replace("{", "").replace("}","").replace("\\","") + + citation = citation + " " + html_escape(venue) + citation = citation + ", " + pub_year + "." + + + ## YAML variables + md = "---\ntitle: \"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + '"\n' + + md += """collection: """ + publist[pubsource]["collection"]["name"] + + md += """\npermalink: """ + publist[pubsource]["collection"]["permalink"] + html_filename + + note = False + if "note" in b.keys(): + if len(str(b["note"])) > 5: + md += "\nexcerpt: '" + html_escape(b["note"]) + "'" + note = True + + md += "\ndate: " + str(pub_date) + + md += "\nvenue: '" + html_escape(venue) + "'" + + url = False + if "url" in b.keys(): + if len(str(b["url"])) > 5: + md += "\npaperurl: '" + b["url"] + "'" + url = True + + md += "\ncitation: '" + html_escape(citation) + "'" + + md += "\n---" + + + ## Markdown description for individual page + if note: + md += "\n" + html_escape(b["note"]) + "\n" + + if url: + md += "\n[Access paper here](" + b["url"] + "){:target=\"_blank\"}\n" + else: + md += "\nUse [Google Scholar](https://scholar.google.com/scholar?q="+html.escape(clean_title.replace("-","+"))+"){:target=\"_blank\"} for full citation" + + md_filename = os.path.basename(md_filename) + + with open("../_publications/" + md_filename, 'w') as f: + f.write(md) + print(f'SUCESSFULLY PARSED {bib_id}: \"', b["title"][:60],"..."*(len(b['title'])>60),"\"") + # field may not exist for a reference + except KeyError as e: + print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', b["title"][:30],"..."*(len(b['title'])>30),"\"") + continue