From 876ec0081ec0643c7ce1ce4186fed384b801ec7e Mon Sep 17 00:00:00 2001
From: Avijit <deepayan137@gmail.com>
Date: Wed, 21 Jun 2017 23:48:45 +0530
Subject: [PATCH 1/3] added the script to automatically download the nips paper
 for a particular year

---
 get_all_pdf_nips.py | 81 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 get_all_pdf_nips.py

diff --git a/get_all_pdf_nips.py b/get_all_pdf_nips.py
new file mode 100644
index 0000000..cf3c75d
--- /dev/null
+++ b/get_all_pdf_nips.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+"""
+Download all the pdfs linked on a given webpage
+Usage -
+    python grab_pdfs.py url <path/to/directory>
+        url is required
+        path is optional. Path needs to be absolute
+        will save in the current directory if no path is given
+        will save in the current directory if given path does not exist
+Requires - requests >= 1.0.4
+           beautifulsoup >= 4.0.0
+Download and install using
+    
+    pip install requests
+    pip install beautifulsoup4
+"""
+
+
+from requests import get
+from urllib.parse import urljoin
+from os import path, getcwd
+from bs4 import BeautifulSoup as soup
+
+
+"""
+Author: Avijit Dasgupta
+
+This script downloads all the pdf files of the accepted papers from "https://papers.nips.cc".
+
+To run this code: python get_all_pdf_nips.py year where you need to specify the year (e.g.- 2011, 2012 etc.)
+
+"""
+
+
+from sys import argv
+import pdb
+from bs4 import BeautifulSoup
+import pandas as pd
+import urllib
+import numpy as np
+import pdb
+import os
+import sys
+from nltk.tokenize import RegexpTokenizer
+from gensim import corpora, models
+import gensim
+from stop_words import get_stop_words
+import logging
+from nltk.corpus import stopwords
+from nltk.stem.wordnet import WordNetLemmatizer
+import string
+
+
+
+year = str(sys.argv[1])
+#year = str(2015)
+base_dir = "NIPS" + year
+if not os.path.exists(base_dir):
+    os.makedirs(base_dir)
+base_url = "https://papers.nips.cc"
+url= 'https://papers.nips.cc/book/advances-in-neural-information-processing-systems-29-'+year
+logging.info("Connecting to URL: %s" % url)
+page = urllib.request.urlopen(url)
+
+soup = BeautifulSoup(urllib.request.urlopen(url), "html.parser")
+lists = soup.find_all('li') # Title and paper url
+#print(len(lists))
+lists = lists[1:]
+n_pdfs = 0
+for paper_link in lists:
+    
+    print(paper_link.find_all('a')[0]['href'])
+    n_pdfs+= 1
+    content= get(urljoin(base_url, paper_link.find_all('a')[0]['href']+ '.pdf'))
+
+    with open(path.join(base_dir, paper_link.find_all('a')[0]['href'][7:]+'.pdf'), 'wb') as pdf: #remove /paper/
+        pdf.write(content.content)
+
+
+pdb.set_trace()
\ No newline at end of file

From 42202a9f343d37ab403dfecd545b81c0e67f9b36 Mon Sep 17 00:00:00 2001
From: Avijit <avijitdasgupta9@gmail.com>
Date: Wed, 21 Jun 2017 23:59:15 +0530
Subject: [PATCH 2/3] added a script to automatically download all nips papers
 for a paticular year

---
 get_all_pdf_nips.py => getallpdfnips.py | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)
 rename get_all_pdf_nips.py => getallpdfnips.py (75%)

diff --git a/get_all_pdf_nips.py b/getallpdfnips.py
similarity index 75%
rename from get_all_pdf_nips.py
rename to getallpdfnips.py
index cf3c75d..7e5bd5b 100644
--- a/get_all_pdf_nips.py
+++ b/getallpdfnips.py
@@ -1,27 +1,7 @@
 #!/usr/bin/env python
 
-"""
-Download all the pdfs linked on a given webpage
-Usage -
-    python grab_pdfs.py url <path/to/directory>
-        url is required
-        path is optional. Path needs to be absolute
-        will save in the current directory if no path is given
-        will save in the current directory if given path does not exist
-Requires - requests >= 1.0.4
-           beautifulsoup >= 4.0.0
-Download and install using
-    
-    pip install requests
-    pip install beautifulsoup4
-"""
 
 
-from requests import get
-from urllib.parse import urljoin
-from os import path, getcwd
-from bs4 import BeautifulSoup as soup
-
 
 """
 Author: Avijit Dasgupta
@@ -33,6 +13,9 @@
 """
 
 
+from requests import get
+from urllib.parse import urljoin
+from os import path, getcwd
 from sys import argv
 import pdb
 from bs4 import BeautifulSoup

From 919bf2075b20182465154c49a903cfc9b82a27d8 Mon Sep 17 00:00:00 2001
From: Avijit <avijitdasgupta9@gmail.com>
Date: Thu, 22 Jun 2017 00:00:45 +0530
Subject: [PATCH 3/3] removed pdb

---
 getallpdfnips.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/getallpdfnips.py b/getallpdfnips.py
index 7e5bd5b..b09843e 100644
--- a/getallpdfnips.py
+++ b/getallpdfnips.py
@@ -1,8 +1,5 @@
 #!/usr/bin/env python
 
-
-
-
 """
 Author: Avijit Dasgupta
 
@@ -61,4 +58,4 @@
         pdf.write(content.content)
 
 
-pdb.set_trace()
\ No newline at end of file
+