From 876ec0081ec0643c7ce1ce4186fed384b801ec7e Mon Sep 17 00:00:00 2001 From: Avijit Date: Wed, 21 Jun 2017 23:48:45 +0530 Subject: [PATCH 1/3] added the script to automatically download the nips paper for a particular year --- get_all_pdf_nips.py | 81 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 get_all_pdf_nips.py diff --git a/get_all_pdf_nips.py b/get_all_pdf_nips.py new file mode 100644 index 0000000..cf3c75d --- /dev/null +++ b/get_all_pdf_nips.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python + +""" +Download all the pdfs linked on a given webpage +Usage - + python grab_pdfs.py url + url is required + path is optional. Path needs to be absolute + will save in the current directory if no path is given + will save in the current directory if given path does not exist +Requires - requests >= 1.0.4 + beautifulsoup >= 4.0.0 +Download and install using + + pip install requests + pip install beautifulsoup4 +""" + + +from requests import get +from urllib.parse import urljoin +from os import path, getcwd +from bs4 import BeautifulSoup as soup + + +""" +Author: Avijit Dasgupta + +This script downloads all the pdf files of the accepted papers from "https://papers.nips.cc". + +To run this code: python get_all_pdf_nips.py year where you need to specify the year (e.g.- 2011, 2012 etc.) + +""" + + +from sys import argv +import pdb +from bs4 import BeautifulSoup +import pandas as pd +import urllib +import numpy as np +import pdb +import os +import sys +from nltk.tokenize import RegexpTokenizer +from gensim import corpora, models +import gensim +from stop_words import get_stop_words +import logging +from nltk.corpus import stopwords +from nltk.stem.wordnet import WordNetLemmatizer +import string + + + +year = str(sys.argv[1]) +#year = str(2015) +base_dir = "NIPS" + year +if not os.path.exists(base_dir): + os.makedirs(base_dir) +base_url = "https://papers.nips.cc" +url= 'https://papers.nips.cc/book/advances-in-neural-information-processing-systems-29-'+year +logging.info("Connecting to URL: %s" % url) +page = urllib.request.urlopen(url) + +soup = BeautifulSoup(urllib.request.urlopen(url), "html.parser") +lists = soup.find_all('li') # Title and paper url +#print(len(lists)) +lists = lists[1:] +n_pdfs = 0 +for paper_link in lists: + + print(paper_link.find_all('a')[0]['href']) + n_pdfs+= 1 + content= get(urljoin(base_url, paper_link.find_all('a')[0]['href']+ '.pdf')) + + with open(path.join(base_dir, paper_link.find_all('a')[0]['href'][7:]+'.pdf'), 'wb') as pdf: #remove /paper/ + pdf.write(content.content) + + +pdb.set_trace() \ No newline at end of file From 42202a9f343d37ab403dfecd545b81c0e67f9b36 Mon Sep 17 00:00:00 2001 From: Avijit Date: Wed, 21 Jun 2017 23:59:15 +0530 Subject: [PATCH 2/3] added a script to automatically download all nips papers for a paticular year --- get_all_pdf_nips.py => getallpdfnips.py | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) rename get_all_pdf_nips.py => getallpdfnips.py (75%) diff --git a/get_all_pdf_nips.py b/getallpdfnips.py similarity index 75% rename from get_all_pdf_nips.py rename to getallpdfnips.py index cf3c75d..7e5bd5b 100644 --- a/get_all_pdf_nips.py +++ b/getallpdfnips.py @@ -1,27 +1,7 @@ #!/usr/bin/env python -""" -Download all the pdfs linked on a given webpage -Usage - - python grab_pdfs.py url - url is required - path is optional. Path needs to be absolute - will save in the current directory if no path is given - will save in the current directory if given path does not exist -Requires - requests >= 1.0.4 - beautifulsoup >= 4.0.0 -Download and install using - - pip install requests - pip install beautifulsoup4 -""" -from requests import get -from urllib.parse import urljoin -from os import path, getcwd -from bs4 import BeautifulSoup as soup - """ Author: Avijit Dasgupta @@ -33,6 +13,9 @@ """ +from requests import get +from urllib.parse import urljoin +from os import path, getcwd from sys import argv import pdb from bs4 import BeautifulSoup From 919bf2075b20182465154c49a903cfc9b82a27d8 Mon Sep 17 00:00:00 2001 From: Avijit Date: Thu, 22 Jun 2017 00:00:45 +0530 Subject: [PATCH 3/3] removed pdb --- getallpdfnips.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/getallpdfnips.py b/getallpdfnips.py index 7e5bd5b..b09843e 100644 --- a/getallpdfnips.py +++ b/getallpdfnips.py @@ -1,8 +1,5 @@ #!/usr/bin/env python - - - """ Author: Avijit Dasgupta @@ -61,4 +58,4 @@ pdf.write(content.content) -pdb.set_trace() \ No newline at end of file +