forked from 18F/privacy-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpia_scraper.py
114 lines (96 loc) · 3.86 KB
/
pia_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
import pdfquery
from pdfminer.high_level import extract_text
import time, re, pdb, glob, csv
GSAS_PIA_URL = "https://www.gsa.gov/reference/gsa-privacy-program/privacy-impact-assessments-pia"
class Agency:
def __init__(self):
'''
Agency holds the url to the list of PIAs and the scraped PIAs.
'''
self.url = GSAS_PIA_URL
self.pias = []
def get_pia_urls(self):
result = requests.get(self.url)
soup = BeautifulSoup(result.text, 'html.parser')
for link in soup.find_all('a'): # Find all anchor tags
href = link.get('href')
if "/cdnstatic/" in href: # All the PDFs are stored on GSA's cdn.
full_url = "https://gsa.gov" + href
pia = PIA(full_url)
self.pias.append(pia)
def load_local_pias_from_txt(self):
txts = glob.glob('pias/*.txt')
for txt_path in txts:
pia = PIA(txt_path=txt_path)
self.pias.append(pia)
def write_all_to_csv(self):
with open("gsa_pias.csv", "w") as f:
writer = csv.writer(f)
writer.writerow(['System Name', 'Authority', 'URL'])
for pia in self.pias:
writer.writerow([pia.system_name, pia.authority, pia.pdf_url])
class PIA:
def __init__(self, pdf_url=None, pdf_path=None, txt_path=None):
self.pdf_url = pdf_url
self.pdf_path = pdf_path
self.txt_path = txt_path
self.full_text = None
self.system_name = None
self.authority = None
def download_pdf(self):
print(pia.pdf_url)
filename = urlparse(self.pdf_url).path
filename = filename.split("/")[2] # drop the cdnstatic
self.pdf_path = 'pias/' + filename
result = requests.get(self.pdf_url)
with open(self.pdf_path, 'wb') as f:
f.write(result.content)
def get_text_from_pdf(self):
self.txt_path = self.pdf_path.split(".")[0] + ".txt" # replace the .pdf
self.full_text = extract_text(self.pdf_path)
with open(self.txt_path, 'w') as f:
f.write(self.full_text)
def get_text_from_txt(self):
with open(self.txt_path) as f:
self.full_text = f.read()
def get_system_name(self):
# Two different regex patterns run in order to ge the most complete set of system names.
words_after_system_names = "January|February|March|April|May|June|July|August|September|October|November|December|Privacy|\d+/"
pattern_one = "(.*?)(?=%s)" %(words_after_system_names)
pattern_two = "(?<=Assessment)(.*?)(?=%s)" %(words_after_system_names)
match = re.search(pattern_one, self.full_text, flags=re.IGNORECASE | re.DOTALL)
self.system_name = match.group(1).replace('\n','').strip()
if not self.system_name:
match = re.search(pattern_two, self.full_text, flags=re.IGNORECASE | re.DOTALL)
self.system_name = match.group(1).replace('\n','').strip()
def get_authority(self):
pattern_just_cfr = "(\d+\s+(CFR|U\.S\.C\.|U\.S\. Code § |USC § )\s*\d+\.*\d+)"
pattern_full_authority = "\d\.\d.+authority.*?\?(.+?)\d\.\d\s+"
match = re.search(pattern_just_cfr, self.full_text, flags=re.DOTALL)
if match:
self.authority = ", ".join(match.groups()).replace("\n","")
# if not self.authority:
# match = re.search(pattern_full_authority, self.full_text, flags=re.DOTALL)
# try:
# self.authority = match.group(1).replace("\n","")
# except:
# pass
if __name__ == '__main__':
# pia = PIA(txt_path="pias/Government%20Retirement%20Benefits_PIA_August2019.txt")
# pia.get_text_from_txt()
# pia.get_authority()
# print(pia.authority)
# pdb.set_trace()
agency = Agency()
agency.load_local_pias_from_txt()
for pia in agency.pias:
# pia.pdf_url = "https://www.gsa.gov/cdnstatic/" + pia.txt_path.replace("pias/","").replace("txt","pdf")
pia.get_text_from_txt()
# pia.get_system_name()
pia.get_authority()
print(pia.authority)
print(pia.txt_path)
# # agency.write_all_to_csv()