-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhcris_load_extract.py
239 lines (208 loc) · 9.68 KB
/
hcris_load_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import mysql.connector
import requests, zipfile, io
import os, codecs
import datetime
import config
def main():
# Define data directory for data dump and MySQL LOAD DATA INFILE statements
data_dir = "./data"
current_year = datetime.datetime.now().year
# Function to Download raw HCRIS data from CMS website
def download(form, year=None):
"""Downloads HCRIS files from CMS website based on year and form (i.e. 2552-96 or 2552-10)."""
if form == "2552-96" and year is not None:
r = requests.get(f"http://downloads.cms.gov/Files/hcris/HOSPFY{year}.zip")
elif form == "2552-10" and year is not None:
r = requests.get(f"http://downloads.cms.gov/Files/hcris/HOSP10FY{year}.zip")
elif form == "2552-96" and year is None:
r = requests.get("http://downloads.cms.gov/files/hcris/HOSP-REPORTS.ZIP")
elif form == "2552-10" and year is None:
r = requests.get("http://downloads.cms.gov/files/hcris/hosp10-reports.zip")
# Read content stream of Zip file and extract to data directory
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(f"{data_dir}/")
z.close()
# Function to Remove 'Byte Order Mark' from report file of earlier years 1996-2011 form 2552-96
def remove_bom(file):
"""Removes the 'Byte Order Mark' at the beginning of CSV file, which causes LOAD INFILE to fail"""
bufsize = 4096
bomlen = len(codecs.BOM_UTF8)
with open(file, "r+b") as fp:
chunk = fp.read(bufsize)
if chunk.startswith(codecs.BOM_UTF8):
i = 0
chunk = chunk[bomlen:]
while chunk:
fp.seek(i)
fp.write(chunk)
i += len(chunk)
fp.seek(bomlen, os.SEEK_CUR)
chunk = fp.read(bufsize)
fp.seek(-bomlen, os.SEEK_CUR)
fp.truncate()
# Connect to MySQL database
cnx = mysql.connector.connect(
user=config.database.user,
password=config.database.password,
host='localhost',
database='HCRIS'
)
cursor = cnx.cursor()
# Load variable names and locations in HCRIS files
print("Loading features to extract")
cursor.execute("""
LOAD DATA LOCAL INFILE './feature.csv'
INTO TABLE feature
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\\r\\n'
IGNORE 1 LINES
""")
cnx.commit()
# Load variable names and locations in HCRIS files
print("Loading feature locations")
cursor.execute("""
LOAD DATA LOCAL INFILE './feature_location.csv'
INTO TABLE feature_location
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\\r\\n'
IGNORE 1 LINES
""")
cnx.commit()
# Generic load Providers SQL Statement, with placeholder for data file
providers_load_sql = """
LOAD DATA LOCAL INFILE %s
IGNORE INTO TABLE provider # IGNORE here skips duplicates
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\\r\\n'
IGNORE 1 LINES
(provider_id, @FYB, @FYE, @`STATUS`, @CTRL_TYPE, hospital_name, street_address, po_box, city, @state, @zip_code, county, @Rural)
SET state = CASE
WHEN @state = "CONNECTICUT" THEN "CT"
WHEN @state = "MICHIGAN" THEN "MI"
WHEN @state = "NEW YORK" THEN "NY"
WHEN @state = "TEXAS" THEN "TX"
ELSE @state
END
, zip_code = TRIM(TRAILING '-' FROM @zip_code)
"""
# Generic load Reports SQL Statement, with placeholders for data file, year, and form
report_load_sql = """
LOAD DATA LOCAL INFILE %s
INTO TABLE report
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\\r\\n'
(report_id, @control_type_id, provider_id, @npi, report_status, @fiscal_year_start, @fiscal_year_end, @process_date, @INITL_RPT_SW, @LAST_RPT_SW, @TRNSMTL_NUM, @FI_NUM, @ADR_VNDR_CD, @FI_CREAT_DT, @medicare_utilization, @NPR_DT, @SPEC_IND, @FI_RCPT_DT)
SET form = %s
, report_year = %s
, control_type_id = IF(@control_type_id = '', NULL, @control_type_id)
, npi = NULLIF(@npi, '')
, fiscal_year_start = DATE_FORMAT(STR_TO_DATE(@fiscal_year_start, '%m/%d/%Y'), '%Y-%m-%d')
, fiscal_year_end = DATE_FORMAT(STR_TO_DATE(@fiscal_year_end, '%m/%d/%Y'), '%Y-%m-%d')
, process_date = DATE_FORMAT(STR_TO_DATE(@process_date, '%m/%d/%Y'), '%Y-%m-%d')
, medicare_utilization = CASE
WHEN @medicare_utilization = "L" THEN "Low"
WHEN @medicare_utilization = "N" THEN "None"
ELSE "Full"
END
"""
# Generic load Alpha SQL Statement into temporary table, with placeholders for data file and form
alpha_load_sql = """
LOAD DATA LOCAL INFILE %s
INTO TABLE alpha_temp
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\\r\\n'
(report_id, worksheet_code, line_number, column_number, item_text)
SET form = %s
"""
# Extract and insert only those desired alpha variables in the features table (join of alpha_temp and extracted)
alpha_insert_sql = """
INSERT INTO alpha
SELECT report_id, %s AS form, feature_id, MAX(item_text) AS item_text FROM alpha_temp AS a
JOIN feature_location AS l
ON a.worksheet_code = l.worksheet_code
AND a.line_number BETWEEN l.from_line AND l.to_line
AND a.column_number BETWEEN l.from_column AND l.to_column
JOIN feature USING(feature_id)
WHERE `extract` = 1 AND variable_type = "Alpha" AND l.form = %s
GROUP BY report_id, feature_id
"""
# Generic load Numeric SQL Statement into temporary table, with placeholders for data file and form
numeric_load_sql = """
LOAD DATA LOCAL INFILE %s
INTO TABLE numeric_temp
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\\r\\n'
(report_id, worksheet_code, line_number, column_number, item_value)
SET form = %s
"""
# Extract and insert only those desired numeric variables in the features table (join of alpha_temp and extracted)
numeric_insert_sql = """
INSERT INTO `numeric`
SELECT report_id, %s AS form, feature_id, SUM(item_value) AS item_value FROM numeric_temp AS n
JOIN feature_location AS l
ON n.worksheet_code = l.worksheet_code
AND n.line_number BETWEEN l.from_line AND l.to_line
AND n.column_number BETWEEN l.from_column AND l.to_column
JOIN feature USING(feature_id)
WHERE `extract` = 1 AND variable_type = "Numeric" AND l.form = %s
GROUP BY report_id, feature_id
"""
# Set form for years 1996-2011 and download provider data
form = '2552-96'
print(f"Downloading provider data for {form}")
download(form)
# Load provider data for 1996-2011
provider_file = f"{data_dir}/HOSPITAL_PROVIDER_ID_INFO.csv"
print(f"Loading provider data for {form}")
cursor.execute(providers_load_sql, (provider_file,))
cnx.commit()
# Loop for years 1996-2011 to download, load, and extract data
for year in range(1996, 2012):
print(f"Downloading report data for {year}")
download(form, year)
report_file = f"{data_dir}/hosp_{year}_RPT.CSV"
remove_bom(report_file)
alpha_file = f"{data_dir}/hosp_{year}_ALPHA.CSV"
numeric_file = f"{data_dir}/hosp_{year}_NMRC.CSV"
print(f"Loading reports for {year}")
cursor.execute(report_load_sql, (report_file, form, year))
print(f"Loading alpha for {year}")
cursor.execute(alpha_load_sql, (alpha_file, form))
cursor.execute(alpha_insert_sql, (form, form))
cursor.execute("TRUNCATE alpha_temp")
print(f"Loading numeric for {year}")
cursor.execute(numeric_load_sql, (numeric_file, form))
cursor.execute(numeric_insert_sql, (form, form))
cursor.execute("TRUNCATE numeric_temp")
cnx.commit()
# Set form for years 2010-present and download provider data
form = '2552-10'
print(f"Downloading provider data for {form}")
download(form)
# Load provider data for 2010-present
provider_file = f"{data_dir}/HOSPITAL_PROVIDER_ID_INFO.csv"
print(f"Loading provider data for {form}")
cursor.execute(providers_load_sql, (provider_file,))
cnx.commit()
# Loop for years 2010-present to download, load, and extract data
for year in range(2010, current_year):
print(f"Downloading report data for {year}")
download(form, year)
report_file = f"{data_dir}/hosp10_{year}_RPT.CSV"
alpha_file = f"{data_dir}/hosp10_{year}_ALPHA.CSV"
numeric_file = f"{data_dir}/hosp10_{year}_NMRC.CSV"
print(f"Loading reports for {year}")
cursor.execute(report_load_sql, (report_file, form, year))
print(f"Loading alpha for {year}")
cursor.execute(alpha_load_sql, (alpha_file, form))
cursor.execute(alpha_insert_sql, (form, form))
cursor.execute("TRUNCATE alpha_temp")
print(f"Loading numeric for {year}")
cursor.execute(numeric_load_sql, (numeric_file, form))
cursor.execute(numeric_insert_sql, (form, form))
cursor.execute("TRUNCATE numeric_temp")
cnx.commit()
cursor.close()
cnx.close()
if __name__ == '__main__':
main()