-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathget_target_data.R
156 lines (140 loc) · 6.09 KB
/
get_target_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# System and Libraries ----
library(dplyr)
library(tidyr)
library(lubridate)
# Workflow ----
# Read files from the US SMH RSV GitHub Repository, please consult the
# repository for more information
# - RSV-NET data
# - Location information
# Location
df_loc <- read.csv("auxiliary-data/location_census/locations.csv")
location2fips <- setNames(df_loc$location, df_loc$location_name)
# Age Group
age2st_age <-
setNames(c("0-130", "0-130",
"0-4",
"5-17",
"18-49",
"50-64",
"65-130", "65-130",
"0-0.49", "0-0.49",
"0.5-0.99", "0.5-0.99",
"75-130", "75-130",
"65-74",
"1-1.99", "1-1.99",
"2-4", "2-4",
"18-130", "18-130", "18-130"),
c("Overall", "All",
"0-4 years",
"5-17 years",
"18-49 years",
"50-64 years",
"65+ years", "≥65 years",
"----0-<6 months", "0-<6 months",
"6mo-<12 months", "----6-<12 months",
"75+ years", "≥75 years",
"65-74 years",
"----1-<2 years", "1-<2 years",
"----2-4 years", "2-4 years",
"18+ (Adults)", "18+ years (Adults)", "≥18 years (Adults)"))
# Census - From US Census Bureau
# - Add columns for 2023 and 2024 with the same values as 2022
census_pop <- read.csv("auxiliary-data/location_census/state_pop_data.csv")
census_pop <- dplyr::mutate(
census_pop,
fips = gsub("00", "US", ifelse(nchar(STATE) < 2, paste0("0", STATE),
as.character(STATE)))) %>%
dplyr::filter(SEX == 0) %>%
dplyr::select(fips, age = AGE, contains("POPEST")) %>%
dplyr::mutate(POPEST2023_CIV = POPEST2022_CIV) %>%
dplyr::mutate(POPEST2024_CIV = POPEST2022_CIV) %>%
tidyr::pivot_longer(cols = contains("POPEST"), names_to = "year") %>%
dplyr::mutate(year = as.numeric(gsub("[[:alpha:]]|_", "", year)))
# Calculate census data by age group of interest
# For the age groups: "0-0.49" and "0.5-0.99", use the year 0 divided by 2.
# The other age groups include all the years included in the range:
# - "0-4" include all the years from 0 to 4: 0, 1, 2, 3, and 4.
# - "1-1.99" include the year 1
# - etc.
census_agegroup <- lapply(unique(age2st_age), function(age_grp) {
age_min = as.numeric(strsplit(age_grp, "-")[[1]][1])
age_max = as.numeric(strsplit(age_grp, "-")[[1]][2])
div = 1
if (age_min < 1 & age_max < 1) div = 2
if (age_min < 1) age_min <- 0
if (age_max < 1) age_max <- 0
df <- dplyr::filter(census_pop, age >= age_min, age <= age_max)
df_age_group <- dplyr::group_by(df, fips, year) %>%
dplyr::summarise(tot_pop = sum(value) / div, .groups = "keep") %>%
dplyr::ungroup() %>%
dplyr::mutate(age_group = age_grp)
return(df_age_group)
}) %>% bind_rows()
# RSV-Net
df <- arrow::read_parquet(
tail(sort(dir("auxiliary-data/rsv/rsv-net/", full.names = TRUE)), 1))
# Standardize:
# - Load relevant age groups
# - Filter overall race, sex
# - Remove seasonal summaries
# - Standardize date format in "YYYY-MM-DD" format
# - Add missing value in the time series
# - Recode age group information to US SMH format
# - Standardize column names (lower case, without space, dot)
rsv <- df %>%
dplyr::mutate(
date = as.Date(`Week ending date`,
tryFormats = c("%m/%d/%Y", "%Y-%m-%d"))) %>%
dplyr::filter(
Sex == "All" & Race == "All" &
`Age Category` %in%
c("Overall", "All", "0-4 years", "5-17 years", "18-49 years",
"50-64 years", "65+ years", "≥65 years", "----0-<6 months",
"0-<6 months", "6mo-<12 months", "----6-<12 months", "75+ years",
"≥75 years", "65-74 years", "----1-<2 years", "1-<2 years",
"----2-4 years", "2-4 years", "18+ (Adults)", "18+ years (Adults)",
"≥18 years (Adults)")
)
full_ts <- seq(min(rsv$date),max(rsv$date), by = "week")
full_df <- tidyr::expand(rsv, tidyr::nesting(State, `Age Category`),
full_ts) %>%
dplyr::rename(date = full_ts)
rsv_standard <- dplyr::full_join(rsv, full_df,
by = c("State", "Age Category", "date")) %>%
dplyr::mutate(
week = lubridate::epiweek(date),
year = lubridate::epiyear(date),
location = gsub("Entire Network \\(RSV-NET\\)|RSV-NET", "US", State),
fips = location2fips[location],
age_group = age2st_age[`Age Category`]) %>%
dplyr::rename(value_rate = Rate, value_cumul_rate = `Cumulative Rate`,
season = Season) %>%
dplyr::select(location, fips, age_group, date, week, year, season, value_rate,
value_cumul_rate)
# Calculate hospitalization number:
# - add population data per year, age_group, state
# - apply function: rate * population size / 100000
rsv_pop <- dplyr::left_join(rsv_standard, census_agegroup,
by = c("age_group", "fips", "year")) %>%
dplyr::mutate(inc_hosp = value_rate * tot_pop / 100000) %>%
dplyr::rename(rate_hosp = value_rate)
# Standardization to US SMH format
rsv_output <-
tidyr::pivot_longer(
rsv_pop, contains("_hosp"), names_to = "target", values_to = "value") %>%
dplyr::mutate(target = gsub("_", " ", target)) %>%
dplyr::select(location = fips, date, age_group, target, value,
population = tot_pop)
# Append previous season (removed on the new version of the data (2023-11-10))
# Use last version of the file containing the 2014-2015 and 2015-2016 seasons
# from the RSV SMH GitHub Repository (commit #f183e8a)
rsv_past_season <- read.csv("https://raw.githubusercontent.com/midas-network/rsv-scenario-modeling-hub/f183e8a1a8d2387f02c2e007527af48226370d03/target-data/rsvnet_hospitalization.csv")
rsv_past_season <- dplyr::filter(rsv_past_season, date < min(rsv_output$date))
# Archive past files
old_files <- dir("target-data/", full.names = TRUE, pattern = "rsvnet_hospitalization.csv")
file.rename(old_files, gsub("target-data/", "target-data/archive", old_files))
# Write output
rsv_output <- rbind(rsv_output, rsv_past_season)
write.csv(rsv_output, paste0("target-data/", Sys.Date(), "_rsvnet_hospitalization.csv"),
row.names = FALSE)