00-read-data.Rmd

---
title: "Read in and process data"
output:
  html_document:
    theme: lumen
    toc: true
    toc_float: false
    code_download: true
    highlight: tango
knit: (function(inputFile, encoding) {
  rmarkdown::render(inputFile, encoding = encoding, output_dir = "docs") })
---

```{r}
library(dplyr)
library(tidyr)
library(readr)
library(readxl)
library(httr)
library(purrr)
library(stringi)
library(tibble)
# library(gt)
# library(gtsummary)
source('utils.R')
```

## Download data

```{r message=FALSE}
github_link <- 'https://raw.githubusercontent.com/covidclinical/Phase1.1AggregateDataPerSite/master/'
demo_link <- paste0(github_link, 'Demographics_persite_fakeID.csv')
diag_link <- paste0(github_link, 'Diagnoses_persite_fakeID.csv')

temp_file_1 <- temp_file_2 <- tempfile(fileext = ".csv")
req <- GET(demo_link, 
          # authenticate using GITHUB_PAT
           authenticate(Sys.getenv("GITHUB_PAT"), ""),
          # write result to disk
           write_disk(path = temp_file_1))
demo_all <- read_csv(temp_file_1)
unlink(temp_file_1)

req <- GET(diag_link, 
           authenticate(Sys.getenv("GITHUB_PAT"), ""),
           write_disk(path = temp_file_2))
diag_all <- read_csv(temp_file_2)
unlink(temp_file_2)
```

Notes:
- `num_patients_ever_severe_icd1`: number of patients having the icd code and ever being severe
- `num_patients_never_severe_icd1`: number of patients having the icd code and never being severe
- `num_patients_ever_severe_icd0`: number of patients not having the icd code and ever being severe
- `num_patients_never_severe_icd0`: number of patients not having the icd code and never being severe

## Read in local files

```{r}
site_country <- read_csv('data/SiteID_Map_Non_Pediatric_3digit_toshare.csv') %>% 
  select(- Peds.only) %>% 
  mutate(Country = case_when(
    Country == 'USA' ~ Country,
    TRUE ~ stringr::str_to_title(Country)))
```

<!-- Two rows have country SINGAPORE_SPAIN because there is only one site in each one of the countries. -->
<!-- If this were not combined: data could be known the specific site. -->

```{r}
neuro_icds_10 <- read_excel('data/2020-09-10_neuro-icd10.xlsx') %>% 
  rename('icd' = `ICD-10`)

neuro_icds_9 <- readxl::read_excel('data/neuroicd10to9withdescript.xlsx') %>% 
  left_join(neuro_icds_10 %>% select(`Neurological Disease Category`, icd), 
            by = c('ICD10_CODE' = 'icd')) %>% 
  # select(- c(ICD9_CODE, ICD9_DESCRIPTION, ICD10_CODE)) %>% 
  rename('icd' = ICD9_THREE_DIGIT) %>% 
  filter(!is.na(icd)) %>% # ignore ICD10 code G46 and G65 not mapped to any ICD_9 code
  {.}

neuro_icds_9 %>% 
  distinct(icd, ICD10_CODE) %>% 
  count(icd) %>%
  filter(n > 1)
```
`


## Manual grouping of four ICD-9 codes

For the two codes with non-specific description (780 and 781), we can include (in the Methods, Figure legend or supplementary material) the descriptions of all subcodes for each.
See link here <http://dbmi-ncats-test01.dbmi.pitt.edu/webclient/>: no login required, and also the link below each code.
I added the disease groups in parentheses for these four codes.
The only group that is not consistent with the grouping based on ICD10 codes is vision/smell/taste.
(For ICD10 codes, we separate vision and neuropathy, the latter of which includes smell and taste).
V41 Problems with special senses and other special functions (vision/smell/taste)
437 Other and ill-defined cerebrovascular disease (vascular)
780 General symptoms (Other)
https://icd.codes/icd9cm/780
781 Symptoms involving nervous and musculoskeletal systems (Other)
https://icd.codes/icd9cm/781


```{r}
phecode_icd9 <- read_csv('data/phecode_icd9_rolled.csv') %>% 
  select(icd = ICD9, icd9_desc = `ICD9 String`)

neuro_icds_9 <- neuro_icds_9 %>% 
  left_join(phecode_icd9, by = 'icd')

neuro_icds_9 %>% filter(icd %in% c('V41', '780', '781', '437'))

neuro_icds_9 <- neuro_icds_9 %>% 
  within(., icd9_desc[icd == 'V41'] <- 'Problems with special senses and other special functions') %>% 
  within(., `Neurological Disease Category`[icd == 'V41'] <- 'Vision/smell/taste') %>%
  within(., icd9_desc[icd == '781'] <- 'Symptoms involving nervous and musculoskeletal systems') %>% 
  within(., `Neurological Disease Category`[icd == '781'] <- 'Neuropathy') %>% # and Consciousness
  within(., icd9_desc[icd == '437'] <- 'Other and ill-defined cerebrovascular disease') %>%
  within(., icd9_desc[icd == '780'] <- 'General symptoms') %>%
  within(., `Neurological Disease Category`[icd == '780'] <- 'Other') %>%
  distinct() %>% 
  select(`Neurological Disease Category`, icd, icd9_desc) %>% 
  distinct() %>% 
  arrange(icd)

neuro_icds_9 %>% 
  write_csv('results/icd9_tab.csv')
```

Notes: 

“-99” indicate masked small numbers (for obfuscation) and “-999" indicate missing, mismatches, not applicable, etc.

# Descriptive statistics

`Diagnoses_persite_fakeID.csv` has `r length(unique(diag_all$siteid))` unique siteid's.
`Demographics_persite_fakeID.csv` has `r length(unique(demo_all$siteid))` unique siteid's.


# Mold data frames

```{r}
demo_ana <- demo_all %>% 
  filter(race == 'all', age_group == 'all', sex == 'all') %>% 
  # left_join(site_to_country, by = 'siteid') %>% 
  select(- c(sex, age_group, race)) %>% 
  bind_rows(# Compute patients_all for SITE734 manually
    demo_all %>% 
      filter(siteid == 'SITE734', race == 'all', age_group == 'all') %>% 
      group_by(siteid) %>% 
      summarise(across(c(num_patients_all, num_patients_ever_severe), .fns = sum),
                .groups = 'drop')
  ) %>% 
  mutate(num_patients_never_severe = num_patients_all - num_patients_ever_severe)


split_patient_type <- function(x) {
  temp <- strsplit(sub('(^[^_]+_[^_]+)_(.*)$', '\\1 \\2', stri_reverse(x)), ' ')
  map(temp, stri_reverse)
}

diag_ana <- diag_all %>% 
  mutate(
    num_patients_never_severe_before_admission = num_patients_all_before_admission - num_patients_ever_severe_before_admission,
    num_patients_never_severe_since_admission = num_patients_all_since_admission - num_patients_ever_severe_since_admission,
  ) %>% 
  pivot_longer(cols = starts_with('num'), names_to = 'patient_type', values_to = 'count') %>% 
  bind_cols(
    split_patient_type(.[['patient_type']]) %>% 
      do.call(rbind, .) %>% 
      as.data.frame() %>% 
      `colnames<-`(c('time', 'severe'))
  ) %>% 
  mutate(severe = recode(severe, 
                         num_patients_all = 'num_patients_icd',
                         num_patients_ever_severe= 'num_patients_ever_severe_icd1',
                         num_patients_never_severe= 'num_patients_never_severe_icd1'),
         time = recode(time,
                       before_admission = 'Before admission',
                       since_admission = 'After admission')) %>% 
  rename('icd' = icd_code_3chars) %>% 
  select(- patient_type) %>% 
  drop_na() %>% 
  left_join(site_country, by = c('siteid' = 'SiteID.new')) %>% 
  {.}

add_back_974 = cbind('SITE974', '728', 9, 0, 
crossing(c('Before admission', 'After admission'), 
c('num_patients_icd', 'num_patients_ever_severe_icd1')), 'Italy') %>% 
`colnames<-`(colnames(diag_ana)) %>% 
  as.data.frame()

diag_ana = bind_rows(diag_ana, add_back_974) # 0 as count

diag_icd_10 <- diag_ana %>% 
  filter(icd_version == 10) %>%
  select(- icd_version) %>% 
  pivot_wider(names_from = severe, values_from = count) %>%
  right_join(neuro_icds_10, by = 'icd') %>% 
  distinct() %>% 
  left_join(demo_ana, by = 'siteid') %>% 
  mutate(full_icd = paste0(`ICD-10 Description`, ' (', icd, ')'),
         num_patients_never_severe_icd0 = num_patients_never_severe - num_patients_never_severe_icd1,
         num_patients_ever_severe_icd0 = num_patients_ever_severe - num_patients_ever_severe_icd1,
  ) %>%
  drop_na(num_patients_icd) %>% 
  {.}

diag_icd_9 <- diag_ana %>% 
  filter(icd_version == 9) %>%
  select(- icd_version) %>% 
  pivot_wider(names_from = severe, values_from = count) %>%
  right_join(neuro_icds_9, by = 'icd') %>% 
  distinct() %>% 
  left_join(demo_ana, by = 'siteid') %>% 
  mutate(full_icd = paste0(icd9_desc, ' (', icd, ')'),
         num_patients_never_severe_icd0 = num_patients_never_severe - num_patients_never_severe_icd1,
         num_patients_ever_severe_icd0 = num_patients_ever_severe - num_patients_ever_severe_icd1,
  ) %>%
  drop_na(num_patients_icd) %>% 
  {.}
```

`r length(unique(diag_icd_10$siteid))` sites with icd-10 code. 
`r length(unique(diag_icd_9$siteid))` sites have icd-9.

```{r}
country_sum_icd10 <- diag_icd_10 %>% 
  select(siteid, Country, num_patients_all) %>% 
  distinct() %>% 
  group_by(Country) %>% 
  summarise(all_pats_country = sum(num_patients_all), .groups = 'drop')

country_sum_icd9 <- diag_icd_9 %>% 
  select(siteid, Country, num_patients_all) %>% 
  distinct() %>% 
  group_by(Country) %>% 
  summarise(all_pats_country = sum(num_patients_all), .groups = 'drop')

diag_icd_10 <- diag_icd_10 %>% 
  merge(country_sum_icd10, by = 'Country')

diag_icd_9 <- diag_icd_9 %>% 
  merge(country_sum_icd9, by = 'Country')
```

# Write data out to .Rdata

```{r}
save(demo_ana, diag_icd_10, diag_icd_9, file = 'data/processed-data.Rdata')
```

# Quality checks

## Check if 'all' equals to sum of the rest
```{r}
age_group_check <- demo_all %>% 
  filter(sex == 'all', race == 'all', age_group != 'all') %>% 
  group_by(siteid) %>% 
  summarise(age_group_sum = sum(num_patients_all), .groups = 'drop') %>% 
  right_join(demo_ana) %>% 
  mutate(mismatches = num_patients_all - age_group_sum) %>% 
  # filter(abs(mismatches) != 0) %>% 
  select(-contains('severe')) %>% 
  arrange(desc(abs(mismatches))) %>% 
  {.}

race_check <- demo_all %>% 
  filter(sex == 'all', race != 'all', age_group == 'all') %>% 
  group_by(siteid) %>% 
  summarise(race_sum = sum(num_patients_all), .groups = 'drop') %>% 
  right_join(demo_ana) %>% 
  mutate(mismatches = num_patients_all - race_sum) %>% 
  # filter(abs(mismatches) != 0) %>% 
  select(-contains('severe')) %>% 
  arrange(desc(abs(mismatches))) %>% 
  {.}

sex_check <- demo_all %>% 
  filter(race == 'all', sex != 'all', age_group == 'all') %>% 
  group_by(siteid) %>% 
  summarise(sex_sum = sum(num_patients_all), .groups = 'drop') %>% 
  right_join(demo_ana) %>% 
  mutate(mismatches = num_patients_all - sex_sum) %>% 
  # filter(mismatches != 0) %>% 
  select(-contains('severe')) %>% 
  arrange(desc(abs(mismatches))) %>% 
  {.}

demo_check <- demo_ana %>% 
  mutate(perc_severe = num_patients_ever_severe/num_patients_all)

write_csv(race_check, 'data/race_check.csv')
write_csv(sex_check, 'data/sex_check.csv')
write_csv(age_group_check, 'data/age_group_check.csv')
```

## Generate characteristics table

```{r}
sum_sum <- sym(paste('N =', sum(demo_ana$num_patients_all)))
```

```{r}
sites_no_age <- demo_all %>% 
  filter(siteid %in% (age_group_check %>% filter(is.na(age_group_sum)) %>% pull(siteid)), 
         age_group != 'all', sex != 'all', race != 'all') %>% 
  group_by(siteid, age_group) %>% 
  summarise(size = sum(num_patients_all), .groups = 'drop') 

age_group_desc <- demo_all %>% 
  filter(sex == 'all', race == 'all', age_group != 'all') %>% 
  group_by(age_group, siteid) %>% 
  summarise(size = sum(num_patients_all, na.rm = T), .groups = 'drop') %>% 
  bind_rows(sites_no_age) %>% 
  pivot_wider(names_from = age_group, values_from = size, values_fill = list(size = 0)) %>% 
  mutate(sum_age_cols = rowSums(.[, -1])) %>% 
  right_join(age_group_check) %>% 
  replace(is.na(.), 0) %>% 
  mutate(unknown_other = other + num_patients_all - sum_age_cols) %>% 
  {.}

age_comp <- age_group_desc  %>% 
  column_to_rownames('siteid') %>% 
  rename('80to100' = `80plus`) %>% 
  select(contains('to')) %>% 
  apply(., 1, cumsum) %>% 
  t()
median_age(age_comp[2,])
age_med_df <- data.frame(median_age = apply(age_comp, 1, median_age)) %>% 
  rownames_to_column('siteid')

age_group_desc <- age_group_desc %>% 
  left_join(age_med_df)
```

```{r}
sites_no_race <- demo_all %>% 
  filter(siteid %in% (race_check %>% filter(is.na(race_sum)) %>% pull(siteid)), 
  age_group != 'all', sex != 'all', race != 'all') %>% 
  group_by(siteid, race) %>% 
  summarise(size = sum(num_patients_all), .groups = 'drop') 

sites_no_race %>% 
  left_join(site_country, by = c('siteid' = 'SiteID.new'))

race_desc <- demo_all %>% 
  filter(sex == 'all', race != 'all', age_group == 'all') %>% 
  group_by(race, siteid) %>% 
  summarise(size = sum(num_patients_all, na.rm = T), .groups = 'drop') %>% 
  bind_rows(sites_no_race) %>% 
  pivot_wider(names_from = race, values_from = size, values_fill = list(size = 0)) %>% 
  mutate(sum_race_cols = rowSums(.[, -1])) %>% 
  right_join(race_check) %>% 
  replace(is.na(.), 0) %>% 
  mutate(unknown_other_race = other + num_patients_all - sum_race_cols) %>% 
  {.}
```


```{r}
sex_desc <- demo_all %>% 
  filter(sex != 'all', race == 'all', age_group == 'all') %>% 
  group_by(sex, siteid) %>% 
  summarise(size = sum(num_patients_all, na.rm = T), .groups = 'drop') %>% 
  pivot_wider(names_from = sex, values_from = size, values_fill = list(size = 0)) %>% 
  mutate(sum_sex_cols = rowSums(.[, -1])) %>% 
  right_join(sex_check) %>% 
  replace(is.na(.), 0) %>% 
  mutate(unknown_other = other + num_patients_all - sex_sum) %>% 
  {.}

# race_desc <- demo_all %>%
#   filter(sex == 'all', race != 'all', age_group == 'all') %>% 
#   group_by(race, siteid) %>% 
#   summarise(size = sum(num_patients_all, na.rm = T), .groups = 'drop') %>% 
#   pivot_wider(names_from = race, values_from = size, values_fill = list(size = 0)) %>% 
#   mutate(sum_race_cols = rowSums(.[, -1])) %>% 
#   right_join(race_check) %>% 
#   replace(is.na(.), 0) %>% 
#   mutate(unknown_other_race = other + num_patients_all - race_sum) %>% 
#   {.}

demo_desc <- left_join(sex_desc, age_group_desc, by = 'siteid', suffix = c("_sex", "_age_group")) %>% 
  left_join(race_desc, by = 'siteid') %>% 
  # Reduce(function(...) left_join(..., by = 'siteid', , '_race')), 
  #                   list(, race_desc), ) %>% 
  select(- c(contains('um'), contains('mismatches'))) %>% 
  right_join(demo_ana, ., by = 'siteid') %>% 
  left_join(site_country, by = c('siteid' = 'SiteID.new')) %>% 
  select(siteid, Country, everything())


demo_perc <- demo_desc %>% 
  # mutate(across(- contains('num'), ~ .x/.data$num_patients_all))
  mutate_at(vars(- num_patients_all, - siteid, - Country, - median_age),
            list(~ round(. / .data$num_patients_all * 100, 1)))

demo_desc %>% write_csv('results/demo_prelim.csv')
demo_perc %>% write_csv('results/demo_perc_prelim.csv')
# some NAs are because there are just not a row for with that value
```


```{r}
sex_age <- demo_all %>% 
  filter(age_group != 'all', sex != 'all', race != 'all') %>% 
  group_by(siteid, age_group, sex) %>% 
  summarise(size = sum(num_patients_all), .groups = 'drop') %>% 
  pivot_wider(names_from = c(sex, age_group), values_from = size, values_fill = list(size = 0)) %>% 
  mutate(sum_age_cols = rowSums(.[, -1])) %>% 
  right_join(age_group_check) %>% 
  replace(is.na(.), 0) %>% 
  mutate(unknown_other = female_other + male_other + other_other + num_patients_all - sum_age_cols) %>% 
  select(- c(contains('um'), contains('mismatches'))) %>% 
  right_join(demo_ana, ., by = 'siteid') %>% 
  left_join(site_country, by = c('siteid' = 'SiteID.new')) %>% 
  select(siteid, Country, everything()) %>% 
  {.}
```


```{r}
# sex_desc <- demo_all %>% 
#   filter(sex != 'all', race == 'all', age_group == 'all') %>% 
#   group_by(sex) %>% 
#   summarise(!!sum_sum := sum(num_patients_all), .groups = 'drop') %>% 
#   mutate(category = 'Sex') %>% 
#   rename('Characteristics' = sex)
# 
# race_desc <- demo_all %>% 
#   filter(sex == 'all', race != 'all', age_group == 'all') %>% 
#   group_by(race) %>% 
#   summarise(!!sum_sum := sum(num_patients_all), .groups = 'drop') %>% 
#   mutate(category = 'Race/Ethnicity') %>% 
#   rename('Characteristics' = race)
# 
# bind_rows(age_group_desc, sex_desc, race_desc) %>% 
#   group_by(category) %>% 
#   gt() %>% 
#   tab_footnote(
#     footnote = "Obfuscation at each site may affect the total counts.",
#     locations = cells_column_labels(
#       columns = 2)
#   ) %>% 
#   {.}
```


Check NAs:
```{r}
na_diag <- diag_ana %>% 
  select(siteid, icd, contains('num')) %>% 
  filter(rowSums(is.na(.)) > 0)
```

<!-- # More quality check: -->

```{r include=FALSE, eval=FALSE}
# check_all <- demo_all %>% 
#   filter(race == 'all') 

# check whether the rows with race = all are actual sum of the rest
# check_race <- demo_all %>%
#   filter(race != 'all') %>%
#   group_by(siteid, sex, age_group) %>%
#   summarise(num_patients_sum = sum(num_patients_all)) %>%
#   left_join(check_all)


# check_na <- check_race %>% 
#   filter(is.na(num_patients_all)) %>% 
#   pull(siteid) %>% 
#   unique()

# length(unique(diag_all$siteid))
check_diag <- diag_all %>% 
  filter(!is.na(num_patients_ever_severe_icd1)) %>% 
  mutate(num_patients_icd = num_patients_ever_severe_icd1 + num_patients_never_severe_icd1,
         icd = gsub('icd:', '', icd)) %>% 
  group_by(siteid) %>%
  summarise(max_icd_pats_all = max(num_patients_icd, na.rm = T), .groups = 'drop') %>%
  left_join(demo_ana)

check_diag %>% 
  filter(max_icd_pats_all > num_patients_all)
```


```{r include=FALSE, eval=FALSE}
# country_pats <- demo_ana %>%
#   group_by(Country) %>%
#   summarise(total_patients_per_country = sum(num_patients_all), .groups = 'drop')

# dot_dat <- diag_ana %>%
#   group_by(siteid, time, icd) %>%
#   summarise(pats_time_icd_ctry = sum(num_patients_icd), .groups = 'drop') %>%
#   left_join(site_pats, by = 'siteid') %>%
#   mutate(percent_pats = pats_time_icd_ctry/num_patients_all)
# 
# dot_dat %>%
#   ggplot(aes(y = forcats::fct_reorder(icd, percent_pats), x = percent_pats, group = icd)) +
#     geom_point(aes(color = time)) +
#     geom_line() +
#     facet_grid(rows = vars(siteid)) +
#   # scale_y_continuous(breaks = seq(0, 1, 0.1), labels = scales::percent_format(accuracy = 1)) +
#   scale_color_carto_d(palette = 4) +
#   labs(x = NULL, y = 'Percent patients')
    
```


```{r}
# va_vec <- c("VA1", "VA10", "VA12", "VA15", "VA16", "VA17", "VA19", "VA2", "VA20", "VA21", "VA22", "VA23", "VA4", "VA5", "VA6", "VA7", "VA8", "VA9")
# sites <- c("APHP", "ASSTPAVIA", "BIDMC", "C2WF", "FRBDX", "H12O", "HPG23", "ICSM1", "KUMC", "MCWCTSI", "MGB", "MUSC", "NUH", "NWU", "POLIMI", "RIVHS", "SLHN", "UCLA", "UKER", "UKFR", "UKY", "UMICH", "UMM", "UNICZ", "UPenn", "UPITT", "UTSW", "VA")
# 
# obfuscation_df <- googlesheets4::read_sheet(
#   # 'https://docs.google.com/spreadsheets/d/1VhtKIbzEOeGFG1Iw27D_gq_l3eJpKn0LiJr_f_hFdwc/edit?ts=5ec17a70#gid=0',
#   'https://docs.google.com/spreadsheets/d/1Xl9juDBXt86P3xQtsoTaBl2zPl1BIiAG9DI3Rotyqp8/edit#gid=212461777',
#   skip = 1,
#   col_names = c('SLHN', 'site_name', 'contact_name', 'contact_email', 'city', 'country',
#                 'low_count_threshold', 'small_count_rows_deleted', 'blur_range', 'notes',
#                 'irb_statement')) %>%
#   filter(SLHN != 'FICHOS') %>%
#   mutate(low_count_threshold = low_count_threshold %>%
#            replace(low_count_threshold == '<=10', '<11') %>%
#            replace(low_count_threshold == '<=5', '<6'))
# 
# library(ggplot2)

# obfuscation_df %>%
#   ggplot(aes(x = low_count_threshold)) +
#   geom_bar()
```

```{r}
# obfus_1 <- obfuscation_df %>%
#   filter(SLHN %in% sites)
# table(obfus_1$low_count_threshold)
# sum(table(obfus_1$low_count_threshold))
```