-
Notifications
You must be signed in to change notification settings - Fork 256
/
Copy pathde_dupe_linelist.R
57 lines (44 loc) · 2.06 KB
/
de_dupe_linelist.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#Hashing for de-dupe
#Samuel V. Scarpino ([email protected])
#Jan. 30th 20202
#see http://amunategui.github.io/feature-hashing/
#You need to create a folder called "secrets" and add the path to the Google sheets and a file with your service API key
###########
#libraries#
###########
library(FeatureHashing)
library(glmnet)
library(googlesheets4)
library(googledrive)
###############
#Global Params#
###############
cols_to_use <- c("ID", "age", "sex", "city", "province", "country" ,"date_onset_symptoms", "date_admission_hospital", "date_confirmation", "symptoms", "lives_in_Wuhan", "travel_history_dates", "travel_history_location", "reported_market_exposure", "sequence_available", "outcome", "source")
cols_to_match <- c("ID", "age", "sex", "city", "province", "country", "latitude", "longitude", "date_onset_symptoms", "date_admission_hospital", "date_confirmation", "symptoms", "lives_in_Wuhan", "travel_history_dates", "travel_history_location", "reported_market_exposure", "sequence_available", "outcome", "source")
google_sheet_name <- readLines("secrets/google_sheet_name.txt")
sheets_auth(path = "secrets/service_google_api_key.json", use_oob = TRUE)
###############
#Acc functions#
###############
source("de_dupe_functions.R")
######
#Data#
######
wuhan_data <- sheets_get(ss = google_sheet_name) %>%
read_sheet(sheet = "Hubei")
#changing wuhan resident column
find_Wuhan_resident <- which(colnames(wuhan_data) == "Wuhan_resident")
if(length(find_Wuhan_resident) == 1){
colnames(wuhan_data)[find_Wuhan_resident] <- "lives_in_Wuhan" #this is the column in the outside wuhan sheet
}
wuhan_data$ID <- paste0(wuhan_data$ID, "-Wuhan")
outside_wuhan_data <- sheets_get(ss = google_sheet_name) %>%
read_sheet(sheet = "outside_Hubei")
outside_wuhan_data$ID <- paste0(outside_wuhan_data$ID, "-Outside-Wuhan")
full_data <- rbind(wuhan_data[,cols_to_match], outside_wuhan_data[,cols_to_match])
full_data$age <- as.character(full_data$age)
full_data$lives_in_Wuhan <- as.character(full_data$lives_in_Wuhan)
############
#Find dupes#
############
dupes <- main(data = full_data)