-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprepare_data.R
41 lines (32 loc) · 1.69 KB
/
prepare_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
library(tidyverse)
library(readxl)
library(viridis)
library(CoordinateCleaner)
metadata <- readxl::read_xlsx(file.path("data", "SCYLV metadata.xlsx"))
names(metadata) <- c("isolate", "virus", "country", "host", "collection_date", "date", "authors", "accession")
metadata <- metadata %>%
select(strain = accession, virus, accession, date, country, authors)
# Example data set cols: strain virus accession date region country division city db segment authors url title journal paper_url
date_parts <- as.data.frame(str_match(metadata$date, pattern = '([0-9]{4})-?([0-9]{0,2})-?([0-9]{0,2})')[, 2:4])
date_parts[] <- lapply(date_parts, function(x) ifelse(x == "", "01", x))
metadata$date <- apply(date_parts, MARGIN = 1, paste0, collapse = "-")
metadata$country[metadata$country == "Reunion Island"] <- "Reunion" # Makes it match the lat/long data source
metadata$country[grepl(metadata$country, pattern = "^USA-")] <- "United States"
write_tsv(metadata, file = file.path("data", "metadata.tsv"))
## Config files
### Dropped strains
### Color
color_data <- tibble(scale = "country",
country = unique(metadata$country),
color = viridis(length(country)))
color_data$color <- substr(color_data$color, start = 1, stop = 7)
write_tsv(color_data, file = file.path("config", "colors.tsv"), col_names = FALSE)
### lat_longs
country_data <- as_tibble(countryref[! duplicated(countryref$name), ])
coord_data <- left_join(metadata, country_data, by = c(country = "name")) %>%
transmute(scale = "country",
country = country,
lat = centroid.lat,
long = centroid.lon)
write_tsv(coord_data, file = file.path("config", "lat_longs.tsv"), col_names = FALSE)
###