-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathData Assimilation.R
119 lines (93 loc) · 3.68 KB
/
Data Assimilation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
## Included get_data.R and get_googletrends_data.R to have everything in once place
## Google Trends Data (get_googletrends_data.R)
setwd("/home/carya/TeamGOATS")
#install gtrendsR package that pulls Google trend data
install.packages("gtrendsR")
library(gtrendsR)
#Data Source: Google Zika Trends (https://www.google.com/trends/explore#q=zika&geo=CO&date=9%2F2015%207m&cmpt=q&tz=Etc%2FGMT%2B5)
gconnect("[email protected]","ge585zika")
#Pulling Google trend data for Colombia from September 2015 to present
## Change start date to within the last 3 months to get daily updates (using February 1st, 2016 below)
zika_trend.old=gtrends("zika",geo=c("CO"), start_date="2015-11-01",end_date= "2016-01-31")
zika_trend=gtrends("zika",geo=c("CO"), start_date= "2016-02-01")
zika = rbind(zika_trend.old$trend,zika_trend$trend)
plot(zika)
save.image("googleZika.RData")
#######################################################################################
## Buzzfeed Data (get_data.R)
#!/usr/bin/env Rscript
.libPaths(c("/home/carya/R/library",.libPaths()))
.libPaths()
setwd("/home/carya/TeamGOATS")
## since libraries will be pulled, make sure repository is set
repos = "http://cran.us.r-project.org"
get.pkg <- function(pkg){
loaded <- do.call("require",list(package=pkg))
if(!loaded){
print(paste("trying to install",pkg))
install.packages(pkg,dependencies=TRUE,repos=repos)
loaded <- do.call("require",list(package=pkg))
if(loaded){
print(paste(pkg,"installed and loaded"))
}
else {
stop(paste("could not install",pkg))
}
}
}
get.pkg("RCurl")
print(require("XML"))
get.pkg("ncdf4")
get.pkg("devtools")
get.pkg("MODISTools")
print("loaded packages")
load("zika.RData")
update.old = update
data.repo = "https://github.com/BuzzFeedNews/zika-data/tree/master/data/parsed/colombia"
# Scrape Github for new data
## Grab raw html
zika_URL <- getURL(data.repo)
## Read the Github reference table data into R
zika_table = readHTMLTable(zika_URL)[[1]]
muni_data = as.matrix(zika_table[3:nrow(zika_table),]) # Extract the municipal data
## Store the dates of when each file was updated in the object "update"
update = as.Date(substr(muni_data[,2],20,29)) # Extract dates from .csv file names
## Plot histogram of when new files were uploaded
#hist(update,"days",col="grey")
raw.path = "https://raw.githubusercontent.com/BuzzFeedNews/zika-data/master/data/parsed/colombia"
data = list()
total = matrix(data=NA,nrow=length(update))
for(i in 1:length(update)){
data[[i]] = read.csv(paste0(raw.path,"/",muni_data[i,2]))
total[i] = sum(data[[i]][,"zika_total"])
#myfile = file.path("C:/Users/Emily/Documents/TeamGOATS/",paste0("file",i,".csv"))
#datafile = read.csv(paste0(raw.path,"/",muni_data[i,2]))
#write.csv(datafile,file=myfile)
}
jpeg("web/Cases.jpg")
plot(update,total,xlab="Time",ylab="Total confirmed and suspected")
dev.off()
save.image("zika.RData")
#save
time = update
y = as.integer(total)
#######################################################################################
## Combine Buzzfeed and Google Trends data
# Create new column for buzzfeed data
x = NA
new.column = rep(x,times = 146)
# Create new matrix with both data sources
combined.data = cbind(zika,new.column)
colnames(combined.data) = c("Date","Searches","Cases")
google.time = as.Date(combined.data$Date)
combined.data[,1] = google.time
search.sum = cumsum(combined.data$Searches) # Make google data sum up to match format of buzzfeed data
combined.data[,2] = search.sum
# Add buzzfeed data to combined.data matrix
for(i in 1:146){
for(j in 1:7){
if(combined.data$Date[i] == time[j]) {
combined.data$Cases[i] = total[j] # Add buzzfeed data in when dates match, the rest are NA
}
}
}