-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlqc_analysis.R
147 lines (115 loc) · 5.51 KB
/
lqc_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# The encoding of the script is UTF-8
# R version 3.1.1 (2014-07-10) -- "Sock it to Me"
# Platform: i386-w64-mingw32/i386 (32-bit)
# Operating system: Windows 7 (SP1)
# 20140821 qingcheng
# Couresra-[Reproducible Research][repdata-005][Course Project 2]
# This project involves exploring the U.S. National Oceanic and Atmospheric
# Administration's (NOAA) storm database. This database tracks characteristics
# of major storms and weather events in the United States, including when and
# where they occur, as well as estimates of any fatalities, injuries, and
# property damage.
# change the locale in English environment
Sys.setlocale('LC_ALL', 'English')
sessionInfo()
# Download and read the dataset
DataUrl<-
"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
filename = "StormData.csv.bz2"
if (file.exists(filename)){
print("storm data file already exists locally")
}else{
print("storm data file not found locally, so dowload from website")
download.file(DataUrl, destfile=filename)
dateDownloaded <- date()
dateDownloaded
}
if(exists("stormData")){
print("stormData already loaded into R")
}else{
print("load stormData")
stormData <- read.csv(bzfile(filename))
}
# Size of loaded data
print(object.size(stormData), units = "MB")
str(stormData)
# selecting only columns relevant to the analysis
data <- stormData[, c("BGN_DATE", "STATE", "EVTYPE", "FATALITIES", "INJURIES",
"PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
#column names to lower case
colnames(data) <- tolower(colnames(data))
data$date <- as.Date(data$bgn_date, format = "%m/%d/%Y")
mindate <- min(data$date)
maxdate <- max(data$date)
#all event types to upper case and refactored
data$evtype <- factor(toupper(data$evtype))
# all exponents defining unit of measurement of costs to upper case and refactored
data$propdmgexp <- factor(toupper(data$propdmgexp))
data$cropdmgexp <- factor(toupper(data$cropdmgexp))
levels(data$cropdmgexp)
levels(data$propdmgexp)
# define multipliers for valid exponents
validExpo <- data.frame(c("", "0", "H", "K", "M", "B"),
c(1, 1, 10^2, 10^3, 10^6, 10^9))
colnames(validExpo) <- c("validexp", "multiplier")
# subset data retaining only records with valid exponent
data <- subset(data, (cropdmgexp %in% validExpo$validexp) &
(propdmgexp %in% validExpo$validexp))
nrow(data)
# convert damage values in number
colnames(validExpo) <- c("validexp", "propdmgmultiplier")
data <- merge(data, validExpo, by.x = "propdmgexp", by.y = "validexp")
data$propdmg <- (data$propdmg * data$propdmgmultiplier)
colnames(validExpo) <- c("validexp", "cropdmgmultiplier")
data <- merge(data, validExpo, by.x = "cropdmgexp", by.y = "validexp")
data$cropdmg <- (data$cropdmg * data$cropdmgmultiplier)
# Population Health
library(plyr)
Fatalities <- aggregate(x = data$fatalities, by = list(data$evtype), FUN = sum)
names(Fatalities) <- c("evtype", "fatalities")
Fatalities <- arrange(Fatalities, fatalities, decreasing = TRUE)
Top10Fatalities <- head(Fatalities, n = 10)
Top10Fatalities <- within(Top10Fatalities,
evtype <- factor(x = evtype, levels = Fatalities$evtype))
Injuries <- aggregate(x = data$injuries, by = list(data$evtype), FUN = sum)
names(Injuries) <- c("evtype", "injuries")
Injuries <- arrange(Injuries, injuries, decreasing = TRUE)
Top10Injuries <- head(Injuries, n = 10)
Top10Injuries <- within(Top10Injuries,
evtype <- factor(x = evtype, levels = Top10Injuries$evtype))
# Economic Consequences
PropertyDmg <- aggregate(x = data$propdmg, by = list(data$evtype), FUN = sum)
names(PropertyDmg) <- c("evtype", "propdmg")
PropertyDmg <- arrange(PropertyDmg, propdmg, decreasing = TRUE)
Top10PropertyDmg <- head(PropertyDmg, n = 10)
Top10PropertyDmg <- within(Top10PropertyDmg, evtype <-
factor(x = evtype, levels = Top10PropertyDmg$evtype))
CropDmg <- aggregate(x = data$cropdmg, by = list(data$evtype), FUN = sum)
names(CropDmg) <- c("evtype", "cropdmg")
CropDmg <- arrange(CropDmg, cropdmg, decreasing = TRUE)
Top10CropDmg <- head(CropDmg, n = 10)
Top10CropDmg <- within(Top10CropDmg,
evtype <- factor(x = evtype, levels = Top10CropDmg$evtype))
# Results
library(ggplot2)
library(gridExtra)
plot1 <- qplot(evtype, data = Top10Fatalities, weight = fatalities, geom = "bar",
binwidth = 1) + scale_y_continuous("Number of Fatalities") +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
xlab("Event Type") + ggtitle("Total Fatalities by Event Type in the U.S.")
plot2 <- qplot(evtype, data = Top10Injuries, weight = injuries, geom = "bar",
binwidth = 1) + scale_y_continuous("Number of Injuries") +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
xlab("Event Type") + ggtitle("Total Injuries by Event Type in the U.S.")
grid.arrange(plot1, plot2, ncol = 2)
plot3 <- qplot(evtype, data = Top10PropertyDmg, weight = propdmg, geom = "bar",
binwidth = 1) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
ylab("Property Damage in US dollars") +
xlab("Event Type") + ggtitle("Total Property Damage by Event Type in the U.S.")
plot4 <- qplot(evtype, data = Top10CropDmg, weight = cropdmg, geom = "bar",
binwidth = 1) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
ylab("Crop Damage in US dollars") +
xlab("Event Type") + ggtitle("Total Crop Damage by Event Type in the U.S.")
grid.arrange(plot3, plot4, ncol = 2)