.Rhistory

count.high.SES.line
# high SES SVM modeling
subway_lm_high <- train(rating ~ .,
data = above.avg.subway,
method = "svmRadial",
trControl=subway.above.ctrl,
tunelength= 10)
#subway_lm_high_2 <- lm(formula = rating ~ reviewCount * price, data = above.avg.subway.train)
summary(subway_lm_high)
predict_subway_lm_high <- predict(subway_lm_high, below.avg.subway)
#postResample(predict_yelp_svm_high, below.avg$rating)
#actual minus predicted
delta.subway.high.SES <- below.avg.subway$rating - predict_subway_lm_high
count.subway.high.SES.line <- length(which(delta.subway.high.SES < 0)) / length(delta.subway.high.SES)
count.high.SES.line
above.avg.subway <- filter(above.avg, name %in% "Subway")
below.avg.subway <- filter(below.avg, name %in% "Subway")
check.unique.dummy.variable(above.avg.subway)
check.unique.dummy.variable(below.avg.subway)
View(above.avg.subway)
# filter only Subways for high and low SES
above.avg.subway <- filter(above.avg, name %in% "Subway")
below.avg.subway <- filter(below.avg, name %in% "Subway")
above.avg.subway <- subset(above.avg.subway, select=c("rating", "reviewCount", "recentHealthInspectionScore","totalInspectionScore", "totalInspectionCount", "avgInspectionScore", "restaurantTotalMonths", "restaurantMaxSeats", "recentHealthInspectionResultSatisfactory", "recentHealthInspectionResultUnsatisfactory", "recentHealthViolationType.1", "recentHealthViolationTypeblue", "recentHealthViolationTypered", "recentHealthInspectionGrade.1", "recentHealthInspectionGrade.2", "recentHealthInspectionGrade.3", "recentHealthInspectionGrade.4"))
above.avg.subway <- subset(below.avg.subway, select=c("rating", "reviewCount", "recentHealthInspectionScore","totalInspectionScore", "totalInspectionCount", "avgInspectionScore", "restaurantTotalMonths", "restaurantMaxSeats", "recentHealthInspectionResultSatisfactory", "recentHealthInspectionResultUnsatisfactory", "recentHealthViolationType.1", "recentHealthViolationTypeblue", "recentHealthViolationTypered", "recentHealthInspectionGrade.1", "recentHealthInspectionGrade.2", "recentHealthInspectionGrade.3", "recentHealthInspectionGrade.4"))
subway.above.ctrl <- trainControl(method = "repeatedcv", number=nrow(above.avg.subway) - 1, repeats=3) # 10 fold cross-validation, repeated 3 times. better way to do it but takes longer.
subway.below.ctrl <- trainControl(method = "repeatedcv", number=nrow(below.avg.subway) - 1, repeats=3)
above.avg.subway
View(above.avg.subway)
View(below.avg.subway)
# splitting boston data into train+validate and test sets
split_proportion = 0.8
# select outcome variable for below avg HH income
below.avg.subway.outcome <- subway.below.avg.wo.census %>% dplyr::select(rating) # rating column only
# randomly select indices for train/validate set
below.avg.subway.train_ind <- createDataPartition(below.avg.subway.outcome$rating, p = split_proportion, list = FALSE)
below.avg.subway.train <- subway.below.avg.wo.census[below.avg.subway.train_ind,] # get training below avg data
below.avg.subway.test <- subway.below.avg.wo.census[-below.avg.subway.train_ind,] # get test below avg data
# select outcome variable for above avg HH income
above.avg.subway.outcome <- subway.above.avg.wo.census %>% dplyr::select(rating) # rating column only
# randomly select indices for train/validate set
above.avg.subway.train_ind <- createDataPartition(above.avg.subway.outcome$rating, p = split_proportion, list = FALSE)
above.avg.subway.train <- subway.above.avg.wo.census[above.avg.subway.train_ind,] # get training above avg data
above.avg.subway.test <- subway.above.avg.wo.census[-above.avg.subway.train_ind,] # get test above avg data
# filter only Subways for high and low SES
above.avg.subway <- filter(above.avg, name %in% "Subway")
below.avg.subway <- filter(below.avg, name %in% "Subway")
above.avg.subway <- subset(above.avg.subway, select=c("rating", "reviewCount", "recentHealthInspectionScore","totalInspectionScore", "totalInspectionCount", "avgInspectionScore", "restaurantTotalMonths", "restaurantMaxSeats", "recentHealthInspectionResultSatisfactory", "recentHealthInspectionResultUnsatisfactory", "recentHealthViolationType.1", "recentHealthViolationTypeblue", "recentHealthViolationTypered", "recentHealthInspectionGrade.1", "recentHealthInspectionGrade.2", "recentHealthInspectionGrade.3", "recentHealthInspectionGrade.4"))
below.avg.subway <- subset(below.avg.subway, select=c("rating", "reviewCount", "recentHealthInspectionScore","totalInspectionScore", "totalInspectionCount", "avgInspectionScore", "restaurantTotalMonths", "restaurantMaxSeats", "recentHealthInspectionResultSatisfactory", "recentHealthInspectionResultUnsatisfactory", "recentHealthViolationType.1", "recentHealthViolationTypeblue", "recentHealthViolationTypered", "recentHealthInspectionGrade.1", "recentHealthInspectionGrade.2", "recentHealthInspectionGrade.3", "recentHealthInspectionGrade.4"))
subway.above.ctrl <- trainControl(method = "repeatedcv", number=nrow(above.avg.subway) - 1, repeats=3) # 10 fold cross-validation, repeated 3 times. better way to do it but takes longer.
subway.below.ctrl <- trainControl(method = "repeatedcv", number=nrow(below.avg.subway) - 1, repeats=3)
# filter only Subways for high and low SES
above.avg.subway <- filter(above.avg, name %in% "Subway")
below.avg.subway <- filter(below.avg, name %in% "Subway")
# remove violationTypeRed, InspectionGrade 2 ,3 ,4 for above.avg since those columns aren't unique
above.avg.subway <- subset(above.avg.subway, select=c("rating", "reviewCount", "recentHealthInspectionScore","totalInspectionScore", "totalInspectionCount", "avgInspectionScore", "restaurantTotalMonths", "restaurantMaxSeats", "recentHealthInspectionResultSatisfactory", "recentHealthInspectionResultUnsatisfactory", "recentHealthViolationType.1", "recentHealthViolationTypeblue", "recentHealthInspectionGrade.1"))
# remove InspectionGrade 3 ,4 for above.avg since those columns aren't unique
below.avg.subway <- subset(below.avg.subway, select=c("rating", "reviewCount", "recentHealthInspectionScore","totalInspectionScore", "totalInspectionCount", "avgInspectionScore", "restaurantTotalMonths", "restaurantMaxSeats", "recentHealthInspectionResultSatisfactory", "recentHealthInspectionResultUnsatisfactory", "recentHealthViolationType.1", "recentHealthViolationTypeblue",  "recentHealthInspectionGrade.1"))
subway.above.ctrl <- trainControl(method = "repeatedcv", number=nrow(above.avg.subway) - 1, repeats=3) # 10 fold cross-validation, repeated 3 times. better way to do it but takes longer.
subway.below.ctrl <- trainControl(method = "repeatedcv", number=nrow(below.avg.subway) - 1, repeats=3)
# high SES SVM modeling
subway_lm_high <- train(rating ~ .,
data = above.avg.subway,
method = "lm",
trControl=subway.above.ctrl)
#subway_lm_high_2 <- lm(formula = rating ~ reviewCount * price, data = above.avg.subway.train)
summary(subway_lm_high)
predict_subway_lm_high <- predict(subway_lm_high, below.avg.subway)
#postResample(predict_yelp_svm_high, below.avg$rating)
#actual minus predicted
delta.subway.high.SES <- below.avg.subway$rating - predict_subway_lm_high
count.subway.high.SES.line <- length(which(delta.subway.high.SES < 0)) / length(delta.subway.high.SES)
count.high.SES.line
# high SES SVM modeling
subway_lm_high <- train(rating ~ .,
data = above.avg.subway,
method = "lm",
trControl=subway.above.ctrl)
#subway_lm_high_2 <- lm(formula = rating ~ reviewCount * price, data = above.avg.subway.train)
summary(subway_lm_high)
predict_subway_lm_high <- predict(subway_lm_high, below.avg.subway)
#postResample(predict_yelp_svm_high, below.avg$rating)
#actual minus predicted
delta.subway.high.SES <- below.avg.subway$rating - predict_subway_lm_high
count.subway.high.SES.line <- length(which(delta.subway.high.SES < 0)) / length(delta.subway.high.SES)
count.subway.high.SES.line
# high SES SVM modeling
subway_lm_low <- train(rating ~ .,
data = below.avg.subway,
method = "lm",
trControl=subway.below.ctrl)
predict_subway_lm_low <- predict(subway_lm_low, above.avg.subway)
#postResample(predict_yelp_svm_high, below.avg$rating)
#actual minus predicted
delta.subway.low.SES <- above.avg.subway$rating - predict_subway_lm_low
count.subway.low.SES.line <- length(which(delta.subway.low.SES < 0)) / length(delta.subway.low.SES)
count.subway.low.SES.line
# high SES SVM modeling
subway_lm_high <- train(rating ~ .,
data = above.avg.subway,
method = "lm",
trControl=subway.above.ctrl)
#subway_lm_high_2 <- lm(formula = rating ~ reviewCount * price, data = above.avg.subway.train)
summary(subway_lm_high)
predict_subway_lm_high <- predict(subway_lm_high, below.avg.subway)
#postResample(predict_yelp_svm_high, below.avg$rating)
#actual minus predicted
delta.subway.high.SES <- below.avg.subway$rating - predict_subway_lm_high
count.subway.high.SES.below.line <- length(which(delta.subway.high.SES < 0)) / length(delta.subway.high.SES)
count.subway.high.SES.below.line
count.subway.high.SES.above.line <- length(which(delta.subway.high.SES > 0)) / length(delta.subway.high.SES)
count.subway.high.SES.above.line
count.subway.high.SES.on.line <- length(which(delta.subway.high.SES = 0)) / length(delta.subway.high.SES)
# high SES SVM modeling
subway_lm_high <- train(rating ~ .,
data = above.avg.subway,
method = "lm",
trControl=subway.above.ctrl)
#subway_lm_high_2 <- lm(formula = rating ~ reviewCount * price, data = above.avg.subway.train)
summary(subway_lm_high)
predict_subway_lm_high <- predict(subway_lm_high, below.avg.subway)
#postResample(predict_yelp_svm_high, below.avg$rating)
#actual minus predicted
delta.subway.high.SES <- below.avg.subway$rating - predict_subway_lm_high
count.subway.high.SES.below.line <- length(which(delta.subway.high.SES < 0)) / length(delta.subway.high.SES)
count.subway.high.SES.below.line
count.subway.high.SES.above.line <- length(which(delta.subway.high.SES > 0)) / length(delta.subway.high.SES)
count.subway.high.SES.above.line
count.subway.high.SES.on.line <- length(which(delta.subway.high.SES == 0)) / length(delta.subway.high.SES)
count.subway.high.SES.on.line
# high SES SVM modeling
subway_lm_low <- train(rating ~ .,
data = below.avg.subway,
method = "lm",
trControl=subway.below.ctrl)
predict_subway_lm_low <- predict(subway_lm_low, above.avg.subway)
#postResample(predict_yelp_svm_high, below.avg$rating)
#actual minus predicted
delta.subway.low.SES <- above.avg.subway$rating - predict_subway_lm_low
count.subway.low.SES.below.line <- length(which(delta.subway.low.SES < 0)) / length(delta.subway.low.SES)
count.subway.low.SES.below.line
count.subway.low.SES.above.line <- length(which(delta.subway.low.SES > 0)) / length(delta.subway.low.SES)
count.subway.low.SES.above.line
count.subway.low.SES.on.line <- length(which(delta.subway.low.SES == 0)) / length(delta.subway.low.SES)
count.subway.low.SES.on.line
ggplot(data=subway_lm_high) + geom_point()
subway_highSES_grid <- below.avg.subway %>%
gather_predictions(subway_lm_high)
ggplot( below.avg.subway, aes(rating, color = rm)) +
geom_point() +
geom_line(data = subway_highSES_grid, aes(y = pred))
ggplot( below.avg.subway, aes(rating)) +
geom_point() +
geom_line(data = subway_highSES_grid, aes(y = pred))
ggplot( below.avg.subway, aes(rating,totalInspectionCount)) +
geom_point() + geom_line(data = subway_highSES_grid, aes(y = pred))
ggplot( below.avg.subway, aes(totalInspectionCount,rating)) +
geom_point() + geom_line(data = subway_highSES_grid, aes(y = pred))
subway_high_with_residual <- below.avg.subway %>%
mutate(
residuals = rating - predict_subway_lm_high
)
ggplot(subway_high_with_residual, aes(x)) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0)
ggplot(subway_high_with_residual, aes(rating)) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0)
# high SES SVM modeling
subway_lm_high <- train(rating ~ .,
data = above.avg.subway,
method = "lm",
trControl=subway.above.ctrl)
#subway_lm_high_2 <- lm(formula = rating ~ reviewCount * price, data = above.avg.subway.train)
summary(subway_lm_high)
predict_subway_lm_high <- predict(subway_lm_high, below.avg.subway)
#postResample(predict_yelp_svm_high, below.avg$rating)
#actual minus predicted
delta.subway.high.SES <- below.avg.subway$rating - predict_subway_lm_high
count.subway.high.SES.below.line <- length(which(delta.subway.high.SES < 0)) / length(delta.subway.high.SES)
count.subway.high.SES.below.line
count.subway.high.SES.above.line <- length(which(delta.subway.high.SES > 0)) / length(delta.subway.high.SES)
count.subway.high.SES.above.line
count.subway.high.SES.on.line <- length(which(delta.subway.high.SES == 0)) / length(delta.subway.high.SES)
count.subway.high.SES.on.line
subway_high_with_residual <- below.avg.subway %>%
mutate(
residuals = rating - predict_subway_lm_high
)
ggplot(subway_high_with_residual, aes(rating)) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0)
View(subway_high_with_residual)
# high SES SVM modeling
subway_lm_low <- train(rating ~ .,
data = below.avg.subway,
method = "lm",
trControl=subway.below.ctrl)
predict_subway_lm_low <- predict(subway_lm_low, above.avg.subway)
#postResample(predict_yelp_svm_high, below.avg$rating)
#actual minus predicted
delta.subway.low.SES <- above.avg.subway$rating - predict_subway_lm_low
count.subway.low.SES.below.line <- length(which(delta.subway.low.SES < 0)) / length(delta.subway.low.SES)
count.subway.low.SES.below.line
count.subway.low.SES.above.line <- length(which(delta.subway.low.SES > 0)) / length(delta.subway.low.SES)
count.subway.low.SES.above.line
count.subway.low.SES.on.line <- length(which(delta.subway.low.SES == 0)) / length(delta.subway.low.SES)
count.subway.low.SES.on.line
subway_low_with_residual <- above.avg.subway %>%
mutate(
residuals = rating - predict_subway_lm_low
)
ggplot(subway_low_with_residual, aes(rating)) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0)
# high SES SVM modeling
model_svm_high <- train(rating ~ .,
data = above.avg.train,
method = "svmRadial",
trControl=ctrl,   # Radial kernel
tuneLength = 10)
predict_yelp_svm_high <- predict(model_svm_high, below.avg.test)
#postResample(predict_yelp_svm_high, below.avg$rating)
#actual minus predicted
delta.high.SES <- below.avg.test$rating - predict_yelp_svm_high
count.high.SES.below.line <- length(which(delta.high.SES < 0)) / length(delta.high.SES)
count.high.SES.below.line
count.high.SES.above.line <- length(which(delta.high.SES > 0)) / length(delta.high.SES)
count.high.SES.above.line
count.high.SES.on.line <- length(which(delta.high.SES == 0)) / length(delta.high.SES)
count.high.SES.on.line
highSES_with_residual <- below.avg.test %>%
mutate(
residuals = rating - model_svm_high
)
# high SES SVM modeling
model_svm_high <- train(rating ~ .,
data = above.avg.train,
method = "svmRadial",
trControl=ctrl,   # Radial kernel
tuneLength = 10)
predict_yelp_svm_high <- predict(model_svm_high, below.avg.test)
#postResample(predict_yelp_svm_high, below.avg$rating)
#actual minus predicted
delta.high.SES <- below.avg.test$rating - predict_yelp_svm_high
count.high.SES.below.line <- length(which(delta.high.SES < 0)) / length(delta.high.SES)
count.high.SES.below.line
count.high.SES.above.line <- length(which(delta.high.SES > 0)) / length(delta.high.SES)
count.high.SES.above.line
count.high.SES.on.line <- length(which(delta.high.SES == 0)) / length(delta.high.SES)
count.high.SES.on.line
highSES_with_residual <- below.avg.test %>%
mutate(
residuals = rating - predict_yelp_svm_high
)
ggplot(highSES_with_residual, aes(rating)) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0)
ggplot(highSES_with_residual) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0)
# low SES SVM modeling
model_svm_low <- train(rating ~ .,
data = below.avg.train,
method = "svmRadial",
trControl=ctrl,   # Radial kernel
tuneLength = 10)
predict_yelp_svm_low <- predict(model_svm_low, above.avg.test)
#postResample(predict_yelp_svm_low, below.avg$rating)
#actual minus predicted
delta.low.SES <- above.avg.test$rating - predict_yelp_svm_low
count.low.SES.below.line <- length(which(delta.low.SES < 0)) / length(delta.low.SES)
count.low.SES.below.line
count.low.SES.above.line <- length(which(delta.low.SES > 0)) / length(delta.low.SES)
count.low.SES.above.line
count.low.SES.on.line <- length(which(delta.low.SES == 0)) / length(delta.low.SES)
count.low.SES.on.line
lowSES_with_residual <- above.avg.test %>%
mutate(
residuals = rating - predict_yelp_svm_low
)
ggplot(lowSES_with_residual, aes(rating)) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0)
View(predict_subway_lm_high)
ggplot(lowSES_with_residual, aes(1:nrow())) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0)
ggplot(lowSES_with_residual, aes(1:nrow(lowSES_with_residual))) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0)
ggplot(highSES_with_residual, aes(1:nrow(highSES_with_residual))) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0)
# low SES SVM modeling
model_svm_low <- train(rating ~ .,
data = below.avg.train,
method = "svmRadial",
trControl=ctrl,   # Radial kernel
tuneLength = 10)
predict_yelp_svm_low <- predict(model_svm_low, above.avg.test)
#postResample(predict_yelp_svm_low, below.avg$rating)
#actual minus predicted
delta.low.SES <- above.avg.test$rating - predict_yelp_svm_low
count.low.SES.below.line <- length(which(delta.low.SES < 0)) / length(delta.low.SES)
count.low.SES.below.line
count.low.SES.above.line <- length(which(delta.low.SES > 0)) / length(delta.low.SES)
count.low.SES.above.line
count.low.SES.on.line <- length(which(delta.low.SES == 0)) / length(delta.low.SES)
count.low.SES.on.line
lowSES_with_residual <- above.avg.test %>%
mutate(
residuals = rating - predict_yelp_svm_low
)
ggplot(lowSES_with_residual, aes(1:nrow(lowSES_with_residual))) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0)
nrow(above.avg)
nrow(mid.avg)
nrow(below.avg)
ggplot(subway_high_with_residual, aes(rating)) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0)
ggplot(highSES_with_residual, aes(1:nrow(highSES_with_residual))) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0) + ylim(-3,3)
ggplot(lowSES_with_residual, aes(1:nrow(lowSES_with_residual))) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0) + ylim(-3,3)
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp",y="Number of Restaurants", x="Seattle Census Tract Code")
highSES
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", legend="test")
highSES
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", color="test")
highSES
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", colour="test")
highSES
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", colour="test", caption="(based on data from Yelp Fusion API and Seattle and Census Reporter)")
highSES
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", color="Median Household Income", caption="(based on data from Yelp Fusion API and Seattle and Census Reporter)")
highSES
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", colour="Median Household Income", caption="(based on data from Yelp Fusion API and Seattle and Census Reporter)")
highSES
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", col="Median Household Income", caption="(based on data from Yelp Fusion API and Seattle and Census Reporter)")
highSES
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", fill="Median Household Income", caption="(based on data from Yelp Fusion API and Seattle and Census Reporter)")
highSES
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", fill="Median Household Income ($)", caption="(based on data from Yelp Fusion API and Seattle and Census Reporter)")
highSES
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", fill="Median Household Income ($)", caption="(based on data from Yelp Fusion API and Seattle and Census Reporter)")
highSES
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", fill="Median Household Income ($)", caption="(based on data from Yelp Fusion API and Seattle and Census Reporter)")
highSES
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", fill="Median Household Income ($)", caption="(based on data from Yelp Fusion API and Seattle, FCC Data, Census Reporter, and the U.S. Census bureau)")
highSES
sorted.highSES <- transform(above.avg, censusTract = reorder(censusTract, censusMedianHHIncome))
highSES <- ggplot(sorted.highSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in High Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", fill="Median Household Income ($)", caption="(based on data from Yelp Fusion API and Seattle, FCC Data, Census Reporter, and the U.S. Census Bureau)")
sorted.midSES <- transform(mid.avg, censusTract = reorder(censusTract, censusMedianHHIncome))
midSES <- ggplot(sorted.midSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in Median Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", fill="Median Household Income ($)", caption="(based on data from Yelp Fusion API and Seattle, FCC Data, Census Reporter, and the U.S. Census Bureau)")
sorted.lowSES <- transform(below.avg, censusTract = reorder(censusTract, censusMedianHHIncome))
lowSES <- ggplot(sorted.lowSES, aes(censusTract, fill=censusMedianHHIncome)) +geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Seattle Restaurants in Low Socio-Economic Neighborhoods on Yelp", y="Number of Restaurants", x="Seattle Census Tract Code", fill="Median Household Income ($)", caption="(based on data from Yelp Fusion API and Seattle, FCC Data, Census Reporter, and the U.S. Census Bureau)")
highSES
midSES
lowSES
restaurantsByPrice <- ggplot(priceFreq, aes(fill=Var2, y=Freq, x=Var1)) +
geom_bar( stat="identity", position="dodge") + labs(title="Distribution of Seattle Restaurants by Price Level in various Socio-Economic neighborhoods on Yelp", y="Number of Restaurants", x="Price Level on Yelp ($ - $$$$)", fill="Socio-Economic Level (based on Median Household Income)", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
restaurantsByPrice
restaurantsByPrice <- ggplot(priceFreq, aes(fill=Var2, y=Freq, x=Var1)) +
geom_bar( stat="identity", position="dodge") + labs(title="Distribution of Seattle Restaurants by Price Level", y="Number of Restaurants", x="Price Level on Yelp ($ - $$$$)", fill="Socio-Economic Level (based on Median Household Income)", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
restaurantsByPrice
restaurantsByPrice <- ggplot(priceFreq, aes(fill=Var2, y=Freq, x=Var1)) +
geom_bar( stat="identity", position="dodge") + labs(title="Distribution of Seattle Restaurants by Price Level", y="Number of Restaurants", x="Price Level on Yelp ($ - $$$$)", fill="Socio-Economic Level /b (based on Median Household Income)", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
restaurantsByPrice
restaurantsByPrice <- ggplot(priceFreq, aes(fill=Var2, y=Freq, x=Var1)) +
geom_bar( stat="identity", position="dodge") + labs(title="Distribution of Seattle Restaurants by Price Level", y="Number of Restaurants", x="Price Level on Yelp ($ - $$$$)", fill="Socio-Economic Level /n (based on Median Household Income)", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
restaurantsByPrice
restaurantsByPrice <- ggplot(priceFreq, aes(fill=Var2, y=Freq, x=Var1)) +
geom_bar( stat="identity", position="dodge") + labs(title="Distribution of Seattle Restaurants by Price Level", y="Number of Restaurants", x="Price Level on Yelp ($ - $$$$)", fill="Socio-Economic Level \n (based on Median Household Income)", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
restaurantsByPrice
restaurantsByPrice <- ggplot(priceFreq, aes(fill=Var2, y=Freq, x=Var1)) +
geom_bar( stat="identity", position="dodge") + labs(title="Distribution of Seattle Restaurants by Price Level", y="Number of Restaurants", x="Price Level on Yelp ($ - $$$$)", fill="Socio-Economic Level \n(based on Median Household Income)", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
restaurantsByPrice
restaurantsByPrice <- ggplot(priceFreq, aes(fill=Var2, y=Freq, x=Var1)) +
geom_bar( stat="identity", position="dodge") + labs(title="Distribution of Seattle Restaurants by Price Level on Yelp", y="Number of Restaurants", x="Price Level on Yelp ($ - $$$$)", fill="Socio-Economic Level \n(based on Median Household Income)", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
restaurantsByPrice
ratingFreq <- as.data.frame(table(yelp.data$rating))
ratingFreq$percentage = ratingFreq$Freq / sum(ratingFreq$Freq)
restaurantsByRating <- ggplot(ratingFreq, aes(y=percentage, x=Var1,group=1)) +
geom_line() + labs(title="Distribution of Seattle Restaurants by Rating on Yelp", y="Total Proportion of Restaurants", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
restaurantsByRating
ratingFreq <- as.data.frame(table(yelp.data$rating))
ratingFreq$percentage = ratingFreq$Freq / sum(ratingFreq$Freq)
restaurantsByRating <- ggplot(ratingFreq, aes(y=percentage, x=Var1,group=1)) +
geom_line() + labs(title="Distribution of Seattle Restaurants by Rating on Yelp", y="Total Proportion of Restaurants", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API)")
restaurantsByRating
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(fun.y=mean, geom="point", shape=23, size=2, color="blue") +
stat_summary(fun.y=median, geom="point", size=2, color="red")
ratingViolinPlot
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(fun.y=mean, geom="point", shape=23, size=2, color="blue",show.legend =TRUE) +
stat_summary(fun.y=median, geom="point", size=2, color="red") +
labs(title="Distribution of Seattle Restaurants Rating \nin various Median Household Income on Yelp", y="Median Household Income", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
ratingViolinPlot
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(fun.y=mean, geom="point", shape=23, size=2, color="blue",show.legend =TRUE) +
stat_summary(fun.y=median, geom="point", size=2, color="red",show.legend = TRUE) +
labs(title="Distribution of Seattle Restaurants Rating \nin various Median Household Income on Yelp", y="Median Household Income", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
ratingViolinPlot
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(fun.y=mean, geom="point", shape=23, size=2, color="blue") +
stat_summary(fun.y=median, geom="point", size=2, color="red") + scale_shape_manual("", values=c("mean"="x")) +
labs(title="Distribution of Seattle Restaurants Rating \nin various Median Household Income on Yelp", y="Median Household Income ($)", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
ratingViolinPlot
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(fun.y=mean, geom="point", shape=23, size=2, color="blue") +
stat_summary(fun.y=median, geom="point", size=2, color="red", show.legend = TRUE) + scale_shape_manual("", values=c("mean"="x")) +
labs(title="Distribution of Seattle Restaurants Rating \nin various Median Household Income on Yelp", y="Median Household Income ($)", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
ratingViolinPlot
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(fun.y=mean, geom="point", shape=23, size=2, color="blue") +
stat_summary(fun.y=median, geom="point", size=2, color="red") + scale_shape_manual("", values=c("blue","red")) +
labs(title="Distribution of Seattle Restaurants Rating \nin various Median Household Income on Yelp", y="Median Household Income ($)", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
ratingViolinPlot
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(aes(color="mean"),fun.y=mean, geom="point", shape=23, size=2, color="blue") +
stat_summary(fun.y=median, geom="point", size=2, color="red") +
labs(title="Distribution of Seattle Restaurants Rating \nin various Median Household Income on Yelp", y="Median Household Income ($)", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
ratingViolinPlot
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(aes(color="mean"),fun.data=mean_cl_normal, geom="point", shape=23, size=2, color="blue") +
stat_summary(fun.y=median, geom="point", size=2, color="red") +
labs(title="Distribution of Seattle Restaurants Rating \nin various Median Household Income on Yelp", y="Median Household Income ($)", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
ratingViolinPlot
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(aes(col="mean"),fun.data=mean, geom="point", shape=23, size=2, color="blue") +
stat_summary(fun.y=median, geom="point", size=2, color="red") +
labs(title="Distribution of Seattle Restaurants Rating \nin various Median Household Income on Yelp", y="Median Household Income ($)", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
ratingViolinPlot
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(aes(color="mean")fun.y=mean, geom="point", shape=23, size=2, color="blue") +
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(aes(color="mean"),fun.y=mean, geom="point", shape=23, size=2, color="blue") +
stat_summary(fun.y=median, geom="point", size=2, color="red") + scale_shape_manual("", values=c("blue","red")) +
labs(title="Distribution of Seattle Restaurants Rating \nin various Median Household Income on Yelp", y="Median Household Income ($)", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
ratingViolinPlot
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(fun.y=mean, geom="point", shape=23, size=2, color="blue") +
stat_summary(fun.y=median, geom="point", size=2, color="red")
labs(title="Distribution of Seattle Restaurants Rating \nin various Median Household Income on Yelp", y="Median Household Income ($)", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
ratingViolinPlot
ratingViolinPlot <- ggplot(yelp.data, aes(as.factor(rating), censusMedianHHIncome)) +
geom_violin() +
geom_boxplot(width=0.1) +
stat_summary(fun.y=mean, geom="point", shape=23, size=2, color="blue") +
stat_summary(fun.y=median, geom="point", size=2, color="red") +
labs(title="Distribution of Seattle Restaurants Rating \nin various Median Household Income on Yelp", y="Median Household Income ($)", x="Average Rating on Yelp", caption="(based on data from Yelp Fusion API and Seattle, Census Reporter, and the U.S. Census Bureau)")
ratingViolinPlot
ggplot(highSES_with_residual, aes(1:nrow(highSES_with_residual))) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0) + ylim(-3,3) +
labs(title="High Socio-Economic Predicted Model Residuals \nfrom Prediction with Low Socio-Economic Restaurants")
ggplot(highSES_with_residual, aes(1:nrow(highSES_with_residual))) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0) + ylim(-3,3) +
labs(title="High Socio-Economic Predicted Model Residuals \nfrom Prediction with Low Socio-Economic Restaurants", x="Row Number")
ggplot(highSES_with_residual, aes(1:nrow(highSES_with_residual))) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0) + ylim(-3,3) +
labs(title="High Socio-Economic Predicted Model Residuals \nfrom Prediction with Low Socio-Economic Restaurants",y="Residuals", x="Row Number")
ggplot(highSES_with_residual, aes(1:nrow(highSES_with_residual))) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0) + ylim(-3,3) +
labs(title="High Socio-Economic Predicted Model Residuals \nwhen predicting with Low Socio-Economic Restaurants",y="Residuals", x="Row Number")
ggplot(lowSES_with_residual, aes(1:nrow(lowSES_with_residual))) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0) + ylim(-3,3) +
labs(title="Low Socio-Economic Predicted Model Residuals \nwhen predicting with High Socio-Economic Restaurants",y="Residuals", x="Row Number")
ggplot(subway_high_with_residual, aes(rating)) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0) + ylim(-2,2) +
labs(title="Subway High Socio-Economic Predicted Model Residuals \nwhen predicting with Low Socio-Economic Subways",y="Residuals", x="Row Number")
ggplot(subway_low_with_residual, aes(rating)) +
geom_point(aes(y=residuals)) +
geom_abline(intercept=0, slope = 0) + ylim(-2,2) +
labs(title="Subway Low Socio-Economic Predicted Model Residuals \nwhen predicting with High Socio-Economic Subways",y="Residuals", x="Row Number")
count.subway.low.SES.above.line
count.subway.high.SES.above.line