-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFinal_project code.R
27 lines (22 loc) · 1003 Bytes
/
Final_project code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
library(dplyr)
library(ggplot2)
library(reshape2)
library(fmsb)
library(tidyr)
library(car)
d <- read.csv("/Users/jenni/Desktop/STAT_318/Final_project/youtube_data_test.csv", header = TRUE)
summaryStats <- summary(d)
# Replace "Wellesley" with "XXX" (case-insensitive)
d$title <- gsub("(?i)\\bWellesley\\b", "XXX", d$title, perl = TRUE)
# Create binary variable for offical channel
d$official_channel <- as.integer(d$channel_title == "WellesleyCollege")
count <- table(d$official_channel) #23/100 are published by offcial account
# Combine less meaningful categories
d$category_id[d$category_id != "Education" & d$category_id != "News & Politics" & d$category_id != "People & Blogs"] <- "Other"
##Visualization
# Create histogram for pre-cleaned categories
ggplot(d, aes(x = category_id )) +
geom_bar(fill = "skyblue", color = "black") +
theme_minimal() +
labs(title = "Frequency of Categories", x = "Category", y = "Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))