core-methods-in-edm · ssz2119 · Nov 6, 2020
diff --git a/Assignment 4.Rmd b/Assignment 4.Rmd
@@ -8,14 +8,17 @@ https://www.cs.uic.edu/~wilkinson/Applets/cluster.html
 
 
 ```{r}
-library()
+library(dplyr)
+library(tidyr)
+library(ggplot2)
 ```
 
 Now, upload the file "Class_Motivation.csv" from the Assignment 4 Repository as a data frame called "K1""
 ```{r}
 
-K1 <- read.csv(...)
-
+K1 <- read.csv("Class_Motivation.csv", header = TRUE)
+K1b <- gather(K1, week, measure, 2:6)
+plot(as.factor(K1b$week), K1b$measure)
 ```
 
 This file contains the self-reported motivation scores for a class over five weeks. We are going to look for patterns in motivation over this time and sort people into clusters based on those patterns.
@@ -26,7 +29,7 @@ The algorithm will treat each row as a value belonging to a person, so we need t
 
 ```{r}
 
-K2 <- 
+K2 <- select(K1, 2:6)
 
 ```
 
@@ -40,14 +43,16 @@ We will remove people with missing values for this assignment, but keep in mind
 ```{r}
 
 K3 <- na.omit(K2) #This command create a data frame with only those people with no missing values. It "omits" all rows with missing values, also known as a "listwise deletion". EG - It runs down the list deleting rows as it goes.
+K3 <- K2
+K3[is.na(K3)] <- 0
 
 ```
 
 Another pre-processing step used in K-means is to standardize the values so that they have the same range. We do this because we want to treat each week as equally important - if we do not standardise then the week with the largest range will have the greatest impact on which clusters are formed. We standardise the values by using the "scale()" command.
 
 ```{r}
 
-K3 <- 
+K3 <- scale(K3)
 
 ```
 
@@ -66,20 +71,37 @@ Also, we need to choose the number of clusters we think are in the data. We will
 
 ```{r}
 
-fit <- 
+fit1a <- kmeans(K3, 2)
+fit1b <- kmeans(K3, 2)
+fit1c <- kmeans(K3, 2)
 
 #We have created an object called "fit" that contains all the details of our clustering including which observations belong to each cluster.
 
 #We can access the list of clusters by typing "fit$cluster", the top row corresponds to the original order the rows were in. Notice we have deleted some rows.
 
-
+fit1b$cluster
 
 #We can also attach these clusters to the original dataframe by using the "data.frame" command to create a new data frame called K4.
 
-K4
+K4 <- data.frame(K3, fit1a$cluster, fit1b$cluster, fit1c$cluster)
+
+fit1a$withinss
+fit1b$withinss
+fit1c$withinss
+
+fit1a$tot.withinss
+fit1b$tot.withinss
+fit1c$tot.withinss
+
+fit1a$betweenss
+fit1b$betweenss
+fit1c$betweenss
+
+K4 <- data.frame(K3, fit1a$cluster)
 
 #Have a look at the K4 dataframe. Lets change the names of the variables to make it more convenient with the names() command.
 
+names(K4) <- c("1", "2", "3", "4", "5", "cluster")
 
 ```
 
@@ -88,14 +110,14 @@ Now we need to visualize the clusters we have created. To do so we want to play
 First lets use tidyr to convert from wide to long format.
 ```{r}
 
-K5 <- gather(K4, "week", "motivation", 1:5)
+K5 <- tidyr::gather(K4, "week", "motivation", 1:5)
 ```
 
 Now lets use dplyr to average our motivation values by week and by cluster.
 
 ```{r}
 
-K6 <- K5 %>% group_by(week, cluster) %>% summarise(K6, avg = mean(motivation))
+K6 <- K5 %>% group_by(week, cluster) %>% summarise(avg = mean(motivation))
 
 ```
 
@@ -113,9 +135,9 @@ Likewise, since "cluster" is not numeric but rather a categorical label we want
 
 ```{r}
 
-K6$week <- 
+K6$week <- as.numeric(K6$week)
 
-K6$cluster <- 
+K6$cluster <- as.factor(K6$cluster)
 
 ```
 
@@ -139,21 +161,62 @@ What patterns do you see in the plot?
 It would be useful to determine how many people are in each cluster. We can do this easily with dplyr.
 
 ```{r}
-K7 <- count(K4, cluster)
+K7 <- dplyr::count(K4, cluster)
 ```
 
 Look at the number of people in each cluster, now repeat this process for 3 rather than 2 clusters. Which cluster grouping do you think is more informative? Write your answer below:
 
+I think cluster 2 is more informative.
+
 ##Part II
 
 Using the data collected in the HUDK4050 entrance survey (HUDK4050-cluster.csv) use K-means to cluster the students first according location (lat/long) and then according to their answers to the questions, each student should belong to two clusters.
+```{r}
+library(tidyverse)
+M1 <- read.csv("HUDK405020-cluster.csv", header = TRUE)
+M2 <- select(M1,4:9)
+
+fit2a <- kmeans(M2, 1)
+fit2b <- kmeans(M2, 2)
+fit2c <- kmeans(M2, 3)
+fit2d <- kmeans(M2, 4)
+fit2e <- kmeans(M2, 5)
+fit2f <- kmeans(M2, 6)
+fit2g <- kmeans(M2, 7)
+
+mss<- c(fit2a$tot.withinss,fit2b$tot.withinss,fit2c$tot.withinss,fit2d$tot.withinss,fit2e$tot.withinss,fit2f$tot.withinss,fit2g$tot.withinss, fit2a$betweenss,fit2b$betweenss,fit2c$betweenss,fit2d$betweenss,fit2e$betweenss,fit2f$betweenss,fit2g$betweenss)
+
+clusters <- c(seq(1,7,1),seq(1,7,1))
+col <- c(rep("blue",7), rep("red",7))
+plot(clusters, mss, col = col)
+
+L1 <- select(M1, 2:3)
+plot(L1$long, L1$lat)
+fit3a <- kmeans(L1, 2)
+fit3b <- kmeans(L1, 2)
+fit3c <- kmeans(L1, 2)
+
+fit3a$tot.withinss
+fit3b$tot.withinss
+fit3c$tot.withinss
+
+ML <- data.frame(M1$compare.features, M1$math.accuracy,M1$planner.use,M1$enjoy.discuss,M1$enjoy.group,M1$meet.deadline, fit2c$cluster, M1$lat,M1$long, fit3a$cluster)
+pairs(ML)
+```
+
 
 ##Part III
 
 Create a visualization that shows the overlap between the two clusters each student belongs to in Part II. IE - Are there geographical patterns that correspond to the answers? 
 
 ```{r}
+DF <- data.frame(table(ML$fit2c.cluster,ML$fit3a.cluster))
+ML2 <- ML %>% group_by(fit2c.cluster,fit3a.cluster) %>% summarize(count = n())
+ggplot(ML2, aes(x = fit2c.cluster, y = fit3a.cluster, size = count)) + geom_point()
 
+library(vcd)
+P1 <- structable(fit2c$cluster ~ fit3a$cluster)
+mosaic(P1, shade=TRUE, legend=TRUE) 
 ```