From 8b9e24cfcbf0fe4bc6b7402c69260a41e79eaa94 Mon Sep 17 00:00:00 2001 From: Nicole SCHLOSBERG Date: Thu, 1 Oct 2020 17:25:51 -0500 Subject: [PATCH 01/10] Uploading Assignment 2. --- Assignment 2-2020.Rmd | 150 +++++++-- Assignment-2-2020.html | 708 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 835 insertions(+), 23 deletions(-) create mode 100644 Assignment-2-2020.html diff --git a/Assignment 2-2020.Rmd b/Assignment 2-2020.Rmd index 0b235a3..744151e 100644 --- a/Assignment 2-2020.Rmd +++ b/Assignment 2-2020.Rmd @@ -1,10 +1,12 @@ --- title: "Assignment 2" -author: "Charles Lang" -date: "September 24, 2020" +author: "Nicole Schlosberg" +date: "September 29, 2020" output: html_document --- -#Part I + + +## Part I ## Data Wrangling In the hackathon a project was proposed to collect data from student video watching, a sample of this data is available in the file video-data.csv. @@ -79,6 +81,7 @@ D4 <- filter(D1, stid == 4|stid == 20| stid == 22) D4 <- droplevels(D4) boxplot(D4$watch.time~D4$stid, xlab = "Student", ylab = "Watch Time") ``` + ## Pairs ```{r} #Use matrix notation to select columns 2, 5, 6, and 7 @@ -86,31 +89,50 @@ D5 <- D1[,c(2,5,6,7)] #Draw a matrix of plots for every combination of variables pairs(D5) ``` + ## Part II 1. Create a simulated data set containing 100 students, each with a score from 1-100 representing performance in an educational game. The scores should tend to cluster around 75. Also, each student should be given a classification that reflects one of four interest groups: sport, music, nature, literature. ```{r} -#rnorm(100, 75, 15) creates a random sample with a mean of 75 and standard deviation of 20 -#pmax sets a maximum value, pmin sets a minimum value +#rnorm(100, 75, 15) creates a random sample with a mean of 75 and standard deviation of 15 +#filter() can be used to set max min value #round rounds numbers to whole number values -#sample draws a random samples from the groups vector according to a uniform distribution +#sample() draws a random samples from the groups vector according to a uniform distribution +score <- rnorm(100,75,15) +hist(score,breaks = 30) +S1 <- data.frame(score) -``` +library(dplyr) +S1 <- filter(S1, score <= 100) +hist(S1$score) -2. Using base R commands, draw a histogram of the scores. Change the breaks in your histogram until you think they best represent your data. +S2 <- data.frame(rep(100,5)) # +names(S2) <- "score" +S3 <- bind_rows(S1,S2) + +interest <- c("sport", "music", "nature", "liturature") + +S3$interest <- sample(interest, 100, replace = TRUE) + +S3$stid <- seq(1,100,1) -```{r} ``` +2. **Using base R commands, draw a histogram of the scores. Change the breaks in your histogram until you think they best represent your data. + +```{r} +hist(S3$score, breaks = 10) +``` 3. Create a new variable that groups the scores according to the breaks in your histogram. ```{r} #cut() divides the range of scores into intervals and codes the values in scores according to which interval they fall. We use a vector called `letters` as the labels, `letters` is a vector made up of the letters of the alphabet. - +label <- letters[1:10] +S3$breaks <- cut(S3$score, breaks = 10, labels = label) ``` 4. Now using the colorbrewer package (RColorBrewer; http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3) design a pallette and assign it to the groups in your data on the histogram. @@ -118,48 +140,67 @@ pairs(D5) ```{r} library(RColorBrewer) #Let's look at the available palettes in RColorBrewer - +display.brewer.all() #The top section of palettes are sequential, the middle section are qualitative, and the lower section are diverging. #Make RColorBrewer palette available to R and assign to your bins +S3$colors <- brewer.pal(10, "BrBG") #Use named palette in histogram - +hist(S3$score, col = S3$colors) ``` - 5. Create a boxplot that visualizes the scores for each interest group and color each interest group a different color. ```{r} #Make a vector of the colors from RColorBrewer +interest.col <- brewer.pal(4, "BuPu") +boxplot(score ~ interest, S3, col = interest.col) ``` - 6. Now simulate a new variable that describes the number of logins that students made to the educational game. They should vary from 1-25. ```{r} - +S3$login <- sample(1:25, 100, replace = TRUE) ``` 7. Plot the relationships between logins and scores. Give the plot a title and color the dots according to interest group. ```{r} - - +plot(S3$score, S3$login, main = "Login vs. Score", xlab = "Score", ylab = "Login", col = interest.col) ``` - 8. R contains several inbuilt data sets, one of these in called AirPassengers. Plot a line graph of the the airline passengers over time using this data set. ```{r} - +plot(AirPassengers, type = "l", xlab = "Date", ylab = "Passenger numbers") ``` - -9. Using another inbuilt data set, iris, plot the relationships between all of the variables in the data set. Which of these relationships is it appropraiet to run a correlation on? +9. Using another inbuilt data set, iris, plot the relationships between all of the variables in the data set. Which of these relationships is it appropriate to run a correlation on? ```{r} +plot(iris) +plot(iris$Sepal.Length,iris$Sepal.Width) +plot(iris$Petal.Length,iris$Petal.Width) +plot(iris$Petal.Length,iris$Sepal.Length) +plot(iris$Petal.Width,iris$Sepal.Width) +plot(iris$Petal.Width,iris$Sepal.Length) +plot(iris$Petal.Length,iris$Sepal.Width) +plot(iris$Species,iris$Sepal.Width, xlab = "Species", ylab = "Sepal Width") +plot(iris$Species,iris$Sepal.Length, xlab = "Species", ylab = "Sepal Length") +plot(iris$Species,iris$Petal.Width, xlab = "Species", ylab = "Petal Width") +plot(iris$Species,iris$Petal.Length, xlab = "Species", ylab = "Petal Length") + +#Correlation between Sepal Length and Width +corOfSepalLW <- cor(iris$Sepal.Length, iris$Sepal.Width) +#Correlation between Petal Length and Width +corOfPetalLW <- cor(iris$Petal.Length, iris$Petal.Width) +#Correlation between Petal Length and Sepal Length +corOfLengthPS <- cor(iris$Petal.Length, iris$Sepal.Length) +#Correlation between Petal Width and Sepal Width +corOfWidthPS <- cor(iris$Petal.Width, iris$Sepal.Width) + ``` # Part III - Analyzing Swirl @@ -171,6 +212,7 @@ In this repository you will find data describing Swirl activity from the class s ### Instructions 1. Insert a new code block + 2. Create a data frame from the `swirl-data.csv` file called `DF1` The variables are: @@ -188,16 +230,78 @@ The variables are: 4. Use the `group_by` function to create a data frame that sums all the attempts for each `hash` by each `lesson_name` called `DF3` +```{r} +#2 +DF1 <- read.csv("swirl-data.csv", header = TRUE) + +#3 +DF2 <- DF1[,c(2,5,8)] + +#4 +DF3 <- DF2 %>% group_by(lesson_name, hash) %>% summarise(sum_key = sum(attempt)) + +``` + 5. On a scrap piece of paper draw what you think `DF3` would look like if all the lesson names were column names -6. Convert `DF3` to this format +6. Convert `DF3` to this format + +```{r} +DF3 <- spread(DF3, lesson_name, sum_key) +DF3 <- DF3[-c(2)] +``` 7. Create a new data frame from `DF1` called `DF4` that only includes the variables `hash`, `lesson_name` and `correct` -8. Convert the `correct` variable so that `TRUE` is coded as the **number** `1` and `FALSE` is coded as `0` +8. Convert the `correct` variable so that `TRUE` is coded as the **number** `1` and `FALSE` is coded as `0` + +```{r} +#7 +DF4 <- DF1[,c(2,4,8)] + +#8 +#Correct misspelled FALS at line 809 in swirl_data.csv and subsequent DF4 dataframe from DF1 +DF4$correct <- ifelse(DF4$correct == "FALS", "FALSE", DF4$correct) +#str(DF4$correct) + +#Convert the chr that was created with last back to logi +DF4$correct <- type.convert(DF4$correct) +#str(DF4$correct) + +#Get rid of the NAs so the next steps do not throw "NAs introduced by coercion" +DF4 <- DF4[complete.cases(DF4$correct),] + +#Converts logi to num so 0s and 1s instead of FALSE and TRUE +DF4$correct <- as.numeric(DF4$correct) +#str(DF4$correct) + +``` 9. Create a new data frame called `DF5` that provides a mean score for each student on each course +```{r} +DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct)) +``` + 10. **Extra credit** Convert the `datetime` variable into month-day-year format and create a new data frame (`DF6`) that shows the average correct for each day +```{r} + +DF6 <- DF1[,c(7,4)] + +#steps to get TRUE/FALSE to 1/0 +DF6$correct <- ifelse(DF6$correct == "FALS", "FALSE", DF6$correct) +DF6$correct <- type.convert(DF6$correct) +DF6 <- DF6[complete.cases(DF6$correct),] +DF6$correct <- as.numeric(DF6$correct) + +#Creating average correct for each day +DF6 <- DF6 %>% group_by(datetime) %>% summarise(meanByDay = mean(correct)) + +#Convert 'datetime' to month-day-year by converting the parsed num*** +library(lubridate) +dateConverted <- mdy_hms(DF6$datetime) + +``` + Finally use the knitr function to generate an html document from your work. Commit, Push and Pull Request your work back to the main branch of the repository. Make sure you include both the .Rmd file and the .html file. diff --git a/Assignment-2-2020.html b/Assignment-2-2020.html new file mode 100644 index 0000000..3f9f6f0 --- /dev/null +++ b/Assignment-2-2020.html @@ -0,0 +1,708 @@ + + + + + + + + + + + + + + + +Assignment 2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Part I

+
+
+

Data Wrangling

+

In the hackathon a project was proposed to collect data from student video watching, a sample of this data is available in the file video-data.csv.

+

stid = student id year = year student watched video participation = whether or not the student opened the video watch.time = how long the student watched the video for confusion.points = how many times a student rewatched a section of a video key,points = how many times a student skipped or increased the speed of a video

+
#Install the 'tidyverse' package or if that does not work, install the 'dplyr' and 'tidyr' packages.
+
+#Load the package(s) you just installed
+
+library(tidyverse)
+
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
+
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
+## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
+## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
+## ✓ readr   1.3.1     ✓ forcats 0.5.0
+
## ── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
+## x dplyr::filter() masks stats::filter()
+## x dplyr::lag()    masks stats::lag()
+
library(tidyr)
+library(dplyr)
+
+D1 <- read.csv("video-data.csv", header = TRUE)
+
+#Create a data frame that only contains the years 2018
+D2 <- filter(D1, year == 2018)
+
+
+

Histograms

+
#Generate a histogram of the watch time for the year 2018
+
+hist(D2$watch.time)
+

+
#Change the number of breaks to 100, do you get the same impression?
+
+hist(D2$watch.time, breaks = 100)
+

+
#Cut the y-axis off at 10
+
+hist(D2$watch.time, breaks = 100, ylim = c(0,10))
+

+
#Restore the y-axis and change the breaks so that they are 0-5, 5-20, 20-25, 25-35
+
+hist(D2$watch.time, breaks = c(0,5,20,25,35))
+

+
+
+

Plots

+
#Plot the number of confusion points against the watch time
+
+plot(D1$confusion.points, D1$watch.time)
+

+
#Create two variables x & y
+x <- c(1,3,2,7,6,4,4)
+y <- c(2,4,2,3,2,4,3)
+
+#Create a table from x & y
+table1 <- table(x,y)
+
+#Display the table as a Barplot
+barplot(table1)
+

+
#Create a data frame of the average total key points for each year and plot the two against each other as a lines
+
+D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points))
+
## `summarise()` ungrouping output (override with `.groups` argument)
+
plot(D3$year, D3$mean_key, type = "l", lty = "dashed")
+

+
#Create a boxplot of total enrollment for three students
+D4 <- filter(D1, stid == 4|stid == 20| stid == 22)
+#The drop levels command will remove all the schools from the variable with no data  
+D4 <- droplevels(D4)
+boxplot(D4$watch.time~D4$stid, xlab = "Student", ylab = "Watch Time")
+

+
+
+

Pairs

+
#Use matrix notation to select columns 2, 5, 6, and 7
+D5 <- D1[,c(2,5,6,7)]
+#Draw a matrix of plots for every combination of variables
+pairs(D5)
+

+
+
+

Part II

+
    +
  1. Create a simulated data set containing 100 students, each with a score from 1-100 representing performance in an educational game. The scores should tend to cluster around 75. Also, each student should be given a classification that reflects one of four interest groups: sport, music, nature, literature.
  2. +
+
#rnorm(100, 75, 15) creates a random sample with a mean of 75 and standard deviation of 15
+#filter() can be used to set max min value
+#round rounds numbers to whole number values
+#sample() draws a random samples from the groups vector according to a uniform distribution
+
+score <- rnorm(100,75,15)
+hist(score,breaks = 30)
+

+
S1 <- data.frame(score)
+
+library(dplyr)
+S1 <- filter(S1, score <= 100)
+hist(S1$score)
+

+
S2 <- data.frame(rep(100,5)) #
+names(S2) <- "score"
+S3 <- bind_rows(S1,S2)
+
+interest <- c("sport", "music", "nature", "liturature")
+
+S3$interest <- sample(interest, 100, replace = TRUE)
+
+S3$stid <- seq(1,100,1)
+
    +
  1. **Using base R commands, draw a histogram of the scores. Change the breaks in your histogram until you think they best represent your data.
  2. +
+
hist(S3$score, breaks = 10)
+

+
    +
  1. Create a new variable that groups the scores according to the breaks in your histogram.
  2. +
+
#cut() divides the range of scores into intervals and codes the values in scores according to which interval they fall. We use a vector called `letters` as the labels, `letters` is a vector made up of the letters of the alphabet.
+label <- letters[1:10]
+S3$breaks <- cut(S3$score, breaks = 10, labels = label)
+
    +
  1. Now using the colorbrewer package (RColorBrewer; http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3) design a pallette and assign it to the groups in your data on the histogram.
  2. +
+
library(RColorBrewer)
+#Let's look at the available palettes in RColorBrewer
+display.brewer.all()
+

+
#The top section of palettes are sequential, the middle section are qualitative, and the lower section are diverging.
+#Make RColorBrewer palette available to R and assign to your bins
+
+S3$colors <- brewer.pal(10, "BrBG")
+#Use named palette in histogram
+hist(S3$score, col = S3$colors)
+

+
    +
  1. Create a boxplot that visualizes the scores for each interest group and color each interest group a different color.
  2. +
+
#Make a vector of the colors from RColorBrewer
+interest.col <- brewer.pal(4, "BuPu")
+boxplot(score ~ interest, S3, col = interest.col)
+

+
    +
  1. Now simulate a new variable that describes the number of logins that students made to the educational game. They should vary from 1-25.
  2. +
+
S3$login <- sample(1:25, 100, replace = TRUE)
+
    +
  1. Plot the relationships between logins and scores. Give the plot a title and color the dots according to interest group.
  2. +
+
plot(S3$score, S3$login, main = "Login vs. Score", xlab = "Score", ylab = "Login", col = interest.col) 
+

+
    +
  1. R contains several inbuilt data sets, one of these in called AirPassengers. Plot a line graph of the the airline passengers over time using this data set.
  2. +
+
plot(AirPassengers, type = "l", xlab = "Date", ylab = "Passenger numbers") 
+

+
    +
  1. Using another inbuilt data set, iris, plot the relationships between all of the variables in the data set. Which of these relationships is it appropriate to run a correlation on?
  2. +
+
plot(iris)
+

+
plot(iris$Sepal.Length,iris$Sepal.Width)
+

+
plot(iris$Petal.Length,iris$Petal.Width)
+

+
plot(iris$Petal.Length,iris$Sepal.Length)
+

+
plot(iris$Petal.Width,iris$Sepal.Width)
+

+
plot(iris$Petal.Width,iris$Sepal.Length)
+

+
plot(iris$Petal.Length,iris$Sepal.Width)
+

+
plot(iris$Species,iris$Sepal.Width, xlab = "Species", ylab = "Sepal Width")
+

+
plot(iris$Species,iris$Sepal.Length, xlab = "Species", ylab = "Sepal Length")
+

+
plot(iris$Species,iris$Petal.Width, xlab = "Species", ylab = "Petal Width")
+

+
plot(iris$Species,iris$Petal.Length, xlab = "Species", ylab = "Petal Length")
+

+
#Correlation between Sepal Length and Width
+corOfSepalLW <- cor(iris$Sepal.Length, iris$Sepal.Width)
+#Correlation between Petal Length and Width
+corOfPetalLW <- cor(iris$Petal.Length, iris$Petal.Width)
+#Correlation between Petal Length and Sepal Length
+corOfLengthPS <- cor(iris$Petal.Length, iris$Sepal.Length)
+#Correlation between Petal Width and Sepal Width
+corOfWidthPS <- cor(iris$Petal.Width, iris$Sepal.Width)
+
+
+

Part III - Analyzing Swirl

+
+

Data

+

In this repository you will find data describing Swirl activity from the class so far this semester. Please connect RStudio to this repository.

+
+

Instructions

+
    +
  1. Insert a new code block

  2. +
  3. Create a data frame from the swirl-data.csv file called DF1

  4. +
+

The variables are:

+

course_name - the name of the R course the student attempted
+lesson_name - the lesson name
+question_number - the question number attempted correct - whether the question was answered correctly
+attempt - how many times the student attempted the question
+skipped - whether the student skipped the question
+datetime - the date and time the student attempted the question
+hash - anonymyzed student ID

+
    +
  1. Create a new data frame that only includes the variables hash, lesson_name and attempt called DF2

  2. +
  3. Use the group_by function to create a data frame that sums all the attempts for each hash by each lesson_name called DF3

  4. +
+
#2
+DF1 <- read.csv("swirl-data.csv", header = TRUE)
+
+#3
+DF2 <- DF1[,c(2,5,8)]
+
+#4
+DF3 <-  DF2 %>% group_by(lesson_name, hash) %>% summarise(sum_key = sum(attempt))
+
## `summarise()` regrouping output by 'lesson_name' (override with `.groups` argument)
+
    +
  1. On a scrap piece of paper draw what you think DF3 would look like if all the lesson names were column names

  2. +
  3. Convert DF3 to this format

  4. +
+
DF3 <- spread(DF3, lesson_name, sum_key) 
+
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
+## Using compatibility `.name_repair`.
+## This warning is displayed once every 8 hours.
+## Call `lifecycle::last_warnings()` to see where this warning was generated.
+
DF3 <- DF3[-c(2)]
+
    +
  1. Create a new data frame from DF1 called DF4 that only includes the variables hash, lesson_name and correct

  2. +
  3. Convert the correct variable so that TRUE is coded as the number 1 and FALSE is coded as 0

  4. +
+
#7
+DF4 <- DF1[,c(2,4,8)]
+
+#8
+#Correct misspelled FALS at line 809 in swirl_data.csv and subsequent DF4 dataframe from DF1
+DF4$correct <- ifelse(DF4$correct == "FALS", "FALSE", DF4$correct)
+#str(DF4$correct)
+
+#Convert the chr that was created with last back to logi
+DF4$correct <- type.convert(DF4$correct) 
+#str(DF4$correct)
+
+#Get rid of the NAs so the next steps do not throw "NAs introduced by coercion"
+DF4 <- DF4[complete.cases(DF4$correct),]
+
+#Converts logi to num so 0s and 1s instead of FALSE and TRUE
+DF4$correct <- as.numeric(DF4$correct) 
+#str(DF4$correct)
+
    +
  1. Create a new data frame called DF5 that provides a mean score for each student on each course
  2. +
+
DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct))
+
## `summarise()` regrouping output by 'hash' (override with `.groups` argument)
+
    +
  1. Extra credit Convert the datetime variable into month-day-year format and create a new data frame (DF6) that shows the average correct for each day
  2. +
+
DF6 <- DF1[,c(7,4)]
+
+#steps to get TRUE/FALSE to 1/0
+DF6$correct <- ifelse(DF6$correct == "FALS", "FALSE", DF6$correct)
+DF6$correct <- type.convert(DF6$correct) 
+DF6 <- DF6[complete.cases(DF6$correct),]
+DF6$correct <- as.numeric(DF6$correct) 
+
+#Creating average correct for each day
+DF6 <- DF6 %>% group_by(datetime) %>% summarise(meanByDay = mean(correct))
+
## `summarise()` ungrouping output (override with `.groups` argument)
+
#Convert 'datetime' to month-day-year by converting the parsed num***
+library(lubridate)
+
## 
+## Attaching package: 'lubridate'
+
## The following objects are masked from 'package:base':
+## 
+##     date, intersect, setdiff, union
+
dateConverted <- mdy_hms(DF6$datetime)
+
## Warning: All formats failed to parse. No formats found.
+

Finally use the knitr function to generate an html document from your work. Commit, Push and Pull Request your work back to the main branch of the repository. Make sure you include both the .Rmd file and the .html file.

+
+
+
+ + + + +
+ + + + + + + + + + + + + + + From 798d807d726699f7c30df68dd7fdedf76c4ac730 Mon Sep 17 00:00:00 2001 From: Nicole SCHLOSBERG Date: Sun, 4 Oct 2020 07:52:59 -0500 Subject: [PATCH 02/10] Uploading Assignment 2. --- Assignment 2-2020.Rmd | 27 ++++++--------- Assignment-2-2020.html | 76 ++++++++++++++++++------------------------ 2 files changed, 43 insertions(+), 60 deletions(-) diff --git a/Assignment 2-2020.Rmd b/Assignment 2-2020.Rmd index 744151e..981e874 100644 --- a/Assignment 2-2020.Rmd +++ b/Assignment 2-2020.Rmd @@ -23,7 +23,7 @@ key,points = how many times a student skipped or increased the speed of a video #Load the package(s) you just installed -library(tidyverse) +#library(tidyverse) library(tidyr) library(dplyr) @@ -71,7 +71,7 @@ barplot(table1) #Create a data frame of the average total key points for each year and plot the two against each other as a lines -D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points)) +D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points), .groups = "keep") plot(D3$year, D3$mean_key, type = "l", lty = "dashed") @@ -104,7 +104,7 @@ score <- rnorm(100,75,15) hist(score,breaks = 30) S1 <- data.frame(score) -library(dplyr) +#library(dplyr) S1 <- filter(S1, score <= 100) hist(S1$score) @@ -117,8 +117,6 @@ interest <- c("sport", "music", "nature", "liturature") S3$interest <- sample(interest, 100, replace = TRUE) S3$stid <- seq(1,100,1) - - ``` 2. **Using base R commands, draw a histogram of the scores. Change the breaks in your histogram until you think they best represent your data. @@ -155,7 +153,6 @@ hist(S3$score, col = S3$colors) #Make a vector of the colors from RColorBrewer interest.col <- brewer.pal(4, "BuPu") boxplot(score ~ interest, S3, col = interest.col) - ``` 6. Now simulate a new variable that describes the number of logins that students made to the educational game. They should vary from 1-25. @@ -179,7 +176,6 @@ plot(AirPassengers, type = "l", xlab = "Date", ylab = "Passenger numbers") 9. Using another inbuilt data set, iris, plot the relationships between all of the variables in the data set. Which of these relationships is it appropriate to run a correlation on? ```{r} - plot(iris) plot(iris$Sepal.Length,iris$Sepal.Width) plot(iris$Petal.Length,iris$Petal.Width) @@ -192,6 +188,7 @@ plot(iris$Species,iris$Sepal.Length, xlab = "Species", ylab = "Sepal Length") plot(iris$Species,iris$Petal.Width, xlab = "Species", ylab = "Petal Width") plot(iris$Species,iris$Petal.Length, xlab = "Species", ylab = "Petal Length") +#Which of these relationships is it appropriate to run a correlation on? #Correlation between Sepal Length and Width corOfSepalLW <- cor(iris$Sepal.Length, iris$Sepal.Width) #Correlation between Petal Length and Width @@ -200,7 +197,6 @@ corOfPetalLW <- cor(iris$Petal.Length, iris$Petal.Width) corOfLengthPS <- cor(iris$Petal.Length, iris$Sepal.Length) #Correlation between Petal Width and Sepal Width corOfWidthPS <- cor(iris$Petal.Width, iris$Sepal.Width) - ``` # Part III - Analyzing Swirl @@ -238,8 +234,7 @@ DF1 <- read.csv("swirl-data.csv", header = TRUE) DF2 <- DF1[,c(2,5,8)] #4 -DF3 <- DF2 %>% group_by(lesson_name, hash) %>% summarise(sum_key = sum(attempt)) - +DF3 <- DF2 %>% group_by(lesson_name, hash) %>% summarise(sum_key = sum(attempt), .groups = "keep") ``` 5. On a scrap piece of paper draw what you think `DF3` would look like if all the lesson names were column names @@ -262,25 +257,22 @@ DF4 <- DF1[,c(2,4,8)] #8 #Correct misspelled FALS at line 809 in swirl_data.csv and subsequent DF4 dataframe from DF1 DF4$correct <- ifelse(DF4$correct == "FALS", "FALSE", DF4$correct) -#str(DF4$correct) #Convert the chr that was created with last back to logi DF4$correct <- type.convert(DF4$correct) -#str(DF4$correct) #Get rid of the NAs so the next steps do not throw "NAs introduced by coercion" DF4 <- DF4[complete.cases(DF4$correct),] #Converts logi to num so 0s and 1s instead of FALSE and TRUE DF4$correct <- as.numeric(DF4$correct) -#str(DF4$correct) ``` 9. Create a new data frame called `DF5` that provides a mean score for each student on each course ```{r} -DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct)) +DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct), .groups = "keep") ``` 10. **Extra credit** Convert the `datetime` variable into month-day-year format and create a new data frame (`DF6`) that shows the average correct for each day @@ -296,11 +288,12 @@ DF6 <- DF6[complete.cases(DF6$correct),] DF6$correct <- as.numeric(DF6$correct) #Creating average correct for each day -DF6 <- DF6 %>% group_by(datetime) %>% summarise(meanByDay = mean(correct)) +DF6 <- DF6 %>% group_by(datetime, correct) %>% summarise(meanByDay = mean(correct), .groups = "keep") #Convert 'datetime' to month-day-year by converting the parsed num*** -library(lubridate) -dateConverted <- mdy_hms(DF6$datetime) +#library(lubridate) +#dateConverted <- mdy_hms(DF6$datetime) +#DF6 <- separate_rows(DF6, DF6$datetime, sep = "0") ``` diff --git a/Assignment-2-2020.html b/Assignment-2-2020.html index 3f9f6f0..f417300 100644 --- a/Assignment-2-2020.html +++ b/Assignment-2-2020.html @@ -381,19 +381,18 @@

Data Wrangling

#Load the package(s) you just installed -library(tidyverse) -
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
-
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
-## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
-## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
-## ✓ readr   1.3.1     ✓ forcats 0.5.0
-
## ── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
-## x dplyr::filter() masks stats::filter()
-## x dplyr::lag()    masks stats::lag()
-
library(tidyr)
-library(dplyr)
-
-D1 <- read.csv("video-data.csv", header = TRUE)
+#library(tidyverse)
+library(tidyr)
+library(dplyr)
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
D1 <- read.csv("video-data.csv", header = TRUE)
 
 #Create a data frame that only contains the years 2018
 D2 <- filter(D1, year == 2018)
@@ -435,9 +434,9 @@

Plots

#Create a data frame of the average total key points for each year and plot the two against each other as a lines
 
-D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points))
-
## `summarise()` ungrouping output (override with `.groups` argument)
-
plot(D3$year, D3$mean_key, type = "l", lty = "dashed")
+D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points), .groups = "keep") + +plot(D3$year, D3$mean_key, type = "l", lty = "dashed")

#Create a boxplot of total enrollment for three students
 D4 <- filter(D1, stid == 4|stid == 20| stid == 22)
@@ -466,13 +465,13 @@ 

Part II

score <- rnorm(100,75,15) hist(score,breaks = 30)
-

+

S1 <- data.frame(score)
 
-library(dplyr)
+#library(dplyr)
 S1 <- filter(S1, score <= 100)
 hist(S1$score)
-

+

S2 <- data.frame(rep(100,5)) #
 names(S2) <- "score"
 S3 <- bind_rows(S1,S2)
@@ -486,7 +485,7 @@ 

Part II

  • **Using base R commands, draw a histogram of the scores. Change the breaks in your histogram until you think they best represent your data.
  • hist(S3$score, breaks = 10)
    -

    +

    1. Create a new variable that groups the scores according to the breaks in your histogram.
    @@ -506,14 +505,14 @@

    Part II

    S3$colors <- brewer.pal(10, "BrBG") #Use named palette in histogram hist(S3$score, col = S3$colors)
    -

    +

    1. Create a boxplot that visualizes the scores for each interest group and color each interest group a different color.
    #Make a vector of the colors from RColorBrewer
     interest.col <- brewer.pal(4, "BuPu")
     boxplot(score ~ interest, S3, col = interest.col)
    -

    +

    1. Now simulate a new variable that describes the number of logins that students made to the educational game. They should vary from 1-25.
    @@ -522,7 +521,7 @@

    Part II

  • Plot the relationships between logins and scores. Give the plot a title and color the dots according to interest group.
  • plot(S3$score, S3$login, main = "Login vs. Score", xlab = "Score", ylab = "Login", col = interest.col) 
    -

    +

    1. R contains several inbuilt data sets, one of these in called AirPassengers. Plot a line graph of the the airline passengers over time using this data set.
    @@ -553,7 +552,8 @@

    Part II

    plot(iris$Species,iris$Petal.Length, xlab = "Species", ylab = "Petal Length")

    -
    #Correlation between Sepal Length and Width
    +
    #Which of these relationships is it appropriate to run a correlation on?
    +#Correlation between Sepal Length and Width
     corOfSepalLW <- cor(iris$Sepal.Length, iris$Sepal.Width)
     #Correlation between Petal Length and Width
     corOfPetalLW <- cor(iris$Petal.Length, iris$Petal.Width)
    @@ -592,8 +592,7 @@ 

    Instructions

    DF2 <- DF1[,c(2,5,8)] #4 -DF3 <- DF2 %>% group_by(lesson_name, hash) %>% summarise(sum_key = sum(attempt))
    -
    ## `summarise()` regrouping output by 'lesson_name' (override with `.groups` argument)
    +DF3 <- DF2 %>% group_by(lesson_name, hash) %>% summarise(sum_key = sum(attempt), .groups = "keep")
    1. On a scrap piece of paper draw what you think DF3 would look like if all the lesson names were column names

    2. Convert DF3 to this format

    3. @@ -614,23 +613,19 @@

      Instructions

      #8 #Correct misspelled FALS at line 809 in swirl_data.csv and subsequent DF4 dataframe from DF1 DF4$correct <- ifelse(DF4$correct == "FALS", "FALSE", DF4$correct) -#str(DF4$correct) #Convert the chr that was created with last back to logi DF4$correct <- type.convert(DF4$correct) -#str(DF4$correct) #Get rid of the NAs so the next steps do not throw "NAs introduced by coercion" DF4 <- DF4[complete.cases(DF4$correct),] #Converts logi to num so 0s and 1s instead of FALSE and TRUE -DF4$correct <- as.numeric(DF4$correct) -#str(DF4$correct) +DF4$correct <- as.numeric(DF4$correct)
      1. Create a new data frame called DF5 that provides a mean score for each student on each course
      -
      DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct))
      -
      ## `summarise()` regrouping output by 'hash' (override with `.groups` argument)
      +
      DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct), .groups = "keep")
      1. Extra credit Convert the datetime variable into month-day-year format and create a new data frame (DF6) that shows the average correct for each day
      @@ -643,17 +638,12 @@

      Instructions

      DF6$correct <- as.numeric(DF6$correct) #Creating average correct for each day -DF6 <- DF6 %>% group_by(datetime) %>% summarise(meanByDay = mean(correct)) -
      ## `summarise()` ungrouping output (override with `.groups` argument)
      -
      #Convert 'datetime' to month-day-year by converting the parsed num***
      -library(lubridate)
      -
      ## 
      -## Attaching package: 'lubridate'
      -
      ## The following objects are masked from 'package:base':
      -## 
      -##     date, intersect, setdiff, union
      -
      dateConverted <- mdy_hms(DF6$datetime)
      -
      ## Warning: All formats failed to parse. No formats found.
      +DF6 <- DF6 %>% group_by(datetime, correct) %>% summarise(meanByDay = mean(correct), .groups = "keep") + +#Convert 'datetime' to month-day-year by converting the parsed num*** +#library(lubridate) +#dateConverted <- mdy_hms(DF6$datetime) +#DF6 <- separate_rows(DF6, DF6$datetime, sep = "0")

      Finally use the knitr function to generate an html document from your work. Commit, Push and Pull Request your work back to the main branch of the repository. Make sure you include both the .Rmd file and the .html file.

      From 46fd1d8faeb1a7029f3b1b7c909e2f0f98a845e5 Mon Sep 17 00:00:00 2001 From: Nicole SCHLOSBERG Date: Mon, 5 Oct 2020 17:29:32 -0400 Subject: [PATCH 03/10] Uploading Assignment 2 --- Assignment 2-2020.Rmd | 38 +++++++++++++++------------ Assignment-2-2020.html | 59 ++++++++++++++++++++++-------------------- 2 files changed, 52 insertions(+), 45 deletions(-) diff --git a/Assignment 2-2020.Rmd b/Assignment 2-2020.Rmd index 981e874..98bdd9f 100644 --- a/Assignment 2-2020.Rmd +++ b/Assignment 2-2020.Rmd @@ -71,7 +71,7 @@ barplot(table1) #Create a data frame of the average total key points for each year and plot the two against each other as a lines -D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points), .groups = "keep") +D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points)) plot(D3$year, D3$mean_key, type = "l", lty = "dashed") @@ -96,26 +96,29 @@ pairs(D5) ```{r} #rnorm(100, 75, 15) creates a random sample with a mean of 75 and standard deviation of 15 -#filter() can be used to set max min value -#round rounds numbers to whole number values +#filter() can be used to set max min value and can only work with a data frame, for rows +#select() for columns +#round() rounds numbers to whole number values #sample() draws a random samples from the groups vector according to a uniform distribution score <- rnorm(100,75,15) hist(score,breaks = 30) S1 <- data.frame(score) -#library(dplyr) +#Top and tail the scores S1 <- filter(S1, score <= 100) hist(S1$score) -S2 <- data.frame(rep(100,5)) # +S2 <- data.frame(rep(100,5)) #repeat 100 5 times and names the column a random name that is not helpful so use the names() command to rename names(S2) <- "score" -S3 <- bind_rows(S1,S2) +S3 <- bind_rows(S1,S2) #must make sure that the names of the columns and the type match -interest <- c("sport", "music", "nature", "liturature") +#S3$score <- ifelse(S3$score >= 100, 100, S3$score) -S3$interest <- sample(interest, 100, replace = TRUE) +S3$score <-round(S3$score,0) +interest <- c("sport", "music", "nature", "liturature") +S3$interest <- sample(interest, 100, replace = TRUE) S3$stid <- seq(1,100,1) ``` @@ -231,10 +234,10 @@ The variables are: DF1 <- read.csv("swirl-data.csv", header = TRUE) #3 -DF2 <- DF1[,c(2,5,8)] +DF2<- select(DF1, hash, lesson_name, attempt) #4 -DF3 <- DF2 %>% group_by(lesson_name, hash) %>% summarise(sum_key = sum(attempt), .groups = "keep") +DF3 <- DF2 %>% group_by(hash, lesson_name) %>% summarise(sum_key = sum(attempt), .groups = "keep") ``` 5. On a scrap piece of paper draw what you think `DF3` would look like if all the lesson names were column names @@ -242,8 +245,10 @@ DF3 <- DF2 %>% group_by(lesson_name, hash) %>% summarise(sum_key = sum(attempt) 6. Convert `DF3` to this format ```{r} +#6 +#Get rid of the NAs so the next step does not throw error and add extra column of NAs +DF3 <- na.omit(DF3) DF3 <- spread(DF3, lesson_name, sum_key) -DF3 <- DF3[-c(2)] ``` 7. Create a new data frame from `DF1` called `DF4` that only includes the variables `hash`, `lesson_name` and `correct` @@ -252,7 +257,7 @@ DF3 <- DF3[-c(2)] ```{r} #7 -DF4 <- DF1[,c(2,4,8)] +DF4 <- select(DF1, hash, lesson_name, correct) #8 #Correct misspelled FALS at line 809 in swirl_data.csv and subsequent DF4 dataframe from DF1 @@ -266,20 +271,20 @@ DF4 <- DF4[complete.cases(DF4$correct),] #Converts logi to num so 0s and 1s instead of FALSE and TRUE DF4$correct <- as.numeric(DF4$correct) - ``` 9. Create a new data frame called `DF5` that provides a mean score for each student on each course ```{r} +#9 DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct), .groups = "keep") ``` 10. **Extra credit** Convert the `datetime` variable into month-day-year format and create a new data frame (`DF6`) that shows the average correct for each day ```{r} - -DF6 <- DF1[,c(7,4)] +#10 +DF6 <- select(DF1, hash, lesson_name, datetime, correct) #steps to get TRUE/FALSE to 1/0 DF6$correct <- ifelse(DF6$correct == "FALS", "FALSE", DF6$correct) @@ -288,13 +293,12 @@ DF6 <- DF6[complete.cases(DF6$correct),] DF6$correct <- as.numeric(DF6$correct) #Creating average correct for each day -DF6 <- DF6 %>% group_by(datetime, correct) %>% summarise(meanByDay = mean(correct), .groups = "keep") +DF6 <- DF6 %>% group_by(hash, datetime, correct) %>% summarise(meanByDay = mean(correct), .groups = "keep") #Convert 'datetime' to month-day-year by converting the parsed num*** #library(lubridate) #dateConverted <- mdy_hms(DF6$datetime) #DF6 <- separate_rows(DF6, DF6$datetime, sep = "0") - ``` Finally use the knitr function to generate an html document from your work. Commit, Push and Pull Request your work back to the main branch of the repository. Make sure you include both the .Rmd file and the .html file. diff --git a/Assignment-2-2020.html b/Assignment-2-2020.html index f417300..de9e430 100644 --- a/Assignment-2-2020.html +++ b/Assignment-2-2020.html @@ -434,9 +434,9 @@

      Plots

      #Create a data frame of the average total key points for each year and plot the two against each other as a lines
       
      -D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points), .groups = "keep")
      -
      -plot(D3$year, D3$mean_key, type = "l", lty = "dashed")
      +D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points)) +
      ## `summarise()` ungrouping output (override with `.groups` argument)
      +
      plot(D3$year, D3$mean_key, type = "l", lty = "dashed")

      #Create a boxplot of total enrollment for three students
       D4 <- filter(D1, stid == 4|stid == 20| stid == 22)
      @@ -459,33 +459,36 @@ 

      Part II

    4. Create a simulated data set containing 100 students, each with a score from 1-100 representing performance in an educational game. The scores should tend to cluster around 75. Also, each student should be given a classification that reflects one of four interest groups: sport, music, nature, literature.
    #rnorm(100, 75, 15) creates a random sample with a mean of 75 and standard deviation of 15
    -#filter() can be used to set max min value
    -#round rounds numbers to whole number values
    +#filter() can be used to set max min value and can only work with a data frame, for rows
    +#select() for columns
    +#round() rounds numbers to whole number values
     #sample() draws a random samples from the groups vector according to a uniform distribution
     
     score <- rnorm(100,75,15)
     hist(score,breaks = 30)
    -

    +

    S1 <- data.frame(score)
     
    -#library(dplyr)
    +#Top and tail the scores
     S1 <- filter(S1, score <= 100)
     hist(S1$score)
    -

    -
    S2 <- data.frame(rep(100,5)) #
    +

    +
    S2 <- data.frame(rep(100,5)) #repeat 100 5 times and names the column a random name that is not helpful so use the names() command to rename
     names(S2) <- "score"
    -S3 <- bind_rows(S1,S2)
    +S3 <- bind_rows(S1,S2) #must make sure that the names of the columns and the type match
     
    -interest <- c("sport", "music", "nature", "liturature")
    +#S3$score <- ifelse(S3$score >= 100, 100, S3$score)
     
    -S3$interest <- sample(interest, 100, replace = TRUE)
    +S3$score <-round(S3$score,0)
     
    +interest <- c("sport", "music", "nature", "liturature")
    +S3$interest <- sample(interest, 100, replace = TRUE)
     S3$stid <- seq(1,100,1)
    1. **Using base R commands, draw a histogram of the scores. Change the breaks in your histogram until you think they best represent your data.
    hist(S3$score, breaks = 10)
    -

    +

    1. Create a new variable that groups the scores according to the breaks in your histogram.
    @@ -505,14 +508,14 @@

    Part II

    S3$colors <- brewer.pal(10, "BrBG") #Use named palette in histogram hist(S3$score, col = S3$colors)
    -

    +

    1. Create a boxplot that visualizes the scores for each interest group and color each interest group a different color.
    #Make a vector of the colors from RColorBrewer
     interest.col <- brewer.pal(4, "BuPu")
     boxplot(score ~ interest, S3, col = interest.col)
    -

    +

    1. Now simulate a new variable that describes the number of logins that students made to the educational game. They should vary from 1-25.
    @@ -521,7 +524,7 @@

    Part II

  • Plot the relationships between logins and scores. Give the plot a title and color the dots according to interest group.
  • plot(S3$score, S3$login, main = "Login vs. Score", xlab = "Score", ylab = "Login", col = interest.col) 
    -

    +

    1. R contains several inbuilt data sets, one of these in called AirPassengers. Plot a line graph of the the airline passengers over time using this data set.
    @@ -589,26 +592,24 @@

    Instructions

    DF1 <- read.csv("swirl-data.csv", header = TRUE) #3 -DF2 <- DF1[,c(2,5,8)] +DF2<- select(DF1, hash, lesson_name, attempt) #4 -DF3 <- DF2 %>% group_by(lesson_name, hash) %>% summarise(sum_key = sum(attempt), .groups = "keep")
    +DF3 <- DF2 %>% group_by(hash, lesson_name) %>% summarise(sum_key = sum(attempt), .groups = "keep")
    1. On a scrap piece of paper draw what you think DF3 would look like if all the lesson names were column names

    2. Convert DF3 to this format

    -
    DF3 <- spread(DF3, lesson_name, sum_key) 
    -
    ## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
    -## Using compatibility `.name_repair`.
    -## This warning is displayed once every 8 hours.
    -## Call `lifecycle::last_warnings()` to see where this warning was generated.
    -
    DF3 <- DF3[-c(2)]
    +
    #6
    +#Get rid of the NAs so the next step does not throw error and add extra column of NAs
    +DF3 <- na.omit(DF3)
    +DF3 <- spread(DF3, lesson_name, sum_key) 
    1. Create a new data frame from DF1 called DF4 that only includes the variables hash, lesson_name and correct

    2. Convert the correct variable so that TRUE is coded as the number 1 and FALSE is coded as 0

    #7
    -DF4 <- DF1[,c(2,4,8)]
    +DF4 <- select(DF1, hash, lesson_name, correct)
     
     #8
     #Correct misspelled FALS at line 809 in swirl_data.csv and subsequent DF4 dataframe from DF1
    @@ -625,11 +626,13 @@ 

    Instructions

    1. Create a new data frame called DF5 that provides a mean score for each student on each course
    -
    DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct), .groups = "keep")
    +
    #9
    +DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct), .groups = "keep")
    1. Extra credit Convert the datetime variable into month-day-year format and create a new data frame (DF6) that shows the average correct for each day
    -
    DF6 <- DF1[,c(7,4)]
    +
    #10
    +DF6 <- select(DF1, hash, lesson_name, datetime, correct)
     
     #steps to get TRUE/FALSE to 1/0
     DF6$correct <- ifelse(DF6$correct == "FALS", "FALSE", DF6$correct)
    @@ -638,7 +641,7 @@ 

    Instructions

    DF6$correct <- as.numeric(DF6$correct) #Creating average correct for each day -DF6 <- DF6 %>% group_by(datetime, correct) %>% summarise(meanByDay = mean(correct), .groups = "keep") +DF6 <- DF6 %>% group_by(hash, datetime, correct) %>% summarise(meanByDay = mean(correct), .groups = "keep") #Convert 'datetime' to month-day-year by converting the parsed num*** #library(lubridate) From 2f61289d4f95a78bd4bfa34a7b4b046b7390bd6d Mon Sep 17 00:00:00 2001 From: Nicole Schlosberg <70542380+nicole-schlosberg@users.noreply.github.com> Date: Tue, 17 Aug 2021 17:45:58 -0400 Subject: [PATCH 04/10] Updating --- PrincipleComponentAnalysis.Rmd | 154 ++++++++ PrincipleComponentAnalysis.html | 647 ++++++++++++++++++++++++++++++++ README.md | 24 +- 3 files changed, 818 insertions(+), 7 deletions(-) create mode 100644 PrincipleComponentAnalysis.Rmd create mode 100644 PrincipleComponentAnalysis.html diff --git a/PrincipleComponentAnalysis.Rmd b/PrincipleComponentAnalysis.Rmd new file mode 100644 index 0000000..2df2b66 --- /dev/null +++ b/PrincipleComponentAnalysis.Rmd @@ -0,0 +1,154 @@ +--- +title: "Principle Component Analysis" +author: "Nicole Schlosberg" +date: "11/19/2020" +output: html_document +--- + +Data: The data comes from the Assistments online intelligent tutoring system (https://www.assistments.org/). It describes students working through online math problems. Each student has the following data associated with them: + + +## Part I +Uploading the data +```{r} +D1 <- read.csv("Assistments-confidence.csv", header=TRUE) +``` + +Create a correlation matrix of the relationships between the variables, including correlation coefficients for each pair of variables/features. + +```{r} +library(ggplot2) +library(GGally) +library(corrplot) + +ggpairs(D1, 2:8, progress = FALSE) #ggpairs() draws a correlation plot between all the columns you identify by number (second option, you don't need the first column as it is the student ID) and progress = FALSE stops a progress bar appearing as it renders your plot + +ggcorr(D1[,-1], method = c("everything", "pearson")) #ggcorr() doesn't have an explicit option to choose variables so we need to use matrix notation to drop the id variable. We then need to choose a "method" which determines how to treat missing values (here we choose to keep everything, and then which kind of correlation calculation to use, here we are using Pearson correlation, the other options are "kendall" or "spearman") + +#Note of what is strongly related to the outcome variable of interest, mean_correct. +``` + +Create a new data frame with the mean_correct variable removed, we want to keep that variable intact. The other variables will be included in our PCA. + +```{r} +library(dplyr) + +D2 <- select(D1,-id,-mean_correct) +``` + +Now run the PCA on the new data frame + +```{r} +pca <- prcomp(D2, scale. = TRUE) +``` + +Although princomp does not generate the eigenvalues directly for us, we can print a list of the standard deviation of the variance accounted for by each component. + +```{r} +pca$sdev + +#To convert this into variance accounted for we can square it, these numbers are proportional to the eigenvalue +pca$sdev^2 + +#A summary of our pca will give us the proportion of variance accounted for by each component +summary(pca) + +#We can look at this to get an idea of which components we should keep and which we should drop +plot(pca, type = "lines") +``` + +Decide which components you would drop and remove them from your data set. + +ANSWER: PC5 and PC6 would be components to drop. PC5 only has 12.20% of variance, meaning it only represents 12.20% of the variance. PC6 only has 8.93% of the variance, which means it only represents 8.93% of the variance. Since it is such small amount of the variance it can be removed. + + +## Part II + +```{r} +#Now create a data frame of the transformed data from your pca. +D3 <- data.frame(pca$x) + +#Attach the variable "mean_correct" from your original data frame to D3. +D3 <- data.frame(D3,D1$mean_correct) + +#Now re-run your correlation plots between the transformed data and mean_correct. If you had dropped some components would you have lost important information about mean_correct? +ggcorr(D3, method = c("everything", "pearson")) +``` + +If you had dropped some components would you have lost important information about mean_correct? + +ANSWER: You would lose important information about mean_correct. Components with some small variance representation still contribute some information even if small. PC6 which only makes up 8.93% of the variance is strongly correlated to mean_correct. This means that even though it is the best option to drop, we risk dropping something that correlates with the thing we are interested in. + +Now print out the loadings for the components you generated: + +```{r} +library(ggbiplot) +library(tidyr) + +pca$rotation + +#Examine the eigenvectors, notice that they are a little difficult to interpret. It is much easier to make sense of them if we make them proportional within each component +loadings <- abs(pca$rotation) #abs() will make all eigenvectors positive + +#Now examine your components and try to come up with substantive descriptions of what some might represent? +L1 <- as_tibble(loadings) +labels <- c("prior_prob_count","prior_percent_correct","problems_attempted","mean_hint","mean_attempt","mean_confidence") +L2 <- cbind(labels,L1) +L3 <- L2 %>% mutate(PC1 = PC1/sum(PC1)) %>% mutate(PC2 = PC2/sum(PC2)) %>% mutate(PC3 = PC3/sum(PC3)) %>% mutate(PC4 = PC4/sum(PC4)) %>% mutate(PC5 = PC5/sum(PC5)) %>% mutate(PC6 = PC6/sum(PC6)) %>% print + +#You can generate a biplot to help you, though these can be a bit confusing. They plot the transformed data by the first two components. Therefore, the axes represent the direction of maximum variance accounted for. Then mapped onto this point cloud are the original directions of the variables, depicted as red arrows. It is supposed to provide a visualization of which variables "go together". Variables that possibly represent the same underlying construct point in the same direction. +ggbiplot(pca) +#ggbiplot(pca,choices=c(3,4)) +#ggbiplot(pca,choices=c(5,6)) +``` + +Now examine your components and try to come up with substantive descriptions of what some might represent? + +ANSWER: Having the most variance of a specific category means that the component contributes the most to its variance. PC1 has the most variance in mean_hint (30.22% of variance), mean_attempt (25.86% of variance), and problems_attempted (21.74% of the variance). PC2 contributes the most to prior_percent_correct (44.68% of variance), prior_prob_count (25.08% of variance), and problems_attempted (17.34% of variance). PC3 contributes the most to mean_confidence (45.79% of variance), prior_prob_count (22.10% of variance), and problems_attempted (20.06% of variance). PC4 contributes the most to prior_prob_count (31.52% of variance) and mean_confidence (22.61% of variance). PC5 contributes the most to problems_attempted (30.39% of variance) and mean_attempt (35.77% of variance). PC6 only contributes the most to mean_hint (35.61% of variance). + + +## Part III + +Also in this repository is a data set collected from TC students (tc-program-combos.csv) that shows how many students thought that a TC program was related to another TC program. Students were shown three program names at a time and were asked which two of the three were most similar. + +```{r} +library(ggplot2) +library(GGally) +library(dplyr) + +R1 <- read.csv("tc-program-combos.csv",header=TRUE) + +#Organize the data +R3 <- R1 +rownames(R3)<-R3$program +R3 <- select(R3, -program) +R3 <- R3[order(rownames(R3)),] +R3 <- R3[,sort(colnames(R3))] + +#PCA on data +pca3 <- prcomp(R3, scale. = TRUE) +pca3$sdev +pca3$sdev^2 +summary(pca3) + +plot(pca3, type = "lines") + +#pca3$rotation +loadings3 <- abs(pca3$rotation) +``` + +Use PCA to look for components that represent related programs. Explain why you think there are relationships between these programs (explain the meaning of the components). + +ANSWER: Based off of the percentages of variance of the PCs, I have concluded that yes many of the PCs correspond to related classes. Below are the make up of the first 6 components, which compose 38.05% (cumulative proportion) of all the variance just within those 6 components. The remaining 61.95% is spread across the other 59 components, thus with 38.05% within 6 components and 61.95% spread across 59 components, the first components hold larger proportions (decreasing as they move away from PC1 towards PC67) of the variance. + +PC1: Change.Leadership (27.70%), Economics.and.Education (23.09%), Education.Policy (22.09%), Arts.Administration (21.93%), Politics (21.88%), School.Principals (21.75%), Social.Organizational.Psychology (21.19), Private.School.Leadership (20.48%), Cooperation.and.Conflict.Resolution (20.42%), and Leadership (20.06%) make up the highest percent variance within PC1. This simply means that these programs contribute their respective amounts to PC1. These programs are all related to leadership, organization, and administration concepts. + +PC2: Clinical.Psychology (25.31%), Neuroscience (25.29%), Kinesiology (25.15), Physiology (24.43%), Psychology (22.37%), Health.Education (22.13%), Behavioral.Analysis (21.26%), Nursing (21.21%), Physical.Education (21.08%), and Counseling.Psychology (19.57%) are all within PC2 with relatively high percentages of variance making up most of PC2. These programs are all related to health including mental health or education of the subjects. + +PC3: Design.and.Development.of.Digital.Games (31.52%), Cognitive.Science (31.04%), Mathematics (27.94%), Learning.Analytics (27.93), Education.Technology (26.90%), Creative.Technologies (26.10%), Instructional.Technology.and.Media (25.66%), and Measurement.Evaluation.and.Statistics (24.67) are all within PC3 with relatively high percentages of variance making up most of PC3. These programs are all related to technology, data science, and statistical measures. + +PC4: Linguistics (34.79), English.Education (34.07), Teaching.English (27.46), and Literacy (24.96) all are relatively high percentages of variance making up PC4 and all relate to learning language and reading. As you read down past Literacy and look at the other programs with high variance within PC4, they also are in some way related to learning. + +PC5: History (32.73%) makes up the most of PC5 with the next closest percentage being Music (24.55%), which at least based on the program is not related to the subject of History and therefore PC5 is made up mostly of a program unrelated to the other classes within the component. The gap between them is large. + +PC6: Science.Education (35.53%) and Higher.and.Postsecondary.Education (32.91%) make up the majority of PC6 variance, and are both related to education. \ No newline at end of file diff --git a/PrincipleComponentAnalysis.html b/PrincipleComponentAnalysis.html new file mode 100644 index 0000000..fd9a403 --- /dev/null +++ b/PrincipleComponentAnalysis.html @@ -0,0 +1,647 @@ + + + + + + + + + + + + + + + +Principle Component Analysis + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + +

    Data: The data comes from the Assistments online intelligent tutoring system (https://www.assistments.org/). It describes students working through online math problems. Each student has the following data associated with them:

    +
    +

    Part I

    +

    Uploading the data

    +
    D1 <- read.csv("Assistments-confidence.csv", header=TRUE)
    +

    Create a correlation matrix of the relationships between the variables, including correlation coefficients for each pair of variables/features.

    +
    library(ggplot2)
    +library(GGally)
    +
    ## Registered S3 method overwritten by 'GGally':
    +##   method from   
    +##   +.gg   ggplot2
    +
    library(corrplot)
    +
    ## corrplot 0.84 loaded
    +
    ggpairs(D1, 2:8, progress = FALSE) #ggpairs() draws a correlation plot between all the columns you identify by number (second option, you don't need the first column as it is the student ID) and progress = FALSE stops a progress bar appearing as it renders your plot
    +

    +
    ggcorr(D1[,-1], method = c("everything", "pearson")) #ggcorr() doesn't have an explicit option to choose variables so we need to use matrix notation to drop the id variable. We then need to choose a "method" which determines how to treat missing values (here we choose to keep everything, and then which kind of correlation calculation to use, here we are using Pearson correlation, the other options are "kendall" or "spearman")
    +

    +
    #Note of what is strongly related to the outcome variable of interest, mean_correct. 
    +

    Create a new data frame with the mean_correct variable removed, we want to keep that variable intact. The other variables will be included in our PCA.

    +
    library(dplyr)
    +
    ## 
    +## Attaching package: 'dplyr'
    +
    ## The following objects are masked from 'package:stats':
    +## 
    +##     filter, lag
    +
    ## The following objects are masked from 'package:base':
    +## 
    +##     intersect, setdiff, setequal, union
    +
    D2 <- select(D1,-id,-mean_correct)
    +

    Now run the PCA on the new data frame

    +
    pca <- prcomp(D2, scale. = TRUE)
    +

    Although princomp does not generate the eigenvalues directly for us, we can print a list of the standard deviation of the variance accounted for by each component.

    +
    pca$sdev
    +
    ## [1] 1.2825140 1.0543565 1.0245688 0.9621486 0.8556715 0.7320146
    +
    #To convert this into variance accounted for we can square it, these numbers are proportional to the eigenvalue
    +pca$sdev^2
    +
    ## [1] 1.6448423 1.1116675 1.0497412 0.9257299 0.7321737 0.5358454
    +
    #A summary of our pca will give us the proportion of variance accounted for by each component
    +summary(pca)
    +
    ## Importance of components:
    +##                           PC1    PC2    PC3    PC4    PC5     PC6
    +## Standard deviation     1.2825 1.0544 1.0246 0.9621 0.8557 0.73201
    +## Proportion of Variance 0.2741 0.1853 0.1750 0.1543 0.1220 0.08931
    +## Cumulative Proportion  0.2741 0.4594 0.6344 0.7887 0.9107 1.00000
    +
    #We can look at this to get an idea of which components we should keep and which we should drop
    +plot(pca, type = "lines")
    +

    +

    Decide which components you would drop and remove them from your data set.

    +

    ANSWER: PC5 and PC6 would be components to drop. PC5 only has 12.20% of variance, meaning it only represents 12.20% of the variance. PC6 only has 8.93% of the variance, which means it only represents 8.93% of the variance. Since it is such small amount of the variance it can be removed.

    +
    +
    +

    Part II

    +
    #Now create a data frame of the transformed data from your pca.
    +D3 <- data.frame(pca$x)
    +
    +#Attach the variable "mean_correct" from your original data frame to D3.
    +D3 <- data.frame(D3,D1$mean_correct)
    + 
    +#Now re-run your correlation plots between the transformed data and mean_correct. If you had dropped some components would you have lost important information about mean_correct?
    +ggcorr(D3, method = c("everything", "pearson")) 
    +

    +

    If you had dropped some components would you have lost important information about mean_correct?

    +

    ANSWER: You would lose important information about mean_correct. Components with some small variance representation still contribute some information even if small. PC6 which only makes up 8.93% of the variance is strongly correlated to mean_correct. This means that even though it is the best option to drop, we risk dropping something that correlates with the thing we are interested in.

    +

    Now print out the loadings for the components you generated:

    +
    library(ggbiplot)
    +
    ## Loading required package: plyr
    +
    ## ------------------------------------------------------------------------------
    +
    ## You have loaded plyr after dplyr - this is likely to cause problems.
    +## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
    +## library(plyr); library(dplyr)
    +
    ## ------------------------------------------------------------------------------
    +
    ## 
    +## Attaching package: 'plyr'
    +
    ## The following objects are masked from 'package:dplyr':
    +## 
    +##     arrange, count, desc, failwith, id, mutate, rename, summarise,
    +##     summarize
    +
    ## Loading required package: scales
    +
    ## Loading required package: grid
    +
    library(tidyr)
    +
    +pca$rotation
    +
    ##                               PC1         PC2         PC3        PC4
    +## prior_prob_count      -0.26034140  0.45818753 -0.40090679 -0.6897642
    +## prior_percent_correct  0.16840319  0.81617867  0.09267306  0.2640040
    +## problems_attempted    -0.45568733  0.31685183  0.36387724  0.3168141
    +## mean_hint             -0.63337594 -0.12501620 -0.08008842 -0.1122586
    +## mean_attempt          -0.54200011 -0.08510858 -0.04585364  0.3108682
    +## mean_confidence        0.03581325  0.02547483 -0.83051917  0.4948890
    +##                                PC5         PC6
    +## prior_prob_count      -0.007142834 -0.29280482
    +## prior_percent_correct  0.298843852  0.37134715
    +## problems_attempted    -0.592336569 -0.32911025
    +## mean_hint             -0.102302115  0.74412634
    +## mean_attempt           0.697232132 -0.33781385
    +## mean_confidence       -0.251357022 -0.01452143
    +
    #Examine the eigenvectors, notice that they are a little difficult to interpret. It is much easier to make sense of them if we make them proportional within each component
    +loadings <- abs(pca$rotation) #abs() will make all eigenvectors positive
    +
    +#Now examine your components and try to come up with substantive descriptions of what some might represent?
    +L1 <- as_tibble(loadings)
    +labels <- c("prior_prob_count","prior_percent_correct","problems_attempted","mean_hint","mean_attempt","mean_confidence") 
    +L2 <- cbind(labels,L1)
    +L3 <- L2 %>% mutate(PC1 = PC1/sum(PC1)) %>% mutate(PC2 = PC2/sum(PC2)) %>% mutate(PC3 = PC3/sum(PC3)) %>% mutate(PC4 = PC4/sum(PC4)) %>% mutate(PC5 = PC5/sum(PC5)) %>%  mutate(PC6 = PC6/sum(PC6)) %>% print
    +
    ##                  labels        PC1        PC2        PC3        PC4         PC5
    +## 1      prior_prob_count 0.12423113 0.25081186 0.22101700 0.31516257 0.003664468
    +## 2 prior_percent_correct 0.08035956 0.44677621 0.05108998 0.12062699 0.153315014
    +## 3    problems_attempted 0.21744737 0.17344469 0.20060288 0.14475664 0.303884750
    +## 4             mean_hint 0.30223780 0.06843387 0.04415217 0.05129246 0.052483764
    +## 5          mean_attempt 0.25863458 0.04658844 0.02527878 0.14203987 0.357699023
    +## 6       mean_confidence 0.01708956 0.01394492 0.45785919 0.22612148 0.128952980
    +##          PC6
    +## 1 0.14011651
    +## 2 0.17770154
    +## 3 0.15748983
    +## 4 0.35608836
    +## 5 0.16165478
    +## 6 0.00694897
    +
    #You can generate a biplot to help you, though these can be a bit confusing. They plot the transformed data by the first two components. Therefore, the axes represent the direction of maximum variance accounted for. Then mapped onto this point cloud are the original directions of the variables, depicted as red arrows. It is supposed to provide a visualization of which variables "go together". Variables that possibly represent the same underlying construct point in the same direction.  
    +ggbiplot(pca)
    +

    +
    #ggbiplot(pca,choices=c(3,4))
    +#ggbiplot(pca,choices=c(5,6))
    +

    Now examine your components and try to come up with substantive descriptions of what some might represent?

    +

    ANSWER: Having the most variance of a specific category means that the component contributes the most to its variance. PC1 has the most variance in mean_hint (30.22% of variance), mean_attempt (25.86% of variance), and problems_attempted (21.74% of the variance). PC2 contributes the most to prior_percent_correct (44.68% of variance), prior_prob_count (25.08% of variance), and problems_attempted (17.34% of variance). PC3 contributes the most to mean_confidence (45.79% of variance), prior_prob_count (22.10% of variance), and problems_attempted (20.06% of variance). PC4 contributes the most to prior_prob_count (31.52% of variance) and mean_confidence (22.61% of variance). PC5 contributes the most to problems_attempted (30.39% of variance) and mean_attempt (35.77% of variance). PC6 only contributes the most to mean_hint (35.61% of variance).

    +
    +
    +

    Part III

    +

    Also in this repository is a data set collected from TC students (tc-program-combos.csv) that shows how many students thought that a TC program was related to another TC program. Students were shown three program names at a time and were asked which two of the three were most similar.

    +
    library(ggplot2)
    +library(GGally)
    +library(dplyr)
    +
    +R1 <- read.csv("tc-program-combos.csv",header=TRUE)
    +
    +#Organize the data
    +R3 <- R1
    +rownames(R3)<-R3$program
    +R3 <- select(R3, -program)
    +R3 <- R3[order(rownames(R3)),]
    +R3 <- R3[,sort(colnames(R3))]
    +
    +#PCA on data
    +pca3 <- prcomp(R3, scale. = TRUE)
    +pca3$sdev
    +
    ##  [1] 2.66699514 2.33303087 2.03824332 1.80893489 1.71451092 1.60411744
    +##  [7] 1.58798960 1.49222150 1.46424566 1.39138869 1.33520786 1.32516917
    +## [13] 1.31212863 1.26312253 1.25366254 1.22338737 1.21896028 1.18649390
    +## [19] 1.13127469 1.12814038 1.10432926 1.06319093 1.01168384 0.99665812
    +## [25] 0.96528383 0.95048841 0.93256896 0.90507518 0.85160824 0.83479067
    +## [31] 0.81879538 0.78538963 0.76079365 0.73350908 0.72278124 0.67319166
    +## [37] 0.66343310 0.64839067 0.62448974 0.60331242 0.56846989 0.55769066
    +## [43] 0.51031628 0.49442626 0.47128286 0.44551299 0.43288829 0.41344476
    +## [49] 0.37259965 0.36653720 0.35015529 0.33278376 0.32799766 0.30414406
    +## [55] 0.28040415 0.27066834 0.23729873 0.21156010 0.17616906 0.16541514
    +## [61] 0.14778289 0.14204907 0.11092615 0.07054620 0.04430024 0.03588606
    +## [67] 0.01241193
    +
    pca3$sdev^2
    +
    ##  [1] 7.1128630864 5.4430330560 4.1544358466 3.2722454371 2.9395476996
    +##  [6] 2.5731927459 2.5217109546 2.2267250142 2.1440153558 1.9359624893
    +## [11] 1.7827800336 1.7560733413 1.7216815395 1.5954785267 1.5716697682
    +## [16] 1.4966766496 1.4858641675 1.4077677636 1.2797824290 1.2727007063
    +## [21] 1.2195431178 1.1303749458 1.0235041934 0.9933274149 0.9317728773
    +## [26] 0.9034282205 0.8696848576 0.8191610815 0.7252366024 0.6968754570
    +## [31] 0.6704258671 0.6168368672 0.5788069731 0.5380355701 0.5224127176
    +## [36] 0.4531870174 0.4401434846 0.4204104623 0.3899874305 0.3639858772
    +## [41] 0.3231580154 0.3110188729 0.2604227077 0.2444573271 0.2221075356
    +## [46] 0.1984818264 0.1873922684 0.1709365720 0.1388304984 0.1343495215
    +## [51] 0.1226087294 0.1107450296 0.1075824634 0.0925036076 0.0786264855
    +## [56] 0.0732613524 0.0563106886 0.0447576745 0.0310355381 0.0273621679
    +## [61] 0.0218397813 0.0201779378 0.0123046098 0.0049767662 0.0019625111
    +## [66] 0.0012878092 0.0001540561
    +
    summary(pca3)
    +
    ## Importance of components:
    +##                           PC1     PC2     PC3     PC4     PC5     PC6     PC7
    +## Standard deviation     2.6670 2.33303 2.03824 1.80893 1.71451 1.60412 1.58799
    +## Proportion of Variance 0.1062 0.08124 0.06201 0.04884 0.04387 0.03841 0.03764
    +## Cumulative Proportion  0.1062 0.18740 0.24941 0.29825 0.34212 0.38053 0.41816
    +##                            PC8    PC9    PC10    PC11    PC12   PC13    PC14
    +## Standard deviation     1.49222 1.4642 1.39139 1.33521 1.32517 1.3121 1.26312
    +## Proportion of Variance 0.03323 0.0320 0.02889 0.02661 0.02621 0.0257 0.02381
    +## Cumulative Proportion  0.45140 0.4834 0.51229 0.53890 0.56511 0.5908 0.61462
    +##                           PC15    PC16    PC17    PC18   PC19   PC20   PC21
    +## Standard deviation     1.25366 1.22339 1.21896 1.18649 1.1313 1.1281 1.1043
    +## Proportion of Variance 0.02346 0.02234 0.02218 0.02101 0.0191 0.0190 0.0182
    +## Cumulative Proportion  0.63808 0.66042 0.68260 0.70361 0.7227 0.7417 0.7599
    +##                           PC22    PC23    PC24    PC25    PC26    PC27    PC28
    +## Standard deviation     1.06319 1.01168 0.99666 0.96528 0.95049 0.93257 0.90508
    +## Proportion of Variance 0.01687 0.01528 0.01483 0.01391 0.01348 0.01298 0.01223
    +## Cumulative Proportion  0.77678 0.79205 0.80688 0.82079 0.83427 0.84725 0.85948
    +##                           PC29   PC30    PC31    PC32    PC33    PC34   PC35
    +## Standard deviation     0.85161 0.8348 0.81880 0.78539 0.76079 0.73351 0.7228
    +## Proportion of Variance 0.01082 0.0104 0.01001 0.00921 0.00864 0.00803 0.0078
    +## Cumulative Proportion  0.87030 0.8807 0.89071 0.89992 0.90856 0.91659 0.9244
    +##                           PC36    PC37    PC38    PC39    PC40    PC41    PC42
    +## Standard deviation     0.67319 0.66343 0.64839 0.62449 0.60331 0.56847 0.55769
    +## Proportion of Variance 0.00676 0.00657 0.00627 0.00582 0.00543 0.00482 0.00464
    +## Cumulative Proportion  0.93115 0.93772 0.94399 0.94981 0.95524 0.96007 0.96471
    +##                           PC43    PC44    PC45    PC46   PC47    PC48    PC49
    +## Standard deviation     0.51032 0.49443 0.47128 0.44551 0.4329 0.41344 0.37260
    +## Proportion of Variance 0.00389 0.00365 0.00332 0.00296 0.0028 0.00255 0.00207
    +## Cumulative Proportion  0.96860 0.97224 0.97556 0.97852 0.9813 0.98387 0.98594
    +##                           PC50    PC51    PC52    PC53    PC54    PC55    PC56
    +## Standard deviation     0.36654 0.35016 0.33278 0.32800 0.30414 0.28040 0.27067
    +## Proportion of Variance 0.00201 0.00183 0.00165 0.00161 0.00138 0.00117 0.00109
    +## Cumulative Proportion  0.98795 0.98978 0.99143 0.99304 0.99442 0.99559 0.99668
    +##                           PC57    PC58    PC59    PC60    PC61   PC62    PC63
    +## Standard deviation     0.23730 0.21156 0.17617 0.16542 0.14778 0.1420 0.11093
    +## Proportion of Variance 0.00084 0.00067 0.00046 0.00041 0.00033 0.0003 0.00018
    +## Cumulative Proportion  0.99752 0.99819 0.99866 0.99906 0.99939 0.9997 0.99987
    +##                           PC64    PC65    PC66    PC67
    +## Standard deviation     0.07055 0.04430 0.03589 0.01241
    +## Proportion of Variance 0.00007 0.00003 0.00002 0.00000
    +## Cumulative Proportion  0.99995 0.99998 1.00000 1.00000
    +
    plot(pca3, type = "lines")
    +

    +
    #pca3$rotation
    +loadings3 <- abs(pca3$rotation) 
    +

    Use PCA to look for components that represent related programs. Explain why you think there are relationships between these programs (explain the meaning of the components).

    +

    ANSWER: Based off of the percentages of variance of the PCs, I have concluded that yes many of the PCs correspond to related classes. Below are the make up of the first 6 components, which compose 38.05% (cumulative proportion) of all the variance just within those 6 components. The remaining 61.95% is spread across the other 59 components, thus with 38.05% within 6 components and 61.95% spread across 59 components, the first components hold larger proportions (decreasing as they move away from PC1 towards PC67) of the variance.

    +

    PC1: Change.Leadership (27.70%), Economics.and.Education (23.09%), Education.Policy (22.09%), Arts.Administration (21.93%), Politics (21.88%), School.Principals (21.75%), Social.Organizational.Psychology (21.19), Private.School.Leadership (20.48%), Cooperation.and.Conflict.Resolution (20.42%), and Leadership (20.06%) make up the highest percent variance within PC1. This simply means that these programs contribute their respective amounts to PC1. These programs are all related to leadership, organization, and administration concepts.

    +

    PC2: Clinical.Psychology (25.31%), Neuroscience (25.29%), Kinesiology (25.15), Physiology (24.43%), Psychology (22.37%), Health.Education (22.13%), Behavioral.Analysis (21.26%), Nursing (21.21%), Physical.Education (21.08%), and Counseling.Psychology (19.57%) are all within PC2 with relatively high percentages of variance making up most of PC2. These programs are all related to health including mental health or education of the subjects.

    +

    PC3: Design.and.Development.of.Digital.Games (31.52%), Cognitive.Science (31.04%), Mathematics (27.94%), Learning.Analytics (27.93), Education.Technology (26.90%), Creative.Technologies (26.10%), Instructional.Technology.and.Media (25.66%), and Measurement.Evaluation.and.Statistics (24.67) are all within PC3 with relatively high percentages of variance making up most of PC3. These programs are all related to technology, data science, and statistical measures.

    +

    PC4: Linguistics (34.79), English.Education (34.07), Teaching.English (27.46), and Literacy (24.96) all are relatively high percentages of variance making up PC4 and all relate to learning language and reading. As you read down past Literacy and look at the other programs with high variance within PC4, they also are in some way related to learning.

    +

    PC5: History (32.73%) makes up the most of PC5 with the next closest percentage being Music (24.55%), which at least based on the program is not related to the subject of History and therefore PC5 is made up mostly of a program unrelated to the other classes within the component. The gap between them is large.

    +

    PC6: Science.Education (35.53%) and Higher.and.Postsecondary.Education (32.91%) make up the majority of PC6 variance, and are both related to education.

    +
    + + + + +
    + + + + + + + + + + + + + + + diff --git a/README.md b/README.md index 8ab360d..470dcb2 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,22 @@ -# Assignment 2 -### Data Wrangling and Visualization +# Principal Component Analysis -In Assignment 2 we will be practicing data manipulation including use of the tidyverse. +I used data from the Assistments Intelligent Tutoring system. This system gives students hints based on how they perform on math problems. I wanted to see if I can build a decision tree to help teachers decide which students to follow up with, based on students' performance in Assignments. I create three groups ("teacher should intervene", "teacher should monitor student progress" and "no action") based on students' previous use of the system and how many hints they use. I will be building a decision tree using the "party" package. The party package builds decision trees based on a set of statistical stopping rules. -The instructions to Assignment 2 are in the Assignment 2-2020.rmd file. Assignments are structured in three parts, in the first part you can just follow along with the code, in the second part you will need to apply the code, and in the third part is completely freestyle and you are expected to apply your new knowledge in a new way. +Examine the PrincipleComponentAnalysis.html for the results. -**Please complete as much as you can by midnight EDT, 10/05/20** +# Codebook +id - student id +prior_prob_count - The number of problems a student has done in the system prior to the surrent session +score - The score the student achieved in the current session +hints - The number of hints the student requested in the current session +hint.y - Whether or not the student asked for hints in the current session +complete - Whether or not the student completed the cirrent session +action - The action suggested by the system to a teacher about a given student based on their performance -Once you have finished, commit, push and pull your assignment back to the main branch. Include both the .Rmd file and the .html file. +- prior_percent_correct: The percentage of problems a student has answered correctly prior to this session +- problems_attempted: The number of problems the student has attempted in the current session +- mean_correct: The average number of correct answers a student made on their first attempt at problems in the current session +- mean_hint: The average number of hints a student asked for in the current session +- mean_attempt: The average number of attempts a student took to answer a problem in the current session +- mean_confidence: The average confidence each student has in their ability to answer the problems in the current session -Good luck! From 0db897c3f575545a43b109fac99c791127254687 Mon Sep 17 00:00:00 2001 From: Nicole Schlosberg <70542380+nicole-schlosberg@users.noreply.github.com> Date: Tue, 17 Aug 2021 17:46:09 -0400 Subject: [PATCH 05/10] Delete Assignment 2-2020.Rmd --- Assignment 2-2020.Rmd | 304 ------------------------------------------ 1 file changed, 304 deletions(-) delete mode 100644 Assignment 2-2020.Rmd diff --git a/Assignment 2-2020.Rmd b/Assignment 2-2020.Rmd deleted file mode 100644 index 98bdd9f..0000000 --- a/Assignment 2-2020.Rmd +++ /dev/null @@ -1,304 +0,0 @@ ---- -title: "Assignment 2" -author: "Nicole Schlosberg" -date: "September 29, 2020" -output: html_document ---- - - -## Part I - -## Data Wrangling -In the hackathon a project was proposed to collect data from student video watching, a sample of this data is available in the file video-data.csv. - -stid = student id -year = year student watched video -participation = whether or not the student opened the video -watch.time = how long the student watched the video for -confusion.points = how many times a student rewatched a section of a video -key,points = how many times a student skipped or increased the speed of a video - -```{r} -#Install the 'tidyverse' package or if that does not work, install the 'dplyr' and 'tidyr' packages. - -#Load the package(s) you just installed - -#library(tidyverse) -library(tidyr) -library(dplyr) - -D1 <- read.csv("video-data.csv", header = TRUE) - -#Create a data frame that only contains the years 2018 -D2 <- filter(D1, year == 2018) -``` - -## Histograms -```{r} -#Generate a histogram of the watch time for the year 2018 - -hist(D2$watch.time) - -#Change the number of breaks to 100, do you get the same impression? - -hist(D2$watch.time, breaks = 100) - -#Cut the y-axis off at 10 - -hist(D2$watch.time, breaks = 100, ylim = c(0,10)) - -#Restore the y-axis and change the breaks so that they are 0-5, 5-20, 20-25, 25-35 - -hist(D2$watch.time, breaks = c(0,5,20,25,35)) - -``` - -## Plots -```{r} -#Plot the number of confusion points against the watch time - -plot(D1$confusion.points, D1$watch.time) - -#Create two variables x & y -x <- c(1,3,2,7,6,4,4) -y <- c(2,4,2,3,2,4,3) - -#Create a table from x & y -table1 <- table(x,y) - -#Display the table as a Barplot -barplot(table1) - -#Create a data frame of the average total key points for each year and plot the two against each other as a lines - -D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points)) - -plot(D3$year, D3$mean_key, type = "l", lty = "dashed") - -#Create a boxplot of total enrollment for three students -D4 <- filter(D1, stid == 4|stid == 20| stid == 22) -#The drop levels command will remove all the schools from the variable with no data -D4 <- droplevels(D4) -boxplot(D4$watch.time~D4$stid, xlab = "Student", ylab = "Watch Time") -``` - -## Pairs -```{r} -#Use matrix notation to select columns 2, 5, 6, and 7 -D5 <- D1[,c(2,5,6,7)] -#Draw a matrix of plots for every combination of variables -pairs(D5) -``` - -## Part II - -1. Create a simulated data set containing 100 students, each with a score from 1-100 representing performance in an educational game. The scores should tend to cluster around 75. Also, each student should be given a classification that reflects one of four interest groups: sport, music, nature, literature. - -```{r} -#rnorm(100, 75, 15) creates a random sample with a mean of 75 and standard deviation of 15 -#filter() can be used to set max min value and can only work with a data frame, for rows -#select() for columns -#round() rounds numbers to whole number values -#sample() draws a random samples from the groups vector according to a uniform distribution - -score <- rnorm(100,75,15) -hist(score,breaks = 30) -S1 <- data.frame(score) - -#Top and tail the scores -S1 <- filter(S1, score <= 100) -hist(S1$score) - -S2 <- data.frame(rep(100,5)) #repeat 100 5 times and names the column a random name that is not helpful so use the names() command to rename -names(S2) <- "score" -S3 <- bind_rows(S1,S2) #must make sure that the names of the columns and the type match - -#S3$score <- ifelse(S3$score >= 100, 100, S3$score) - -S3$score <-round(S3$score,0) - -interest <- c("sport", "music", "nature", "liturature") -S3$interest <- sample(interest, 100, replace = TRUE) -S3$stid <- seq(1,100,1) -``` - -2. **Using base R commands, draw a histogram of the scores. Change the breaks in your histogram until you think they best represent your data. - -```{r} -hist(S3$score, breaks = 10) -``` - -3. Create a new variable that groups the scores according to the breaks in your histogram. - -```{r} -#cut() divides the range of scores into intervals and codes the values in scores according to which interval they fall. We use a vector called `letters` as the labels, `letters` is a vector made up of the letters of the alphabet. -label <- letters[1:10] -S3$breaks <- cut(S3$score, breaks = 10, labels = label) -``` - -4. Now using the colorbrewer package (RColorBrewer; http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3) design a pallette and assign it to the groups in your data on the histogram. - -```{r} -library(RColorBrewer) -#Let's look at the available palettes in RColorBrewer -display.brewer.all() -#The top section of palettes are sequential, the middle section are qualitative, and the lower section are diverging. -#Make RColorBrewer palette available to R and assign to your bins - -S3$colors <- brewer.pal(10, "BrBG") -#Use named palette in histogram -hist(S3$score, col = S3$colors) -``` - -5. Create a boxplot that visualizes the scores for each interest group and color each interest group a different color. - -```{r} -#Make a vector of the colors from RColorBrewer -interest.col <- brewer.pal(4, "BuPu") -boxplot(score ~ interest, S3, col = interest.col) -``` - -6. Now simulate a new variable that describes the number of logins that students made to the educational game. They should vary from 1-25. - -```{r} -S3$login <- sample(1:25, 100, replace = TRUE) -``` - -7. Plot the relationships between logins and scores. Give the plot a title and color the dots according to interest group. - -```{r} -plot(S3$score, S3$login, main = "Login vs. Score", xlab = "Score", ylab = "Login", col = interest.col) -``` - -8. R contains several inbuilt data sets, one of these in called AirPassengers. Plot a line graph of the the airline passengers over time using this data set. - -```{r} -plot(AirPassengers, type = "l", xlab = "Date", ylab = "Passenger numbers") -``` - -9. Using another inbuilt data set, iris, plot the relationships between all of the variables in the data set. Which of these relationships is it appropriate to run a correlation on? - -```{r} -plot(iris) -plot(iris$Sepal.Length,iris$Sepal.Width) -plot(iris$Petal.Length,iris$Petal.Width) -plot(iris$Petal.Length,iris$Sepal.Length) -plot(iris$Petal.Width,iris$Sepal.Width) -plot(iris$Petal.Width,iris$Sepal.Length) -plot(iris$Petal.Length,iris$Sepal.Width) -plot(iris$Species,iris$Sepal.Width, xlab = "Species", ylab = "Sepal Width") -plot(iris$Species,iris$Sepal.Length, xlab = "Species", ylab = "Sepal Length") -plot(iris$Species,iris$Petal.Width, xlab = "Species", ylab = "Petal Width") -plot(iris$Species,iris$Petal.Length, xlab = "Species", ylab = "Petal Length") - -#Which of these relationships is it appropriate to run a correlation on? -#Correlation between Sepal Length and Width -corOfSepalLW <- cor(iris$Sepal.Length, iris$Sepal.Width) -#Correlation between Petal Length and Width -corOfPetalLW <- cor(iris$Petal.Length, iris$Petal.Width) -#Correlation between Petal Length and Sepal Length -corOfLengthPS <- cor(iris$Petal.Length, iris$Sepal.Length) -#Correlation between Petal Width and Sepal Width -corOfWidthPS <- cor(iris$Petal.Width, iris$Sepal.Width) -``` - -# Part III - Analyzing Swirl - -## Data - -In this repository you will find data describing Swirl activity from the class so far this semester. Please connect RStudio to this repository. - -### Instructions - -1. Insert a new code block - -2. Create a data frame from the `swirl-data.csv` file called `DF1` - -The variables are: - -`course_name` - the name of the R course the student attempted -`lesson_name` - the lesson name -`question_number` - the question number attempted -`correct` - whether the question was answered correctly -`attempt` - how many times the student attempted the question -`skipped` - whether the student skipped the question -`datetime` - the date and time the student attempted the question -`hash` - anonymyzed student ID - -3. Create a new data frame that only includes the variables `hash`, `lesson_name` and `attempt` called `DF2` - -4. Use the `group_by` function to create a data frame that sums all the attempts for each `hash` by each `lesson_name` called `DF3` - -```{r} -#2 -DF1 <- read.csv("swirl-data.csv", header = TRUE) - -#3 -DF2<- select(DF1, hash, lesson_name, attempt) - -#4 -DF3 <- DF2 %>% group_by(hash, lesson_name) %>% summarise(sum_key = sum(attempt), .groups = "keep") -``` - -5. On a scrap piece of paper draw what you think `DF3` would look like if all the lesson names were column names - -6. Convert `DF3` to this format - -```{r} -#6 -#Get rid of the NAs so the next step does not throw error and add extra column of NAs -DF3 <- na.omit(DF3) -DF3 <- spread(DF3, lesson_name, sum_key) -``` - -7. Create a new data frame from `DF1` called `DF4` that only includes the variables `hash`, `lesson_name` and `correct` - -8. Convert the `correct` variable so that `TRUE` is coded as the **number** `1` and `FALSE` is coded as `0` - -```{r} -#7 -DF4 <- select(DF1, hash, lesson_name, correct) - -#8 -#Correct misspelled FALS at line 809 in swirl_data.csv and subsequent DF4 dataframe from DF1 -DF4$correct <- ifelse(DF4$correct == "FALS", "FALSE", DF4$correct) - -#Convert the chr that was created with last back to logi -DF4$correct <- type.convert(DF4$correct) - -#Get rid of the NAs so the next steps do not throw "NAs introduced by coercion" -DF4 <- DF4[complete.cases(DF4$correct),] - -#Converts logi to num so 0s and 1s instead of FALSE and TRUE -DF4$correct <- as.numeric(DF4$correct) -``` - -9. Create a new data frame called `DF5` that provides a mean score for each student on each course - -```{r} -#9 -DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct), .groups = "keep") -``` - -10. **Extra credit** Convert the `datetime` variable into month-day-year format and create a new data frame (`DF6`) that shows the average correct for each day - -```{r} -#10 -DF6 <- select(DF1, hash, lesson_name, datetime, correct) - -#steps to get TRUE/FALSE to 1/0 -DF6$correct <- ifelse(DF6$correct == "FALS", "FALSE", DF6$correct) -DF6$correct <- type.convert(DF6$correct) -DF6 <- DF6[complete.cases(DF6$correct),] -DF6$correct <- as.numeric(DF6$correct) - -#Creating average correct for each day -DF6 <- DF6 %>% group_by(hash, datetime, correct) %>% summarise(meanByDay = mean(correct), .groups = "keep") - -#Convert 'datetime' to month-day-year by converting the parsed num*** -#library(lubridate) -#dateConverted <- mdy_hms(DF6$datetime) -#DF6 <- separate_rows(DF6, DF6$datetime, sep = "0") -``` - -Finally use the knitr function to generate an html document from your work. Commit, Push and Pull Request your work back to the main branch of the repository. Make sure you include both the .Rmd file and the .html file. From 88c4c1d3cb064f4305b77cc87e60e65e5e2d66ee Mon Sep 17 00:00:00 2001 From: Nicole Schlosberg <70542380+nicole-schlosberg@users.noreply.github.com> Date: Tue, 17 Aug 2021 17:46:17 -0400 Subject: [PATCH 06/10] Delete Assignment-2-2020.html --- Assignment-2-2020.html | 701 ----------------------------------------- 1 file changed, 701 deletions(-) delete mode 100644 Assignment-2-2020.html diff --git a/Assignment-2-2020.html b/Assignment-2-2020.html deleted file mode 100644 index de9e430..0000000 --- a/Assignment-2-2020.html +++ /dev/null @@ -1,701 +0,0 @@ - - - - - - - - - - - - - - - -Assignment 2 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - - - - -
    -

    Part I

    -
    -
    -

    Data Wrangling

    -

    In the hackathon a project was proposed to collect data from student video watching, a sample of this data is available in the file video-data.csv.

    -

    stid = student id year = year student watched video participation = whether or not the student opened the video watch.time = how long the student watched the video for confusion.points = how many times a student rewatched a section of a video key,points = how many times a student skipped or increased the speed of a video

    -
    #Install the 'tidyverse' package or if that does not work, install the 'dplyr' and 'tidyr' packages.
    -
    -#Load the package(s) you just installed
    -
    -#library(tidyverse)
    -library(tidyr)
    -library(dplyr)
    -
    ## 
    -## Attaching package: 'dplyr'
    -
    ## The following objects are masked from 'package:stats':
    -## 
    -##     filter, lag
    -
    ## The following objects are masked from 'package:base':
    -## 
    -##     intersect, setdiff, setequal, union
    -
    D1 <- read.csv("video-data.csv", header = TRUE)
    -
    -#Create a data frame that only contains the years 2018
    -D2 <- filter(D1, year == 2018)
    -
    -
    -

    Histograms

    -
    #Generate a histogram of the watch time for the year 2018
    -
    -hist(D2$watch.time)
    -

    -
    #Change the number of breaks to 100, do you get the same impression?
    -
    -hist(D2$watch.time, breaks = 100)
    -

    -
    #Cut the y-axis off at 10
    -
    -hist(D2$watch.time, breaks = 100, ylim = c(0,10))
    -

    -
    #Restore the y-axis and change the breaks so that they are 0-5, 5-20, 20-25, 25-35
    -
    -hist(D2$watch.time, breaks = c(0,5,20,25,35))
    -

    -
    -
    -

    Plots

    -
    #Plot the number of confusion points against the watch time
    -
    -plot(D1$confusion.points, D1$watch.time)
    -

    -
    #Create two variables x & y
    -x <- c(1,3,2,7,6,4,4)
    -y <- c(2,4,2,3,2,4,3)
    -
    -#Create a table from x & y
    -table1 <- table(x,y)
    -
    -#Display the table as a Barplot
    -barplot(table1)
    -

    -
    #Create a data frame of the average total key points for each year and plot the two against each other as a lines
    -
    -D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points))
    -
    ## `summarise()` ungrouping output (override with `.groups` argument)
    -
    plot(D3$year, D3$mean_key, type = "l", lty = "dashed")
    -

    -
    #Create a boxplot of total enrollment for three students
    -D4 <- filter(D1, stid == 4|stid == 20| stid == 22)
    -#The drop levels command will remove all the schools from the variable with no data  
    -D4 <- droplevels(D4)
    -boxplot(D4$watch.time~D4$stid, xlab = "Student", ylab = "Watch Time")
    -

    -
    -
    -

    Pairs

    -
    #Use matrix notation to select columns 2, 5, 6, and 7
    -D5 <- D1[,c(2,5,6,7)]
    -#Draw a matrix of plots for every combination of variables
    -pairs(D5)
    -

    -
    -
    -

    Part II

    -
      -
    1. Create a simulated data set containing 100 students, each with a score from 1-100 representing performance in an educational game. The scores should tend to cluster around 75. Also, each student should be given a classification that reflects one of four interest groups: sport, music, nature, literature.
    2. -
    -
    #rnorm(100, 75, 15) creates a random sample with a mean of 75 and standard deviation of 15
    -#filter() can be used to set max min value and can only work with a data frame, for rows
    -#select() for columns
    -#round() rounds numbers to whole number values
    -#sample() draws a random samples from the groups vector according to a uniform distribution
    -
    -score <- rnorm(100,75,15)
    -hist(score,breaks = 30)
    -

    -
    S1 <- data.frame(score)
    -
    -#Top and tail the scores
    -S1 <- filter(S1, score <= 100)
    -hist(S1$score)
    -

    -
    S2 <- data.frame(rep(100,5)) #repeat 100 5 times and names the column a random name that is not helpful so use the names() command to rename
    -names(S2) <- "score"
    -S3 <- bind_rows(S1,S2) #must make sure that the names of the columns and the type match
    -
    -#S3$score <- ifelse(S3$score >= 100, 100, S3$score)
    -
    -S3$score <-round(S3$score,0)
    -
    -interest <- c("sport", "music", "nature", "liturature")
    -S3$interest <- sample(interest, 100, replace = TRUE)
    -S3$stid <- seq(1,100,1)
    -
      -
    1. **Using base R commands, draw a histogram of the scores. Change the breaks in your histogram until you think they best represent your data.
    2. -
    -
    hist(S3$score, breaks = 10)
    -

    -
      -
    1. Create a new variable that groups the scores according to the breaks in your histogram.
    2. -
    -
    #cut() divides the range of scores into intervals and codes the values in scores according to which interval they fall. We use a vector called `letters` as the labels, `letters` is a vector made up of the letters of the alphabet.
    -label <- letters[1:10]
    -S3$breaks <- cut(S3$score, breaks = 10, labels = label)
    -
      -
    1. Now using the colorbrewer package (RColorBrewer; http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3) design a pallette and assign it to the groups in your data on the histogram.
    2. -
    -
    library(RColorBrewer)
    -#Let's look at the available palettes in RColorBrewer
    -display.brewer.all()
    -

    -
    #The top section of palettes are sequential, the middle section are qualitative, and the lower section are diverging.
    -#Make RColorBrewer palette available to R and assign to your bins
    -
    -S3$colors <- brewer.pal(10, "BrBG")
    -#Use named palette in histogram
    -hist(S3$score, col = S3$colors)
    -

    -
      -
    1. Create a boxplot that visualizes the scores for each interest group and color each interest group a different color.
    2. -
    -
    #Make a vector of the colors from RColorBrewer
    -interest.col <- brewer.pal(4, "BuPu")
    -boxplot(score ~ interest, S3, col = interest.col)
    -

    -
      -
    1. Now simulate a new variable that describes the number of logins that students made to the educational game. They should vary from 1-25.
    2. -
    -
    S3$login <- sample(1:25, 100, replace = TRUE)
    -
      -
    1. Plot the relationships between logins and scores. Give the plot a title and color the dots according to interest group.
    2. -
    -
    plot(S3$score, S3$login, main = "Login vs. Score", xlab = "Score", ylab = "Login", col = interest.col) 
    -

    -
      -
    1. R contains several inbuilt data sets, one of these in called AirPassengers. Plot a line graph of the the airline passengers over time using this data set.
    2. -
    -
    plot(AirPassengers, type = "l", xlab = "Date", ylab = "Passenger numbers") 
    -

    -
      -
    1. Using another inbuilt data set, iris, plot the relationships between all of the variables in the data set. Which of these relationships is it appropriate to run a correlation on?
    2. -
    -
    plot(iris)
    -

    -
    plot(iris$Sepal.Length,iris$Sepal.Width)
    -

    -
    plot(iris$Petal.Length,iris$Petal.Width)
    -

    -
    plot(iris$Petal.Length,iris$Sepal.Length)
    -

    -
    plot(iris$Petal.Width,iris$Sepal.Width)
    -

    -
    plot(iris$Petal.Width,iris$Sepal.Length)
    -

    -
    plot(iris$Petal.Length,iris$Sepal.Width)
    -

    -
    plot(iris$Species,iris$Sepal.Width, xlab = "Species", ylab = "Sepal Width")
    -

    -
    plot(iris$Species,iris$Sepal.Length, xlab = "Species", ylab = "Sepal Length")
    -

    -
    plot(iris$Species,iris$Petal.Width, xlab = "Species", ylab = "Petal Width")
    -

    -
    plot(iris$Species,iris$Petal.Length, xlab = "Species", ylab = "Petal Length")
    -

    -
    #Which of these relationships is it appropriate to run a correlation on?
    -#Correlation between Sepal Length and Width
    -corOfSepalLW <- cor(iris$Sepal.Length, iris$Sepal.Width)
    -#Correlation between Petal Length and Width
    -corOfPetalLW <- cor(iris$Petal.Length, iris$Petal.Width)
    -#Correlation between Petal Length and Sepal Length
    -corOfLengthPS <- cor(iris$Petal.Length, iris$Sepal.Length)
    -#Correlation between Petal Width and Sepal Width
    -corOfWidthPS <- cor(iris$Petal.Width, iris$Sepal.Width)
    -
    -
    -

    Part III - Analyzing Swirl

    -
    -

    Data

    -

    In this repository you will find data describing Swirl activity from the class so far this semester. Please connect RStudio to this repository.

    -
    -

    Instructions

    -
      -
    1. Insert a new code block

    2. -
    3. Create a data frame from the swirl-data.csv file called DF1

    4. -
    -

    The variables are:

    -

    course_name - the name of the R course the student attempted
    -lesson_name - the lesson name
    -question_number - the question number attempted correct - whether the question was answered correctly
    -attempt - how many times the student attempted the question
    -skipped - whether the student skipped the question
    -datetime - the date and time the student attempted the question
    -hash - anonymyzed student ID

    -
      -
    1. Create a new data frame that only includes the variables hash, lesson_name and attempt called DF2

    2. -
    3. Use the group_by function to create a data frame that sums all the attempts for each hash by each lesson_name called DF3

    4. -
    -
    #2
    -DF1 <- read.csv("swirl-data.csv", header = TRUE)
    -
    -#3
    -DF2<- select(DF1, hash, lesson_name, attempt)
    -
    -#4
    -DF3 <-  DF2 %>% group_by(hash, lesson_name) %>% summarise(sum_key = sum(attempt), .groups = "keep")
    -
      -
    1. On a scrap piece of paper draw what you think DF3 would look like if all the lesson names were column names

    2. -
    3. Convert DF3 to this format

    4. -
    -
    #6
    -#Get rid of the NAs so the next step does not throw error and add extra column of NAs
    -DF3 <- na.omit(DF3)
    -DF3 <- spread(DF3, lesson_name, sum_key) 
    -
      -
    1. Create a new data frame from DF1 called DF4 that only includes the variables hash, lesson_name and correct

    2. -
    3. Convert the correct variable so that TRUE is coded as the number 1 and FALSE is coded as 0

    4. -
    -
    #7
    -DF4 <- select(DF1, hash, lesson_name, correct)
    -
    -#8
    -#Correct misspelled FALS at line 809 in swirl_data.csv and subsequent DF4 dataframe from DF1
    -DF4$correct <- ifelse(DF4$correct == "FALS", "FALSE", DF4$correct)
    -
    -#Convert the chr that was created with last back to logi
    -DF4$correct <- type.convert(DF4$correct) 
    -
    -#Get rid of the NAs so the next steps do not throw "NAs introduced by coercion"
    -DF4 <- DF4[complete.cases(DF4$correct),]
    -
    -#Converts logi to num so 0s and 1s instead of FALSE and TRUE
    -DF4$correct <- as.numeric(DF4$correct) 
    -
      -
    1. Create a new data frame called DF5 that provides a mean score for each student on each course
    2. -
    -
    #9
    -DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct), .groups = "keep")
    -
      -
    1. Extra credit Convert the datetime variable into month-day-year format and create a new data frame (DF6) that shows the average correct for each day
    2. -
    -
    #10
    -DF6 <- select(DF1, hash, lesson_name, datetime, correct)
    -
    -#steps to get TRUE/FALSE to 1/0
    -DF6$correct <- ifelse(DF6$correct == "FALS", "FALSE", DF6$correct)
    -DF6$correct <- type.convert(DF6$correct) 
    -DF6 <- DF6[complete.cases(DF6$correct),]
    -DF6$correct <- as.numeric(DF6$correct) 
    -
    -#Creating average correct for each day
    -DF6 <- DF6 %>% group_by(hash, datetime, correct) %>% summarise(meanByDay = mean(correct), .groups = "keep")
    -
    -#Convert 'datetime' to month-day-year by converting the parsed num***
    -#library(lubridate)
    -#dateConverted <- mdy_hms(DF6$datetime)
    -#DF6 <- separate_rows(DF6, DF6$datetime, sep = "0")
    -

    Finally use the knitr function to generate an html document from your work. Commit, Push and Pull Request your work back to the main branch of the repository. Make sure you include both the .Rmd file and the .html file.

    -
    -
    -
    - - - - -
    - - - - - - - - - - - - - - - From c5dd65d4ac9ae3db070b6baeb677c1d470d1ba6e Mon Sep 17 00:00:00 2001 From: Nicole Schlosberg <70542380+nicole-schlosberg@users.noreply.github.com> Date: Tue, 17 Aug 2021 17:47:25 -0400 Subject: [PATCH 07/10] Fixing Upload --- Data-Wrangling-Visualization.Rmd | 285 ++++++++++++ Data-Wrangling-Visualization.Rproj | 13 + Data-Wrangling-Visualization.html | 689 +++++++++++++++++++++++++++++ README.md | 24 +- 4 files changed, 994 insertions(+), 17 deletions(-) create mode 100644 Data-Wrangling-Visualization.Rmd create mode 100644 Data-Wrangling-Visualization.Rproj create mode 100644 Data-Wrangling-Visualization.html diff --git a/Data-Wrangling-Visualization.Rmd b/Data-Wrangling-Visualization.Rmd new file mode 100644 index 0000000..94cf740 --- /dev/null +++ b/Data-Wrangling-Visualization.Rmd @@ -0,0 +1,285 @@ +--- +title: "Data-Wrangling-Visualization" +author: "Nicole Schlosberg" +date: "September 29, 2020" +output: html_document +--- + + +## Part I + +## Data Wrangling +In the hackathon a project was proposed to collect data from student video watching, a sample of this data is available in the file video-data.csv. + +```{r} +#library(tidyverse) +library(tidyr) +library(dplyr) + +D1 <- read.csv("video-data.csv", header = TRUE) + +#Create a data frame that only contains the years 2018 +D2 <- filter(D1, year == 2018) +``` + +## Histograms +```{r} +#Generate a histogram of the watch time for the year 2018 +hist(D2$watch.time) + +#Change the number of breaks to 100, do you get the same impression? +hist(D2$watch.time, breaks = 100) + +#Cut the y-axis off at 10 +hist(D2$watch.time, breaks = 100, ylim = c(0,10)) + +#Restore the y-axis and change the breaks so that they are 0-5, 5-20, 20-25, 25-35 +hist(D2$watch.time, breaks = c(0,5,20,25,35)) +``` + +## Plots +```{r} +#Plot the number of confusion points against the watch time +plot(D1$confusion.points, D1$watch.time) + +#Create two variables x & y +x <- c(1,3,2,7,6,4,4) +y <- c(2,4,2,3,2,4,3) + +#Create a table from x & y +table1 <- table(x,y) + +#Display the table as a Barplot +barplot(table1) + +#Create a data frame of the average total key points for each year and plot the two against each other as a lines +D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points)) +plot(D3$year, D3$mean_key, type = "l", lty = "dashed") + +#Create a boxplot of total enrollment for three students +D4 <- filter(D1, stid == 4|stid == 20| stid == 22) +#The drop levels command will remove all the schools from the variable with no data +D4 <- droplevels(D4) +boxplot(D4$watch.time~D4$stid, xlab = "Student", ylab = "Watch Time") +``` + +## Pairs +```{r} +#Use matrix notation to select columns 2, 5, 6, and 7 +D5 <- D1[,c(2,5,6,7)] +#Draw a matrix of plots for every combination of variables +pairs(D5) +``` + +## Part II + +1. Create a simulated data set containing 100 students, each with a score from 1-100 representing performance in an educational game. The scores should tend to cluster around 75. Also, each student should be given a classification that reflects one of four interest groups: sport, music, nature, literature. + +```{r} +#rnorm(100, 75, 15) creates a random sample with a mean of 75 and standard deviation of 15 +#filter() can be used to set max min value and can only work with a data frame, for rows +#select() for columns +#round() rounds numbers to whole number values +#sample() draws a random samples from the groups vector according to a uniform distribution + +score <- rnorm(100,75,15) +hist(score,breaks = 30) +S1 <- data.frame(score) + +#Top and tail the scores +S1 <- filter(S1, score <= 100) +hist(S1$score) + +S2 <- data.frame(rep(100,5)) #repeat 100 5 times and names the column a random name that is not helpful so use the names() command to rename +names(S2) <- "score" +S3 <- bind_rows(S1,S2) #must make sure that the names of the columns and the type match + +#S3$score <- ifelse(S3$score >= 100, 100, S3$score) + +S3$score <-round(S3$score,0) + +interest <- c("sport", "music", "nature", "liturature") +S3$interest <- sample(interest, 100, replace = TRUE) +S3$stid <- seq(1,100,1) +``` + +2. **Using base R commands, draw a histogram of the scores. Change the breaks in your histogram until you think they best represent your data. + +```{r} +hist(S3$score, breaks = 10) +``` + +3. Create a new variable that groups the scores according to the breaks in your histogram. + +```{r} +#cut() divides the range of scores into intervals and codes the values in scores according to which interval they fall. We use a vector called `letters` as the labels, `letters` is a vector made up of the letters of the alphabet. +label <- letters[1:10] +S3$breaks <- cut(S3$score, breaks = 10, labels = label) +``` + +4. Now using the colorbrewer package (RColorBrewer; http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3) design a pallette and assign it to the groups in your data on the histogram. + +```{r} +library(RColorBrewer) +#Let's look at the available palettes in RColorBrewer +display.brewer.all() +#The top section of palettes are sequential, the middle section are qualitative, and the lower section are diverging. +#Make RColorBrewer palette available to R and assign to your bins + +S3$colors <- brewer.pal(10, "BrBG") +#Use named palette in histogram +hist(S3$score, col = S3$colors) +``` + +5. Create a boxplot that visualizes the scores for each interest group and color each interest group a different color. + +```{r} +#Make a vector of the colors from RColorBrewer +interest.col <- brewer.pal(4, "BuPu") +boxplot(score ~ interest, S3, col = interest.col) +``` + +6. Now simulate a new variable that describes the number of logins that students made to the educational game. They should vary from 1-25. + +```{r} +S3$login <- sample(1:25, 100, replace = TRUE) +``` + +7. Plot the relationships between logins and scores. Give the plot a title and color the dots according to interest group. + +```{r} +plot(S3$score, S3$login, main = "Login vs. Score", xlab = "Score", ylab = "Login", col = interest.col) +``` + +8. R contains several inbuilt data sets, one of these in called AirPassengers. Plot a line graph of the the airline passengers over time using this data set. + +```{r} +plot(AirPassengers, type = "l", xlab = "Date", ylab = "Passenger numbers") +``` + +9. Using another inbuilt data set, iris, plot the relationships between all of the variables in the data set. Which of these relationships is it appropriate to run a correlation on? + +```{r} +plot(iris) +plot(iris$Sepal.Length,iris$Sepal.Width) +plot(iris$Petal.Length,iris$Petal.Width) +plot(iris$Petal.Length,iris$Sepal.Length) +plot(iris$Petal.Width,iris$Sepal.Width) +plot(iris$Petal.Width,iris$Sepal.Length) +plot(iris$Petal.Length,iris$Sepal.Width) +plot(iris$Species,iris$Sepal.Width, xlab = "Species", ylab = "Sepal Width") +plot(iris$Species,iris$Sepal.Length, xlab = "Species", ylab = "Sepal Length") +plot(iris$Species,iris$Petal.Width, xlab = "Species", ylab = "Petal Width") +plot(iris$Species,iris$Petal.Length, xlab = "Species", ylab = "Petal Length") + +#Which of these relationships is it appropriate to run a correlation on? +#Correlation between Sepal Length and Width +corOfSepalLW <- cor(iris$Sepal.Length, iris$Sepal.Width) +#Correlation between Petal Length and Width +corOfPetalLW <- cor(iris$Petal.Length, iris$Petal.Width) +#Correlation between Petal Length and Sepal Length +corOfLengthPS <- cor(iris$Petal.Length, iris$Sepal.Length) +#Correlation between Petal Width and Sepal Width +corOfWidthPS <- cor(iris$Petal.Width, iris$Sepal.Width) +``` + +# Part III - Analyzing Swirl + +## Data + +In this repository you will find data describing Swirl activity from the class so far this semester. Please connect RStudio to this repository. + +### Instructions + +1. Insert a new code block + +2. Create a data frame from the `swirl-data.csv` file called `DF1` + +The variables are: + +`course_name` - the name of the R course the student attempted +`lesson_name` - the lesson name +`question_number` - the question number attempted +`correct` - whether the question was answered correctly +`attempt` - how many times the student attempted the question +`skipped` - whether the student skipped the question +`datetime` - the date and time the student attempted the question +`hash` - anonymyzed student ID + +3. Create a new data frame that only includes the variables `hash`, `lesson_name` and `attempt` called `DF2` + +4. Use the `group_by` function to create a data frame that sums all the attempts for each `hash` by each `lesson_name` called `DF3` + +```{r} +#2 +DF1 <- read.csv("swirl-data.csv", header = TRUE) + +#3 +DF2<- select(DF1, hash, lesson_name, attempt) + +#4 +DF3 <- DF2 %>% group_by(hash, lesson_name) %>% summarise(sum_key = sum(attempt), .groups = "keep") +``` + +5. On a scrap piece of paper draw what you think `DF3` would look like if all the lesson names were column names + +6. Convert `DF3` to this format + +```{r} +#6 +#Get rid of the NAs so the next step does not throw error and add extra column of NAs +DF3 <- na.omit(DF3) +DF3 <- spread(DF3, lesson_name, sum_key) +``` + +7. Create a new data frame from `DF1` called `DF4` that only includes the variables `hash`, `lesson_name` and `correct` + +8. Convert the `correct` variable so that `TRUE` is coded as the **number** `1` and `FALSE` is coded as `0` + +```{r} +#7 +DF4 <- select(DF1, hash, lesson_name, correct) + +#8 +#Correct misspelled FALS at line 809 in swirl_data.csv and subsequent DF4 dataframe from DF1 +DF4$correct <- ifelse(DF4$correct == "FALS", "FALSE", DF4$correct) + +#Convert the chr that was created with last back to logi +DF4$correct <- type.convert(DF4$correct) + +#Get rid of the NAs so the next steps do not throw "NAs introduced by coercion" +DF4 <- DF4[complete.cases(DF4$correct),] + +#Converts logi to num so 0s and 1s instead of FALSE and TRUE +DF4$correct <- as.numeric(DF4$correct) +``` + +9. Create a new data frame called `DF5` that provides a mean score for each student on each course + +```{r} +#9 +DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct), .groups = "keep") +``` + +10. Convert the `datetime` variable into month-day-year format and create a new data frame (`DF6`) that shows the average correct for each day + +```{r} +#10 +DF6 <- select(DF1, hash, lesson_name, datetime, correct) + +#steps to get TRUE/FALSE to 1/0 +DF6$correct <- ifelse(DF6$correct == "FALS", "FALSE", DF6$correct) +DF6$correct <- type.convert(DF6$correct) +DF6 <- DF6[complete.cases(DF6$correct),] +DF6$correct <- as.numeric(DF6$correct) + +#Creating average correct for each day +DF6 <- DF6 %>% group_by(hash, datetime, correct) %>% summarise(meanByDay = mean(correct), .groups = "keep") + +#Convert 'datetime' to month-day-year by converting the parsed num*** +#library(lubridate) +#dateConverted <- mdy_hms(DF6$datetime) +#DF6 <- separate_rows(DF6, DF6$datetime, sep = "0") +``` + + diff --git a/Data-Wrangling-Visualization.Rproj b/Data-Wrangling-Visualization.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/Data-Wrangling-Visualization.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/Data-Wrangling-Visualization.html b/Data-Wrangling-Visualization.html new file mode 100644 index 0000000..cfd95da --- /dev/null +++ b/Data-Wrangling-Visualization.html @@ -0,0 +1,689 @@ + + + + + + + + + + + + + + + +Data-Wrangling-Visualization + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + +
    +

    Part I

    +
    +
    +

    Data Wrangling

    +

    In the hackathon a project was proposed to collect data from student video watching, a sample of this data is available in the file video-data.csv.

    +
    #library(tidyverse)
    +library(tidyr)
    +library(dplyr)
    +
    ## 
    +## Attaching package: 'dplyr'
    +
    ## The following objects are masked from 'package:stats':
    +## 
    +##     filter, lag
    +
    ## The following objects are masked from 'package:base':
    +## 
    +##     intersect, setdiff, setequal, union
    +
    D1 <- read.csv("video-data.csv", header = TRUE)
    +
    +#Create a data frame that only contains the years 2018
    +D2 <- filter(D1, year == 2018)
    +
    +
    +

    Histograms

    +
    #Generate a histogram of the watch time for the year 2018
    +hist(D2$watch.time)
    +

    +
    #Change the number of breaks to 100, do you get the same impression?
    +hist(D2$watch.time, breaks = 100)
    +

    +
    #Cut the y-axis off at 10
    +hist(D2$watch.time, breaks = 100, ylim = c(0,10))
    +

    +
    #Restore the y-axis and change the breaks so that they are 0-5, 5-20, 20-25, 25-35
    +hist(D2$watch.time, breaks = c(0,5,20,25,35))
    +

    +
    +
    +

    Plots

    +
    #Plot the number of confusion points against the watch time
    +plot(D1$confusion.points, D1$watch.time)
    +

    +
    #Create two variables x & y
    +x <- c(1,3,2,7,6,4,4)
    +y <- c(2,4,2,3,2,4,3)
    +
    +#Create a table from x & y
    +table1 <- table(x,y)
    +
    +#Display the table as a Barplot
    +barplot(table1)
    +

    +
    #Create a data frame of the average total key points for each year and plot the two against each other as a lines
    +D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points))
    +
    ## `summarise()` ungrouping output (override with `.groups` argument)
    +
    plot(D3$year, D3$mean_key, type = "l", lty = "dashed")
    +

    +
    #Create a boxplot of total enrollment for three students
    +D4 <- filter(D1, stid == 4|stid == 20| stid == 22)
    +#The drop levels command will remove all the schools from the variable with no data  
    +D4 <- droplevels(D4)
    +boxplot(D4$watch.time~D4$stid, xlab = "Student", ylab = "Watch Time")
    +

    +
    +
    +

    Pairs

    +
    #Use matrix notation to select columns 2, 5, 6, and 7
    +D5 <- D1[,c(2,5,6,7)]
    +#Draw a matrix of plots for every combination of variables
    +pairs(D5)
    +

    +
    +
    +

    Part II

    +
      +
    1. Create a simulated data set containing 100 students, each with a score from 1-100 representing performance in an educational game. The scores should tend to cluster around 75. Also, each student should be given a classification that reflects one of four interest groups: sport, music, nature, literature.
    2. +
    +
    #rnorm(100, 75, 15) creates a random sample with a mean of 75 and standard deviation of 15
    +#filter() can be used to set max min value and can only work with a data frame, for rows
    +#select() for columns
    +#round() rounds numbers to whole number values
    +#sample() draws a random samples from the groups vector according to a uniform distribution
    +
    +score <- rnorm(100,75,15)
    +hist(score,breaks = 30)
    +

    +
    S1 <- data.frame(score)
    +
    +#Top and tail the scores
    +S1 <- filter(S1, score <= 100)
    +hist(S1$score)
    +

    +
    S2 <- data.frame(rep(100,5)) #repeat 100 5 times and names the column a random name that is not helpful so use the names() command to rename
    +names(S2) <- "score"
    +S3 <- bind_rows(S1,S2) #must make sure that the names of the columns and the type match
    +
    +#S3$score <- ifelse(S3$score >= 100, 100, S3$score)
    +
    +S3$score <-round(S3$score,0)
    +
    +interest <- c("sport", "music", "nature", "liturature")
    +S3$interest <- sample(interest, 100, replace = TRUE)
    +S3$stid <- seq(1,100,1)
    +
      +
    1. **Using base R commands, draw a histogram of the scores. Change the breaks in your histogram until you think they best represent your data.
    2. +
    +
    hist(S3$score, breaks = 10)
    +

    +
      +
    1. Create a new variable that groups the scores according to the breaks in your histogram.
    2. +
    +
    #cut() divides the range of scores into intervals and codes the values in scores according to which interval they fall. We use a vector called `letters` as the labels, `letters` is a vector made up of the letters of the alphabet.
    +label <- letters[1:10]
    +S3$breaks <- cut(S3$score, breaks = 10, labels = label)
    +
      +
    1. Now using the colorbrewer package (RColorBrewer; http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3) design a pallette and assign it to the groups in your data on the histogram.
    2. +
    +
    library(RColorBrewer)
    +#Let's look at the available palettes in RColorBrewer
    +display.brewer.all()
    +

    +
    #The top section of palettes are sequential, the middle section are qualitative, and the lower section are diverging.
    +#Make RColorBrewer palette available to R and assign to your bins
    +
    +S3$colors <- brewer.pal(10, "BrBG")
    +#Use named palette in histogram
    +hist(S3$score, col = S3$colors)
    +

    +
      +
    1. Create a boxplot that visualizes the scores for each interest group and color each interest group a different color.
    2. +
    +
    #Make a vector of the colors from RColorBrewer
    +interest.col <- brewer.pal(4, "BuPu")
    +boxplot(score ~ interest, S3, col = interest.col)
    +

    +
      +
    1. Now simulate a new variable that describes the number of logins that students made to the educational game. They should vary from 1-25.
    2. +
    +
    S3$login <- sample(1:25, 100, replace = TRUE)
    +
      +
    1. Plot the relationships between logins and scores. Give the plot a title and color the dots according to interest group.
    2. +
    +
    plot(S3$score, S3$login, main = "Login vs. Score", xlab = "Score", ylab = "Login", col = interest.col) 
    +

    +
      +
    1. R contains several inbuilt data sets, one of these in called AirPassengers. Plot a line graph of the the airline passengers over time using this data set.
    2. +
    +
    plot(AirPassengers, type = "l", xlab = "Date", ylab = "Passenger numbers") 
    +

    +
      +
    1. Using another inbuilt data set, iris, plot the relationships between all of the variables in the data set. Which of these relationships is it appropriate to run a correlation on?
    2. +
    +
    plot(iris)
    +

    +
    plot(iris$Sepal.Length,iris$Sepal.Width)
    +

    +
    plot(iris$Petal.Length,iris$Petal.Width)
    +

    +
    plot(iris$Petal.Length,iris$Sepal.Length)
    +

    +
    plot(iris$Petal.Width,iris$Sepal.Width)
    +

    +
    plot(iris$Petal.Width,iris$Sepal.Length)
    +

    +
    plot(iris$Petal.Length,iris$Sepal.Width)
    +

    +
    plot(iris$Species,iris$Sepal.Width, xlab = "Species", ylab = "Sepal Width")
    +

    +
    plot(iris$Species,iris$Sepal.Length, xlab = "Species", ylab = "Sepal Length")
    +

    +
    plot(iris$Species,iris$Petal.Width, xlab = "Species", ylab = "Petal Width")
    +

    +
    plot(iris$Species,iris$Petal.Length, xlab = "Species", ylab = "Petal Length")
    +

    +
    #Which of these relationships is it appropriate to run a correlation on?
    +#Correlation between Sepal Length and Width
    +corOfSepalLW <- cor(iris$Sepal.Length, iris$Sepal.Width)
    +#Correlation between Petal Length and Width
    +corOfPetalLW <- cor(iris$Petal.Length, iris$Petal.Width)
    +#Correlation between Petal Length and Sepal Length
    +corOfLengthPS <- cor(iris$Petal.Length, iris$Sepal.Length)
    +#Correlation between Petal Width and Sepal Width
    +corOfWidthPS <- cor(iris$Petal.Width, iris$Sepal.Width)
    +
    +
    +

    Part III - Analyzing Swirl

    +
    +

    Data

    +

    In this repository you will find data describing Swirl activity from the class so far this semester. Please connect RStudio to this repository.

    +
    +

    Instructions

    +
      +
    1. Insert a new code block

    2. +
    3. Create a data frame from the swirl-data.csv file called DF1

    4. +
    +

    The variables are:

    +

    course_name - the name of the R course the student attempted
    +lesson_name - the lesson name
    +question_number - the question number attempted correct - whether the question was answered correctly
    +attempt - how many times the student attempted the question
    +skipped - whether the student skipped the question
    +datetime - the date and time the student attempted the question
    +hash - anonymyzed student ID

    +
      +
    1. Create a new data frame that only includes the variables hash, lesson_name and attempt called DF2

    2. +
    3. Use the group_by function to create a data frame that sums all the attempts for each hash by each lesson_name called DF3

    4. +
    +
    #2
    +DF1 <- read.csv("swirl-data.csv", header = TRUE)
    +
    +#3
    +DF2<- select(DF1, hash, lesson_name, attempt)
    +
    +#4
    +DF3 <-  DF2 %>% group_by(hash, lesson_name) %>% summarise(sum_key = sum(attempt), .groups = "keep")
    +
      +
    1. On a scrap piece of paper draw what you think DF3 would look like if all the lesson names were column names

    2. +
    3. Convert DF3 to this format

    4. +
    +
    #6
    +#Get rid of the NAs so the next step does not throw error and add extra column of NAs
    +DF3 <- na.omit(DF3)
    +DF3 <- spread(DF3, lesson_name, sum_key) 
    +
      +
    1. Create a new data frame from DF1 called DF4 that only includes the variables hash, lesson_name and correct

    2. +
    3. Convert the correct variable so that TRUE is coded as the number 1 and FALSE is coded as 0

    4. +
    +
    #7
    +DF4 <- select(DF1, hash, lesson_name, correct)
    +
    +#8
    +#Correct misspelled FALS at line 809 in swirl_data.csv and subsequent DF4 dataframe from DF1
    +DF4$correct <- ifelse(DF4$correct == "FALS", "FALSE", DF4$correct)
    +
    +#Convert the chr that was created with last back to logi
    +DF4$correct <- type.convert(DF4$correct) 
    +
    +#Get rid of the NAs so the next steps do not throw "NAs introduced by coercion"
    +DF4 <- DF4[complete.cases(DF4$correct),]
    +
    +#Converts logi to num so 0s and 1s instead of FALSE and TRUE
    +DF4$correct <- as.numeric(DF4$correct) 
    +
      +
    1. Create a new data frame called DF5 that provides a mean score for each student on each course
    2. +
    +
    #9
    +DF5 <- DF4 %>% group_by(hash, lesson_name) %>% summarise(mean_key = mean(correct), .groups = "keep")
    +
      +
    1. Convert the datetime variable into month-day-year format and create a new data frame (DF6) that shows the average correct for each day
    2. +
    +
    #10
    +DF6 <- select(DF1, hash, lesson_name, datetime, correct)
    +
    +#steps to get TRUE/FALSE to 1/0
    +DF6$correct <- ifelse(DF6$correct == "FALS", "FALSE", DF6$correct)
    +DF6$correct <- type.convert(DF6$correct) 
    +DF6 <- DF6[complete.cases(DF6$correct),]
    +DF6$correct <- as.numeric(DF6$correct) 
    +
    +#Creating average correct for each day
    +DF6 <- DF6 %>% group_by(hash, datetime, correct) %>% summarise(meanByDay = mean(correct), .groups = "keep")
    +
    +#Convert 'datetime' to month-day-year by converting the parsed num***
    +#library(lubridate)
    +#dateConverted <- mdy_hms(DF6$datetime)
    +#DF6 <- separate_rows(DF6, DF6$datetime, sep = "0")
    +
    +
    +
    + + + + +
    + + + + + + + + + + + + + + + diff --git a/README.md b/README.md index 470dcb2..8ab360d 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,12 @@ -# Principal Component Analysis +# Assignment 2 +### Data Wrangling and Visualization -I used data from the Assistments Intelligent Tutoring system. This system gives students hints based on how they perform on math problems. I wanted to see if I can build a decision tree to help teachers decide which students to follow up with, based on students' performance in Assignments. I create three groups ("teacher should intervene", "teacher should monitor student progress" and "no action") based on students' previous use of the system and how many hints they use. I will be building a decision tree using the "party" package. The party package builds decision trees based on a set of statistical stopping rules. +In Assignment 2 we will be practicing data manipulation including use of the tidyverse. -Examine the PrincipleComponentAnalysis.html for the results. +The instructions to Assignment 2 are in the Assignment 2-2020.rmd file. Assignments are structured in three parts, in the first part you can just follow along with the code, in the second part you will need to apply the code, and in the third part is completely freestyle and you are expected to apply your new knowledge in a new way. -# Codebook -id - student id -prior_prob_count - The number of problems a student has done in the system prior to the surrent session -score - The score the student achieved in the current session -hints - The number of hints the student requested in the current session -hint.y - Whether or not the student asked for hints in the current session -complete - Whether or not the student completed the cirrent session -action - The action suggested by the system to a teacher about a given student based on their performance +**Please complete as much as you can by midnight EDT, 10/05/20** -- prior_percent_correct: The percentage of problems a student has answered correctly prior to this session -- problems_attempted: The number of problems the student has attempted in the current session -- mean_correct: The average number of correct answers a student made on their first attempt at problems in the current session -- mean_hint: The average number of hints a student asked for in the current session -- mean_attempt: The average number of attempts a student took to answer a problem in the current session -- mean_confidence: The average confidence each student has in their ability to answer the problems in the current session +Once you have finished, commit, push and pull your assignment back to the main branch. Include both the .Rmd file and the .html file. +Good luck! From 6f8be466aea1325fd41d493435f7e095e201c705 Mon Sep 17 00:00:00 2001 From: Nicole Schlosberg <70542380+nicole-schlosberg@users.noreply.github.com> Date: Tue, 17 Aug 2021 17:47:39 -0400 Subject: [PATCH 08/10] Delete PrincipleComponentAnalysis.html --- PrincipleComponentAnalysis.html | 647 -------------------------------- 1 file changed, 647 deletions(-) delete mode 100644 PrincipleComponentAnalysis.html diff --git a/PrincipleComponentAnalysis.html b/PrincipleComponentAnalysis.html deleted file mode 100644 index fd9a403..0000000 --- a/PrincipleComponentAnalysis.html +++ /dev/null @@ -1,647 +0,0 @@ - - - - - - - - - - - - - - - -Principle Component Analysis - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - - - - -

    Data: The data comes from the Assistments online intelligent tutoring system (https://www.assistments.org/). It describes students working through online math problems. Each student has the following data associated with them:

    -
    -

    Part I

    -

    Uploading the data

    -
    D1 <- read.csv("Assistments-confidence.csv", header=TRUE)
    -

    Create a correlation matrix of the relationships between the variables, including correlation coefficients for each pair of variables/features.

    -
    library(ggplot2)
    -library(GGally)
    -
    ## Registered S3 method overwritten by 'GGally':
    -##   method from   
    -##   +.gg   ggplot2
    -
    library(corrplot)
    -
    ## corrplot 0.84 loaded
    -
    ggpairs(D1, 2:8, progress = FALSE) #ggpairs() draws a correlation plot between all the columns you identify by number (second option, you don't need the first column as it is the student ID) and progress = FALSE stops a progress bar appearing as it renders your plot
    -

    -
    ggcorr(D1[,-1], method = c("everything", "pearson")) #ggcorr() doesn't have an explicit option to choose variables so we need to use matrix notation to drop the id variable. We then need to choose a "method" which determines how to treat missing values (here we choose to keep everything, and then which kind of correlation calculation to use, here we are using Pearson correlation, the other options are "kendall" or "spearman")
    -

    -
    #Note of what is strongly related to the outcome variable of interest, mean_correct. 
    -

    Create a new data frame with the mean_correct variable removed, we want to keep that variable intact. The other variables will be included in our PCA.

    -
    library(dplyr)
    -
    ## 
    -## Attaching package: 'dplyr'
    -
    ## The following objects are masked from 'package:stats':
    -## 
    -##     filter, lag
    -
    ## The following objects are masked from 'package:base':
    -## 
    -##     intersect, setdiff, setequal, union
    -
    D2 <- select(D1,-id,-mean_correct)
    -

    Now run the PCA on the new data frame

    -
    pca <- prcomp(D2, scale. = TRUE)
    -

    Although princomp does not generate the eigenvalues directly for us, we can print a list of the standard deviation of the variance accounted for by each component.

    -
    pca$sdev
    -
    ## [1] 1.2825140 1.0543565 1.0245688 0.9621486 0.8556715 0.7320146
    -
    #To convert this into variance accounted for we can square it, these numbers are proportional to the eigenvalue
    -pca$sdev^2
    -
    ## [1] 1.6448423 1.1116675 1.0497412 0.9257299 0.7321737 0.5358454
    -
    #A summary of our pca will give us the proportion of variance accounted for by each component
    -summary(pca)
    -
    ## Importance of components:
    -##                           PC1    PC2    PC3    PC4    PC5     PC6
    -## Standard deviation     1.2825 1.0544 1.0246 0.9621 0.8557 0.73201
    -## Proportion of Variance 0.2741 0.1853 0.1750 0.1543 0.1220 0.08931
    -## Cumulative Proportion  0.2741 0.4594 0.6344 0.7887 0.9107 1.00000
    -
    #We can look at this to get an idea of which components we should keep and which we should drop
    -plot(pca, type = "lines")
    -

    -

    Decide which components you would drop and remove them from your data set.

    -

    ANSWER: PC5 and PC6 would be components to drop. PC5 only has 12.20% of variance, meaning it only represents 12.20% of the variance. PC6 only has 8.93% of the variance, which means it only represents 8.93% of the variance. Since it is such small amount of the variance it can be removed.

    -
    -
    -

    Part II

    -
    #Now create a data frame of the transformed data from your pca.
    -D3 <- data.frame(pca$x)
    -
    -#Attach the variable "mean_correct" from your original data frame to D3.
    -D3 <- data.frame(D3,D1$mean_correct)
    - 
    -#Now re-run your correlation plots between the transformed data and mean_correct. If you had dropped some components would you have lost important information about mean_correct?
    -ggcorr(D3, method = c("everything", "pearson")) 
    -

    -

    If you had dropped some components would you have lost important information about mean_correct?

    -

    ANSWER: You would lose important information about mean_correct. Components with some small variance representation still contribute some information even if small. PC6 which only makes up 8.93% of the variance is strongly correlated to mean_correct. This means that even though it is the best option to drop, we risk dropping something that correlates with the thing we are interested in.

    -

    Now print out the loadings for the components you generated:

    -
    library(ggbiplot)
    -
    ## Loading required package: plyr
    -
    ## ------------------------------------------------------------------------------
    -
    ## You have loaded plyr after dplyr - this is likely to cause problems.
    -## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
    -## library(plyr); library(dplyr)
    -
    ## ------------------------------------------------------------------------------
    -
    ## 
    -## Attaching package: 'plyr'
    -
    ## The following objects are masked from 'package:dplyr':
    -## 
    -##     arrange, count, desc, failwith, id, mutate, rename, summarise,
    -##     summarize
    -
    ## Loading required package: scales
    -
    ## Loading required package: grid
    -
    library(tidyr)
    -
    -pca$rotation
    -
    ##                               PC1         PC2         PC3        PC4
    -## prior_prob_count      -0.26034140  0.45818753 -0.40090679 -0.6897642
    -## prior_percent_correct  0.16840319  0.81617867  0.09267306  0.2640040
    -## problems_attempted    -0.45568733  0.31685183  0.36387724  0.3168141
    -## mean_hint             -0.63337594 -0.12501620 -0.08008842 -0.1122586
    -## mean_attempt          -0.54200011 -0.08510858 -0.04585364  0.3108682
    -## mean_confidence        0.03581325  0.02547483 -0.83051917  0.4948890
    -##                                PC5         PC6
    -## prior_prob_count      -0.007142834 -0.29280482
    -## prior_percent_correct  0.298843852  0.37134715
    -## problems_attempted    -0.592336569 -0.32911025
    -## mean_hint             -0.102302115  0.74412634
    -## mean_attempt           0.697232132 -0.33781385
    -## mean_confidence       -0.251357022 -0.01452143
    -
    #Examine the eigenvectors, notice that they are a little difficult to interpret. It is much easier to make sense of them if we make them proportional within each component
    -loadings <- abs(pca$rotation) #abs() will make all eigenvectors positive
    -
    -#Now examine your components and try to come up with substantive descriptions of what some might represent?
    -L1 <- as_tibble(loadings)
    -labels <- c("prior_prob_count","prior_percent_correct","problems_attempted","mean_hint","mean_attempt","mean_confidence") 
    -L2 <- cbind(labels,L1)
    -L3 <- L2 %>% mutate(PC1 = PC1/sum(PC1)) %>% mutate(PC2 = PC2/sum(PC2)) %>% mutate(PC3 = PC3/sum(PC3)) %>% mutate(PC4 = PC4/sum(PC4)) %>% mutate(PC5 = PC5/sum(PC5)) %>%  mutate(PC6 = PC6/sum(PC6)) %>% print
    -
    ##                  labels        PC1        PC2        PC3        PC4         PC5
    -## 1      prior_prob_count 0.12423113 0.25081186 0.22101700 0.31516257 0.003664468
    -## 2 prior_percent_correct 0.08035956 0.44677621 0.05108998 0.12062699 0.153315014
    -## 3    problems_attempted 0.21744737 0.17344469 0.20060288 0.14475664 0.303884750
    -## 4             mean_hint 0.30223780 0.06843387 0.04415217 0.05129246 0.052483764
    -## 5          mean_attempt 0.25863458 0.04658844 0.02527878 0.14203987 0.357699023
    -## 6       mean_confidence 0.01708956 0.01394492 0.45785919 0.22612148 0.128952980
    -##          PC6
    -## 1 0.14011651
    -## 2 0.17770154
    -## 3 0.15748983
    -## 4 0.35608836
    -## 5 0.16165478
    -## 6 0.00694897
    -
    #You can generate a biplot to help you, though these can be a bit confusing. They plot the transformed data by the first two components. Therefore, the axes represent the direction of maximum variance accounted for. Then mapped onto this point cloud are the original directions of the variables, depicted as red arrows. It is supposed to provide a visualization of which variables "go together". Variables that possibly represent the same underlying construct point in the same direction.  
    -ggbiplot(pca)
    -

    -
    #ggbiplot(pca,choices=c(3,4))
    -#ggbiplot(pca,choices=c(5,6))
    -

    Now examine your components and try to come up with substantive descriptions of what some might represent?

    -

    ANSWER: Having the most variance of a specific category means that the component contributes the most to its variance. PC1 has the most variance in mean_hint (30.22% of variance), mean_attempt (25.86% of variance), and problems_attempted (21.74% of the variance). PC2 contributes the most to prior_percent_correct (44.68% of variance), prior_prob_count (25.08% of variance), and problems_attempted (17.34% of variance). PC3 contributes the most to mean_confidence (45.79% of variance), prior_prob_count (22.10% of variance), and problems_attempted (20.06% of variance). PC4 contributes the most to prior_prob_count (31.52% of variance) and mean_confidence (22.61% of variance). PC5 contributes the most to problems_attempted (30.39% of variance) and mean_attempt (35.77% of variance). PC6 only contributes the most to mean_hint (35.61% of variance).

    -
    -
    -

    Part III

    -

    Also in this repository is a data set collected from TC students (tc-program-combos.csv) that shows how many students thought that a TC program was related to another TC program. Students were shown three program names at a time and were asked which two of the three were most similar.

    -
    library(ggplot2)
    -library(GGally)
    -library(dplyr)
    -
    -R1 <- read.csv("tc-program-combos.csv",header=TRUE)
    -
    -#Organize the data
    -R3 <- R1
    -rownames(R3)<-R3$program
    -R3 <- select(R3, -program)
    -R3 <- R3[order(rownames(R3)),]
    -R3 <- R3[,sort(colnames(R3))]
    -
    -#PCA on data
    -pca3 <- prcomp(R3, scale. = TRUE)
    -pca3$sdev
    -
    ##  [1] 2.66699514 2.33303087 2.03824332 1.80893489 1.71451092 1.60411744
    -##  [7] 1.58798960 1.49222150 1.46424566 1.39138869 1.33520786 1.32516917
    -## [13] 1.31212863 1.26312253 1.25366254 1.22338737 1.21896028 1.18649390
    -## [19] 1.13127469 1.12814038 1.10432926 1.06319093 1.01168384 0.99665812
    -## [25] 0.96528383 0.95048841 0.93256896 0.90507518 0.85160824 0.83479067
    -## [31] 0.81879538 0.78538963 0.76079365 0.73350908 0.72278124 0.67319166
    -## [37] 0.66343310 0.64839067 0.62448974 0.60331242 0.56846989 0.55769066
    -## [43] 0.51031628 0.49442626 0.47128286 0.44551299 0.43288829 0.41344476
    -## [49] 0.37259965 0.36653720 0.35015529 0.33278376 0.32799766 0.30414406
    -## [55] 0.28040415 0.27066834 0.23729873 0.21156010 0.17616906 0.16541514
    -## [61] 0.14778289 0.14204907 0.11092615 0.07054620 0.04430024 0.03588606
    -## [67] 0.01241193
    -
    pca3$sdev^2
    -
    ##  [1] 7.1128630864 5.4430330560 4.1544358466 3.2722454371 2.9395476996
    -##  [6] 2.5731927459 2.5217109546 2.2267250142 2.1440153558 1.9359624893
    -## [11] 1.7827800336 1.7560733413 1.7216815395 1.5954785267 1.5716697682
    -## [16] 1.4966766496 1.4858641675 1.4077677636 1.2797824290 1.2727007063
    -## [21] 1.2195431178 1.1303749458 1.0235041934 0.9933274149 0.9317728773
    -## [26] 0.9034282205 0.8696848576 0.8191610815 0.7252366024 0.6968754570
    -## [31] 0.6704258671 0.6168368672 0.5788069731 0.5380355701 0.5224127176
    -## [36] 0.4531870174 0.4401434846 0.4204104623 0.3899874305 0.3639858772
    -## [41] 0.3231580154 0.3110188729 0.2604227077 0.2444573271 0.2221075356
    -## [46] 0.1984818264 0.1873922684 0.1709365720 0.1388304984 0.1343495215
    -## [51] 0.1226087294 0.1107450296 0.1075824634 0.0925036076 0.0786264855
    -## [56] 0.0732613524 0.0563106886 0.0447576745 0.0310355381 0.0273621679
    -## [61] 0.0218397813 0.0201779378 0.0123046098 0.0049767662 0.0019625111
    -## [66] 0.0012878092 0.0001540561
    -
    summary(pca3)
    -
    ## Importance of components:
    -##                           PC1     PC2     PC3     PC4     PC5     PC6     PC7
    -## Standard deviation     2.6670 2.33303 2.03824 1.80893 1.71451 1.60412 1.58799
    -## Proportion of Variance 0.1062 0.08124 0.06201 0.04884 0.04387 0.03841 0.03764
    -## Cumulative Proportion  0.1062 0.18740 0.24941 0.29825 0.34212 0.38053 0.41816
    -##                            PC8    PC9    PC10    PC11    PC12   PC13    PC14
    -## Standard deviation     1.49222 1.4642 1.39139 1.33521 1.32517 1.3121 1.26312
    -## Proportion of Variance 0.03323 0.0320 0.02889 0.02661 0.02621 0.0257 0.02381
    -## Cumulative Proportion  0.45140 0.4834 0.51229 0.53890 0.56511 0.5908 0.61462
    -##                           PC15    PC16    PC17    PC18   PC19   PC20   PC21
    -## Standard deviation     1.25366 1.22339 1.21896 1.18649 1.1313 1.1281 1.1043
    -## Proportion of Variance 0.02346 0.02234 0.02218 0.02101 0.0191 0.0190 0.0182
    -## Cumulative Proportion  0.63808 0.66042 0.68260 0.70361 0.7227 0.7417 0.7599
    -##                           PC22    PC23    PC24    PC25    PC26    PC27    PC28
    -## Standard deviation     1.06319 1.01168 0.99666 0.96528 0.95049 0.93257 0.90508
    -## Proportion of Variance 0.01687 0.01528 0.01483 0.01391 0.01348 0.01298 0.01223
    -## Cumulative Proportion  0.77678 0.79205 0.80688 0.82079 0.83427 0.84725 0.85948
    -##                           PC29   PC30    PC31    PC32    PC33    PC34   PC35
    -## Standard deviation     0.85161 0.8348 0.81880 0.78539 0.76079 0.73351 0.7228
    -## Proportion of Variance 0.01082 0.0104 0.01001 0.00921 0.00864 0.00803 0.0078
    -## Cumulative Proportion  0.87030 0.8807 0.89071 0.89992 0.90856 0.91659 0.9244
    -##                           PC36    PC37    PC38    PC39    PC40    PC41    PC42
    -## Standard deviation     0.67319 0.66343 0.64839 0.62449 0.60331 0.56847 0.55769
    -## Proportion of Variance 0.00676 0.00657 0.00627 0.00582 0.00543 0.00482 0.00464
    -## Cumulative Proportion  0.93115 0.93772 0.94399 0.94981 0.95524 0.96007 0.96471
    -##                           PC43    PC44    PC45    PC46   PC47    PC48    PC49
    -## Standard deviation     0.51032 0.49443 0.47128 0.44551 0.4329 0.41344 0.37260
    -## Proportion of Variance 0.00389 0.00365 0.00332 0.00296 0.0028 0.00255 0.00207
    -## Cumulative Proportion  0.96860 0.97224 0.97556 0.97852 0.9813 0.98387 0.98594
    -##                           PC50    PC51    PC52    PC53    PC54    PC55    PC56
    -## Standard deviation     0.36654 0.35016 0.33278 0.32800 0.30414 0.28040 0.27067
    -## Proportion of Variance 0.00201 0.00183 0.00165 0.00161 0.00138 0.00117 0.00109
    -## Cumulative Proportion  0.98795 0.98978 0.99143 0.99304 0.99442 0.99559 0.99668
    -##                           PC57    PC58    PC59    PC60    PC61   PC62    PC63
    -## Standard deviation     0.23730 0.21156 0.17617 0.16542 0.14778 0.1420 0.11093
    -## Proportion of Variance 0.00084 0.00067 0.00046 0.00041 0.00033 0.0003 0.00018
    -## Cumulative Proportion  0.99752 0.99819 0.99866 0.99906 0.99939 0.9997 0.99987
    -##                           PC64    PC65    PC66    PC67
    -## Standard deviation     0.07055 0.04430 0.03589 0.01241
    -## Proportion of Variance 0.00007 0.00003 0.00002 0.00000
    -## Cumulative Proportion  0.99995 0.99998 1.00000 1.00000
    -
    plot(pca3, type = "lines")
    -

    -
    #pca3$rotation
    -loadings3 <- abs(pca3$rotation) 
    -

    Use PCA to look for components that represent related programs. Explain why you think there are relationships between these programs (explain the meaning of the components).

    -

    ANSWER: Based off of the percentages of variance of the PCs, I have concluded that yes many of the PCs correspond to related classes. Below are the make up of the first 6 components, which compose 38.05% (cumulative proportion) of all the variance just within those 6 components. The remaining 61.95% is spread across the other 59 components, thus with 38.05% within 6 components and 61.95% spread across 59 components, the first components hold larger proportions (decreasing as they move away from PC1 towards PC67) of the variance.

    -

    PC1: Change.Leadership (27.70%), Economics.and.Education (23.09%), Education.Policy (22.09%), Arts.Administration (21.93%), Politics (21.88%), School.Principals (21.75%), Social.Organizational.Psychology (21.19), Private.School.Leadership (20.48%), Cooperation.and.Conflict.Resolution (20.42%), and Leadership (20.06%) make up the highest percent variance within PC1. This simply means that these programs contribute their respective amounts to PC1. These programs are all related to leadership, organization, and administration concepts.

    -

    PC2: Clinical.Psychology (25.31%), Neuroscience (25.29%), Kinesiology (25.15), Physiology (24.43%), Psychology (22.37%), Health.Education (22.13%), Behavioral.Analysis (21.26%), Nursing (21.21%), Physical.Education (21.08%), and Counseling.Psychology (19.57%) are all within PC2 with relatively high percentages of variance making up most of PC2. These programs are all related to health including mental health or education of the subjects.

    -

    PC3: Design.and.Development.of.Digital.Games (31.52%), Cognitive.Science (31.04%), Mathematics (27.94%), Learning.Analytics (27.93), Education.Technology (26.90%), Creative.Technologies (26.10%), Instructional.Technology.and.Media (25.66%), and Measurement.Evaluation.and.Statistics (24.67) are all within PC3 with relatively high percentages of variance making up most of PC3. These programs are all related to technology, data science, and statistical measures.

    -

    PC4: Linguistics (34.79), English.Education (34.07), Teaching.English (27.46), and Literacy (24.96) all are relatively high percentages of variance making up PC4 and all relate to learning language and reading. As you read down past Literacy and look at the other programs with high variance within PC4, they also are in some way related to learning.

    -

    PC5: History (32.73%) makes up the most of PC5 with the next closest percentage being Music (24.55%), which at least based on the program is not related to the subject of History and therefore PC5 is made up mostly of a program unrelated to the other classes within the component. The gap between them is large.

    -

    PC6: Science.Education (35.53%) and Higher.and.Postsecondary.Education (32.91%) make up the majority of PC6 variance, and are both related to education.

    -
    - - - - -
    - - - - - - - - - - - - - - - From c1488287cccf07d3d4d23de342e2d268b09ee500 Mon Sep 17 00:00:00 2001 From: Nicole Schlosberg <70542380+nicole-schlosberg@users.noreply.github.com> Date: Tue, 17 Aug 2021 17:47:47 -0400 Subject: [PATCH 09/10] Delete PrincipleComponentAnalysis.Rmd --- PrincipleComponentAnalysis.Rmd | 154 --------------------------------- 1 file changed, 154 deletions(-) delete mode 100644 PrincipleComponentAnalysis.Rmd diff --git a/PrincipleComponentAnalysis.Rmd b/PrincipleComponentAnalysis.Rmd deleted file mode 100644 index 2df2b66..0000000 --- a/PrincipleComponentAnalysis.Rmd +++ /dev/null @@ -1,154 +0,0 @@ ---- -title: "Principle Component Analysis" -author: "Nicole Schlosberg" -date: "11/19/2020" -output: html_document ---- - -Data: The data comes from the Assistments online intelligent tutoring system (https://www.assistments.org/). It describes students working through online math problems. Each student has the following data associated with them: - - -## Part I -Uploading the data -```{r} -D1 <- read.csv("Assistments-confidence.csv", header=TRUE) -``` - -Create a correlation matrix of the relationships between the variables, including correlation coefficients for each pair of variables/features. - -```{r} -library(ggplot2) -library(GGally) -library(corrplot) - -ggpairs(D1, 2:8, progress = FALSE) #ggpairs() draws a correlation plot between all the columns you identify by number (second option, you don't need the first column as it is the student ID) and progress = FALSE stops a progress bar appearing as it renders your plot - -ggcorr(D1[,-1], method = c("everything", "pearson")) #ggcorr() doesn't have an explicit option to choose variables so we need to use matrix notation to drop the id variable. We then need to choose a "method" which determines how to treat missing values (here we choose to keep everything, and then which kind of correlation calculation to use, here we are using Pearson correlation, the other options are "kendall" or "spearman") - -#Note of what is strongly related to the outcome variable of interest, mean_correct. -``` - -Create a new data frame with the mean_correct variable removed, we want to keep that variable intact. The other variables will be included in our PCA. - -```{r} -library(dplyr) - -D2 <- select(D1,-id,-mean_correct) -``` - -Now run the PCA on the new data frame - -```{r} -pca <- prcomp(D2, scale. = TRUE) -``` - -Although princomp does not generate the eigenvalues directly for us, we can print a list of the standard deviation of the variance accounted for by each component. - -```{r} -pca$sdev - -#To convert this into variance accounted for we can square it, these numbers are proportional to the eigenvalue -pca$sdev^2 - -#A summary of our pca will give us the proportion of variance accounted for by each component -summary(pca) - -#We can look at this to get an idea of which components we should keep and which we should drop -plot(pca, type = "lines") -``` - -Decide which components you would drop and remove them from your data set. - -ANSWER: PC5 and PC6 would be components to drop. PC5 only has 12.20% of variance, meaning it only represents 12.20% of the variance. PC6 only has 8.93% of the variance, which means it only represents 8.93% of the variance. Since it is such small amount of the variance it can be removed. - - -## Part II - -```{r} -#Now create a data frame of the transformed data from your pca. -D3 <- data.frame(pca$x) - -#Attach the variable "mean_correct" from your original data frame to D3. -D3 <- data.frame(D3,D1$mean_correct) - -#Now re-run your correlation plots between the transformed data and mean_correct. If you had dropped some components would you have lost important information about mean_correct? -ggcorr(D3, method = c("everything", "pearson")) -``` - -If you had dropped some components would you have lost important information about mean_correct? - -ANSWER: You would lose important information about mean_correct. Components with some small variance representation still contribute some information even if small. PC6 which only makes up 8.93% of the variance is strongly correlated to mean_correct. This means that even though it is the best option to drop, we risk dropping something that correlates with the thing we are interested in. - -Now print out the loadings for the components you generated: - -```{r} -library(ggbiplot) -library(tidyr) - -pca$rotation - -#Examine the eigenvectors, notice that they are a little difficult to interpret. It is much easier to make sense of them if we make them proportional within each component -loadings <- abs(pca$rotation) #abs() will make all eigenvectors positive - -#Now examine your components and try to come up with substantive descriptions of what some might represent? -L1 <- as_tibble(loadings) -labels <- c("prior_prob_count","prior_percent_correct","problems_attempted","mean_hint","mean_attempt","mean_confidence") -L2 <- cbind(labels,L1) -L3 <- L2 %>% mutate(PC1 = PC1/sum(PC1)) %>% mutate(PC2 = PC2/sum(PC2)) %>% mutate(PC3 = PC3/sum(PC3)) %>% mutate(PC4 = PC4/sum(PC4)) %>% mutate(PC5 = PC5/sum(PC5)) %>% mutate(PC6 = PC6/sum(PC6)) %>% print - -#You can generate a biplot to help you, though these can be a bit confusing. They plot the transformed data by the first two components. Therefore, the axes represent the direction of maximum variance accounted for. Then mapped onto this point cloud are the original directions of the variables, depicted as red arrows. It is supposed to provide a visualization of which variables "go together". Variables that possibly represent the same underlying construct point in the same direction. -ggbiplot(pca) -#ggbiplot(pca,choices=c(3,4)) -#ggbiplot(pca,choices=c(5,6)) -``` - -Now examine your components and try to come up with substantive descriptions of what some might represent? - -ANSWER: Having the most variance of a specific category means that the component contributes the most to its variance. PC1 has the most variance in mean_hint (30.22% of variance), mean_attempt (25.86% of variance), and problems_attempted (21.74% of the variance). PC2 contributes the most to prior_percent_correct (44.68% of variance), prior_prob_count (25.08% of variance), and problems_attempted (17.34% of variance). PC3 contributes the most to mean_confidence (45.79% of variance), prior_prob_count (22.10% of variance), and problems_attempted (20.06% of variance). PC4 contributes the most to prior_prob_count (31.52% of variance) and mean_confidence (22.61% of variance). PC5 contributes the most to problems_attempted (30.39% of variance) and mean_attempt (35.77% of variance). PC6 only contributes the most to mean_hint (35.61% of variance). - - -## Part III - -Also in this repository is a data set collected from TC students (tc-program-combos.csv) that shows how many students thought that a TC program was related to another TC program. Students were shown three program names at a time and were asked which two of the three were most similar. - -```{r} -library(ggplot2) -library(GGally) -library(dplyr) - -R1 <- read.csv("tc-program-combos.csv",header=TRUE) - -#Organize the data -R3 <- R1 -rownames(R3)<-R3$program -R3 <- select(R3, -program) -R3 <- R3[order(rownames(R3)),] -R3 <- R3[,sort(colnames(R3))] - -#PCA on data -pca3 <- prcomp(R3, scale. = TRUE) -pca3$sdev -pca3$sdev^2 -summary(pca3) - -plot(pca3, type = "lines") - -#pca3$rotation -loadings3 <- abs(pca3$rotation) -``` - -Use PCA to look for components that represent related programs. Explain why you think there are relationships between these programs (explain the meaning of the components). - -ANSWER: Based off of the percentages of variance of the PCs, I have concluded that yes many of the PCs correspond to related classes. Below are the make up of the first 6 components, which compose 38.05% (cumulative proportion) of all the variance just within those 6 components. The remaining 61.95% is spread across the other 59 components, thus with 38.05% within 6 components and 61.95% spread across 59 components, the first components hold larger proportions (decreasing as they move away from PC1 towards PC67) of the variance. - -PC1: Change.Leadership (27.70%), Economics.and.Education (23.09%), Education.Policy (22.09%), Arts.Administration (21.93%), Politics (21.88%), School.Principals (21.75%), Social.Organizational.Psychology (21.19), Private.School.Leadership (20.48%), Cooperation.and.Conflict.Resolution (20.42%), and Leadership (20.06%) make up the highest percent variance within PC1. This simply means that these programs contribute their respective amounts to PC1. These programs are all related to leadership, organization, and administration concepts. - -PC2: Clinical.Psychology (25.31%), Neuroscience (25.29%), Kinesiology (25.15), Physiology (24.43%), Psychology (22.37%), Health.Education (22.13%), Behavioral.Analysis (21.26%), Nursing (21.21%), Physical.Education (21.08%), and Counseling.Psychology (19.57%) are all within PC2 with relatively high percentages of variance making up most of PC2. These programs are all related to health including mental health or education of the subjects. - -PC3: Design.and.Development.of.Digital.Games (31.52%), Cognitive.Science (31.04%), Mathematics (27.94%), Learning.Analytics (27.93), Education.Technology (26.90%), Creative.Technologies (26.10%), Instructional.Technology.and.Media (25.66%), and Measurement.Evaluation.and.Statistics (24.67) are all within PC3 with relatively high percentages of variance making up most of PC3. These programs are all related to technology, data science, and statistical measures. - -PC4: Linguistics (34.79), English.Education (34.07), Teaching.English (27.46), and Literacy (24.96) all are relatively high percentages of variance making up PC4 and all relate to learning language and reading. As you read down past Literacy and look at the other programs with high variance within PC4, they also are in some way related to learning. - -PC5: History (32.73%) makes up the most of PC5 with the next closest percentage being Music (24.55%), which at least based on the program is not related to the subject of History and therefore PC5 is made up mostly of a program unrelated to the other classes within the component. The gap between them is large. - -PC6: Science.Education (35.53%) and Higher.and.Postsecondary.Education (32.91%) make up the majority of PC6 variance, and are both related to education. \ No newline at end of file From 2b6a83c1c1e927b37cdf523085a720d298c86c61 Mon Sep 17 00:00:00 2001 From: Nicole Schlosberg <70542380+nicole-schlosberg@users.noreply.github.com> Date: Tue, 17 Aug 2021 17:48:16 -0400 Subject: [PATCH 10/10] Add files via upload --- README.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 8ab360d..0eda8e4 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,11 @@ # Assignment 2 ### Data Wrangling and Visualization -In Assignment 2 we will be practicing data manipulation including use of the tidyverse. - -The instructions to Assignment 2 are in the Assignment 2-2020.rmd file. Assignments are structured in three parts, in the first part you can just follow along with the code, in the second part you will need to apply the code, and in the third part is completely freestyle and you are expected to apply your new knowledge in a new way. - -**Please complete as much as you can by midnight EDT, 10/05/20** - -Once you have finished, commit, push and pull your assignment back to the main branch. Include both the .Rmd file and the .html file. - -Good luck! +Practicing data manipulation including use of the tidyverse. + +stid = student id +year = year student watched video +participation = whether or not the student opened the video +watch.time = how long the student watched the video for +confusion.points = how many times a student rewatched a section of a video +key,points = how many times a student skipped or increased the speed of a video