AlexsLemonade · jashapiro · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024
diff --git a/pathway-analysis/03-gene_set_variation_analysis.Rmd b/pathway-analysis/03-gene_set_variation_analysis.Rmd
@@ -5,7 +5,7 @@ output:
     toc: true
     toc_float: true
 author: CCDL for ALSF
-date: 2020
+date: 2024
 ---
 
 ## Objectives
@@ -147,21 +147,30 @@ The output is a gene set by sample matrix of GSVA scores.
 
 ### Perform GSVA
 
+The main work for GSVA is done by the `gsva()` function, which can actually do a variety of different enrichment score calculations.
+To specify that we want the algorithm used by [Hänzelmann _et al._ (2013)](https://doi.org/10.1186/1471-2105-14-7), we will pass as the first argument a call to the `gsvaParam()` function, which is where we will put our data, gene sets, and other parameters specific to that algorithm.
+
 ```{r run_gsva}
-gsva_results <- gsva(rnaseq_mat,
-                     hallmarks_list,
-                     method = "gsva",
-                     # Appropriate for our transformed data on log2-like scale
-                     kcdf = "Gaussian",
-                     # Minimum gene set size
-                     min.sz = 15,
-                     # Maximum gene set size
-                     max.sz = 500,
-                     # Compute Gaussian-distributed scores
-                     mx.diff = TRUE)
+gsva_results <- gsva(
+  gsvaParam(
+    exprData = rnaseq_mat,
+    geneSets = hallmarks_list,
+    # Minimum gene set size
+    minSize = 15,
+    # Maximum gene set size
+    maxSize = 500,
+    # Kernel for estimation 
+    # Gaussian for our transformed data on log2-like scale
+    kcdf = "Gaussian",
+    # enrichment score is the difference between largest positive and negative
+    maxDiff = TRUE
+  ),
+  # Use 4 cores for multiprocessing
+  BPPARAM = BiocParallel::MulticoreParam(4)
+)
 ```
 
-**Note: the `gsva()` documentation says we can use `kcdf = "Gaussian"` if we had RNA-seq log-CPMs, log-RPKMs or log-TPMs, but we would use `kcdf = "Poisson"` on integer counts.**
+**Note: the `gsvaParam()` documentation says we can use `kcdf = "Gaussian"` if we had RNA-seq log-CPMs, log-RPKMs or log-TPMs, but we would use `kcdf = "Poisson"` on integer counts.**
 
 ```{r gsva_peek}
 # Let's explore what the output of gsva() looks like
@@ -230,18 +239,18 @@ random_gene_sets <- random_gene_sets |>
 Run GSVA on our dataset with the same parameters as before, but now with random gene sets.
 
 ```{r random_gsva, live = TRUE}
-random_gsva_results <- gsva(rnaseq_mat,
-                            random_gene_sets,
-                            method = "gsva",
-                            # Appropriate for our transformed data on
-                            # log2-like scale
-                            kcdf = "Gaussian",
-                            # Minimum gene set size
-                            min.sz = 15,
-                            # Maximum gene set size
-                            max.sz = 500,
-                            # Compute Gaussian-distributed scores
-                            mx.diff = TRUE)
+random_gsva_results <- gsva(
+  gsvaParam(
+    exprData = rnaseq_mat,
+    geneSets = random_gene_sets,
+    minSize = 15,
+    maxSize = 500,
+    kcdf = "Gaussian",
+    maxDiff = TRUE
+  ),
+  # Use 4 cores for multiprocessing
+  BPPARAM = BiocParallel::MulticoreParam(4)
+)
 ```
 
 Now let's make a plot to look at the distribution of scores from random gene sets.
@@ -254,9 +263,11 @@ random_long_df <- random_gsva_results |>
   # Gene set names are rownames
   tibble::rownames_to_column("gene_set") |>
   # Get into long format
-  tidyr::pivot_longer(cols = -gene_set,
-                      names_to = "Kids_First_Biospecimen_ID",
-                      values_to = "gsva_score") |>
+  tidyr::pivot_longer(
+    cols = -gene_set,
+    names_to = "Kids_First_Biospecimen_ID",
+    values_to = "gsva_score"
+  ) |>
   # Remove the .version added by make.names()
   dplyr::mutate(gene_set = stringr::str_remove(gene_set, "\\..*")) |>
   # Add a column that keeps track of the gene set size

diff --git a/scRNA-seq-advanced/diagrams/subramanian_fig1.jpg b/scRNA-seq-advanced/diagrams/subramanian_fig1.jpg
diff --git a/scripts/render-live.sh b/scripts/render-live.sh
@@ -41,9 +41,9 @@ files=(
   # machine-learning/02-openpbta_consensus_clustering.Rmd
   # machine-learning/03-openpbta_PLIER.Rmd
   # machine-learning/04-openpbta_plot_LV.Rmd
-  # pathway-analysis/01-overrepresentation_analysis.Rmd
-  # pathway-analysis/02-gene_set_enrichment_analysis.Rmd
-  # pathway-analysis/03-gene_set_variation_analysis.Rmd
+  pathway-analysis/01-overrepresentation_analysis.Rmd
+  pathway-analysis/02-gene_set_enrichment_analysis.Rmd
+  pathway-analysis/03-gene_set_variation_analysis.Rmd
 )
 for file in ${files[@]}
 do