codebase/1DP_11_MCAO_scsn.Rmd

---
title: "MCAO_scsn"
author: "Daniel Zucha"
date: "2023-06-07"
output: html_document
---

A markdown for the analysis of the processed single-nucleus and single-cell datasets.


=== Essential chunks. Always load. === 

```{r libraries}
library(Seurat)
library(SeuratWrappers)
library(dplyr)
library(tibble)
library(magrittr)
library(dittoSeq)
library(openxlsx)
library(stringr)
library(ggplot2)
library(RColorBrewer)
library(gridExtra)
library(patchwork)
library(plotly)
library(pheatmap)


options(Seurat.object.assay.version = "v3")

## sourcing helper functions
source("code/supporting_functions_MCAO.R")
```

```{r working space and lists}
if(!dir.exists("ws")){dir.create("ws"); ws <- "ws"}else{ws <- "ws"}

# lists
seurat.plots <- list()
seurat.results <- list()
```

load data
```{r load seurats}
# spatial data
spatial.seurat <- readRDS(file.path("data", "seurat_spatial_1DP_06.Rds"))

# single-nucleus data
sn.seurat <- readRDS("data/seurat_all_UMAP_soupX.rds")
sn.seurat %<>% SetIdent(value = "cell_type_1")
```

=== All cell types. ===

Basic QC filtering
```{r QC filtering}
# check and remove doublets
{
  sn.seurat$Doublet %>% table
  sn.seurat %<>% SetIdent(value = "Doublet")
  sn.seurat %>% DimPlot(reduction = "umap", pt.size = 1.5)
  sn.seurat %<>% subset(idents = "Singlet") ## keep only singlets
  sn.seurat %<>% SetIdent(value = "cell_type_1")
}

# ncount, nfeature, mito and ribo content
sn.seurat %<>% PercentageFeatureSet(pattern = "^mt-|^Mt-", col.name = "percent.mt", assay = "RNA")
sn.seurat %<>% PercentageFeatureSet(pattern = "^Rpl|^Rps", col.name = "percent.rib", assay = "RNA")
sn.seurat$nCount_RNA %>% summary
sn.seurat$nFeature_RNA %>% summary
sn.seurat$percent.mt %>% hist(breaks = 100)
sn.seurat$percent.rib %>% summary

# filter high mito content
## cell_type_1 counts
seurat.plots[["BarPlot_celltypes1"]] <- sn.seurat$cell_type_1 %>% table %>% 
  barplot(col = col.list$alloursn, ylab = "Cell count", xlab = "cell types", cex.names = 0.8, las = 1, horiz = T)

seurat.plots[["BarPlot_celltypes1_HighpercMT"]] <- sn.seurat$cell_type_1[(sn.seurat$percent.mt > 10)] %>% table %>% 
  barplot(col = col.list$alloursn, ylab = "Cell count", xlab = "cell types", cex.names = 0.8, las = 1, horiz = T)

sn.seurat %>% VlnPlot(features = "percent.mt", group.by = "cell_type_1", cols = col.list$alloursn)
sn.seurat %>% VlnPlot(features = "nFeature_RNA", group.by = "cell_type_1", cols = col.list$alloursn)

# Filter low QC cells
{
  sn.seurat %<>% subset(nFeature_RNA > 200)
  sn.seurat %<>% subset(percent.mt < 15)
  sn.seurat %>% glimpse
}

seurat.plots[["basicQC"]] <- sn.seurat %>% 
  FeaturePlot(features = c("nCount_RNA", "nFeature_RNA", "percent.mt", "percent.rib"), reduction = "umap", pt.size = 1, max.cutoff = 'q99', label = T, ncol = 2) & NoAxes()
```

A look at the dataset.
```{r sn seurat plots}
# cell_type_1 annotation
seurat.plots[["UMAP_Celltype1"]] <- sn.seurat %>% 
  DimPlot(reduction = "umap", pt.size = 1, label = T, group.by = 'cell_type_1', cols = col.list$alloursn) + NoLegend()

seurat.plots[["UMAP_Celltype1"]] %>% 
  ggsave(filename = file.path(ws, "UMAPplot_ourSN_AllCelltypes_labeled.tiff"), device = 'tiff', dpi = 300, units = 'cm', width = 12, height = 12)

# cell_type_1 annotation but no legend and modified plotting theme
seurat.plots[["UMAP_Celltype1_v2"]] <- sn.seurat %>% 
  DimPlot(reduction = "umap", pt.size = 0.25, label = F, group.by = 'cell_type_1', cols = col.list$alloursn) + NoLegend() + NoAxes() + ggtitle(label = NULL) + theme(plot.background = element_blank(), panel.background = element_blank())

seurat.plots[["UMAP_Celltype1_v2"]] %>% 
  ggsave(filename = file.path(ws, "UMAPplot_ourSN_AllCelltypes.tiff"), device = 'tiff', dpi = 300, units = 'cm', width = 12, height = 12)
```

Cluster using RNA assay
```{r log(RNA)-driven UMAP and clustering}
# Normalize and scale
{
  DefaultAssay(sn.seurat) <- "RNA"
  sn.seurat %<>% NormalizeData(normalization.method = "LogNormalize", verbose = T)
  sn.seurat %<>% FindVariableFeatures(selection.method = "vst", nfeatures = 3000)
  sn.seurat %<>% ScaleData()
}

# Explore the gene expression distribution across cells
## total gene counts
sn.seurat[["RNA"]]@data %>% rowSums() %>% summary
sn.seurat[["RNA"]]@data %>% rowSums() %>% sort(decreasing = T) %>% head(50) ## the most expressed genes overall
sn.seurat[["RNA"]]@data %>% rowSums() %>% hist(breaks = 100)

## gene exp positivity across cells
seurat.results[["nonzero genes"]] <- (sn.seurat[["RNA"]]@data > 0) %>% rowMeans() %>% "*"(100) ## gene expression positivity across cells
seurat.results[["nonzero genes"]] %>% hist(breaks = 100) ## >30k genes are expressed in less than 1 % of cells

## closer look at the variable genes
seurat.results[["rna_varfeat"]] <- sn.seurat %>% HVFInfo(assay = "RNA")
seurat.results[["rna_varfeat"]] %>% arrange(desc(variance.standardized)) %>% top_n(30) ## top 30 variance contributors
sn.seurat %>% VariableFeaturePlot(assay = "RNA")

## select variable features with standardized variance >1.5
selected.var.features <- rownames(seurat.results[["rna_varfeat"]])[(seurat.results[["rna_varfeat"]][["variance.standardized"]] > 1.5)]
selected.var.features %<>% stringr::str_subset(pattern = "^mt-|^Mt-", negate = T) # remove mitochondrial genes

## how many cells express these highly variable genes?
seurat.results[["nonzero genes"]][names(seurat.results[["nonzero genes"]]) %in% selected.var.features] %>% 
  hist(breaks = 100, main = "In how many % of cells are the variable genes expressed?", xlab = "% of gene positive cells") 
(seurat.results[["nonzero genes"]][names(seurat.results[["nonzero genes"]]) %in% selected.var.features] < 5) %>% sum ## 1k (= half) var genes are expressed in less than 5% of cells!

## how many counts do these low cell-present genes have? 
lowpresent.vars <- seurat.results[["nonzero genes"]][names(seurat.results[["nonzero genes"]]) %in% selected.var.features] %>% sort() %>% head(400) ## take the bottom 400 genes (= <1% cell-positive genes)
seurat.results[["gene_sums"]] <- sn.seurat[["RNA"]]@counts %>% rowSums()
seurat.results[["gene_sums"]][names(seurat.results[["gene_sums"]]) %in% names(lowpresent.vars)] %>% 
  hist(breaks = 100, main = "Summed expression of variable features for the bottom % cell-positive genes", xlab = "Summed gene count") ## they have maximum of 300 counts across 8k cells!

## in conclusion, let's evade using the bottom 400 genes, as we want to include only more robust expressers
present.var.features <- selected.var.features[!(selected.var.features %in% names(lowpresent.vars))]

## PCA
{
  sn.seurat %<>% RunPCA(assay = "RNA", features = present.var.features, npcs = 50, verbose = T)
  sn.seurat %>% DimPlot(reduction = "pca", dims = c(1,2), cols = col.list$alloursn)
  sn.seurat %>% ElbowPlot(ndims = 50) | sn.seurat %>% PC_var_explained()
  sn.seurat %>% DimHeatmap(cells = 200, reduction = "pca", balanced = T, dims = 1:15)
}

## Explore the 3D PCA
{
  dims <- c(1, 2, 3) ## PCs to plot
  df <- data.frame(
      "ElementName" = sn.seurat@reductions$pca@cell.embeddings %>% rownames(),
      "X" = sn.seurat@reductions$pca@cell.embeddings[, dims[1]],
      "Y" = sn.seurat@reductions$pca@cell.embeddings[, dims[2]],
      "Z" = sn.seurat@reductions$pca@cell.embeddings[, dims[3]],
      "Annotation" = sn.seurat$cell_type_1
  )
  df %<>% sample_n(size = 1000) # (optional) to increase plotting speed
  df %>% plot_ly(x = .[["X"]], y = .[["Y"]], z = .[["Z"]], type="scatter3d", mode = "markers", color = .[["Annotation"]], colors = col.list$alloursn)
}

## UMAP
pc_dims <- 1:15
{
  sn.seurat %<>% RunUMAP(reduction = "pca", dims = pc_dims, verbose = T)
  sn.seurat %>% DimPlot(reduction = "umap", group.by = "cell_type_1", label = T)
}

## Clustering 
{
  sn.seurat %<>% FindNeighbors(reduction = "pca", dims = pc_dims, verbose = T)
  for(res in c(0.6, 0.8, 1, 1.2, 1.4)){
   sn.seurat %<>% FindClusters(resolution = res, algorithm = 1, verbose = T) 
  }
}

seurat.plots[["UMAP_RNA_res1.4"]] <- sn.seurat %>% DimPlot(reduction = "umap", pt.size = 1, group.by = "RNA_snn_res.1.4", cols = col.list$big_col_palette, label = T)
```

Cluster using SCT
```{r SCT-driven UMAP and clustering}
DefaultAssay(sn.seurat) <- "RNA"
sn.seurat[["SCT"]] <- NULL
sn.seurat %<>% SetIdent(value = "cell_type_1")

## compute SCT assay and identifying variable features
sn.seurat %<>% SCTransform(variable.features.n = 2000, return.only.var.genes = T)
sn.seurat %>% VariableFeaturePlot()

## let's hand-select genes that are robustly expressed, are of nuclear origin
{
  seurat.results[["sct_varfeat"]] <- sn.seurat[["SCT"]] %>% VariableFeatures()
  seurat.results[["sct_varfeat"]] %<>% stringr::str_subset(pattern = "^mt-|^Mt-", negate = T) # remove mitochondrial genes
  seurat.results[["sct_varfeat"]] %>% length()
}

## in how many cells are the variable genes present?
seurat.results[["nonzero genes"]][names(seurat.results[["nonzero genes"]]) %in% seurat.results[["sct_varfeat"]]] %>% 
  hist(breaks = 100, main = "% of cells expressing the variable gene")

## PCA
{
  sn.seurat %<>% RunPCA(assay = "SCT", features = seurat.results[["sct_varfeat"]], npcs = 50, verbose = T, reduction.name = "sct.pca")
  sn.seurat %>% DimPlot(reduction = "sct.pca", dims = c(1,2), cols = col.list$alloursn)
  sn.seurat %>% ElbowPlot(ndims = 50)
  sn.seurat %>% PC_var_explained()
  sn.seurat %>% DimHeatmap(cells = 200, reduction = "sct.pca", balanced = T, dims = 1:15)
}

# exploring PCA
{
  sn.seurat %>% DimPlot(reduction = "sct.pca", dims = c(1,2), cols = col.list$alloursn)
  sn.seurat %>% DimPlot(reduction = "sct.pca", dims = c(1,3), cols = col.list$alloursn)
  sn.seurat %>% DimPlot(reduction = "sct.pca", dims = c(3,4), cols = col.list$alloursn)
  sn.seurat %>% DimPlot(reduction = "sct.pca", dims = c(10,11), cols = col.list$alloursn)
}

## exploring 3D PCA plot
{
  dims <- c(11, 12, 13) ## PCs to plot
  df <- data.frame(
      "ElementName" = sn.seurat@reductions$sct.pca@cell.embeddings %>% rownames(),
      "X" = sn.seurat@reductions$sct.pca@cell.embeddings[, dims[1]],
      "Y" = sn.seurat@reductions$sct.pca@cell.embeddings[, dims[2]],
      "Z" = sn.seurat@reductions$sct.pca@cell.embeddings[, dims[3]],
      "Annotation" = sn.seurat$cell_type_1
  )
  df %<>% sample_n(size = 1000) # (optional) to increase plotting speed
  df %>% plot_ly(x = .[["X"]], y = .[["Y"]], z = .[["Z"]], type="scatter3d", mode = "markers", color = .[["Annotation"]], colors = col.list$alloursn)
}

## UMAP
pc_dims <- 1:15
{
  sn.seurat %<>% RunUMAP(reduction = "sct.pca", dims = pc_dims, verbose = T, reduction.name = "sct.umap")
  sn.seurat %>% DimPlot(reduction = "sct.umap", group.by = "cell_type_1", label = T)
}
## Clustering 
{
  sn.seurat %<>% FindNeighbors(reduction = "sct.pca", dims = pc_dims, verbose = T)
  for(res in c(0.6, 0.8, 1, 1.2)){
   sn.seurat %<>% FindClusters(resolution = res, algorithm = 1, verbose = T) 
  }
}

## plot the different resolutions
{
  seurat.plots[["UMAP_SCT_res0.6"]] <- sn.seurat %>% DimPlot(reduction = "sct.umap", group.by = "SCT_snn_res.0.6", label = T)
  seurat.plots[["UMAP_SCT_res0.8"]] <- sn.seurat %>% DimPlot(reduction = "sct.umap", group.by = "SCT_snn_res.0.8", label = T)
  seurat.plots[["UMAP_SCT_res1"]] <- sn.seurat %>% DimPlot(reduction = "sct.umap", group.by = "SCT_snn_res.1", label = T)
  seurat.plots[["UMAP_SCT_res1.2"]] <- sn.seurat %>% DimPlot(reduction = "sct.umap", group.by = "SCT_snn_res.1.2", label = T)
}

```

Since both variants produce similar output, we decided to stick with RNA assay as it is better interpretable and does not require re-normalization after every subset. Also, the work of Eltze & Huber, Nat Methods (2023) discussed that although sct does not perform poorly, the robustness of log(RNA) for analysis overshadows these more sophisticated tools.

```{r Removing SCT assay}
sn.seurat[["SCT"]] <- NULL
```

Exploring seurat to filter low quality clusters
```{r explore for cluster removal}
sn.seurat %<>% SetIdent(value = "RNA_snn_res.1.4")

## UMAP with current RNA clustering
seurat.plots[["UMAP_RNA_res1.4"]] <- sn.seurat %>% 
  DimPlot(reduction = "umap", group.by = "RNA_snn_res.1.4", label = T, pt.size = 1, cols = col.list$big_col_palette) & NoLegend() & NoAxes()

## UMAP with Conditions
seurat.plots[["UMAP_Cond"]] <- sn.seurat %>% 
  DimPlot(reduction = "umap", group.by = "Condition", pt.size = 1, cols = col.list$Condition, shuffle = T) & NoAxes()

## dittoBarplot with Condition
seurat.plots[["dittoBarplot_Condition"]] <- sn.seurat %>% 
  dittoBarPlot(var = "Condition", group.by = "RNA_snn_res.1.4", color.panel = col.list$Condition, retain.factor.levels = T) + xlab("")

## dittoBarplot with Phase
seurat.plots[["dittoBarplot_Phase"]] <- sn.seurat %>% 
  dittoBarPlot(var = "Phase", group.by = "RNA_snn_res.1.4", color.panel = col.list$Phase, retain.factor.levels = T) + xlab("")

## previous cell_type_1 annotation in the new clusters
seurat.plots[["UMAP_celltype1"]] <- sn.seurat %>% 
  DimPlot(reduction = "umap", group.by = "cell_type_1", label = T) & NoAxes() + theme(legend.position = "none")

## Markers of the res1.4 clusters
seurat.results[["Markers_res1.4"]] <- sn.seurat %>% 
  FindAllMarkers(assay = "RNA", logfc.threshold = 1, min.pct = 0.3, only.pos = T, verbose = T) %>% 
  filter(p_val_adj < 0.01) %>% 
  arrange(p_val_adj) %>% 
  split(f = .[["cluster"]])

## Background? Even after soupX, there is a lot of neuronal RNA in all cells!
seurat.plots[["NeuronalBackground"]] <- (sn.seurat %>% FeaturePlot(
  features = c("Meg3", "Rbfox3"), max.cutoff = 'q99', repel = T, label.size = 1,
  reduction = "umap", label = T, raster = F, pt.size = 0.2, ncol = 1, cols = c("white", "#AE0900")) & NoAxes() & NoLegend()) | (
    sn.seurat %>% DotPlot(features = c("Meg3", "Rbfox3"), cols = c("white", "#AE0900"), assay = "RNA")
  )

## Marker genes and their FeaturePlot
seurat.results[["canonical_markers"]] <- c(
  "Rbfox3", "Slc17a7",  "Gad1", # Neurons all, GLUT, GABA
  "Sox4", # Neuroblast
  "Ccdc153", # Ependymal cells
  "Slc1a3", "Gfap", # Astrocyte
  "Plp1", "Klk6", # Oligodendrocyte
  "Serpina3n",
  "Pdgfra", # OPC
  "Csf1r", "Spp1", # Microglia
  "Ptgds", # VLMC
  "Flt1" # Vasculature
)

seurat.plots[["FeaturePlot_CanMarkers"]] <- sn.seurat %>% FeaturePlot(
  features = seurat.results[["canonical_markers"]], max.cutoff = 'q99', reduction = "umap", 
  label = F, raster = F, pt.size = 0.2, ncol = 5, cols = c("white", "#AE0900")) & NoAxes() & theme(legend.position = "none")

## VlnPlot for Neuronal markers
seurat.plots[["VlnPlot_NeuronalMarkers"]] <- sn.seurat %>% 
  VlnPlot(features = c("Rbfox3", "Slc17a7",  "Gad1"), group.by = "RNA_snn_res.1.4", pt.size = 0, layer = "RNA", slot = "data", raster = F, ncol = 1, cols = col.list$big_col_palette) & geom_boxplot(width=0.1, fill="white", outlier.size = 0.2) & NoLegend()

## VlnPlot for Glial markers
seurat.plots[["VlnPlot_GlialMarkers"]] <- sn.seurat %>% 
  VlnPlot(features = c("Slc1a3", "Plp1", "Pdgfra"), group.by = "RNA_snn_res.1.4", pt.size = 0, layer = "RNA", slot = "data", raster = F, ncol = 1, cols = col.list$big_col_palette) & geom_boxplot(width=0.1, fill="white", outlier.size = 0.2) & NoLegend()


## Ridgeplot on nFeatures per cluster (observing double or triple peaks?)
seurat.plots[["RidgePlot_nFeature_RNA"]] <- sn.seurat %>% 
  RidgePlot(features = "nFeature_RNA", cols = col.list$big_col_palette) & NoLegend()

## Ridgeplot on percent mt
seurat.plots[["RidgePlot_percentMT"]] <- sn.seurat %>% 
  RidgePlot(features = "percent.mt", cols = col.list$big_col_palette) + theme(legend.position = "none")


## FeaturePlot percent mt
seurat.plots[["FeatPlot_mito"]] <- sn.seurat %>% FeaturePlot(
  features = "percent.mt", reduction = "umap", 
  max.cutoff = 'q99', label = F, raster = F, pt.size = 0.5,
  ncol = 1, cols = RColorBrewer::brewer.pal(n = 9, name = "Blues")) + theme(legend.position = "none")

## featureplot percent ribosomal
seurat.plots[["FeatPlot_rib"]] <- sn.seurat %>% FeaturePlot(
  features = "percent.rib", reduction = "umap", 
  max.cutoff = 'q99', label = F, raster = F, pt.size = 0.5,
  ncol = 1, cols = RColorBrewer::brewer.pal(n = 9, name = "Greens")) + theme(legend.position = "none")


## ScatterPlot  
sn.seurat %>% FeatureScatter(feature1 = "nCount_RNA", feature2 = "Apoe", group.by = "cell_type_1", cols = col.list$alloursn, pt.size = 1.5)


# == Wrap Plots == 
seurat.plots[["WrapPlots_beforeAnnotation"]] <- wrap_plots(seurat.plots[c("UMAP_celltype1", "UMAP_RNA_res1.4", "dittoBarplot_Condition",
                                                                          "FeaturePlot_CanMarkers", "RidgePlot_nFeature_RNA", "RidgePlot_percentMT",
                                                                          "FeatPlot_mito", "FeatPlot_rib", "dittoBarplot_Phase"
                                                                          )], ncol = 6,
                                                           design = "AABBCC
                                                           DDDEEF
                                                           DDDEEF
                                                           GGHHII") + plot_annotation(
                                                             title = "Zucha 2023: Exploratory view pre-annotation", 
                                                             subtitle = "After basic QC. n = 7,558 nuclei.")

seurat.plots[["WrapPlots_beforeAnnotation"]] %>% ggsave_tiff(plotname = "WrapPlots_sn_Zucha2023_AfterBasicQC_preAnnotation", height = 30, width = 30)

```

Cluster annotation
```{r cluster annotation and save QC filtered seurat}
# annotate
sn.seurat@meta.data %<>% mutate("Celltypes" = factor(case_when(
  RNA_snn_res.1.4 == 0 ~ "OLs", 
  RNA_snn_res.1.4 == 1 ~ "NeuronsGABA", 
  RNA_snn_res.1.4 == 2 ~ "NeuronsGABA", 
  RNA_snn_res.1.4 == 3 ~ "NeuronsGLUT", 
  RNA_snn_res.1.4 == 4 ~ "Astrocytes",
  RNA_snn_res.1.4 == 5 ~ "NeuronsGLUT", 
  RNA_snn_res.1.4 == 6 ~ "NeuronsGABA", 
  RNA_snn_res.1.4 == 7 ~ "NeuronsGABA", 
  RNA_snn_res.1.4 == 8 ~ "NeuronsGLUT", 
  RNA_snn_res.1.4 == 9 ~ "OLs", 
  RNA_snn_res.1.4 == 10 ~ "Microglia", 
  RNA_snn_res.1.4 == 11 ~ "NeuronsGLUT", 
  RNA_snn_res.1.4 == 12 ~ "NeuronsGLUT",
  RNA_snn_res.1.4 == 13 ~ "NeuronsGABA", 
  RNA_snn_res.1.4 == 14 ~ "NeuronsGLUT", 
  RNA_snn_res.1.4 == 15 ~ "OPCs", 
  RNA_snn_res.1.4 == 16 ~ "NeuronsGLUT", 
  RNA_snn_res.1.4 == 17 ~ "EndothelialCells", 
  RNA_snn_res.1.4 == 18 ~ "NeuronsGLUT", 
  RNA_snn_res.1.4 == 19 ~ "VLMCs", 
  RNA_snn_res.1.4 == 20 ~ "debris", 
  RNA_snn_res.1.4 == 21 ~ "Astrocytes", 
  RNA_snn_res.1.4 == 22 ~ "EpendymalCells", 
  RNA_snn_res.1.4 == 23 ~ "NeuronsGLUT", 
  RNA_snn_res.1.4 == 24 ~ "Neuroblasts", 
  RNA_snn_res.1.4 == 25 ~ "NeuronsGABA",
  RNA_snn_res.1.4 == 26 ~ "Microglia"
)))

sn.seurat@meta.data %<>% mutate("CelltypesDetailed" = factor(case_when(
  RNA_snn_res.1.4 == 0 ~ "OLs1", 
  RNA_snn_res.1.4 == 1 ~ "NeuronsGABA1", 
  RNA_snn_res.1.4 == 2 ~ "NeuronsGABA2", 
  RNA_snn_res.1.4 == 3 ~ "NeuronsGLUT1", 
  RNA_snn_res.1.4 == 4 ~ "Astrocytes",
  RNA_snn_res.1.4 == 5 ~ "NeuronsGLUT2", 
  RNA_snn_res.1.4 == 6 ~ "NeuronsGABA3", 
  RNA_snn_res.1.4 == 7 ~ "NeuronsGABA4", 
  RNA_snn_res.1.4 == 8 ~ "NeuronsGLUT3", 
  RNA_snn_res.1.4 == 9 ~ "OLs2", 
  RNA_snn_res.1.4 == 10 ~ "Microglia1", 
  RNA_snn_res.1.4 == 11 ~ "NeuronsGLUT4", 
  RNA_snn_res.1.4 == 12 ~ "NeuronsGLUT5",
  RNA_snn_res.1.4 == 13 ~ "NeuronsGABA5", 
  RNA_snn_res.1.4 == 14 ~ "NeuronsGLUT6", 
  RNA_snn_res.1.4 == 15 ~ "OPCs", 
  RNA_snn_res.1.4 == 16 ~ "NeuronsGLUT7", 
  RNA_snn_res.1.4 == 17 ~ "EndothelialCells", 
  RNA_snn_res.1.4 == 18 ~ "NeuronsGLUT8", 
  RNA_snn_res.1.4 == 19 ~ "VLMCs", 
  RNA_snn_res.1.4 == 20 ~ "debris", 
  RNA_snn_res.1.4 == 21 ~ "Astrocytes2", 
  RNA_snn_res.1.4 == 22 ~ "EpendymalCells", 
  RNA_snn_res.1.4 == 23 ~ "NeuronsGLUT9", 
  RNA_snn_res.1.4 == 24 ~ "Neuroblasts", 
  RNA_snn_res.1.4 == 25 ~ "NeuronsGABA6",
  RNA_snn_res.1.4 == 26 ~ "Microglia2"
)))

## remove debris
sn.seurat %<>% SetIdent(value = "Celltypes")
sn.seurat %<>% subset(idents = c("debris"), invert = T)
sn.seurat$Celltypes %<>% droplevels()
sn.seurat$CelltypesDetailed %<>% droplevels()

{
  sn.seurat %<>% FindNeighbors(reduction = "pca", dims = 1:15) 
  sn.seurat %<>% FindClusters(resolution = 1, cluster.name = "RNA_DebrisFilt_res1")
  sn.seurat %<>% RunUMAP(reduction = "pca", dims = 1:15, reduction.name = "umap.filt")
}
seurat.plots[["UMAP_DebrisFiltered"]] <- sn.seurat %>% 
  DimPlot(reduction = "umap", group.by = "RNA_DebrisFilt_res1", label = T, pt.size = 1, cols = col.list$big_col_palette) & NoLegend() & NoAxes()

# save the filtered seurat
saveRDS(sn.seurat, file = "data/Seurat_sn_MCAO_Zucha2023_QCFiltered.Rds")
```

Markers of annotated clusters
```{r markers of annotated clusters}
DefaultAssay(sn.seurat) <- "RNA" 
sn.seurat %<>% SetIdent(value = "Celltypes")

seurat.results[["allmarkers"]] <- FindAllMarkers(sn.seurat, logfc.threshold = 1, assay = 'RNA', verbose = T, only.pos = T) %>% 
  arrange(p_val_adj) %>% 
  filter(p_val_adj < 0.01) %>% 
  split(f = .[["cluster"]])
```

Summarizing plots of finalized sn dataset
```{r summary plots of whole sn dataset}
DefaultAssay(sn.seurat) <- "RNA" 
sn.seurat %<>% SetIdent(value = "Celltypes")

# UMAP of Celltypes
seurat.plots[["UMAP of Celltypes"]] <- sn.seurat %>% 
  DimPlot(reduction = "umap", pt.size = 1, label = T, group.by = 'Celltypes', cols = col.list$Celltypes) + NoLegend() + NoAxes()

# UMAP of Cell subtypes
seurat.plots[["UMAP of Cell Subtypes"]] <- sn.seurat %>% 
  DimPlot(reduction = "umap", pt.size = 1, label = F, group.by = 'CelltypesDetailed', cols = col.list$CelltypesDetailed) + NoLegend() + NoAxes()

# Cell type frequencies
seurat.plots[["Barplot of Celltypes"]] <- sn.seurat %>% 
  total_barplot(groupvar = "Celltypes", ylim = 2550)

# Cell Subtype frequencies
seurat.plots[["Barplot of CellSubtypes"]] <- sn.seurat %>% 
  total_barplot(groupvar = "CelltypesDetailed", ylim = 900)

# Cell type  proportions

# Markers dotplot
features <- c('Stmn1', 'Sox4', 'Ccnd2', 'Dcx',   # Neuroblast
                   'Usp29', 'Snhg11', 'Nhs', # Other Neurons
                   'Gad1', 'Gad2', # GABAergic
                   'Meg3', 'Rbfox3',  'Nrgn', 'Slc17a6', 'Slc17a7',  # Glutamatergic
                   'Ccdc153', 'Foxj1',  # Ependymal
                   'Aqp4',  'Aldh1l1', 'Slc1a3', 'Gfap',   # Astrocyte
                   'Pdgfra', 'Cspg4', 'Vcan', # OPC
                   'Mbp',  'Mobp', 'Plp1', 'Cldn11',    # Oligo
                   'Cx3cr1', 'Csf1r', # Microglia
                   'Slc47a1',  'Mgp', 'Ptgds',    # VLMC
                  'Pdgfrb',  'Flt1', 'Pecam1'    # Vasculature
                   )

seurat.plots[["Dotplot_Markers"]] <- sn.seurat %>% 
  dittoSeq::dittoDotPlot(vars = features, group.by = "Celltypes", scale = T)

# Displaying a specific gene
sn.seurat %>% 
  dittoSeq::dittoPlot(var = "Apoe", group.by = "Celltypes", plots = c("ridgeplot", "jitter") , assay = "RNA", slot = "data", color.panel = col.list$Celltypes) + theme(legend.position = "none")


```

Population proportions 
```{r proportions}
## proportions
library(dittoSeq)
sn.seurat$Condition %>% table

sn.seurat %>% dittoBarPlot(var = "Condition", group.by = "Celltypes", scale = "count")
sn.seurat %>% dittoBarPlot(var = "Condition", group.by = "Celltypes", scale = "percent")

sn.seurat[['Condition2']] <- case_when( ## simplifying the conditions
  sn.seurat$Condition == 'sham' ~ 'Ctrl',
               sn.seurat$Condition == 'D1' ~ 'D1',
               sn.seurat$Condition == 'D3' ~ 'D3',
               sn.seurat$Condition == 'D7' ~ 'D7') %>% factor(levels = c('Ctrl', 'D1', 'D3', 'D7'))

ids <- sn.seurat@meta.data$Condition2 %>% levels; categories <- sn.seurat@meta.data$cell_type_1_simplified %>% levels; counts <- matrix(nrow=length(ids), ncol=length(categories)); rownames(counts) <- ids; colnames(counts) <- categories
for (i in seq_along(ids)) {
  for (j in seq_along(categories)) {
    count <- sn.seurat@meta.data %>%
      filter(Condition2 == ids[i], cell_type_1_simplified == categories[j]) %>%
      nrow()

    counts[i, j] <- count
  }
}; rm(i,j)

counts_mm_0 <- reshape2::melt(counts, id = "rownames") ## total count proportions

ggplot(data = counts_mm_0, aes(x = Var2, y = value, fill = Var1)) + 
  geom_col(position = "dodge") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  ylab('Proportion of cells')
counts <- counts/rowSums(counts)
counts_mm_1 <- reshape2::melt(counts, id = "rownames") ## percent count proportions

## BarPlot on percentage of count proportions
seurat.plots[["BarPlot_sn_CellTypes1simp_bySample"]] <-  ggplot(data = counts_mm_1, aes(x = Var2, y = value, fill = Var1)) + ## percent count proportions
    geom_col(position = "dodge") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 105, hjust = 1, vjust = 0), axis.text.y = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) +
    ylab('Proportion of nuclei') +
    xlab("") + ggtitle("") + NoLegend() +
    theme(axis.text = element_text(family = "OpenSans", size = 25, color = "black"),
        axis.title = element_text(family = "OpenSans", size = 25),
        plot.background = element_rect(fill = 'transparent', color = NA),
        panel.background = element_rect(fill = 'transparent', color = NA)) +
    scale_fill_manual(values = c("#525475", "#F1D7AB", "#E6B188", "#D18080")); p


seurat.plots[["BarPlot_sn_CellTypes1simp_bySample"]] 
```

=== Astroependymal populations ===

Loading the seurat
```{r seurat load}
astro.seurat <- readRDS("data/seurat_ASTRO_names_soupX.rds")
DefaultAssay(astro.seurat) <- "SCT"
```

```{r seurat object clean up}
astro.seurat %<>% SetIdent(value = "cell_type_2")
astro.seurat$cell_type_2 %>% levels
astro.seurat %<>% subset(idents = "mix", invert = T) ## removing the Meg3+ astrocytes (maternal non-coding RNA), as in Habib et al 2020

# recode the cell type annotations
astro.seurat@meta.data %<>% 
  mutate(cell_type_2 = recode(cell_type_2, 
                              "ASTRO_TE" = "TE Astrocytes", 
                              "ASTRO_DE" = "DE Astrocytes", 
                              "progenitor Astro" = "Activated Astrocytes", 
                              "reactive Astro" = "Reactive Astrocytes", 
                              "EPEN" = "Ependymal", 
                              "NB" = "Neuroblasts", 
                              "CHORO" = "ChoroidPlexus"
                              ) %>% 
           factor(levels = c("Neuroblasts", 
                             "Ependymal", 
                             "ChoroidPlexus", 
                             "TE Astrocytes", 
                             "DE Astrocytes", 
                             "Activated Astrocytes", 
                             "Reactive Astrocytes")), 
         Condition = recode(Condition, 
                            "sham" = "Sham", 
                            "D1" = "1DPI", 
                            "D3" = "3DPI", 
                            "D7" = "7DPI") %>% 
           factor(levels = c("Sham", 
                             "1DPI", 
                             "3DPI", 
                             "7DPI"))
         )
astro.seurat %<>% SetIdent(value = "cell_type_2")

# add ribo and mito percentages
astro.seurat %<>% PercentageFeatureSet(pattern = "mt-", col.name = "percent_mt")
astro.seurat %<>% PercentageFeatureSet(pattern = "Rpl|Rps", col.name = "percent_rib")

```

Overall Markers
```{r Markers}
seurat.results[["Markers_ast_all"]] <- 
  astro.seurat %>% 
  FindAllMarkers(
    logfc.threshold = 0.58, 
    assay = 'SCT', 
    verbose = T, 
    only.pos = T) %>% 
  arrange(desc(avg_log2FC)) %>% 
  split(f = .[["cluster"]])
names(seurat.results[["Markers_ast_all"]]) <- paste0("SN_Ast_", names(seurat.results[["Markers_ast_all"]]))

write.xlsx(seurat.results[["Markers_ast_all"]], 
           file = file.path(ws, "Markers_sn_AstroEpe.xlsx"), overwrite = T)

astro.seurat %>% FeaturePlot(features = c("Dcx", "Foxj1", "Tmem72"), label = T)

## dotplot
astro.seurat$cell_type_2 %>% levels

{
  features_test <- c('Dcx', 'Nrxn3', "Adarb2",   # Neuroblasts
                   'Foxj1', 'Adamts20', 'Tmem212', # Ependymal
                   'Tmem72', 'Htr2c', "Otx2os1",  # Choroid Plexus cells
                   'Grm3', 'Rgs7', 'Htra1',  # TE Astro
                   'Slc1a3',  # astro general
                   "Gria1", "Fry", "Slc6a11", "Mgat4c",  # DE Astro
                   'Ogt', 'Fam107a', 'Cox8a', 'Aldoc', "Shank1", "Nrgn", # Activated Astro
                   "Ctsd", 'Gfap', "Clu", 'Apoe', "Ptn", "Gpm6b", "Cd9", "Vim", "Thbs4"    # Reactive astro
                   )

p <- DotPlot(astro.seurat, assay = "SCT", features = features_test, group.by = "cell_type_2", cols = c("#FFF7FB", "#014636")) + 
  theme_mk + 
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1, 
                                   vjust = 1)) +
  xlab(NULL) + 
  ylab(NULL)
print(p)
}
```

Gene ontology analysis for Reactive Astrocytes
```{r Reactive AST markers and GO}
# Direct marker comparison between Reactive and Homeostatic AST
seurat.results[["Markers_ast_ReactiveAST"]] <- astro.seurat %>%
  FindMarkers(
    ident.1 = "Reactive Astrocytes",
    ident.2 = c("DE Astrocytes", "TE Astrocytes"),
    logfc.threshold = 0.58,
    only.pos = T,
    assay = 'SCT',
    verbose = T
  ) %>%
  arrange(desc(avg_log2FC)) %>%
  rownames_to_column(var = "GeneSymbol") %>% 
  filter(p_val_adj < 0.01 & !str_detect(GeneSymbol, "^Rp") & !str_detect(GeneSymbol, "^mt"))

write.xlsx(seurat.results[["Markers_ast_ReactiveAST"]], 
           file = file.path(ws, "Markers_sn_ReacAstro_vs_HomeoAstro.xlsx"))

# Gene Ontology
library(clusterProfiler)
library("org.Mm.eg.db", character.only = TRUE)
organism <- org.Mm.eg.db
background <- rownames(astro.seurat[["SCT"]])
marker_genes <- seurat.results[["Markers_ast_ReactiveAST"]] %>% pull(GeneSymbol)
seurat.results[["ORA_ast_ReactiveAST"]] <- 
  clusterProfiler::enrichGO(gene = marker_genes, 
                            universe = background, # 'potential DEGs'
                            keyType = "SYMBOL", # or ENSEMBL
                            OrgDb = organism,
                            ont = "ALL", # BP = biological process, MF = molecular function, CC = cellular component
                            pAdjustMethod = "fdr",
                            pvalueCutoff = 0.01, #
                            qvalueCutoff = 0.2,
                            readable = F,
                            minGSSize = 5) %>% 
  clusterProfiler::simplify()

# Add as module
astro.seurat %<>% AddModuleScore(features = list(marker_genes), name = "ReactiveAST_module", assay = 'SCT')
astro.seurat %>% FeaturePlot(features = "ReactiveAST_module1", reduction = "umap")
astro.seurat %>% 
  enh_vlnplot(feature = "ReactiveAST_module1", 
              grouping = "cell_type_2", 
              colors = col.list$astro_cols, 
              compare_means = F)

# save the results
seurat.results[["ORA_ast_ReactiveAST"]] %>% 
  write.xlsx(file = file.path(ws, "ORAlist_sn_ReacAstro_vs_HomeoAstro.xlsx"), overwrite = T)
```

Gene Ontology analysis Activated Astrocytes
```{r Activated AST markers and GO}
# Direct marker comparison between Reactive and Homeostatic AST
seurat.results[["Markers_ast_ActivatedAST"]] <- astro.seurat %>%
  FindMarkers(
    ident.1 = "Activated Astrocytes",
    ident.2 = c("DE Astrocytes", "TE Astrocytes"),
    logfc.threshold = 0.58,
    only.pos = T,
    assay = 'SCT',
    verbose = T
  ) %>%
  arrange(desc(avg_log2FC)) %>%
  rownames_to_column(var = "GeneSymbol") %>% 
  filter(p_val_adj < 0.01 & !str_detect(GeneSymbol, "^Rp") & !str_detect(GeneSymbol, "^mt"))

# Gene Ontology
library(clusterProfiler)
library("org.Mm.eg.db", character.only = TRUE)
organism <- org.Mm.eg.db
background <- rownames(astro.seurat[["SCT"]])
marker_genes <- seurat.results[["Markers_ast_ActivatedAST"]] %>% pull(GeneSymbol)
seurat.results[["ORA_ast_ActivatedAST"]] <- 
  clusterProfiler::enrichGO(gene = marker_genes, 
                            universe = background, # 'potential DEGs'
                            keyType = "SYMBOL", # or ENSEMBL
                            OrgDb = organism,
                            ont = "ALL", # BP = biological process, MF = molecular function, CC = cellular component
                            pAdjustMethod = "fdr",
                            pvalueCutoff = 0.01, #
                            qvalueCutoff = 0.2,
                            readable = F,
                            minGSSize = 5) %>% 
  clusterProfiler::simplify()

# Add as module
astro.seurat %<>% AddModuleScore(features = list(marker_genes), name = "ActivatedAST_module", assay = 'SCT')
astro.seurat %>% FeaturePlot(features = "ActivatedAST_module1", reduction = "umap")
astro.seurat %>% 
  enh_vlnplot(feature = "ActivatedAST_module1", 
              grouping = "cell_type_2", 
              colors = col.list$astro_cols, 
              compare_means = F)

# save the results
seurat.results[["ORA_ast_ActivatedAST"]] %>% 
  write.xlsx(file = file.path(ws, "ORAlist_sn_ActivatedAstro_vs_HomeoAstro.xlsx"), overwrite = T)
```

Comparing markers from published datasets  with our reactive population. Visualizing the metadata list in our spatial data
```{r astrocytes metanalysis}
DefaultAssay(astro.seurat) <- "SCT"
if(!exists("Markers_ast_ReactiveAST", where = seurat.results)){
  seurat.results[["Markers_ast_ReactiveAST"]] <- read_all_sheets(
    file = file.path(ws, "Markers_sn_ReacAstro_vs_HomeoAstro.xlsx"))
}

# read in the metadata file
seurat.results[["Metadata_reactive_astro"]] <- 
  read_all_sheets(
    file = "data/SupplementaryTable4_Metanalysis_GlialReactivityMarkers.xlsx") %>% 
  .[names(.) %>% grep(pattern = "astro|DAA", ignore.case = T)] %>%  ## keep the astro-related populations
  lapply(\(df){as.character(df[[1]])}) # convert dfs to strings
  
# find intersetions with the metadata list
meta.intersects <- 
  mapply(\(meta_name, meta_values){
    our_markers <- seurat.results[["Markers_ast_ReactiveAST"]] %>% pull(GeneSymbol)
    intersection <- intersect(our_markers, meta_values)
    cat(crayon::bgBlack(paste0(length(intersection), " genes intersect with ", meta_name, ".", "\n")))
    return(intersection)
  }, 
  names(seurat.results[["Metadata_reactive_astro"]]), 
  seurat.results[["Metadata_reactive_astro"]], 
  SIMPLIFY = FALSE)

# Testing the intersection significance
meta.signif <- 
  mapply(\(meta_name, meta_values){
    background_size <- astro.seurat[["SCT"]] %>% nrow
    intersection_size <- meta_values %>% length
    meta_set_size <- seurat.results[["Metadata_reactive_astro"]][[meta_name]] %>% length
    test <- phyper(
      q = intersection_size - 1,  ## number of red marbles in the draw -1 (see below)
      m = meta_set_size, ## number of red marbles in urn
      n = background_size - meta_set_size, ## number of green marbles in urn
      k = seurat.results[["Markers_ast_ReactiveAST"]] %>% nrow, ## number of drawn marbles
  lower.tail = FALSE ## compute P(X > overlap), hence the '-1' in the q argument
  )
    return(test)
  }, 
  names(meta.intersects), 
  meta.intersects, 
  SIMPLIFY = FALSE)
meta.signif %<>% p.adjust(method = "fdr")
meta.signif

# Gene intersection with Habib's Reactive Astrocytes
meta.intersects[["Habib2020_DAA_5XFAD"]]


# Module differences
astro.seurat@meta.data %>% glimpse
astro.seurat %>% enh_vlnplot(
  feature = "Astro_DAA_vs_homeostatic_Habib2020_up_1", 
  grouping = "cell_type_2", 
  idents = c("TE Astrocytes", "DE Astrocytes", "Reactive Astrocytes"), 
  colors = col.list$astro_cols, 
  compare_means = F, 
  ref.group = "Reactive Astrocytes", 
  stat_test = "t.test", 
  paired = F
) + 
  theme_mk + 
  remove_grid +
  NoLegend() + 
  xlab("MCAO-induced Astrocyte Populations") + 
  ylab("Module Score") +
  ylim(c(-0.11, 0.9)) +
  labs(title = "DAA Signature") + 
  annotate(geom = "text", x = 3, y = 0.9, label = expression(italic("***P < 0.001")), size = 2.45)
```

Reactive astrocytes (our, metadata) in the spatial data
```{r astrocytes in spatial}
# Add Metadata to spatial
spatial.seurat %<>% 
  AddModuleScore(
    features = seurat.results[["Metadata_reactive_astro"]], 
    name = names(seurat.results[["Metadata_reactive_astro"]]))

# Add our Reactive Astro to spatial
spatial.seurat %<>% 
  AddModuleScore(features = list(seurat.results[["Markers_ast_ReactiveAST"]] %>% pull(GeneSymbol), 
                                 seurat.results[["Markers_ast_ActivatedAST"]] %>% pull(GeneSymbol)), 
                 name = c("Zucha_sn_reactive_astro", "Zucha_sn_activated_astro"))

## Show the astro modules
spatial.seurat %>% SpatialPlot(
  features = c("Habib2020_DAA_5XFAD3", "Zucha_sn_reactive_astro1", "Zucha_sn_activated_astro2"),
  images = c("Ctrl", "1DPI", "3DPI", "7DPI"), 
  image.alpha = 0.1, 
  stroke = 0, 
  crop = T
) & 
  scale_fill_gradientn(colours = c("#4D4D4D", "#FFFFFF", "#016C59")) &
  scale_alpha_continuous(c(0.8, 1)) &
  theme_mk & 
  remove_grid & 
  NoLegend() & 
  NoAxes()

# perform correlation test
cor.test(spatial.seurat@meta.data[["Habib2020_DAA_5XFAD3"]], 
         spatial.seurat@meta.data[["Zucha_sn_reactive_astro1"]], 
         method = "pearson")

plot(x = spatial.seurat@meta.data[["Habib2020_DAA_5XFAD3"]], 
     y = spatial.seurat@meta.data[["Zucha_sn_reactive_astro1"]])

spatial.seurat@meta.data %>% 
  ggplot(aes(x = Zucha_sn_reactive_astro1, y = Habib2020_DAA_5XFAD3)) + 
  geom_point(aes(color = Zucha_sn_reactive_astro1), size = 0.5) + 
  scale_color_gradientn(colors = c("#3690C0", "#014636")) +
  geom_smooth(method = 'lm', se = F, color = "black") + 
  annotate(geom = "text", x = 1.2, y = 0.2, label = expression(italic(rho == 0.946)), hjust = 1, size = 2.45) + 
  annotate(geom = "text", x = 1.2, y = 0.1, label = expression(italic("***P < 0.001")), hjust = 1, size = 2.45) +
  xlab("Reactive Astrocyte signature") + 
  ylab("DAA signature") + 
  theme_mk +
  theme(legend.position = "none")
```

Projections on how the origin of the astro populations, based on the Zeisel et al annotation. Calculation of the projections is in a separate markdown.
```{r Brain region projections}
## plot
astro.seurat@assays %>% names
DefaultAssay(astro.seurat) <- "prediction_ASTRO_region" ## azimuth projection based on Zeisel et al
astro.seurat@assays$prediction_ASTRO_region %>% dimnames %>% .[1] ## brain region names

## where are the reactive astro populations from?
astro.seurat %>% VlnPlot(features = "Telencephalon", group.by = "cell_type_2") ## telencephalon
astro.seurat %>% VlnPlot(features = "Hypothalamus,Thalamus,Midbrain dorsal,Midbrain ventral,Pons,Medulla,Spinal cord", group.by = "cell_type_2") ##diencephalon


(FeaturePlot(astro.seurat, features = c("Telencephalon"), pt.size = 0.25, cols = brewer.pal(n = 9, name = "BuGn")[3:9]) + ## telencephalon projection
  theme(plot.background = element_blank(), panel.background = element_blank())) 


(FeaturePlot(astro.seurat, features = c("Hypothalamus,Thalamus,Midbrain dorsal,Midbrain ventral,Pons,Medulla,Spinal cord"), pt.size = 0.25, cols = brewer.pal(n = 9, name = "BuGn")[3:9]) + ## diencephalon projection
  theme(plot.background = element_blank(), panel.background = element_blank()))

```

save processed astro for visual markdown
```{r save astro seurat}
saveRDS(astro.seurat, "data/seurat_ASTRO_1DP_11.rds")
```

=== Microglial populations. ===

Loading the microglial sn seurat
```{r seurat load}
mg.seurat <- readRDS("data/seurat_MG_names_soupX.rds")
DefaultAssay(mg.seurat) <- "SCT"
```

```{r seurat clean up}
mg.seurat %<>% SetIdent(value = "cell_type_2")
mg.seurat$cell_type_2 %>% levels
mg.seurat %<>% subset(idents = "debris", invert = T)  ## remove dead cells

mg.seurat@meta.data %<>% 
  mutate(cell_type_2 = recode(cell_type_2, 
                              "homeostatic" = "Homeostatic",
                              "TAM" = "Reactive") %>% 
           factor(levels = c("Homeostatic", "Reactive")), 
         Condition = recode(Condition, 
                            "sham" = "Sham", 
                            "D1" = "1DPI", 
                            "D3" = "3DPI", 
                            "D7" = "7DPI") %>% 
           factor(levels = c("Sham", 
                             "1DPI", 
                             "3DPI", 
                             "7DPI"))
         )

mg.seurat@meta.data %>% glimpse
```

Marker genes, dotplot, volcano plot.
```{r Markers and Volcano}
mg.seurat %<>% SetIdent(value = "cell_type_2")
# Calculate Markers
seurat.results[["Markers_sn_MG"]] <- 
  mg.seurat %>% 
  FindAllMarkers(
    assay = "SCT", 
    logfc.threshold = 0.58, 
    verbose = T, 
    only.pos = T) %>% 
  arrange(desc(avg_log2FC)) %>% 
  filter(p_val_adj < 0.01) %>% 
  split(f = .[["cluster"]])
names(seurat.results[["Markers_sn_MG"]]) <- paste0("SN_MG_", names(seurat.results[["Markers_sn_MG"]]))

# save the marker list
seurat.results[["Markers_sn_MG"]] %>% 
  write.xlsx(file = file.path(ws, "Markers_sn_MG.xlsx"), overwrite = T)

## Dotplot ####
{
  features_test <- c('Hexb', 'Cx3cr1', 'P2ry12', 'Tmem119',  # homeo MG
                     'Apoe', 'Ctsb', 'Abca1',  'Hif1a', 'Spp1', 'Sash1',  'H2-D1', 'Lyz2', 'Trem2', 'Igf1', "Lgals3",  'Gpnmb'  ## reactive mg
                     )

  p <- DotPlot(mg.seurat, 
               assay = "SCT", 
               features = features_test, 
               group.by = "cell_type_2", 
               cols = c("#FFF7FB", "#A50F15")) + 
    theme_mk + 
    theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1, 
                                   vjust = 1)) +
    xlab(NULL) + 
    ylab(NULL)
  print(p)
}

  
## Volcano plot ####
library(EnhancedVolcano)
marker.df <- mg.seurat.markers.TAM
keyvals.colour <- ifelse(
  marker.df$avg_log2FC < -0.58 & marker.df$p_val_adj < 0.01, '#3288BD',
  ifelse(marker.df$avg_log2FC > 0.58 & marker.df$p_val_adj < 0.01, '#D53E4F',
         '#E7E7E7')) 
names(keyvals.colour)[keyvals.colour == '#3288BD'] <- 'downregulated'; names(keyvals.colour)[keyvals.colour == '#D53E4F'] <- 'upregulated'; names(keyvals.colour)[keyvals.colour == '#E7E7E7'] <- 'insignificant'

p <- EnhancedVolcano(toptable = marker.df,
                     lab = rownames(marker.df),
                     x = 'avg_log2FC',
                     y = 'p_val_adj',
                     axisLabSize = 20,
                     titleLabSize = 14,
                     title = "",
                     pCutoff = 0.05,
                     colAlpha = 0.7,
                     colCustom = keyvals.colour,
                     subtitleLabSize = 9,
                     FCcutoff = 0.4,
                     labSize = 10,
                     pointSize = 3.0,
                     drawConnectors = T,
                     max.overlaps = 10,
                     maxoverlapsConnectors = NULL,
                     arrowheads = F,
                     boxedLabels = T,
                     selectLab = c('P2ry12', 'Cx3cr1', 'Lyz2', 'Gpnmb', 'Apoe', 'Trem2', 'Spp1', 'Igf1', 'Ctsb', 'Lpl', 'Sash1', 'H2-D1'), ## select genes to highlight
                     labFace = "bold",
                     cutoffLineType = "blank") + 
  theme_light() + theme(axis.text = element_text(size = 30), axis.title = element_text(size = 30)) +
  NoLegend()
```

Gene ontology analysis
```{r GO analysis}
mg.seurat$cell_type_2 %>% levels
if(!exists("Markers_sn_MG", where = seurat.results)){
  seurat.results[["Markers_sn_MG"]] <- read_all_sheets(file = file.path(ws, "Markers_sn_MG.xlsx"))
}

# Gene Ontology
library(clusterProfiler)
library("org.Mm.eg.db", character.only = TRUE)
organism <- org.Mm.eg.db
background <- rownames(mg.seurat[["SCT"]])
marker_genes <- 
  seurat.results[["Markers_sn_MG"]][["SN_MG_Reactive"]] %>% 
  rownames_to_column("GeneSymbol") %>% 
  filter(p_val_adj < 0.01 & !str_detect(GeneSymbol, "^Rp") & !str_detect(GeneSymbol, "^mt")) %>% 
  pull(GeneSymbol)


seurat.results[["ORA_mg_ReactiveMG"]] <- 
  clusterProfiler::enrichGO(gene = marker_genes, 
                            universe = background, # 'potential DEGs'
                            keyType = "SYMBOL", # or ENSEMBL
                            OrgDb = organism,
                            ont = "ALL", # BP = biological process, MF = molecular function, CC = cellular component
                            pAdjustMethod = "fdr",
                            pvalueCutoff = 0.01, #
                            qvalueCutoff = 0.2,
                            readable = F,
                            minGSSize = 5) %>% 
  clusterProfiler::simplify()

# Add as module
mg.seurat %<>% AddModuleScore(features = list(marker_genes), name = "ReactiveMG_module", assay = 'SCT')
mg.seurat %>% FeaturePlot(features = "ReactiveMG_module1", reduction = "umap")
mg.seurat %>% 
  enh_vlnplot(feature = "ReactiveMG_module1", 
              grouping = "cell_type_2", 
              colors = col.list$mg_cols, 
              compare_means = F)

# save the ORA result
seurat.results[["ORA_mg_ReactiveMG"]] %>% 
  write.xlsx(
    file = file.path(ws, "ORA_sn_MG_Reactive_vs_Homeostatic.xlsx"), overwrite = T)

## selecting terms for the barplot
terms <- c(
           ## BP
           "GO:1905952", # regulation of lipid localization
           "GO:0030335", # positive regulation of cell migration
           "GO:0048771", # tissue remodeling
           "GO:1903530", # regulation of secretion by cell
           "GO:0050670", # regulation of lymphocyte proliferation
           
           ## CC
           "GO:0030312", # external encapsulating structure
           "GO:0030055", # cell-substrate junction
           
           
           ## MF
           "GO:0005178", # integrin binding
           "GO:0071813", # lipoprotein particle binding
           "GO:0050839" # cell adhesion molecule binding
           )

{
  population <- "Reactive"
  subset.ORA <- ORA.list[[population]]@result %>% filter(ID %in% terms)
  ORA.data <- data.frame(
    'Description' = subset.ORA[["Description"]] %>% stringr::str_to_title(),
    'Pval' = subset.ORA[["p.adjust"]] %>% log10 %>% '*'(-1))
  ORA.data$Description %<>% factor(levels = rev(unique(.)))
  rm(subset.ORA)
  
  (ggplot(data = ORA.data, aes(x = Description, y = Pval)) + 
    geom_bar(stat = "identity", width = 0.6, size = 0.2, colour = "black", alpha = 0.75, fill = col.list[["mg_cols"]][[population]]) +
    ylim(0, max(ORA.data$Pval) + 1) + 
    geom_hline(yintercept = 1.3, size = 0.2) +
    coord_flip() +
    theme(text = element_text(family = "OpenSans", size = 40), 
          axis.text.x = element_text(vjust = 1, size = 40),
          axis.text.y = element_text(vjust = 0.5, size = 40, color = 'black'),
          axis.ticks = element_blank(),
          panel.background = element_blank(), #transparent panel bg
          plot.background = element_blank(), #transparent plot bg 
          panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_line(colour = "#A1A19C", size = 0.2)
          ) +  
          xlab("") + ylab(expression(paste("-log(Padj)")))) %>% ggsave(filename = paste0(ws, "/Barplot_sn_mg_GO_", population, ".tiff"), device = "tiff", dpi = 300, width = 7, height = 5)
}
```

Comparing the reactive population with the published datasets.
```{r metanalysis}
# read in the metadata file
seurat.results[["Metadata_reactive_mg"]] <- 
  read_all_sheets(
    file = "data/SupplementaryTable4_Metanalysis_GlialReactivityMarkers.xlsx") %>% 
  .[names(.) %>% grep(pattern = "_MG_", ignore.case = T)] %>%  ## keep the astro-related populations
  lapply(\(df){as.character(df[[1]])}) # convert dfs to strings

## our markers
if(!exists("Markers_sn_MG", where = seurat.results)){
  seurat.results[["Markers_sn_MG"]] <- read_all_sheets(file = file.path(ws, "Markers_sn_MG.xlsx"))
}


# find intersetions with the metadata list
meta.intersects <- 
  mapply(\(meta_name, meta_values){
    our_markers <- seurat.results[["Markers_sn_MG"]][["SN_MG_Reactive"]] %>% 
      pull(gene)
    intersection <- intersect(our_markers, meta_values)
    cat(crayon::bgBlack(paste0(length(intersection), " genes intersect with ", meta_name, ".", "\n")))
    return(intersection)
  }, 
  names(seurat.results[["Metadata_reactive_mg"]]), 
  seurat.results[["Metadata_reactive_mg"]], 
  SIMPLIFY = FALSE)

# Testing the intersection significance
meta.signif <- 
  mapply(\(meta_name, meta_values){
    background_size <- mg.seurat[["SCT"]] %>% nrow
    intersection_size <- meta_values %>% length
    meta_set_size <- seurat.results[["Metadata_reactive_mg"]][[meta_name]] %>% length
    test <- phyper(
      q = intersection_size - 1,  ## number of red marbles in the draw -1 (see below)
      m = meta_set_size, ## number of red marbles in urn
      n = background_size - meta_set_size, ## number of green marbles in urn
      k = seurat.results[["Markers_sn_MG"]][["SN_MG_Reactive"]] %>% nrow, ## number of drawn marbles
  lower.tail = FALSE ## compute P(X > overlap), hence the '-1' in the q argument
  )
    return(test)
  }, 
  names(meta.intersects), 
  meta.intersects, 
  SIMPLIFY = FALSE)
meta.signif %<>% p.adjust(method = "fdr")
meta.signif

# Add metadata profiles to mg.seurat
mg.seurat %<>% 
  AddModuleScore(
    features = seurat.results[["Metadata_reactive_mg"]], 
    name = names(seurat.results[["Metadata_reactive_mg"]]))

mg.seurat@meta.data %>% colnames %>% tail

mg.seurat %>% enh_vlnplot(
  feature = "SF2019_MG_ARM_5XFAD5", 
  grouping = "cell_type_2", 
  colors = col.list$mg_cols, 
  compare_means = F, 
  ref.group = "Homeostatic", 
  stat_test = "t.test"
) + 
  theme_mk + 
  remove_grid +
  NoLegend() + 
  xlab("MCAO-induced Microglia Populations") + 
  ylab("Module Score") +
  ylim(c(-0.18, 0.7)) +
  labs(title = "ARM Signature") + 
  annotate(geom = "text", x = 2, y = 0.65, label = expression(italic("***P < 0.001")), size = 2.45)
```

Visualizing microglial reactive profiles in spatial data
```{r spatial microglia metanalysis}
# add metadata to spatial
spatial.seurat %<>% 
  AddModuleScore(
    features = seurat.results[["Metadata_reactive_mg"]], 
    name = names(seurat.results[["Metadata_reactive_mg"]]), 
    assay = "SCT")

# add our sn microglia to spatial
spatial.seurat %<>% 
  AddModuleScore(
    features = list(seurat.results[["Markers_sn_MG"]][["SN_MG_Reactive"]] %>% pull(gene)), 
    name = "Zucha_SN_MG_Reactive", 
    assay = "SCT")

# show the microglial modules spatially
spatial.seurat %>% SpatialPlot(
  features = c("SF2019_MG_ARM_5XFAD5", "SF2019_MG_IRM_5XFAD6", "Zucha_SN_MG_Reactive1"),
  images = c("Ctrl", "1DPI", "3DPI", "7DPI"), 
  image.alpha = 0.1, 
  stroke = 0, 
  crop = T, 
  max.cutoff = 'q99'
) & 
  scale_fill_gradientn(colours = c("#4D4D4D", "#FFFFFF", "#A50F15")) &
  scale_alpha_continuous(c(0.8, 1)) &
  theme_mk & 
  remove_grid & 
  NoLegend() & 
  NoAxes()


# perform correlation test
cor.test(spatial.seurat@meta.data[["SF2019_MG_ARM_5XFAD5"]], 
         spatial.seurat@meta.data[["Zucha_SN_MG_Reactive1"]], 
         method = "pearson")

plot(x = spatial.seurat@meta.data[["SF2019_MG_ARM_5XFAD5"]], 
     y = spatial.seurat@meta.data[["Zucha_SN_MG_Reactive1"]])

spatial.seurat@meta.data %>% 
  ggplot(aes(x = Zucha_SN_MG_Reactive1, y = SF2019_MG_ARM_5XFAD5)) + 
  geom_point(aes(color = Zucha_SN_MG_Reactive1), size = 0.5) + 
  scale_color_gradientn(colors = c("#FC9272","#A50F15")) +
  geom_smooth(method = 'lm', se = F, color = "black") + 
  annotate(geom = "text", x = 0.7, y = 0.3, label = expression(italic(rho == 0.864)), hjust = 1, size = 2.45) + 
  annotate(geom = "text", x = 0.7, y = 0.2, label = expression(italic("***P < 0.001")), hjust = 1, size = 2.45) +
  xlab("Reactive Microglia signature") + 
  ylab("ARM signature") + 
  theme_mk +
  theme(legend.position = "none")
```

MG markers expression in the spatial plot
```{r Spatial plot}
{## spatial plot of microglial markers
  features <- c("Csf1r", "Cd68", 'Hexb', 'Aif1', 'Lyz2', 'Spp1', 'Cst7', 'Cenpa', 'B2m', 'Ctsb', 'Apoe', 'Trem2')
  lapply(features, function(feature){
    minmax <- c(spatial.seurat@assays$SCT[feature] %>% min, spatial.seurat@assays$SCT[feature] %>% max)
    p <- (spatial.seurat %>% 
      SpatialPlot(features = feature, crop = F, pt.size.factor = 1.5, image.alpha = 0.2, alpha = c(0.8, 1), stroke = 0) & 
      NoLegend() &
      theme(text = element_text(family = 'OpenSans')) & ggplot2::scale_fill_gradientn(colours = brewer.pal(n = 11, name = "RdBu") %>% rev)
    )
    p %>% ggsave(filename = paste0(ws, "/SpatialPlot_mg_marker_", feature, ".tiff"), device = 'tiff', dpi = 300, width = 20, height = 5, units = 'cm')
  })
}
```

save processed microglia seurat
```{r save microglial seurat}
saveRDS(mg.seurat, file = file.path("data", "seurat_MG_1DP_11.rds"))
```


=== Oligodendrocytes ===
## single-nucleus OLs
 
```{r ol results and plot lists}
oligo.results <- list()
oligo.plots <- list()
```

```{r subset and processing sn OLs}
sn.seurat %<>% SetIdent(value = "Celltypes")
oligo.seurat <- subset(sn.seurat, idents = c("OLs", "OPCs"))

# Normalize and scale
{
  DefaultAssay(oligo.seurat) <- "RNA"
  oligo.seurat %<>% NormalizeData(normalization.method = "LogNormalize", verbose = T)
  oligo.seurat %<>% FindVariableFeatures(selection.method = "vst", nfeatures = 3000)
  oligo.seurat %<>% ScaleData()
}

# Explore gene distribution across cells
oligo.seurat[["RNA"]]@counts %>% rowSums() %>% summary
oligo.seurat[["RNA"]]@counts %>% rowSums() %>% sort(decreasing = T) %>% head(50) ## the most expressed genes overall

# Gene presence and variability in cells
oligo.results[["GenesAboveFive"]] <- (oligo.seurat[["RNA"]]@counts %>% rowSums()) %>% .[. >= 5 ] ## no genes in >= 5 cells
oligo.results[["rna_varfeat"]] <- oligo.seurat %>% HVFInfo(assay = "RNA")
oligo.results[["rna_varfeat"]] %>% arrange(desc(variance.standardized)) %>% top_n(20)
selected.var.features <- oligo.results[["rna_varfeat"]] %>% dplyr::filter(variance.standardized >= 1.5) %>% rownames ## highly variable genes
((selected.var.features %in% names(oligo.results[["GenesAboveFive"]])) %>% sum) / length(selected.var.features) ## >99% of var genes are present in at least five cells

oligo.results[["GenesAboveFive"]] %>% .[names(.) %in% selected.var.features] %>% hist(breaks = 1000)

# PCA
{
  oligo.seurat %<>% RunPCA(assay = "RNA", features = selected.var.features, npcs = 50, verbose = T)
  oligo.seurat %>% DimPlot(reduction = "pca", dims = c(1,2), cols = col.list$Celltypes)
  oligo.seurat %>% ElbowPlot(ndims = 50) | oligo.seurat %>% PC_var_explained()
  oligo.seurat %>% DimHeatmap(cells = 200, reduction = "pca", balanced = T, dims = 1:15)
}

# the 3D PCA
{
  dims <- c(1, 2, 3) ## PCs to plot
  df <- data.frame(
      "ElementName" = oligo.seurat@reductions$pca@cell.embeddings %>% rownames(),
      "X" = oligo.seurat@reductions$pca@cell.embeddings[, dims[1]],
      "Y" = oligo.seurat@reductions$pca@cell.embeddings[, dims[2]],
      "Z" = oligo.seurat@reductions$pca@cell.embeddings[, dims[3]],
      "Annotation" = oligo.seurat[["CelltypesDetailed"]]
  )
  df %<>% sample_n(size = 1000) # (optional) to increase plotting speed
  df %>% plot_ly(x = .[["X"]], y = .[["Y"]], z = .[["Z"]], type="scatter3d", mode = "markers", color = .[["Annotation"]], colors = col.list$CelltypesDetailed)
}

## UMAP
pc_dims <- 1:10
{
  oligo.seurat %<>% RunUMAP(reduction = "pca", dims = pc_dims, verbose = T)
  oligo.seurat %>% DimPlot(reduction = "umap", group.by = "CelltypesDetailed", label = T, cols = col.list$CelltypesDetailed)
}

## Clustering 
{
  oligo.seurat %<>% FindNeighbors(reduction = "pca", dims = pc_dims, verbose = T)
  for(res in c(0.6, 0.8, 1, 1.2, 1.4)){
   oligo.seurat %<>% FindClusters(resolution = res, algorithm = 1, verbose = T) 
  }
}

oligo.plots[["UMAP_RNA_res1.4"]] <- oligo.seurat %>% 
  DimPlot(reduction = "umap", pt.size = 1, group.by = "RNA_snn_res.1.4", cols = col.list$big_col_palette, label = T)

oligo.seurat %>% FeaturePlot(features = "Serpina3n", pt.size = 1.5)

oligo.seurat %>% dittoPlot(var = "Tnfrsf12a", group.by = "RNA_snn_res.1.4", plots = c("vlnplot", "boxplot", "jitter"))
```
 
```{r markers}
DefaultAssay(oligo.seurat) <- "RNA"
oligo.seurat %<>% SetIdent(value = "RNA_snn_res.1.4")

oligo.results[["markers_RNAres1.4"]] <- FindAllMarkers(oligo.seurat, logfc.threshold = 1, assay = 'RNA', verbose = T, only.pos = T) %>% 
  arrange(p_val_adj) %>% 
  filter(p_val_adj < 0.01) %>% 
  split(f = .[["cluster"]])

```

```{r Annotating sn OLs}
canonical.features <- c(
  "Plp1", "Mobp", ## OL-lineage cells
  "Nckap5", "Tcf7l2", ## NFOL
  "Sepp1", "S100b", ## MOL2
  "Opalin", "Ptgds", ## MOL5/6
  "Ifi27", "H2-D1", ## IFN-responsive
  "Serpina3n", "Klk6", ## MOL DA1
  "Tnfrsf12a", "Cdkn1a" ## MOL DA2
)

oligo.plots[["FeaturePlots_sn_CanonicalMarkers"]] <- oligo.seurat %>% 
  FeaturePlot(features = canonical.features, max.cutoff = 'q99', repel = T, label.size = 3, slot = "data",
  reduction = "umap", label = T, raster = F, pt.size = 0.2, ncol = 5, cols = c("white", "#AE0900")) & NoAxes() & NoLegend()


oligo.seurat@meta.data %<>% mutate("OLCelltypes" = factor(case_when(
  RNA_snn_res.1.4 == 0 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 1 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 2 ~ "MOL2", 
  RNA_snn_res.1.4 == 3 ~ "MOL2", 
  RNA_snn_res.1.4 == 4 ~ "OPCs",
  RNA_snn_res.1.4 == 5 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 6 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 7 ~ "NFOLs", 
  RNA_snn_res.1.4 == 8 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 9 ~ "OPCs", 
  RNA_snn_res.1.4 == 10 ~ "MOL_DA1", 
  RNA_snn_res.1.4 == 11 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 12 ~ "OPCs"
), levels = c("OPCs", "NFOLs", "MOL2", "MOL5/6", "MOL_DA1")))

oligo.seurat %<>% SetIdent(value = "OLCelltypes")
```

```{r Plotting sn OLs}
oligo.results[["UMAP_snOLs_RNAres1.4"]] <- oligo.seurat %>% DimPlot(group.by = "RNA_snn_res.1.4", pt.size = 1)

oligo.plots[["UMAP_snOLs_OLCelltypes"]] <- oligo.seurat %>% 
  DimPlot(group.by = "OLCelltypes", pt.size = 1.5, cols = col.list$ol_cols, label = T, label.box = 5,  label.size = 5) & NoAxes() & theme(legend.position = "none")

oligo.plots[["Barplot_snOLs_Conditions_byOLCelltypes"]] <- oligo.seurat %>% 
  dittoBarPlot(var = "Condition", group.by = "OLCelltypes", color.panel = col.list[["Condition"]], retain.factor.levels = T) 
```

```{r save sn OLs}
saveRDS(oligo.seurat, file = "data/Seurat_snOLs_ReadyForIntegration.Rds")
```

## single-cell OLs

Subset and processing sc OLs
```{r subset and processing sc OLs}
oligo.integrated <- readRDS("data/seurat_OLIGO_integrated_UMAP_named.rds")
oligo.integrated %<>% subset(subset = orig.ident == "scRNA_MCAO_all")

ol.assay <- oligo.integrated %>% GetAssayData(assay = "RNA", layer = "counts")
ol.metadata <- oligo.integrated@meta.data[c("orig.ident", "Sample", "Condition", "Doublet", "cell_type_1","Phase", "percent.mt", "percent.rib", "scsnOLAnno")]

oligo.seurat.sc <- CreateSeuratObject(counts = ol.assay, meta.data = ol.metadata, project = "Zucha2023_scOLs")
oligo.seurat.sc %<>% subset(subset = Doublet == "Singlet")
oligo.seurat.sc %<>% SetIdent(value = "scsnOLAnno")

# Normalize and scale
{
  DefaultAssay(oligo.seurat.sc) <- "RNA"
  oligo.seurat.sc %<>% NormalizeData(normalization.method = "LogNormalize", verbose = T)
  oligo.seurat.sc %<>% FindVariableFeatures(selection.method = "vst", nfeatures = 3000)
  oligo.seurat.sc %<>% ScaleData()
}

# Explore gene distribution across cells
oligo.seurat.sc[["RNA"]]@counts %>% rowSums() %>% summary
oligo.seurat.sc[["RNA"]]@counts %>% rowSums() %>% sort(decreasing = T) %>% head(50) ## the most expressed genes overall

# Gene presence and variability in cells
oligo.results[["GenesAboveFive"]] <- (oligo.seurat.sc[["RNA"]]@counts %>% rowSums()) %>% .[. >= 5 ] ## no genes in >= 5 cells
oligo.results[["rna_varfeat"]] <- oligo.seurat.sc %>% HVFInfo(assay = "RNA")
oligo.results[["rna_varfeat"]] %>% arrange(desc(variance.standardized)) %>% top_n(20)
selected.var.features <- oligo.results[["rna_varfeat"]] %>% dplyr::filter(variance.standardized >= 1.5) %>% rownames ## highly variable genes
((selected.var.features %in% names(oligo.results[["GenesAboveFive"]])) %>% sum) / length(selected.var.features) ## >99% of var genes are present in at least five cells

oligo.results[["GenesAboveFive"]] %>% .[names(.) %in% selected.var.features] %>% hist(breaks = 1000)

# PCA
{
  oligo.seurat.sc %<>% RunPCA(assay = "RNA", features = selected.var.features, npcs = 50, verbose = T)
  oligo.seurat.sc %>% DimPlot(reduction = "pca", dims = c(1,2), cols = col.list$big_col_palette)
  oligo.seurat.sc %>% ElbowPlot(ndims = 50) | oligo.seurat.sc %>% PC_var_explained()
  oligo.seurat.sc %>% DimHeatmap(cells = 200, reduction = "pca", balanced = T, dims = 1:15)
}

# the 3D PCA
{
  dims <- c(1, 2, 3) ## PCs to plot
  df <- data.frame(
      "ElementName" = oligo.seurat.sc@reductions$pca@cell.embeddings %>% rownames(),
      "X" = oligo.seurat.sc@reductions$pca@cell.embeddings[, dims[1]],
      "Y" = oligo.seurat.sc@reductions$pca@cell.embeddings[, dims[2]],
      "Z" = oligo.seurat.sc@reductions$pca@cell.embeddings[, dims[3]],
      "Annotation" = oligo.seurat.sc[["scsnOLAnno"]]
  )
  df %<>% sample_n(size = 1000) # (optional) to increase plotting speed
  df %>% plot_ly(x = .[["X"]], y = .[["Y"]], z = .[["Z"]], type="scatter3d", mode = "markers", color = .[["Annotation"]], colors = col.list$ol_cols)
}


## UMAP
pc_dims <- 1:10
{
  oligo.seurat.sc %<>% RunUMAP(reduction = "pca", dims = pc_dims, verbose = T)
  oligo.seurat.sc %>% DimPlot(reduction = "umap", group.by = "scsnOLAnno", label = T, cols = col.list$ol_cols)
}

## Clustering 
{
  oligo.seurat.sc %<>% FindNeighbors(reduction = "pca", dims = pc_dims, verbose = T)
  for(res in c(0.6, 0.8, 1, 1.2, 1.4)){
   oligo.seurat.sc %<>% FindClusters(resolution = res, algorithm = 1, verbose = T) 
  }
}

oligo.seurat.sc %<>% SetIdent(value = "RNA_snn_res.1.4")

oligo.plots[["UMAP_sc_RNA_res1.4"]] <- oligo.seurat.sc %>% 
  DimPlot(reduction = "umap", pt.size = 1, group.by = "RNA_snn_res.1.4", cols = col.list$big_col_palette, label = T)

```

```{r Annotating sc OLs}
canonical.features <- c(
  "Plp1", "Mobp", ## OL-lineage cells
  "Nckap5", "Tcf7l2", ## NFOL
  "Sepp1", "S100b", ## MOL2
  "Opalin", "Ptgds", ## MOL5/6
  "Ifi27", "H2-D1", ## IFN-responsive
  "Serpina3n", "Klk6", ## MOL DA1
  "Tnfrsf12a", "Cdkn1a" ## MOL DA2
)

oligo.plots[["FeaturePlots_sc_CanonicalMarkers"]] <- oligo.seurat.sc %>% 
  FeaturePlot(features = canonical.features, max.cutoff = 'q99', repel = T, label.size = 3, slot = "data",
  reduction = "umap", label = T, raster = F, pt.size = 0.2, ncol = 5, cols = c("white", "#AE0900")) & NoAxes() & NoLegend()


oligo.seurat.sc@meta.data %<>% mutate("OLCelltypes" = factor(case_when(
  RNA_snn_res.1.4 == 0 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 1 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 2 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 3 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 4 ~ "MOL2",
  RNA_snn_res.1.4 == 5 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 6 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 7 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 8 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 9 ~ "MOL2", 
  RNA_snn_res.1.4 == 10 ~ "MOL_DA1", 
  RNA_snn_res.1.4 == 11 ~ "MOL5/6", 
  RNA_snn_res.1.4 == 12 ~ "MOL5/6",
  RNA_snn_res.1.4 == 13 ~ "MOL_DA2",
  RNA_snn_res.1.4 == 14 ~ "MOL2"
), levels = c("MOL2", "MOL5/6", "MOL_DA1", "MOL_DA2")))


oligo.seurat.sc %<>% SetIdent(value = "OLCelltypes")
```

```{r Plotting sc OLs}
oligo.results[["UMAP_scOLs_RNAres1.4"]] <- oligo.seurat.sc %>% DimPlot(group.by = "RNA_snn_res.1.4", pt.size = 1)

oligo.plots[["UMAP_scOLs_OLCelltypes"]] <- oligo.seurat.sc %>% 
  DimPlot(group.by = "OLCelltypes", pt.size = 1.5, cols = col.list$ol_cols, label = T, label.box = 5,  label.size = 5) & NoAxes() & theme(legend.position = "none")

oligo.plots[["Barplot_scOLs_Conditions_byOLCelltypes"]] <- oligo.seurat.sc %>% 
  dittoBarPlot(var = "Condition", group.by = "OLCelltypes", color.panel = col.list[["Condition"]], retain.factor.levels = T) 
```

```{r save sc OLs}
saveRDS(oligo.seurat.sc, file = "data/Seurat_scOLs_ReadyForIntegration.Rds")
```

## Integration of sn and sc Oligos
```{r load respective ol seurats v4}
oligo.seurat <- readRDS("data/Seurat_snOLs_ReadyForIntegration.Rds")
oligo.seurat$OLCelltypes %>% table
oligo.seurat %<>% subset(subset = OLCelltypes != "OPCs")

oligo.seurat.sc <- readRDS("data/Seurat_scOLs_ReadyForIntegration.Rds")
oligo.seurat.sc$OLCelltypes %>% table

options(Seurat.object.assay.version = "v5")

seurat.list <- list(
  "sn" = oligo.seurat,
  "sc" = oligo.seurat.sc
)
```

The seurats were prepared as in Seurat v4 but for smoother integration and dataset longevity we transform them to v5.
```{r transform seurats v4->v5}
## clean up and convert to v5
seuratv5.list <- lapply(seurat.list, function(x){
  DefaultAssay(x) <- "RNA"
  x %<>% DietSeurat(assays = "RNA", layers = "counts")
  assay5 <- as(x[["RNA"]], Class = "Assay5")
  seurat5 <- CreateSeuratObject(assay5, meta.data = x@meta.data)
  return(seurat5)
})

##merge into a single big seurat v5 object
names(seuratv5.list)
merged.seurat <- merge(seuratv5.list[["sn"]], 
                      seuratv5.list[["sc"]])

## keep non-NA metadata and merge the count matrix
merged.seurat@meta.data %<>% .[ , colSums(is.na(.)) == 0]
merged.seurat %<>% JoinLayers()
Idents(merged.seurat) <- "orig.ident"

## non-near-zero genes that are in >= 5 cells with >= 5 counts
genes.to.keep <- base::intersect(rownames(merged.seurat[["RNA"]]$data)[(merged.seurat[["RNA"]]$data %>% rowSums()) >= 5], ## genes with >= 5 counts
                                 rownames(merged.seurat[["RNA"]]$data)[((merged.seurat[["RNA"]]$data > 0) %>% rowSums()) >= 5] ## genes present in >= 5 cells
                                 )

## split based on the sequencing batches
merged.seurat[["RNA"]] <- split(merged.seurat[["RNA"]], f = merged.seurat$orig.ident)
```

```{r Process without integration}
# Normalize, Scale, FindVariableFeatures
{
  gc(full = T)
  merged.seurat %<>% NormalizeData()
  merged.seurat %<>% ScaleData(features = genes.to.keep)
  merged.seurat %<>% FindVariableFeatures(nfeatures = 3000, verbose = T)
}

# PCA
{
  merged.seurat %<>% RunPCA(npcs = 50, verbose = T)
  merged.seurat %>% PCAPlot(group.by = "orig.ident", cols = col.list$big_col_palette, raster = F, shuffle = T) + NoLegend()
}

# PCA QC
PC_var_explained(Seurat = merged.seurat) | merged.seurat %>% ElbowPlot(ndims = 50)
merged.seurat %>% DimHeatmap(dims = 1:15, cells = 300, balanced = TRUE)
merged.seurat %>% DimHeatmap(dims = 16:30, cells = 300, balanced = TRUE)

# UMAP
pca.dims <- 1:15
{
  merged.seurat %<>% RunUMAP(dims = pca.dims, reduction = "pca",  verbose = T, reduction.name = "umap_unintegrated")
  merged.seurat %>% DimPlot(reduction = "umap_unintegrated", group.by = 'orig.ident', cols = col.list$big_col_palette, raster = F, combine = T, shuffle = T) ## strong batch effect is present
}
```

Strong technical batch-effect is present. Let's integrate
```{r Integrate}
gc(full = T)

merged.seurat %<>% IntegrateLayers(method = CCAIntegration,
  orig.reduction = "pca", new.reduction = "cca",
  verbose = T)

merged.seurat %<>% JoinLayers()
```

Clusters and UMAP in the integrated OL dataset
```{r integrated clusters and umap}
pca.dims <- 1:15
## cca
{
  merged.seurat %<>% FindNeighbors(reduction = "cca", dims = pca.dims) 
  merged.seurat %<>% FindClusters(resolution = 1.4, cluster.name = "cca_clusters")
  merged.seurat %<>% RunUMAP(reduction = "cca", dims = pca.dims, reduction.name = "umap.cca")
}


oligo.plots[["UMAP_int_ccaclusters"]] <-  merged.seurat %>% 
  DimPlot(reduction = "umap.cca", pt.size = 1.5, group.by = "cca_clusters", cols = col.list$big_col_palette, shuffle = T, label = T, label.box = T, label.size = 4) & NoAxes()

oligo.plots[["UMAP_int_OLCelltypes"]] <- merged.seurat %>% 
  DimPlot(reduction = "umap.cca", pt.size = 1, group.by = "OLCelltypes", cols = col.list$ol_cols, label = T, label.box = T, label.size = 4, shuffle = T)
merged.seurat %>% DimPlot(reduction = "umap.cca", pt.size = 1, group.by = "orig.ident")

merged.seurat %>% VlnPlot(features = c("Irf7", "H2-D1", "Ifi27"), cols = col.list$big_col_palette)
merged.seurat %>% VlnPlot(features = c("Serpina3n", "Tnfrsf12a"), cols = col.list$big_col_palette)
```

CCA integration maintains description observed in the two individual datasets.
```{r Canonical Markers in the integrated dataset}
canonical.features <- c(
  "Plp1", "Mobp", ## OL-lineage cells
  "Nckap5", "Tcf7l2", ## NFOL
  "Sepp1", "S100b", ## MOL2
  "Opalin", "Ptgds", ## MOL5/6
  "Irf7", "Ifi27", "H2-D1", ## IFN-responsive
  "Serpina3n", "Klk6", ## MOL DA1
  "Tnfrsf12a", "Cdkn1a" ## MOL DA2
)

oligo.plots[["FeaturePlots_int_CanonicalMarkers_cca"]] <- merged.seurat %>% 
  FeaturePlot(features = canonical.features, max.cutoff = 'q99', repel = T, label.size = 3, slot = "data",
  reduction = "umap.cca", label = T, raster = F, pt.size = 0.2, ncol = 5, cols = c("white", "#AE0900")) & NoAxes() & NoLegend()
```

Calculating markers in cca clusters
```{r OL Markers in cca clusters}
DefaultAssay(merged.seurat) <- "RNA"
merged.seurat %<>% SetIdent(value = "cca_clusters")


oligo.results[["FindAllMarkers_int_ccaclusters"]] <- FindAllMarkers(merged.seurat, logfc.threshold = 1, assay = 'RNA', verbose = T, only.pos = T) %>% 
  arrange(p_val_adj) %>% 
  filter(p_val_adj < 0.01) %>% 
  split(f = .[["cluster"]])
```

Annotating the integrated oligo dataset
```{r annotating integrated OL dataset}
merged.seurat@meta.data %<>% mutate("intOLSubtypes" = factor(case_when(
  cca_clusters == 0 ~ "MOL5/6",
  cca_clusters == 1 ~ "MOL5/6", 
  cca_clusters == 2 ~ "MOL2", 
  cca_clusters == 3 ~ "MOL5/6",
  cca_clusters == 4 ~ "MOL5/6",
  cca_clusters == 5 ~ "MOL5/6",
  cca_clusters == 6 ~ "MOL5/6",
  cca_clusters == 7 ~ "MOL2", 
  cca_clusters == 8 ~ "MOL5/6", 
  cca_clusters == 9 ~ "MOL2", 
  cca_clusters == 10 ~ "MOL_DA1", 
  cca_clusters == 11 ~ "MOL5/6", 
  cca_clusters == 12 ~ "MOL_DA2", 
  cca_clusters == 13 ~ "NFOLs",
  cca_clusters == 14 ~ "MOL_IFN", 
  cca_clusters == 15 ~ "MOL5/6"
), levels = c("NFOLs", "MOL2", "MOL5/6","MOL_IFN",  "MOL_DA1", "MOL_DA2")))

oligo.plots[["UMAP_intOLSubtypes"]] <- merged.seurat %>% DimPlot(
  reduction = "umap.cca", group.by = "intOLSubtypes", cols = col.list$intOLSubtypes, pt.size = 1.2, shuffle = T
) & NoAxes()
```

save integrated annotated Zucha 2023 OLs
```{r save int anno seurat}
saveRDS(merged.seurat, file = "data/Seurat_OLs_integrated_annotated.Rds")
```