satijalab · zskylarli · Feb 23, 2024 · Feb 23, 2024 · Feb 26, 2024 · Mar 12, 2024
diff --git a/mouse_pansci/Snakefile b/mouse_pansci/Snakefile
@@ -0,0 +1,82 @@
+############################## Config #########################################
+container: "docker://satijalab/azimuth-references:latest" 
+
+############################## All  ###########################################
+rule all:
+    input:
+        "reference/ref.Rds",
+        "reference/idx.annoy",
+        "reference/demo.rds"
+
+############################## Reference #######################################
+rule download:
+    params:
+        data_url = "https://zenodo.org/records/14042645/files/pansci_filtered.rds"
+    output:
+        "logs/download_data.log",
+        "data/obj.rds"
+    shell:
+        """
+        wget {params.data_url} -P data
+        mv data/pansci_filtered.rds data/obj.rds
+        echo "Mouse PanSci data downloaded on: $(date)" > logs/download_data.log
+        """
+
+rule build_reference:
+    input:
+        script = "scripts/build_reference.R",
+        obj = "data/obj.rds",
+        annotations = "data/pansci_genenames.csv",
+    output:
+        ref = "reference/ref.Rds",
+        idx = "reference/idx.annoy",
+        obj = "full_reference.Rds"
+    shell:
+        """
+        Rscript {input.script} {input.obj} {input.genenames} {output.ref} {output.idx} {output.obj}
+        """
+
+############################## Demo ############################################
+rule download_demo:
+    params:
+        data_url = "https://datasets.cellxgene.cziscience.com/4103ff27-c13c-4092-87a7-5cd71ce57d26.rds",
+    output:
+        "logs/download_demo_data.log",
+        "data/demo/adipose_emont_2022.rds"
+    shell:
+        """
+        wget {params.data_url} -P data/demo
+        mv data/demo/4103ff27-c13c-4092-87a7-5cd71ce57d26.rds data/demo/adipose_emont_2022.rds
+        echo "Mouse adipose demo data downloaded on: $(date)" > logs/download_demo_data.log
+        """
+
+rule setup_demo:
+    input:
+        data = "data/demo/adipose_emont_2022.rds"
+        script = "scripts/setup_demo.R",
+
+    output:
+        "reference/adipose_emont_2022_subset.rds"
+    shell:
+        """
+        Rscript {input.script} {input.data} {output} > logs/setup_demo.Rout 2>&1
+        """
+
+rule export_zarr:
+    input: 
+        ref = "reference/ref.Rds",
+        fullref = "full_reference.Rds",
+        script1 = "scripts/convert_to_h5ad.R",
+        script2 = "scripts/convert_to_zarr.py"
+    output:
+        h5Seurat = "vitessce/vitessce_ref.h5Seurat",
+        h5ad = "vitessce/vitessce_ref.h5ad",
+        zarr = directory("vitessce/vitessce_ref.zarr")
+    container:
+        "docker://satijalab/azimuth-references:vitessce"
+    shell:
+        """
+        mkdir -p vitessce
+        Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1
+        python3 {input.script2} {output.h5ad} {output.zarr}
+        """
diff --git a/mouse_pansci/reference/config.json b/mouse_pansci/reference/config.json
@@ -0,0 +1,13 @@
+{
+    "Azimuth.app.reference": "/reference-data/mouse_pansci/",
+    "Azimuth.app.demodataset": "/reference-data/mouse_pansci/demo.rds",
+    "Azimuth.app.max_cells": 100000,
+    "Azimuth.app.default_gene": "GLNY",
+    "Azimuth.app.default_metadata": "subclass",
+    "Azimuth.app.do_adt": "FALSE",
+    "Azimuth.app.welcomebox": "div(\n    h3(HTML(\"Please upload a dataset to map to the <b>mouse PanSci</b> reference\")),\n    \"Upload a counts matrix from an scRNA-seq dataset of mouse data in one\n        of the following formats: hdf5, rds, h5ad, h5seurat. For testing, we\n        also provide a demo dataset of 20000 mouse adipose cells (10x Genomics v3),\n        which is loaded automatically with the 'Load demo dataset' button.\",\n    width = 12\n  )",
+    "Azimuth.app.refdescriptor": "<div class='refdescriptor'><br>This reference was generated from the PanSci mouse organismal aging atlas originally described in <a href='https://www.biorxiv.org/content/10.1101/2024.03.01.583001v1'>Zhang et al., 2024</a>, this data was integrated across 3 genetic backgrounds. Annotations are provided at the level of <i>main cell type</i>, <i>sub cell type</i>, and <i>organ</i>. </div>",
+    "Azimuth.app.refuri": "https://seurat.nygenome.org/azimuth/references/v1.0.0/mouse_pansci",
+    "shiny.maxRequestSize": 1048576000,
+    "Azimuth.app.do_adt": "FALSE"
+  }
diff --git a/mouse_pansci/scripts/build_demo.R b/mouse_pansci/scripts/build_demo.R
@@ -0,0 +1,15 @@
+#!/usr/bin/env Rscript
+
+# Script to build demo
+
+library(Seurat)
+library(SeuratDisk)
+
+args = commandArgs(trailingOnly=TRUE)
+demo.path <- args[1] 
+demo.output <- args[2]
+
+demo <- readRDS(demo.path, gene.column = 1)
+demo <- subset(demo, cells = sample(Cells(demo), size = 20000))
+
+saveRDS(demo.obj, demo.output)
diff --git a/mouse_pansci/scripts/build_reference.R b/mouse_pansci/scripts/build_reference.R
@@ -0,0 +1,59 @@
+#!/usr/bin/env Rscript
+
+# Script to build mouse pansci reference 1.0.0
+
+# Rscript {input.script} {input.counts} {input.genenames} {input.dr} {output.ref} {output.idx} {output.obj}
+
+library(Seurat)
+library(Azimuth)
+
+args = commandArgs(trailingOnly=TRUE)
+obj_original <- readRDS(args[1])
+genenames <- read.csv(args[2], row.names = 1)
+azimuth.reference <- args[3]
+azimuth.index <- args[4]
+full.reference <- args[5]
+
+# Convert gene IDs to gene symbols based on annotations file 
+pansci.mtx <- LayerData(obj_original, assay="RNA", layer="counts")
+gene_names <- read.csv(genenames)
+rownames(pansci.mtx) <- gene_names$gene_name
+subset.pansci.mtx <- pansci.mtx[grep("^ENSMUSG", rownames(pansci.mtx), invert = TRUE), ]
+
+# Rebuild object from RNA assay and converted gene names
+pansci.assay <- CreateAssayObject(counts = subset.pansci.mtx)
+obj <- CreateSeuratObject(counts=pansci.assay, assay="RNA")
+obj <- AddMetaData(obj, obj_original[[]])
+
+# Normalize wtih SCT
+obj <- SCTransform(obj, variable.features.n = 5000)
+
+# Run PCA on the SCT residuals 
+obj <- RunPCA(obj, assay = "SCT", reduction.name = "pca", npcs = 100)
+
+# Build UMAP model 
+obj <- RunUMAP(obj, dims = 1:100, return.model = T)
+
+saveRDS(obj, full.reference)
+
+# Build Azimuth Reference
+azimuth.obj <- AzimuthReference(obj,
+                        refUMAP='umap',
+                        refDR='pca',
+                        refAssay = 'SCT',
+                        dims = 1:100,
+                        plotref='umap',
+                        metadata = c("Main_cell_type","Sub_cell_type", "Sub_cell_type_organ"),
+                        reference.version = "1.0.0"
+)
+
+ValidateAzimuthReference(object = azimuth.obj)
+
+SaveAnnoyIndex(
+  object = azimuth.obj[["refdr.annoy.neighbors"]],
+  file = azimuth.index
+)
+saveRDS(
+  object = azimuth.obj,
+  file = azimuth.reference
+)
diff --git a/mouse_pansci/scripts/convert_to_h5ad.R b/mouse_pansci/scripts/convert_to_h5ad.R
@@ -0,0 +1,35 @@
+#!/usr/bin/env Rscript
+library(Seurat)
+library(SeuratDisk)
+args <- commandArgs(trailingOnly = TRUE)
+
+ref <- readRDS(file = args[1])
+fullref <- readRDS(file = args[2])
+fullref <- subset(x = fullref, cells = Cells(x = ref))
+fullref[['umap']] <- ref[['refUMAP']]
+Key(object = fullref[['umap']]) <- "umap_"
+DefaultAssay(object = fullref[['umap']]) <- "RNA"
+
+DefaultAssay(object = fullref) <- "RNA"
+fullref <- NormalizeData(object = fullref)
+fullref <- DietSeurat(
+  object = fullref,
+  dimreducs = "umap",
+  assays = "RNA"
+)
+for (i in colnames(x = fullref[[]])) {
+  fullref[[i]] <- NULL
+}
+fullref <- AddMetaData(object = fullref, metadata = ref[[]])
+Misc(object = fullref[['umap']], slot = "model") <- NULL
+
+fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref)))
+
+for (i in colnames(x = fullref[[]])) {
+  if (is.factor(x = fullref[[i, drop = TRUE]])) {
+    fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]])
+  }
+}
+
+SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE)
+Convert(args[3], dest = "h5ad", overwrite = TRUE)
diff --git a/mouse_pansci/scripts/convert_to_zarr.py b/mouse_pansci/scripts/convert_to_zarr.py
@@ -0,0 +1,13 @@
+#!/usr/bin/python
+import scanpy as sc
+import sys
+import zarr
+from scipy import sparse
+
+adata = sc.read_h5ad(sys.argv[1])
+del adata.raw
+adata.var.index = adata.var.index.astype('str')
+adata.obs.index = adata.obs.index.astype('str')
+adata.var_names = adata.var_names.astype(str)
+adata.X = adata.X.tocsc()
+adata.write_zarr(sys.argv[2], [adata.shape[0], 10])