From 1d818dbbba3a907f4d4c987692fd39e806a763dd Mon Sep 17 00:00:00 2001
From: Gesmira <gesmiramolla@gmail.com>
Date: Fri, 23 Feb 2024 16:50:31 -0500
Subject: [PATCH 1/7] init pansci

---
 mouse_pansci/processing.R | 49 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 mouse_pansci/processing.R

diff --git a/mouse_pansci/processing.R b/mouse_pansci/processing.R
new file mode 100644
index 0000000..934b22a
--- /dev/null
+++ b/mouse_pansci/processing.R
@@ -0,0 +1,49 @@
+
+
+# Convert to seurat object
+# sceasy::convertFormat("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.h5ad", from="anndata", 
+# to="seurat", outFile = "/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds")
+
+a <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds")
+
+a <- NormalizeData(a)
+a <- FindVariableFeatures(a, nfeatures = 5000)
+a <- ScaleData(a)
+a <- RunPCA(a, npcs = 100)
+a <- RunUMAP(a, dims = 1:100)
+
+##### Post processing ####
+a <- readRDS("/brahms/mollag/azimuth/pansci/pansci_processed_umap_100_dims.rds")
+
+DimPlot(a, group.by = "Main_cell_type", label = T, repel = T, alpha = 0.1) + NoLegend()
+DimPlot(a, group.by = "Lineage", label = T, repel = T, alpha = 0.1) + NoLegend()
+DimPlot(a, group.by = "Organ_name", label = T, repel = T, alpha = 0.1) + NoLegend()
+saveRDS(obj, full.reference)
+
+
+
+# Build Azimuth Reference
+azimuth.obj <- AzimuthReference(
+  object = obj,
+  refUMAP = "umap",
+  plotref = "umap",
+  refDR = "pca",
+  refAssay = "RNA",
+  metadata = c("celltype.l1", "celltype.l2"),
+  dims = 1:100,
+  reference.version = "2.0.0"
+)
+
+########### SCTransform VERSION #########
+a <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds")
+
+a <- SCTransform(a)
+a <- FindVariableFeatures(a, nfeatures = 5000)
+a <- ScaleData(a)
+a <- RunPCA(a, npcs = 100)
+a <- RunUMAP(a, dims = 1:100)
+
+
+
+# they actually include "sub_umap" coordinates in the metadata? not sure if they computed the umap themselves? 
+# Skylar looked into this and they dont look the best

From 970dd0282626d3fae5ed615332eac73cd172dabc Mon Sep 17 00:00:00 2001
From: Gesmira <gesmiramolla@gmail.com>
Date: Fri, 23 Feb 2024 16:57:34 -0500
Subject: [PATCH 2/7] init pansci

---
 mouse_pansci/processing.R | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

diff --git a/mouse_pansci/processing.R b/mouse_pansci/processing.R
index 934b22a..df5a10c 100644
--- a/mouse_pansci/processing.R
+++ b/mouse_pansci/processing.R
@@ -1,4 +1,4 @@
-
+# Script to process pansci downsampled dataset
 
 # Convert to seurat object
 # sceasy::convertFormat("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.h5ad", from="anndata", 
@@ -10,7 +10,8 @@ a <- NormalizeData(a)
 a <- FindVariableFeatures(a, nfeatures = 5000)
 a <- ScaleData(a)
 a <- RunPCA(a, npcs = 100)
-a <- RunUMAP(a, dims = 1:100)
+a <- RunUMAP(a, dims = 1:100, return.model = T)
+saveRDS(a, "/brahms/mollag/azimuth/pansci/pansci_processed_umap_100_dims.rds")
 
 ##### Post processing ####
 a <- readRDS("/brahms/mollag/azimuth/pansci/pansci_processed_umap_100_dims.rds")
@@ -18,32 +19,15 @@ a <- readRDS("/brahms/mollag/azimuth/pansci/pansci_processed_umap_100_dims.rds")
 DimPlot(a, group.by = "Main_cell_type", label = T, repel = T, alpha = 0.1) + NoLegend()
 DimPlot(a, group.by = "Lineage", label = T, repel = T, alpha = 0.1) + NoLegend()
 DimPlot(a, group.by = "Organ_name", label = T, repel = T, alpha = 0.1) + NoLegend()
-saveRDS(obj, full.reference)
 
 
 
-# Build Azimuth Reference
-azimuth.obj <- AzimuthReference(
-  object = obj,
-  refUMAP = "umap",
-  plotref = "umap",
-  refDR = "pca",
-  refAssay = "RNA",
-  metadata = c("celltype.l1", "celltype.l2"),
-  dims = 1:100,
-  reference.version = "2.0.0"
-)
 
 ########### SCTransform VERSION #########
-a <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds")
+b <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds")
 
-a <- SCTransform(a)
-a <- FindVariableFeatures(a, nfeatures = 5000)
-a <- ScaleData(a)
+b <- SCTransform(b, variable.features.n = 5000)
+b <- ScaleData(b)
 a <- RunPCA(a, npcs = 100)
 a <- RunUMAP(a, dims = 1:100)
 
-
-
-# they actually include "sub_umap" coordinates in the metadata? not sure if they computed the umap themselves? 
-# Skylar looked into this and they dont look the best

From 63aaa8caee490c24bda3fefe90305891fd9388ac Mon Sep 17 00:00:00 2001
From: Gesmira <gesmiramolla@gmail.com>
Date: Mon, 26 Feb 2024 09:46:59 -0500
Subject: [PATCH 3/7] sct processing

---
 mouse_pansci/processing.R | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mouse_pansci/processing.R b/mouse_pansci/processing.R
index df5a10c..73b45e1 100644
--- a/mouse_pansci/processing.R
+++ b/mouse_pansci/processing.R
@@ -27,7 +27,10 @@ DimPlot(a, group.by = "Organ_name", label = T, repel = T, alpha = 0.1) + NoLegen
 b <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds")
 
 b <- SCTransform(b, variable.features.n = 5000)
-b <- ScaleData(b)
-a <- RunPCA(a, npcs = 100)
-a <- RunUMAP(a, dims = 1:100)
+b <- RunPCA(b, npcs = 100)
+b  <- RunUMAP(b, dims = 1:100)
+saveRDS(b, "/brahms/mollag/azimuth/pansci/pansci_SCT_processed_umap_100_dims.rds")
 
+DimPlot(b, group.by = "Main_cell_type", label = T, repel = T, alpha = 0.1) + NoLegend()
+DimPlot(b, group.by = "Lineage", label = T, repel = T, alpha = 0.1) + NoLegend()
+DimPlot(b, group.by = "Organ_name", label = T, repel = T, alpha = 0.1) + NoLegend()

From 8edb4fe6ca89d0ee81668f2e176d9447f518dfa7 Mon Sep 17 00:00:00 2001
From: zskylarli <zl2876@columbia.edu>
Date: Tue, 12 Mar 2024 15:04:59 -0400
Subject: [PATCH 4/7] add ref build script

---
 mouse_pansci/build_reference.R | 59 ++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 mouse_pansci/build_reference.R

diff --git a/mouse_pansci/build_reference.R b/mouse_pansci/build_reference.R
new file mode 100644
index 0000000..1635665
--- /dev/null
+++ b/mouse_pansci/build_reference.R
@@ -0,0 +1,59 @@
+#!/usr/bin/env Rscript
+
+# Script to build mouse pansci reference 1.0.0
+
+# Rscript {input.script} {input.counts} {input.annotations} {input.dr} {output.ref} {output.idx} {output.obj}
+
+library(Seurat)
+library(Azimuth)
+
+args = commandArgs(trailingOnly=TRUE)
+obj_original <- readRDS(args[1])
+annotations <- read.csv(args[2], row.names = 1)
+azimuth.reference <- args[3]
+azimuth.index <- args[4]
+full.reference <- args[5]
+
+# Convert gene IDs to gene symbols based on annotations file 
+pansci.mtx <- LayerData(obj_original, assay="RNA", layer="counts")
+gene_names <- read.csv(annotations)
+rownames(pansci.mtx) <- gene_names$gene_name
+subset.pansci.mtx <- pansci.mtx[grep("^ENSMUSG", rownames(pansci.mtx), invert = TRUE), ]
+
+# Rebuild object from RNA assay and converted gene names
+pansci.assay <- CreateAssayObject(counts = subset.pansci.mtx)
+obj <- CreateSeuratObject(counts=pansci.assay, assay="RNA")
+obj <- AddMetaData(obj, obj_original[[]])
+
+# Normalize wtih SCT
+obj <- SCTransform(obj, variable.features.n = 5000)
+
+# Run PCA on the SCT residuals 
+obj <- RunPCA(obj, assay = "SCT", reduction.name = "pca", npcs = 100)
+
+# Build UMAP model 
+obj <- RunUMAP(obj, dims = 1:100, return.model = T)
+
+saveRDS(obj, full.reference)
+
+# Build Azimuth Reference
+azimuth.obj <- AzimuthReference(obj,
+                        refUMAP='umap',
+                        refDR='pca',
+                        refAssay = 'SCT',
+                        dims = 1:100,
+                        plotref='umap',
+                        metadata = c("Main_cell_type","Sub_cell_type", "Sub_cell_type_organ"),
+                        reference.version = "1.0.0"
+)
+
+ValidateAzimuthReference(object = azimuth.obj)
+
+SaveAnnoyIndex(
+  object = azimuth.obj[["refdr.annoy.neighbors"]],
+  file = azimuth.index
+)
+saveRDS(
+  object = azimuth.obj,
+  file = azimuth.reference
+)
\ No newline at end of file

From 2f6e4d86d52b6b6ad3fab6db8def8032315b4ac5 Mon Sep 17 00:00:00 2001
From: zskylarli <zl2876@columbia.edu>
Date: Tue, 12 Mar 2024 17:37:07 -0400
Subject: [PATCH 5/7] add demo build file

---
 mouse_pansci/build_demo.R | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 mouse_pansci/build_demo.R

diff --git a/mouse_pansci/build_demo.R b/mouse_pansci/build_demo.R
new file mode 100644
index 0000000..472bdc0
--- /dev/null
+++ b/mouse_pansci/build_demo.R
@@ -0,0 +1,17 @@
+#!/usr/bin/env Rscript
+
+# Script to build human liver demo
+
+# Demo object can be downloaded at https://cellxgene.cziscience.com/collections/fe0e718d-2ee9-42cc-894b-0b490f437dfd
+
+library(Seurat)
+library(SeuratDisk)
+
+args = commandArgs(trailingOnly=TRUE)
+demo.path <- args[1] 
+demo.output <- args[2]
+
+demo <- readRDS(demo.path, gene.column = 1)
+demo <- subset(demo, cells = sample(Cells(demo), size = 20000))
+
+saveRDS(demo.obj, demo.output)

From 8fc0bffc3c6fa8ce70fd547968f29ef8eb24f365 Mon Sep 17 00:00:00 2001
From: Gesmira <gesmiramolla@gmail.com>
Date: Wed, 26 Jun 2024 13:58:40 -0400
Subject: [PATCH 6/7] updating scripts

---
 mouse_pansci/Snakefile                  | 129 ++++++++++++++++++++++++
 mouse_pansci/scripts/build_demo.R       |  17 ++++
 mouse_pansci/scripts/build_reference.R  |  59 +++++++++++
 mouse_pansci/scripts/convert_to_h5ad.R  |  35 +++++++
 mouse_pansci/scripts/convert_to_zarr.py |  13 +++
 5 files changed, 253 insertions(+)
 create mode 100644 mouse_pansci/Snakefile
 create mode 100644 mouse_pansci/scripts/build_demo.R
 create mode 100644 mouse_pansci/scripts/build_reference.R
 create mode 100644 mouse_pansci/scripts/convert_to_h5ad.R
 create mode 100644 mouse_pansci/scripts/convert_to_zarr.py

diff --git a/mouse_pansci/Snakefile b/mouse_pansci/Snakefile
new file mode 100644
index 0000000..f886301
--- /dev/null
+++ b/mouse_pansci/Snakefile
@@ -0,0 +1,129 @@
+############################## Config #########################################
+container: "docker://satijalab/azimuth-references:human_adipose-1.0.0" 
+
+############################## All ############################################
+rule all:
+    input:
+        "reference/ref.Rds",
+        "reference/idx.annoy"
+
+############################## Reference ######################################
+rule download:
+    params:
+        aizarani_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE124nnn/GSE124395/suppl/GSE124395_Normalhumanlivercellatlasdata.txt.gz",
+        macparland_url = "'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE115469&format=file&file=GSE115469%5FData%2Ecsv%2Egz'",
+        payen_url = "'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE158723&format=file'", 
+        ramachandran_url = "'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE136103&format=file'", 
+        zhang_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE138nnn/GSE138709/suppl/GSE138709_RAW.tar"
+    output:
+        "logs/download_data.log",
+        "data/GSE124395_Normalhumanlivercellatlasdata.txt",
+        "data/natcomm_GSE115469.csv.gz",
+        "data/mockfile.txt"
+        #"data/payen/*.gz",
+        #"data/ramachandran/*.gz",
+        #"data/zhang/*.gz"
+    shell:
+        """
+        wget {params.aizarani_url} -P data
+        gunzip data/GSE124395_Normalhumanlivercellatlasdata.txt.gz 
+        wget -O data/natcomm_GSE115469.csv.gz \
+        {params.macparland_url}
+        wget -O data/JHEP_GSE158723.tar \
+        'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE158723&format=file'
+        mkdir data/payen/
+        tar -xvf data/JHEP_GSE158723.tar -C data/payen/
+        FILES=$(ls data/payen/*.gz)
+        for f in $FILES ; do
+            basename=$(basename ${{f}})
+            num=${{basename:8:2}}
+            substring=$(echo $f| cut --complement -d'_' -f 1,2)
+            mv "$f" "$substring"
+            mkdir -p data/payen/GSM"$num"
+            mv "$substring" data/payen/GSM"$num"/
+            gunzip data/payen/GSM"$num"/"$substring"
+        done
+        wget -O data/GSE136103_RAW.tar {params.ramachandran_url}
+        mkdir data/ramachandran/
+        tar -xvf data/GSE136103_RAW.tar -C data/ramachandran/
+        FILES=$(ls data/ramachandran/*healthy[0-9]*)
+        for f in $FILES ; do
+            basename=$(basename ${{f}})
+            num=${{basename:8:2}}
+            substring=$(echo $f| cut --complement -d'_' -f 1,2,3)
+            mv "$f" "$substring"
+            mkdir -p data/ramachandran/GSM"$num"
+            mv "$substring" data/ramachandran/GSM"$num"/
+            gunzip data/ramachandran/GSM"$num"/"$substring"
+        done
+        rm data/ramachandran/*.gz
+        wget {params.zhang_url} -P data
+        mkdir data/zhang/
+        tar -xvf data/GSE138709_RAW.tar -C data/zhang/
+        touch data/mockfile.txt
+        echo "Liver reference data downloaded on: $(date)" > logs/download_data.log 
+        """
+
+rule build_reference:
+    input:
+        script = "scripts/build_reference.R",
+        aizarani = "data/GSE124395_Normalhumanlivercellatlasdata.txt",
+        macparland = "data/natcomm_GSE115469.csv.gz",
+        payen = "data/payen",
+        ramachandran = "data/ramachandran",
+        zhang = "data/zhang",
+        annotations = "data/annotations.csv"
+    output:
+        ref = "reference/ref.Rds",
+        idx = "reference/idx.annoy",
+        obj = "full_reference.Rds"
+    shell:
+        """
+        Rscript {input.script} {input.aizarani} {input.macparland} {input.payen} {input.ramachandran} {input.zhang} {input.annotations} {output.ref} {output.idx} {output.obj} 
+        """
+############################## Demo ######################################
+rule download_demo:
+    params:
+        demo_url = "https://www.livercellatlas.org/data_files/toDownload/rawData_human.zip"
+    output:
+        "logs/download_data.log",
+        "data/demo/rawData_human/countTable_human/barcodes.tsv.gz",
+        "data/demo/rawData_human/countTable_human/features.tsv.gz",
+        "data/demo/rawData_human/countTable_human/matrix.mtx.gz"
+    shell:
+        """
+        wget {params.demo_url} - P data/demo
+        gunzip rawData_human 
+        echo "Demo data downloaded on: $(date)" > logs/download_data.log 
+        """
+
+rule build_demo:
+    input:
+        script = "scripts/build_demo.R",
+        demo = "data/demo/rawData_human/countTable_human/"
+    output:
+        obj = "reference/demo.Rds"
+    shell:
+        """
+        Rscript {input.script} {input.demo} {output.obj}
+        """
+   
+############################## Explorer  ########################################
+rule export_zarr:
+    input:
+        ref = "reference/ref.Rds",
+        fullref = "full_reference.Rds",
+        script1 = "scripts/convert_to_h5ad.R",
+        script2 = "scripts/convert_to_zarr.py"
+    output:
+        h5Seurat = "vitessce/vitessce_ref.h5Seurat",
+        h5ad = "vitessce/vitessce_ref.h5ad",
+        zarr = directory("vitessce/vitessce_ref.zarr")
+    container:
+        "docker://satijalab/azimuth-references:vitessce"
+    shell:
+        """ 
+        mkdir -p vitessce
+        Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat}
+        python3 {input.script2} {output.h5ad} {output.zarr}
+        """ 
\ No newline at end of file
diff --git a/mouse_pansci/scripts/build_demo.R b/mouse_pansci/scripts/build_demo.R
new file mode 100644
index 0000000..472bdc0
--- /dev/null
+++ b/mouse_pansci/scripts/build_demo.R
@@ -0,0 +1,17 @@
+#!/usr/bin/env Rscript
+
+# Script to build human liver demo
+
+# Demo object can be downloaded at https://cellxgene.cziscience.com/collections/fe0e718d-2ee9-42cc-894b-0b490f437dfd
+
+library(Seurat)
+library(SeuratDisk)
+
+args = commandArgs(trailingOnly=TRUE)
+demo.path <- args[1] 
+demo.output <- args[2]
+
+demo <- readRDS(demo.path, gene.column = 1)
+demo <- subset(demo, cells = sample(Cells(demo), size = 20000))
+
+saveRDS(demo.obj, demo.output)
diff --git a/mouse_pansci/scripts/build_reference.R b/mouse_pansci/scripts/build_reference.R
new file mode 100644
index 0000000..1635665
--- /dev/null
+++ b/mouse_pansci/scripts/build_reference.R
@@ -0,0 +1,59 @@
+#!/usr/bin/env Rscript
+
+# Script to build mouse pansci reference 1.0.0
+
+# Rscript {input.script} {input.counts} {input.annotations} {input.dr} {output.ref} {output.idx} {output.obj}
+
+library(Seurat)
+library(Azimuth)
+
+args = commandArgs(trailingOnly=TRUE)
+obj_original <- readRDS(args[1])
+annotations <- read.csv(args[2], row.names = 1)
+azimuth.reference <- args[3]
+azimuth.index <- args[4]
+full.reference <- args[5]
+
+# Convert gene IDs to gene symbols based on annotations file 
+pansci.mtx <- LayerData(obj_original, assay="RNA", layer="counts")
+gene_names <- read.csv(annotations)
+rownames(pansci.mtx) <- gene_names$gene_name
+subset.pansci.mtx <- pansci.mtx[grep("^ENSMUSG", rownames(pansci.mtx), invert = TRUE), ]
+
+# Rebuild object from RNA assay and converted gene names
+pansci.assay <- CreateAssayObject(counts = subset.pansci.mtx)
+obj <- CreateSeuratObject(counts=pansci.assay, assay="RNA")
+obj <- AddMetaData(obj, obj_original[[]])
+
+# Normalize wtih SCT
+obj <- SCTransform(obj, variable.features.n = 5000)
+
+# Run PCA on the SCT residuals 
+obj <- RunPCA(obj, assay = "SCT", reduction.name = "pca", npcs = 100)
+
+# Build UMAP model 
+obj <- RunUMAP(obj, dims = 1:100, return.model = T)
+
+saveRDS(obj, full.reference)
+
+# Build Azimuth Reference
+azimuth.obj <- AzimuthReference(obj,
+                        refUMAP='umap',
+                        refDR='pca',
+                        refAssay = 'SCT',
+                        dims = 1:100,
+                        plotref='umap',
+                        metadata = c("Main_cell_type","Sub_cell_type", "Sub_cell_type_organ"),
+                        reference.version = "1.0.0"
+)
+
+ValidateAzimuthReference(object = azimuth.obj)
+
+SaveAnnoyIndex(
+  object = azimuth.obj[["refdr.annoy.neighbors"]],
+  file = azimuth.index
+)
+saveRDS(
+  object = azimuth.obj,
+  file = azimuth.reference
+)
\ No newline at end of file
diff --git a/mouse_pansci/scripts/convert_to_h5ad.R b/mouse_pansci/scripts/convert_to_h5ad.R
new file mode 100644
index 0000000..29e99f3
--- /dev/null
+++ b/mouse_pansci/scripts/convert_to_h5ad.R
@@ -0,0 +1,35 @@
+#!/usr/bin/env Rscript
+library(Seurat)
+library(SeuratDisk)
+args <- commandArgs(trailingOnly = TRUE)
+
+ref <- readRDS(file = args[1])
+fullref <- readRDS(file = args[2])
+fullref <- subset(x = fullref, cells = Cells(x = ref))
+fullref[['umap']] <- ref[['refUMAP']]
+Key(object = fullref[['umap']]) <- "umap_"
+DefaultAssay(object = fullref[['umap']]) <- "RNA"
+
+DefaultAssay(object = fullref) <- "RNA"
+fullref <- NormalizeData(object = fullref)
+fullref <- DietSeurat(
+  object = fullref,
+  dimreducs = "umap",
+  assays = "RNA"
+)
+for (i in colnames(x = fullref[[]])) {
+  fullref[[i]] <- NULL
+}
+fullref <- AddMetaData(object = fullref, metadata = ref[[]])
+Misc(object = fullref[['umap']], slot = "model") <- NULL
+
+fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref)))
+
+for (i in colnames(x = fullref[[]])) {
+  if (is.factor(x = fullref[[i, drop = TRUE]])) {
+    fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]])
+  }
+}
+
+SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE)
+Convert(args[3], dest = "h5ad", overwrite = TRUE)
diff --git a/mouse_pansci/scripts/convert_to_zarr.py b/mouse_pansci/scripts/convert_to_zarr.py
new file mode 100644
index 0000000..2e7a66f
--- /dev/null
+++ b/mouse_pansci/scripts/convert_to_zarr.py
@@ -0,0 +1,13 @@
+#!/usr/bin/python
+import scanpy as sc
+import sys
+import zarr
+from scipy import sparse
+
+adata = sc.read_h5ad(sys.argv[1])
+del adata.raw
+adata.var.index = adata.var.index.astype('str')
+adata.obs.index = adata.obs.index.astype('str')
+adata.var_names = adata.var_names.astype(str)
+adata.X = adata.X.tocsc()
+adata.write_zarr(sys.argv[2], [adata.shape[0], 10])

From 888a2b3d2027014bfda32da81c5efdaf64577ba8 Mon Sep 17 00:00:00 2001
From: zskylarli <zl2876@columbia.edu>
Date: Tue, 5 Nov 2024 15:43:22 -0500
Subject: [PATCH 7/7] formalize scripts for public release

---
 mouse_pansci/Snakefile                       | 82 ++++++++++++++++++++
 mouse_pansci/processing.R                    | 36 ---------
 mouse_pansci/reference/config.json           | 13 ++++
 mouse_pansci/{ => scripts}/build_demo.R      |  4 +-
 mouse_pansci/{ => scripts}/build_reference.R |  6 +-
 mouse_pansci/scripts/convert_to_h5ad.R       | 35 +++++++++
 mouse_pansci/scripts/convert_to_zarr.py      | 13 ++++
 7 files changed, 147 insertions(+), 42 deletions(-)
 create mode 100644 mouse_pansci/Snakefile
 delete mode 100644 mouse_pansci/processing.R
 create mode 100644 mouse_pansci/reference/config.json
 rename mouse_pansci/{ => scripts}/build_demo.R (65%)
 rename mouse_pansci/{ => scripts}/build_reference.R (89%)
 create mode 100644 mouse_pansci/scripts/convert_to_h5ad.R
 create mode 100644 mouse_pansci/scripts/convert_to_zarr.py

diff --git a/mouse_pansci/Snakefile b/mouse_pansci/Snakefile
new file mode 100644
index 0000000..5ffd1ab
--- /dev/null
+++ b/mouse_pansci/Snakefile
@@ -0,0 +1,82 @@
+############################## Config #########################################
+container: "docker://satijalab/azimuth-references:latest" 
+
+############################## All  ###########################################
+rule all:
+    input:
+        "reference/ref.Rds",
+        "reference/idx.annoy",
+        "reference/demo.rds"
+
+############################## Reference #######################################
+rule download:
+    params:
+        data_url = "https://zenodo.org/records/14042645/files/pansci_filtered.rds"
+    output:
+        "logs/download_data.log",
+        "data/obj.rds"
+    shell:
+        """
+        wget {params.data_url} -P data
+        mv data/pansci_filtered.rds data/obj.rds
+        echo "Mouse PanSci data downloaded on: $(date)" > logs/download_data.log
+        """
+
+rule build_reference:
+    input:
+        script = "scripts/build_reference.R",
+        obj = "data/obj.rds",
+        annotations = "data/pansci_genenames.csv",
+    output:
+        ref = "reference/ref.Rds",
+        idx = "reference/idx.annoy",
+        obj = "full_reference.Rds"
+    shell:
+        """
+        Rscript {input.script} {input.obj} {input.genenames} {output.ref} {output.idx} {output.obj}
+        """
+
+############################## Demo ############################################
+rule download_demo:
+    params:
+        data_url = "https://datasets.cellxgene.cziscience.com/4103ff27-c13c-4092-87a7-5cd71ce57d26.rds",
+    output:
+        "logs/download_demo_data.log",
+        "data/demo/adipose_emont_2022.rds"
+    shell:
+        """
+        wget {params.data_url} -P data/demo
+        mv data/demo/4103ff27-c13c-4092-87a7-5cd71ce57d26.rds data/demo/adipose_emont_2022.rds
+        echo "Mouse adipose demo data downloaded on: $(date)" > logs/download_demo_data.log
+        """
+
+rule setup_demo:
+    input:
+        data = "data/demo/adipose_emont_2022.rds"
+        script = "scripts/setup_demo.R",
+
+    output:
+        "reference/adipose_emont_2022_subset.rds"
+    shell:
+        """
+        Rscript {input.script} {input.data} {output} > logs/setup_demo.Rout 2>&1
+        """
+
+rule export_zarr:
+    input: 
+        ref = "reference/ref.Rds",
+        fullref = "full_reference.Rds",
+        script1 = "scripts/convert_to_h5ad.R",
+        script2 = "scripts/convert_to_zarr.py"
+    output:
+        h5Seurat = "vitessce/vitessce_ref.h5Seurat",
+        h5ad = "vitessce/vitessce_ref.h5ad",
+        zarr = directory("vitessce/vitessce_ref.zarr")
+    container:
+        "docker://satijalab/azimuth-references:vitessce"
+    shell:
+        """
+        mkdir -p vitessce
+        Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1
+        python3 {input.script2} {output.h5ad} {output.zarr}
+        """
\ No newline at end of file
diff --git a/mouse_pansci/processing.R b/mouse_pansci/processing.R
deleted file mode 100644
index 73b45e1..0000000
--- a/mouse_pansci/processing.R
+++ /dev/null
@@ -1,36 +0,0 @@
-# Script to process pansci downsampled dataset
-
-# Convert to seurat object
-# sceasy::convertFormat("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.h5ad", from="anndata", 
-# to="seurat", outFile = "/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds")
-
-a <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds")
-
-a <- NormalizeData(a)
-a <- FindVariableFeatures(a, nfeatures = 5000)
-a <- ScaleData(a)
-a <- RunPCA(a, npcs = 100)
-a <- RunUMAP(a, dims = 1:100, return.model = T)
-saveRDS(a, "/brahms/mollag/azimuth/pansci/pansci_processed_umap_100_dims.rds")
-
-##### Post processing ####
-a <- readRDS("/brahms/mollag/azimuth/pansci/pansci_processed_umap_100_dims.rds")
-
-DimPlot(a, group.by = "Main_cell_type", label = T, repel = T, alpha = 0.1) + NoLegend()
-DimPlot(a, group.by = "Lineage", label = T, repel = T, alpha = 0.1) + NoLegend()
-DimPlot(a, group.by = "Organ_name", label = T, repel = T, alpha = 0.1) + NoLegend()
-
-
-
-
-########### SCTransform VERSION #########
-b <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds")
-
-b <- SCTransform(b, variable.features.n = 5000)
-b <- RunPCA(b, npcs = 100)
-b  <- RunUMAP(b, dims = 1:100)
-saveRDS(b, "/brahms/mollag/azimuth/pansci/pansci_SCT_processed_umap_100_dims.rds")
-
-DimPlot(b, group.by = "Main_cell_type", label = T, repel = T, alpha = 0.1) + NoLegend()
-DimPlot(b, group.by = "Lineage", label = T, repel = T, alpha = 0.1) + NoLegend()
-DimPlot(b, group.by = "Organ_name", label = T, repel = T, alpha = 0.1) + NoLegend()
diff --git a/mouse_pansci/reference/config.json b/mouse_pansci/reference/config.json
new file mode 100644
index 0000000..a1398e5
--- /dev/null
+++ b/mouse_pansci/reference/config.json
@@ -0,0 +1,13 @@
+{
+    "Azimuth.app.reference": "/reference-data/mouse_pansci/",
+    "Azimuth.app.demodataset": "/reference-data/mouse_pansci/demo.rds",
+    "Azimuth.app.max_cells": 100000,
+    "Azimuth.app.default_gene": "GLNY",
+    "Azimuth.app.default_metadata": "subclass",
+    "Azimuth.app.do_adt": "FALSE",
+    "Azimuth.app.welcomebox": "div(\n    h3(HTML(\"Please upload a dataset to map to the <b>mouse PanSci</b> reference\")),\n    \"Upload a counts matrix from an scRNA-seq dataset of mouse data in one\n        of the following formats: hdf5, rds, h5ad, h5seurat. For testing, we\n        also provide a demo dataset of 20000 mouse adipose cells (10x Genomics v3),\n        which is loaded automatically with the 'Load demo dataset' button.\",\n    width = 12\n  )",
+    "Azimuth.app.refdescriptor": "<div class='refdescriptor'><br>This reference was generated from the PanSci mouse organismal aging atlas originally described in <a href='https://www.biorxiv.org/content/10.1101/2024.03.01.583001v1'>Zhang et al., 2024</a>, this data was integrated across 3 genetic backgrounds. Annotations are provided at the level of <i>main cell type</i>, <i>sub cell type</i>, and <i>organ</i>. </div>",
+    "Azimuth.app.refuri": "https://seurat.nygenome.org/azimuth/references/v1.0.0/mouse_pansci",
+    "shiny.maxRequestSize": 1048576000,
+    "Azimuth.app.do_adt": "FALSE"
+  }
\ No newline at end of file
diff --git a/mouse_pansci/build_demo.R b/mouse_pansci/scripts/build_demo.R
similarity index 65%
rename from mouse_pansci/build_demo.R
rename to mouse_pansci/scripts/build_demo.R
index 472bdc0..c15f646 100644
--- a/mouse_pansci/build_demo.R
+++ b/mouse_pansci/scripts/build_demo.R
@@ -1,8 +1,6 @@
 #!/usr/bin/env Rscript
 
-# Script to build human liver demo
-
-# Demo object can be downloaded at https://cellxgene.cziscience.com/collections/fe0e718d-2ee9-42cc-894b-0b490f437dfd
+# Script to build demo
 
 library(Seurat)
 library(SeuratDisk)
diff --git a/mouse_pansci/build_reference.R b/mouse_pansci/scripts/build_reference.R
similarity index 89%
rename from mouse_pansci/build_reference.R
rename to mouse_pansci/scripts/build_reference.R
index 1635665..2a5c32d 100644
--- a/mouse_pansci/build_reference.R
+++ b/mouse_pansci/scripts/build_reference.R
@@ -2,21 +2,21 @@
 
 # Script to build mouse pansci reference 1.0.0
 
-# Rscript {input.script} {input.counts} {input.annotations} {input.dr} {output.ref} {output.idx} {output.obj}
+# Rscript {input.script} {input.counts} {input.genenames} {input.dr} {output.ref} {output.idx} {output.obj}
 
 library(Seurat)
 library(Azimuth)
 
 args = commandArgs(trailingOnly=TRUE)
 obj_original <- readRDS(args[1])
-annotations <- read.csv(args[2], row.names = 1)
+genenames <- read.csv(args[2], row.names = 1)
 azimuth.reference <- args[3]
 azimuth.index <- args[4]
 full.reference <- args[5]
 
 # Convert gene IDs to gene symbols based on annotations file 
 pansci.mtx <- LayerData(obj_original, assay="RNA", layer="counts")
-gene_names <- read.csv(annotations)
+gene_names <- read.csv(genenames)
 rownames(pansci.mtx) <- gene_names$gene_name
 subset.pansci.mtx <- pansci.mtx[grep("^ENSMUSG", rownames(pansci.mtx), invert = TRUE), ]
 
diff --git a/mouse_pansci/scripts/convert_to_h5ad.R b/mouse_pansci/scripts/convert_to_h5ad.R
new file mode 100644
index 0000000..f09dc27
--- /dev/null
+++ b/mouse_pansci/scripts/convert_to_h5ad.R
@@ -0,0 +1,35 @@
+#!/usr/bin/env Rscript
+library(Seurat)
+library(SeuratDisk)
+args <- commandArgs(trailingOnly = TRUE)
+
+ref <- readRDS(file = args[1])
+fullref <- readRDS(file = args[2])
+fullref <- subset(x = fullref, cells = Cells(x = ref))
+fullref[['umap']] <- ref[['refUMAP']]
+Key(object = fullref[['umap']]) <- "umap_"
+DefaultAssay(object = fullref[['umap']]) <- "RNA"
+
+DefaultAssay(object = fullref) <- "RNA"
+fullref <- NormalizeData(object = fullref)
+fullref <- DietSeurat(
+  object = fullref,
+  dimreducs = "umap",
+  assays = "RNA"
+)
+for (i in colnames(x = fullref[[]])) {
+  fullref[[i]] <- NULL
+}
+fullref <- AddMetaData(object = fullref, metadata = ref[[]])
+Misc(object = fullref[['umap']], slot = "model") <- NULL
+
+fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref)))
+
+for (i in colnames(x = fullref[[]])) {
+  if (is.factor(x = fullref[[i, drop = TRUE]])) {
+    fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]])
+  }
+}
+
+SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE)
+Convert(args[3], dest = "h5ad", overwrite = TRUE)
\ No newline at end of file
diff --git a/mouse_pansci/scripts/convert_to_zarr.py b/mouse_pansci/scripts/convert_to_zarr.py
new file mode 100644
index 0000000..79ec200
--- /dev/null
+++ b/mouse_pansci/scripts/convert_to_zarr.py
@@ -0,0 +1,13 @@
+#!/usr/bin/python
+import scanpy as sc
+import sys
+import zarr
+from scipy import sparse
+
+adata = sc.read_h5ad(sys.argv[1])
+del adata.raw
+adata.var.index = adata.var.index.astype('str')
+adata.obs.index = adata.obs.index.astype('str')
+adata.var_names = adata.var_names.astype(str)
+adata.X = adata.X.tocsc()
+adata.write_zarr(sys.argv[2], [adata.shape[0], 10])
\ No newline at end of file