From 1d818dbbba3a907f4d4c987692fd39e806a763dd Mon Sep 17 00:00:00 2001 From: Gesmira Date: Fri, 23 Feb 2024 16:50:31 -0500 Subject: [PATCH 1/7] init pansci --- mouse_pansci/processing.R | 49 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 mouse_pansci/processing.R diff --git a/mouse_pansci/processing.R b/mouse_pansci/processing.R new file mode 100644 index 0000000..934b22a --- /dev/null +++ b/mouse_pansci/processing.R @@ -0,0 +1,49 @@ + + +# Convert to seurat object +# sceasy::convertFormat("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.h5ad", from="anndata", +# to="seurat", outFile = "/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds") + +a <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds") + +a <- NormalizeData(a) +a <- FindVariableFeatures(a, nfeatures = 5000) +a <- ScaleData(a) +a <- RunPCA(a, npcs = 100) +a <- RunUMAP(a, dims = 1:100) + +##### Post processing #### +a <- readRDS("/brahms/mollag/azimuth/pansci/pansci_processed_umap_100_dims.rds") + +DimPlot(a, group.by = "Main_cell_type", label = T, repel = T, alpha = 0.1) + NoLegend() +DimPlot(a, group.by = "Lineage", label = T, repel = T, alpha = 0.1) + NoLegend() +DimPlot(a, group.by = "Organ_name", label = T, repel = T, alpha = 0.1) + NoLegend() +saveRDS(obj, full.reference) + + + +# Build Azimuth Reference +azimuth.obj <- AzimuthReference( + object = obj, + refUMAP = "umap", + plotref = "umap", + refDR = "pca", + refAssay = "RNA", + metadata = c("celltype.l1", "celltype.l2"), + dims = 1:100, + reference.version = "2.0.0" +) + +########### SCTransform VERSION ######### +a <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds") + +a <- SCTransform(a) +a <- FindVariableFeatures(a, nfeatures = 5000) +a <- ScaleData(a) +a <- RunPCA(a, npcs = 100) +a <- RunUMAP(a, dims = 1:100) + + + +# they actually include "sub_umap" coordinates in the metadata? not sure if they computed the umap themselves? +# Skylar looked into this and they dont look the best From 970dd0282626d3fae5ed615332eac73cd172dabc Mon Sep 17 00:00:00 2001 From: Gesmira Date: Fri, 23 Feb 2024 16:57:34 -0500 Subject: [PATCH 2/7] init pansci --- mouse_pansci/processing.R | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/mouse_pansci/processing.R b/mouse_pansci/processing.R index 934b22a..df5a10c 100644 --- a/mouse_pansci/processing.R +++ b/mouse_pansci/processing.R @@ -1,4 +1,4 @@ - +# Script to process pansci downsampled dataset # Convert to seurat object # sceasy::convertFormat("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.h5ad", from="anndata", @@ -10,7 +10,8 @@ a <- NormalizeData(a) a <- FindVariableFeatures(a, nfeatures = 5000) a <- ScaleData(a) a <- RunPCA(a, npcs = 100) -a <- RunUMAP(a, dims = 1:100) +a <- RunUMAP(a, dims = 1:100, return.model = T) +saveRDS(a, "/brahms/mollag/azimuth/pansci/pansci_processed_umap_100_dims.rds") ##### Post processing #### a <- readRDS("/brahms/mollag/azimuth/pansci/pansci_processed_umap_100_dims.rds") @@ -18,32 +19,15 @@ a <- readRDS("/brahms/mollag/azimuth/pansci/pansci_processed_umap_100_dims.rds") DimPlot(a, group.by = "Main_cell_type", label = T, repel = T, alpha = 0.1) + NoLegend() DimPlot(a, group.by = "Lineage", label = T, repel = T, alpha = 0.1) + NoLegend() DimPlot(a, group.by = "Organ_name", label = T, repel = T, alpha = 0.1) + NoLegend() -saveRDS(obj, full.reference) -# Build Azimuth Reference -azimuth.obj <- AzimuthReference( - object = obj, - refUMAP = "umap", - plotref = "umap", - refDR = "pca", - refAssay = "RNA", - metadata = c("celltype.l1", "celltype.l2"), - dims = 1:100, - reference.version = "2.0.0" -) ########### SCTransform VERSION ######### -a <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds") +b <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds") -a <- SCTransform(a) -a <- FindVariableFeatures(a, nfeatures = 5000) -a <- ScaleData(a) +b <- SCTransform(b, variable.features.n = 5000) +b <- ScaleData(b) a <- RunPCA(a, npcs = 100) a <- RunUMAP(a, dims = 1:100) - - -# they actually include "sub_umap" coordinates in the metadata? not sure if they computed the umap themselves? -# Skylar looked into this and they dont look the best From 63aaa8caee490c24bda3fefe90305891fd9388ac Mon Sep 17 00:00:00 2001 From: Gesmira Date: Mon, 26 Feb 2024 09:46:59 -0500 Subject: [PATCH 3/7] sct processing --- mouse_pansci/processing.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mouse_pansci/processing.R b/mouse_pansci/processing.R index df5a10c..73b45e1 100644 --- a/mouse_pansci/processing.R +++ b/mouse_pansci/processing.R @@ -27,7 +27,10 @@ DimPlot(a, group.by = "Organ_name", label = T, repel = T, alpha = 0.1) + NoLegen b <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds") b <- SCTransform(b, variable.features.n = 5000) -b <- ScaleData(b) -a <- RunPCA(a, npcs = 100) -a <- RunUMAP(a, dims = 1:100) +b <- RunPCA(b, npcs = 100) +b <- RunUMAP(b, dims = 1:100) +saveRDS(b, "/brahms/mollag/azimuth/pansci/pansci_SCT_processed_umap_100_dims.rds") +DimPlot(b, group.by = "Main_cell_type", label = T, repel = T, alpha = 0.1) + NoLegend() +DimPlot(b, group.by = "Lineage", label = T, repel = T, alpha = 0.1) + NoLegend() +DimPlot(b, group.by = "Organ_name", label = T, repel = T, alpha = 0.1) + NoLegend() From 8edb4fe6ca89d0ee81668f2e176d9447f518dfa7 Mon Sep 17 00:00:00 2001 From: zskylarli Date: Tue, 12 Mar 2024 15:04:59 -0400 Subject: [PATCH 4/7] add ref build script --- mouse_pansci/build_reference.R | 59 ++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 mouse_pansci/build_reference.R diff --git a/mouse_pansci/build_reference.R b/mouse_pansci/build_reference.R new file mode 100644 index 0000000..1635665 --- /dev/null +++ b/mouse_pansci/build_reference.R @@ -0,0 +1,59 @@ +#!/usr/bin/env Rscript + +# Script to build mouse pansci reference 1.0.0 + +# Rscript {input.script} {input.counts} {input.annotations} {input.dr} {output.ref} {output.idx} {output.obj} + +library(Seurat) +library(Azimuth) + +args = commandArgs(trailingOnly=TRUE) +obj_original <- readRDS(args[1]) +annotations <- read.csv(args[2], row.names = 1) +azimuth.reference <- args[3] +azimuth.index <- args[4] +full.reference <- args[5] + +# Convert gene IDs to gene symbols based on annotations file +pansci.mtx <- LayerData(obj_original, assay="RNA", layer="counts") +gene_names <- read.csv(annotations) +rownames(pansci.mtx) <- gene_names$gene_name +subset.pansci.mtx <- pansci.mtx[grep("^ENSMUSG", rownames(pansci.mtx), invert = TRUE), ] + +# Rebuild object from RNA assay and converted gene names +pansci.assay <- CreateAssayObject(counts = subset.pansci.mtx) +obj <- CreateSeuratObject(counts=pansci.assay, assay="RNA") +obj <- AddMetaData(obj, obj_original[[]]) + +# Normalize wtih SCT +obj <- SCTransform(obj, variable.features.n = 5000) + +# Run PCA on the SCT residuals +obj <- RunPCA(obj, assay = "SCT", reduction.name = "pca", npcs = 100) + +# Build UMAP model +obj <- RunUMAP(obj, dims = 1:100, return.model = T) + +saveRDS(obj, full.reference) + +# Build Azimuth Reference +azimuth.obj <- AzimuthReference(obj, + refUMAP='umap', + refDR='pca', + refAssay = 'SCT', + dims = 1:100, + plotref='umap', + metadata = c("Main_cell_type","Sub_cell_type", "Sub_cell_type_organ"), + reference.version = "1.0.0" +) + +ValidateAzimuthReference(object = azimuth.obj) + +SaveAnnoyIndex( + object = azimuth.obj[["refdr.annoy.neighbors"]], + file = azimuth.index +) +saveRDS( + object = azimuth.obj, + file = azimuth.reference +) \ No newline at end of file From 2f6e4d86d52b6b6ad3fab6db8def8032315b4ac5 Mon Sep 17 00:00:00 2001 From: zskylarli Date: Tue, 12 Mar 2024 17:37:07 -0400 Subject: [PATCH 5/7] add demo build file --- mouse_pansci/build_demo.R | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 mouse_pansci/build_demo.R diff --git a/mouse_pansci/build_demo.R b/mouse_pansci/build_demo.R new file mode 100644 index 0000000..472bdc0 --- /dev/null +++ b/mouse_pansci/build_demo.R @@ -0,0 +1,17 @@ +#!/usr/bin/env Rscript + +# Script to build human liver demo + +# Demo object can be downloaded at https://cellxgene.cziscience.com/collections/fe0e718d-2ee9-42cc-894b-0b490f437dfd + +library(Seurat) +library(SeuratDisk) + +args = commandArgs(trailingOnly=TRUE) +demo.path <- args[1] +demo.output <- args[2] + +demo <- readRDS(demo.path, gene.column = 1) +demo <- subset(demo, cells = sample(Cells(demo), size = 20000)) + +saveRDS(demo.obj, demo.output) From 8fc0bffc3c6fa8ce70fd547968f29ef8eb24f365 Mon Sep 17 00:00:00 2001 From: Gesmira Date: Wed, 26 Jun 2024 13:58:40 -0400 Subject: [PATCH 6/7] updating scripts --- mouse_pansci/Snakefile | 129 ++++++++++++++++++++++++ mouse_pansci/scripts/build_demo.R | 17 ++++ mouse_pansci/scripts/build_reference.R | 59 +++++++++++ mouse_pansci/scripts/convert_to_h5ad.R | 35 +++++++ mouse_pansci/scripts/convert_to_zarr.py | 13 +++ 5 files changed, 253 insertions(+) create mode 100644 mouse_pansci/Snakefile create mode 100644 mouse_pansci/scripts/build_demo.R create mode 100644 mouse_pansci/scripts/build_reference.R create mode 100644 mouse_pansci/scripts/convert_to_h5ad.R create mode 100644 mouse_pansci/scripts/convert_to_zarr.py diff --git a/mouse_pansci/Snakefile b/mouse_pansci/Snakefile new file mode 100644 index 0000000..f886301 --- /dev/null +++ b/mouse_pansci/Snakefile @@ -0,0 +1,129 @@ +############################## Config ######################################### +container: "docker://satijalab/azimuth-references:human_adipose-1.0.0" + +############################## All ############################################ +rule all: + input: + "reference/ref.Rds", + "reference/idx.annoy" + +############################## Reference ###################################### +rule download: + params: + aizarani_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE124nnn/GSE124395/suppl/GSE124395_Normalhumanlivercellatlasdata.txt.gz", + macparland_url = "'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE115469&format=file&file=GSE115469%5FData%2Ecsv%2Egz'", + payen_url = "'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE158723&format=file'", + ramachandran_url = "'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE136103&format=file'", + zhang_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE138nnn/GSE138709/suppl/GSE138709_RAW.tar" + output: + "logs/download_data.log", + "data/GSE124395_Normalhumanlivercellatlasdata.txt", + "data/natcomm_GSE115469.csv.gz", + "data/mockfile.txt" + #"data/payen/*.gz", + #"data/ramachandran/*.gz", + #"data/zhang/*.gz" + shell: + """ + wget {params.aizarani_url} -P data + gunzip data/GSE124395_Normalhumanlivercellatlasdata.txt.gz + wget -O data/natcomm_GSE115469.csv.gz \ + {params.macparland_url} + wget -O data/JHEP_GSE158723.tar \ + 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE158723&format=file' + mkdir data/payen/ + tar -xvf data/JHEP_GSE158723.tar -C data/payen/ + FILES=$(ls data/payen/*.gz) + for f in $FILES ; do + basename=$(basename ${{f}}) + num=${{basename:8:2}} + substring=$(echo $f| cut --complement -d'_' -f 1,2) + mv "$f" "$substring" + mkdir -p data/payen/GSM"$num" + mv "$substring" data/payen/GSM"$num"/ + gunzip data/payen/GSM"$num"/"$substring" + done + wget -O data/GSE136103_RAW.tar {params.ramachandran_url} + mkdir data/ramachandran/ + tar -xvf data/GSE136103_RAW.tar -C data/ramachandran/ + FILES=$(ls data/ramachandran/*healthy[0-9]*) + for f in $FILES ; do + basename=$(basename ${{f}}) + num=${{basename:8:2}} + substring=$(echo $f| cut --complement -d'_' -f 1,2,3) + mv "$f" "$substring" + mkdir -p data/ramachandran/GSM"$num" + mv "$substring" data/ramachandran/GSM"$num"/ + gunzip data/ramachandran/GSM"$num"/"$substring" + done + rm data/ramachandran/*.gz + wget {params.zhang_url} -P data + mkdir data/zhang/ + tar -xvf data/GSE138709_RAW.tar -C data/zhang/ + touch data/mockfile.txt + echo "Liver reference data downloaded on: $(date)" > logs/download_data.log + """ + +rule build_reference: + input: + script = "scripts/build_reference.R", + aizarani = "data/GSE124395_Normalhumanlivercellatlasdata.txt", + macparland = "data/natcomm_GSE115469.csv.gz", + payen = "data/payen", + ramachandran = "data/ramachandran", + zhang = "data/zhang", + annotations = "data/annotations.csv" + output: + ref = "reference/ref.Rds", + idx = "reference/idx.annoy", + obj = "full_reference.Rds" + shell: + """ + Rscript {input.script} {input.aizarani} {input.macparland} {input.payen} {input.ramachandran} {input.zhang} {input.annotations} {output.ref} {output.idx} {output.obj} + """ +############################## Demo ###################################### +rule download_demo: + params: + demo_url = "https://www.livercellatlas.org/data_files/toDownload/rawData_human.zip" + output: + "logs/download_data.log", + "data/demo/rawData_human/countTable_human/barcodes.tsv.gz", + "data/demo/rawData_human/countTable_human/features.tsv.gz", + "data/demo/rawData_human/countTable_human/matrix.mtx.gz" + shell: + """ + wget {params.demo_url} - P data/demo + gunzip rawData_human + echo "Demo data downloaded on: $(date)" > logs/download_data.log + """ + +rule build_demo: + input: + script = "scripts/build_demo.R", + demo = "data/demo/rawData_human/countTable_human/" + output: + obj = "reference/demo.Rds" + shell: + """ + Rscript {input.script} {input.demo} {output.obj} + """ + +############################## Explorer ######################################## +rule export_zarr: + input: + ref = "reference/ref.Rds", + fullref = "full_reference.Rds", + script1 = "scripts/convert_to_h5ad.R", + script2 = "scripts/convert_to_zarr.py" + output: + h5Seurat = "vitessce/vitessce_ref.h5Seurat", + h5ad = "vitessce/vitessce_ref.h5ad", + zarr = directory("vitessce/vitessce_ref.zarr") + container: + "docker://satijalab/azimuth-references:vitessce" + shell: + """ + mkdir -p vitessce + Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} + python3 {input.script2} {output.h5ad} {output.zarr} + """ \ No newline at end of file diff --git a/mouse_pansci/scripts/build_demo.R b/mouse_pansci/scripts/build_demo.R new file mode 100644 index 0000000..472bdc0 --- /dev/null +++ b/mouse_pansci/scripts/build_demo.R @@ -0,0 +1,17 @@ +#!/usr/bin/env Rscript + +# Script to build human liver demo + +# Demo object can be downloaded at https://cellxgene.cziscience.com/collections/fe0e718d-2ee9-42cc-894b-0b490f437dfd + +library(Seurat) +library(SeuratDisk) + +args = commandArgs(trailingOnly=TRUE) +demo.path <- args[1] +demo.output <- args[2] + +demo <- readRDS(demo.path, gene.column = 1) +demo <- subset(demo, cells = sample(Cells(demo), size = 20000)) + +saveRDS(demo.obj, demo.output) diff --git a/mouse_pansci/scripts/build_reference.R b/mouse_pansci/scripts/build_reference.R new file mode 100644 index 0000000..1635665 --- /dev/null +++ b/mouse_pansci/scripts/build_reference.R @@ -0,0 +1,59 @@ +#!/usr/bin/env Rscript + +# Script to build mouse pansci reference 1.0.0 + +# Rscript {input.script} {input.counts} {input.annotations} {input.dr} {output.ref} {output.idx} {output.obj} + +library(Seurat) +library(Azimuth) + +args = commandArgs(trailingOnly=TRUE) +obj_original <- readRDS(args[1]) +annotations <- read.csv(args[2], row.names = 1) +azimuth.reference <- args[3] +azimuth.index <- args[4] +full.reference <- args[5] + +# Convert gene IDs to gene symbols based on annotations file +pansci.mtx <- LayerData(obj_original, assay="RNA", layer="counts") +gene_names <- read.csv(annotations) +rownames(pansci.mtx) <- gene_names$gene_name +subset.pansci.mtx <- pansci.mtx[grep("^ENSMUSG", rownames(pansci.mtx), invert = TRUE), ] + +# Rebuild object from RNA assay and converted gene names +pansci.assay <- CreateAssayObject(counts = subset.pansci.mtx) +obj <- CreateSeuratObject(counts=pansci.assay, assay="RNA") +obj <- AddMetaData(obj, obj_original[[]]) + +# Normalize wtih SCT +obj <- SCTransform(obj, variable.features.n = 5000) + +# Run PCA on the SCT residuals +obj <- RunPCA(obj, assay = "SCT", reduction.name = "pca", npcs = 100) + +# Build UMAP model +obj <- RunUMAP(obj, dims = 1:100, return.model = T) + +saveRDS(obj, full.reference) + +# Build Azimuth Reference +azimuth.obj <- AzimuthReference(obj, + refUMAP='umap', + refDR='pca', + refAssay = 'SCT', + dims = 1:100, + plotref='umap', + metadata = c("Main_cell_type","Sub_cell_type", "Sub_cell_type_organ"), + reference.version = "1.0.0" +) + +ValidateAzimuthReference(object = azimuth.obj) + +SaveAnnoyIndex( + object = azimuth.obj[["refdr.annoy.neighbors"]], + file = azimuth.index +) +saveRDS( + object = azimuth.obj, + file = azimuth.reference +) \ No newline at end of file diff --git a/mouse_pansci/scripts/convert_to_h5ad.R b/mouse_pansci/scripts/convert_to_h5ad.R new file mode 100644 index 0000000..29e99f3 --- /dev/null +++ b/mouse_pansci/scripts/convert_to_h5ad.R @@ -0,0 +1,35 @@ +#!/usr/bin/env Rscript +library(Seurat) +library(SeuratDisk) +args <- commandArgs(trailingOnly = TRUE) + +ref <- readRDS(file = args[1]) +fullref <- readRDS(file = args[2]) +fullref <- subset(x = fullref, cells = Cells(x = ref)) +fullref[['umap']] <- ref[['refUMAP']] +Key(object = fullref[['umap']]) <- "umap_" +DefaultAssay(object = fullref[['umap']]) <- "RNA" + +DefaultAssay(object = fullref) <- "RNA" +fullref <- NormalizeData(object = fullref) +fullref <- DietSeurat( + object = fullref, + dimreducs = "umap", + assays = "RNA" +) +for (i in colnames(x = fullref[[]])) { + fullref[[i]] <- NULL +} +fullref <- AddMetaData(object = fullref, metadata = ref[[]]) +Misc(object = fullref[['umap']], slot = "model") <- NULL + +fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref))) + +for (i in colnames(x = fullref[[]])) { + if (is.factor(x = fullref[[i, drop = TRUE]])) { + fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]]) + } +} + +SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE) +Convert(args[3], dest = "h5ad", overwrite = TRUE) diff --git a/mouse_pansci/scripts/convert_to_zarr.py b/mouse_pansci/scripts/convert_to_zarr.py new file mode 100644 index 0000000..2e7a66f --- /dev/null +++ b/mouse_pansci/scripts/convert_to_zarr.py @@ -0,0 +1,13 @@ +#!/usr/bin/python +import scanpy as sc +import sys +import zarr +from scipy import sparse + +adata = sc.read_h5ad(sys.argv[1]) +del adata.raw +adata.var.index = adata.var.index.astype('str') +adata.obs.index = adata.obs.index.astype('str') +adata.var_names = adata.var_names.astype(str) +adata.X = adata.X.tocsc() +adata.write_zarr(sys.argv[2], [adata.shape[0], 10]) From 888a2b3d2027014bfda32da81c5efdaf64577ba8 Mon Sep 17 00:00:00 2001 From: zskylarli Date: Tue, 5 Nov 2024 15:43:22 -0500 Subject: [PATCH 7/7] formalize scripts for public release --- mouse_pansci/Snakefile | 82 ++++++++++++++++++++ mouse_pansci/processing.R | 36 --------- mouse_pansci/reference/config.json | 13 ++++ mouse_pansci/{ => scripts}/build_demo.R | 4 +- mouse_pansci/{ => scripts}/build_reference.R | 6 +- mouse_pansci/scripts/convert_to_h5ad.R | 35 +++++++++ mouse_pansci/scripts/convert_to_zarr.py | 13 ++++ 7 files changed, 147 insertions(+), 42 deletions(-) create mode 100644 mouse_pansci/Snakefile delete mode 100644 mouse_pansci/processing.R create mode 100644 mouse_pansci/reference/config.json rename mouse_pansci/{ => scripts}/build_demo.R (65%) rename mouse_pansci/{ => scripts}/build_reference.R (89%) create mode 100644 mouse_pansci/scripts/convert_to_h5ad.R create mode 100644 mouse_pansci/scripts/convert_to_zarr.py diff --git a/mouse_pansci/Snakefile b/mouse_pansci/Snakefile new file mode 100644 index 0000000..5ffd1ab --- /dev/null +++ b/mouse_pansci/Snakefile @@ -0,0 +1,82 @@ +############################## Config ######################################### +container: "docker://satijalab/azimuth-references:latest" + +############################## All ########################################### +rule all: + input: + "reference/ref.Rds", + "reference/idx.annoy", + "reference/demo.rds" + +############################## Reference ####################################### +rule download: + params: + data_url = "https://zenodo.org/records/14042645/files/pansci_filtered.rds" + output: + "logs/download_data.log", + "data/obj.rds" + shell: + """ + wget {params.data_url} -P data + mv data/pansci_filtered.rds data/obj.rds + echo "Mouse PanSci data downloaded on: $(date)" > logs/download_data.log + """ + +rule build_reference: + input: + script = "scripts/build_reference.R", + obj = "data/obj.rds", + annotations = "data/pansci_genenames.csv", + output: + ref = "reference/ref.Rds", + idx = "reference/idx.annoy", + obj = "full_reference.Rds" + shell: + """ + Rscript {input.script} {input.obj} {input.genenames} {output.ref} {output.idx} {output.obj} + """ + +############################## Demo ############################################ +rule download_demo: + params: + data_url = "https://datasets.cellxgene.cziscience.com/4103ff27-c13c-4092-87a7-5cd71ce57d26.rds", + output: + "logs/download_demo_data.log", + "data/demo/adipose_emont_2022.rds" + shell: + """ + wget {params.data_url} -P data/demo + mv data/demo/4103ff27-c13c-4092-87a7-5cd71ce57d26.rds data/demo/adipose_emont_2022.rds + echo "Mouse adipose demo data downloaded on: $(date)" > logs/download_demo_data.log + """ + +rule setup_demo: + input: + data = "data/demo/adipose_emont_2022.rds" + script = "scripts/setup_demo.R", + + output: + "reference/adipose_emont_2022_subset.rds" + shell: + """ + Rscript {input.script} {input.data} {output} > logs/setup_demo.Rout 2>&1 + """ + +rule export_zarr: + input: + ref = "reference/ref.Rds", + fullref = "full_reference.Rds", + script1 = "scripts/convert_to_h5ad.R", + script2 = "scripts/convert_to_zarr.py" + output: + h5Seurat = "vitessce/vitessce_ref.h5Seurat", + h5ad = "vitessce/vitessce_ref.h5ad", + zarr = directory("vitessce/vitessce_ref.zarr") + container: + "docker://satijalab/azimuth-references:vitessce" + shell: + """ + mkdir -p vitessce + Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1 + python3 {input.script2} {output.h5ad} {output.zarr} + """ \ No newline at end of file diff --git a/mouse_pansci/processing.R b/mouse_pansci/processing.R deleted file mode 100644 index 73b45e1..0000000 --- a/mouse_pansci/processing.R +++ /dev/null @@ -1,36 +0,0 @@ -# Script to process pansci downsampled dataset - -# Convert to seurat object -# sceasy::convertFormat("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.h5ad", from="anndata", -# to="seurat", outFile = "/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds") - -a <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds") - -a <- NormalizeData(a) -a <- FindVariableFeatures(a, nfeatures = 5000) -a <- ScaleData(a) -a <- RunPCA(a, npcs = 100) -a <- RunUMAP(a, dims = 1:100, return.model = T) -saveRDS(a, "/brahms/mollag/azimuth/pansci/pansci_processed_umap_100_dims.rds") - -##### Post processing #### -a <- readRDS("/brahms/mollag/azimuth/pansci/pansci_processed_umap_100_dims.rds") - -DimPlot(a, group.by = "Main_cell_type", label = T, repel = T, alpha = 0.1) + NoLegend() -DimPlot(a, group.by = "Lineage", label = T, repel = T, alpha = 0.1) + NoLegend() -DimPlot(a, group.by = "Organ_name", label = T, repel = T, alpha = 0.1) + NoLegend() - - - - -########### SCTransform VERSION ######### -b <- readRDS("/brahms/mollag/azimuth/pansci/20240222_PanSci_all_cells_adata_sampled_by_subtype.rds") - -b <- SCTransform(b, variable.features.n = 5000) -b <- RunPCA(b, npcs = 100) -b <- RunUMAP(b, dims = 1:100) -saveRDS(b, "/brahms/mollag/azimuth/pansci/pansci_SCT_processed_umap_100_dims.rds") - -DimPlot(b, group.by = "Main_cell_type", label = T, repel = T, alpha = 0.1) + NoLegend() -DimPlot(b, group.by = "Lineage", label = T, repel = T, alpha = 0.1) + NoLegend() -DimPlot(b, group.by = "Organ_name", label = T, repel = T, alpha = 0.1) + NoLegend() diff --git a/mouse_pansci/reference/config.json b/mouse_pansci/reference/config.json new file mode 100644 index 0000000..a1398e5 --- /dev/null +++ b/mouse_pansci/reference/config.json @@ -0,0 +1,13 @@ +{ + "Azimuth.app.reference": "/reference-data/mouse_pansci/", + "Azimuth.app.demodataset": "/reference-data/mouse_pansci/demo.rds", + "Azimuth.app.max_cells": 100000, + "Azimuth.app.default_gene": "GLNY", + "Azimuth.app.default_metadata": "subclass", + "Azimuth.app.do_adt": "FALSE", + "Azimuth.app.welcomebox": "div(\n h3(HTML(\"Please upload a dataset to map to the mouse PanSci reference\")),\n \"Upload a counts matrix from an scRNA-seq dataset of mouse data in one\n of the following formats: hdf5, rds, h5ad, h5seurat. For testing, we\n also provide a demo dataset of 20000 mouse adipose cells (10x Genomics v3),\n which is loaded automatically with the 'Load demo dataset' button.\",\n width = 12\n )", + "Azimuth.app.refdescriptor": "

This reference was generated from the PanSci mouse organismal aging atlas originally described in Zhang et al., 2024, this data was integrated across 3 genetic backgrounds. Annotations are provided at the level of main cell type, sub cell type, and organ.
", + "Azimuth.app.refuri": "https://seurat.nygenome.org/azimuth/references/v1.0.0/mouse_pansci", + "shiny.maxRequestSize": 1048576000, + "Azimuth.app.do_adt": "FALSE" + } \ No newline at end of file diff --git a/mouse_pansci/build_demo.R b/mouse_pansci/scripts/build_demo.R similarity index 65% rename from mouse_pansci/build_demo.R rename to mouse_pansci/scripts/build_demo.R index 472bdc0..c15f646 100644 --- a/mouse_pansci/build_demo.R +++ b/mouse_pansci/scripts/build_demo.R @@ -1,8 +1,6 @@ #!/usr/bin/env Rscript -# Script to build human liver demo - -# Demo object can be downloaded at https://cellxgene.cziscience.com/collections/fe0e718d-2ee9-42cc-894b-0b490f437dfd +# Script to build demo library(Seurat) library(SeuratDisk) diff --git a/mouse_pansci/build_reference.R b/mouse_pansci/scripts/build_reference.R similarity index 89% rename from mouse_pansci/build_reference.R rename to mouse_pansci/scripts/build_reference.R index 1635665..2a5c32d 100644 --- a/mouse_pansci/build_reference.R +++ b/mouse_pansci/scripts/build_reference.R @@ -2,21 +2,21 @@ # Script to build mouse pansci reference 1.0.0 -# Rscript {input.script} {input.counts} {input.annotations} {input.dr} {output.ref} {output.idx} {output.obj} +# Rscript {input.script} {input.counts} {input.genenames} {input.dr} {output.ref} {output.idx} {output.obj} library(Seurat) library(Azimuth) args = commandArgs(trailingOnly=TRUE) obj_original <- readRDS(args[1]) -annotations <- read.csv(args[2], row.names = 1) +genenames <- read.csv(args[2], row.names = 1) azimuth.reference <- args[3] azimuth.index <- args[4] full.reference <- args[5] # Convert gene IDs to gene symbols based on annotations file pansci.mtx <- LayerData(obj_original, assay="RNA", layer="counts") -gene_names <- read.csv(annotations) +gene_names <- read.csv(genenames) rownames(pansci.mtx) <- gene_names$gene_name subset.pansci.mtx <- pansci.mtx[grep("^ENSMUSG", rownames(pansci.mtx), invert = TRUE), ] diff --git a/mouse_pansci/scripts/convert_to_h5ad.R b/mouse_pansci/scripts/convert_to_h5ad.R new file mode 100644 index 0000000..f09dc27 --- /dev/null +++ b/mouse_pansci/scripts/convert_to_h5ad.R @@ -0,0 +1,35 @@ +#!/usr/bin/env Rscript +library(Seurat) +library(SeuratDisk) +args <- commandArgs(trailingOnly = TRUE) + +ref <- readRDS(file = args[1]) +fullref <- readRDS(file = args[2]) +fullref <- subset(x = fullref, cells = Cells(x = ref)) +fullref[['umap']] <- ref[['refUMAP']] +Key(object = fullref[['umap']]) <- "umap_" +DefaultAssay(object = fullref[['umap']]) <- "RNA" + +DefaultAssay(object = fullref) <- "RNA" +fullref <- NormalizeData(object = fullref) +fullref <- DietSeurat( + object = fullref, + dimreducs = "umap", + assays = "RNA" +) +for (i in colnames(x = fullref[[]])) { + fullref[[i]] <- NULL +} +fullref <- AddMetaData(object = fullref, metadata = ref[[]]) +Misc(object = fullref[['umap']], slot = "model") <- NULL + +fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref))) + +for (i in colnames(x = fullref[[]])) { + if (is.factor(x = fullref[[i, drop = TRUE]])) { + fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]]) + } +} + +SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE) +Convert(args[3], dest = "h5ad", overwrite = TRUE) \ No newline at end of file diff --git a/mouse_pansci/scripts/convert_to_zarr.py b/mouse_pansci/scripts/convert_to_zarr.py new file mode 100644 index 0000000..79ec200 --- /dev/null +++ b/mouse_pansci/scripts/convert_to_zarr.py @@ -0,0 +1,13 @@ +#!/usr/bin/python +import scanpy as sc +import sys +import zarr +from scipy import sparse + +adata = sc.read_h5ad(sys.argv[1]) +del adata.raw +adata.var.index = adata.var.index.astype('str') +adata.obs.index = adata.obs.index.astype('str') +adata.var_names = adata.var_names.astype(str) +adata.X = adata.X.tocsc() +adata.write_zarr(sys.argv[2], [adata.shape[0], 10]) \ No newline at end of file