Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Mouse PanSci reference #26

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions mouse_pansci/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
############################## Config #########################################
container: "docker://satijalab/azimuth-references:latest"

############################## All ###########################################
rule all:
input:
"reference/ref.Rds",
"reference/idx.annoy",
"reference/demo.rds"

############################## Reference #######################################
rule download:
params:
data_url = "https://zenodo.org/records/14042645/files/pansci_filtered.rds"
output:
"logs/download_data.log",
"data/obj.rds"
shell:
"""
wget {params.data_url} -P data
mv data/pansci_filtered.rds data/obj.rds
echo "Mouse PanSci data downloaded on: $(date)" > logs/download_data.log
"""

rule build_reference:
input:
script = "scripts/build_reference.R",
obj = "data/obj.rds",
annotations = "data/pansci_genenames.csv",
output:
ref = "reference/ref.Rds",
idx = "reference/idx.annoy",
obj = "full_reference.Rds"
shell:
"""
Rscript {input.script} {input.obj} {input.genenames} {output.ref} {output.idx} {output.obj}
"""

############################## Demo ############################################
rule download_demo:
params:
data_url = "https://datasets.cellxgene.cziscience.com/4103ff27-c13c-4092-87a7-5cd71ce57d26.rds",
output:
"logs/download_demo_data.log",
"data/demo/adipose_emont_2022.rds"
shell:
"""
wget {params.data_url} -P data/demo
mv data/demo/4103ff27-c13c-4092-87a7-5cd71ce57d26.rds data/demo/adipose_emont_2022.rds
echo "Mouse adipose demo data downloaded on: $(date)" > logs/download_demo_data.log
"""

rule setup_demo:
input:
data = "data/demo/adipose_emont_2022.rds"
script = "scripts/setup_demo.R",

output:
"reference/adipose_emont_2022_subset.rds"
shell:
"""
Rscript {input.script} {input.data} {output} > logs/setup_demo.Rout 2>&1
"""

rule export_zarr:
input:
ref = "reference/ref.Rds",
fullref = "full_reference.Rds",
script1 = "scripts/convert_to_h5ad.R",
script2 = "scripts/convert_to_zarr.py"
output:
h5Seurat = "vitessce/vitessce_ref.h5Seurat",
h5ad = "vitessce/vitessce_ref.h5ad",
zarr = directory("vitessce/vitessce_ref.zarr")
container:
"docker://satijalab/azimuth-references:vitessce"
shell:
"""
mkdir -p vitessce
Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1
python3 {input.script2} {output.h5ad} {output.zarr}
"""
13 changes: 13 additions & 0 deletions mouse_pansci/reference/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"Azimuth.app.reference": "/reference-data/mouse_pansci/",
"Azimuth.app.demodataset": "/reference-data/mouse_pansci/demo.rds",
"Azimuth.app.max_cells": 100000,
"Azimuth.app.default_gene": "GLNY",
"Azimuth.app.default_metadata": "subclass",
"Azimuth.app.do_adt": "FALSE",
"Azimuth.app.welcomebox": "div(\n h3(HTML(\"Please upload a dataset to map to the <b>mouse PanSci</b> reference\")),\n \"Upload a counts matrix from an scRNA-seq dataset of mouse data in one\n of the following formats: hdf5, rds, h5ad, h5seurat. For testing, we\n also provide a demo dataset of 20000 mouse adipose cells (10x Genomics v3),\n which is loaded automatically with the 'Load demo dataset' button.\",\n width = 12\n )",
"Azimuth.app.refdescriptor": "<div class='refdescriptor'><br>This reference was generated from the PanSci mouse organismal aging atlas originally described in <a href='https://www.biorxiv.org/content/10.1101/2024.03.01.583001v1'>Zhang et al., 2024</a>, this data was integrated across 3 genetic backgrounds. Annotations are provided at the level of <i>main cell type</i>, <i>sub cell type</i>, and <i>organ</i>. </div>",
"Azimuth.app.refuri": "https://seurat.nygenome.org/azimuth/references/v1.0.0/mouse_pansci",
"shiny.maxRequestSize": 1048576000,
"Azimuth.app.do_adt": "FALSE"
}
15 changes: 15 additions & 0 deletions mouse_pansci/scripts/build_demo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env Rscript

# Script to build demo

library(Seurat)
library(SeuratDisk)

args = commandArgs(trailingOnly=TRUE)
demo.path <- args[1]
demo.output <- args[2]

demo <- readRDS(demo.path, gene.column = 1)
demo <- subset(demo, cells = sample(Cells(demo), size = 20000))

saveRDS(demo.obj, demo.output)
59 changes: 59 additions & 0 deletions mouse_pansci/scripts/build_reference.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env Rscript

# Script to build mouse pansci reference 1.0.0

# Rscript {input.script} {input.counts} {input.genenames} {input.dr} {output.ref} {output.idx} {output.obj}

library(Seurat)
library(Azimuth)

args = commandArgs(trailingOnly=TRUE)
obj_original <- readRDS(args[1])
genenames <- read.csv(args[2], row.names = 1)
azimuth.reference <- args[3]
azimuth.index <- args[4]
full.reference <- args[5]

# Convert gene IDs to gene symbols based on annotations file
pansci.mtx <- LayerData(obj_original, assay="RNA", layer="counts")
gene_names <- read.csv(genenames)
rownames(pansci.mtx) <- gene_names$gene_name
subset.pansci.mtx <- pansci.mtx[grep("^ENSMUSG", rownames(pansci.mtx), invert = TRUE), ]

# Rebuild object from RNA assay and converted gene names
pansci.assay <- CreateAssayObject(counts = subset.pansci.mtx)
obj <- CreateSeuratObject(counts=pansci.assay, assay="RNA")
obj <- AddMetaData(obj, obj_original[[]])

# Normalize wtih SCT
obj <- SCTransform(obj, variable.features.n = 5000)

# Run PCA on the SCT residuals
obj <- RunPCA(obj, assay = "SCT", reduction.name = "pca", npcs = 100)

# Build UMAP model
obj <- RunUMAP(obj, dims = 1:100, return.model = T)

saveRDS(obj, full.reference)

# Build Azimuth Reference
azimuth.obj <- AzimuthReference(obj,
refUMAP='umap',
refDR='pca',
refAssay = 'SCT',
dims = 1:100,
plotref='umap',
metadata = c("Main_cell_type","Sub_cell_type", "Sub_cell_type_organ"),
reference.version = "1.0.0"
)

ValidateAzimuthReference(object = azimuth.obj)

SaveAnnoyIndex(
object = azimuth.obj[["refdr.annoy.neighbors"]],
file = azimuth.index
)
saveRDS(
object = azimuth.obj,
file = azimuth.reference
)
35 changes: 35 additions & 0 deletions mouse_pansci/scripts/convert_to_h5ad.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env Rscript
library(Seurat)
library(SeuratDisk)
args <- commandArgs(trailingOnly = TRUE)

ref <- readRDS(file = args[1])
fullref <- readRDS(file = args[2])
fullref <- subset(x = fullref, cells = Cells(x = ref))
fullref[['umap']] <- ref[['refUMAP']]
Key(object = fullref[['umap']]) <- "umap_"
DefaultAssay(object = fullref[['umap']]) <- "RNA"

DefaultAssay(object = fullref) <- "RNA"
fullref <- NormalizeData(object = fullref)
fullref <- DietSeurat(
object = fullref,
dimreducs = "umap",
assays = "RNA"
)
for (i in colnames(x = fullref[[]])) {
fullref[[i]] <- NULL
}
fullref <- AddMetaData(object = fullref, metadata = ref[[]])
Misc(object = fullref[['umap']], slot = "model") <- NULL

fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref)))

for (i in colnames(x = fullref[[]])) {
if (is.factor(x = fullref[[i, drop = TRUE]])) {
fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]])
}
}

SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE)
Convert(args[3], dest = "h5ad", overwrite = TRUE)
13 changes: 13 additions & 0 deletions mouse_pansci/scripts/convert_to_zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/python
import scanpy as sc
import sys
import zarr
from scipy import sparse

adata = sc.read_h5ad(sys.argv[1])
del adata.raw
adata.var.index = adata.var.index.astype('str')
adata.obs.index = adata.obs.index.astype('str')
adata.var_names = adata.var_names.astype(str)
adata.X = adata.X.tocsc()
adata.write_zarr(sys.argv[2], [adata.shape[0], 10])