From 8b8789e57907c9a00bdbdeb2d9dae35bf2c47ce6 Mon Sep 17 00:00:00 2001 From: Andrew Butler Date: Fri, 9 Apr 2021 21:14:33 -0400 Subject: [PATCH] add zarr export rules --- .gitignore | 4 ++ docker/{ => main}/Dockerfile | 0 human_motorcortex/Snakefile | 23 +++++++++- human_motorcortex/scripts/convert_to_h5ad.R | 37 +++++++++++++++ human_motorcortex/scripts/convert_to_zarr.py | 13 ++++++ human_pancreas/Snakefile | 20 ++++++++ human_pancreas/scripts/convert_to_h5ad.R | 37 +++++++++++++++ human_pancreas/scripts/convert_to_zarr.py | 13 ++++++ human_pbmc/Snakefile | 41 +++++++++++++++++ human_pbmc/scripts/convert_to_h5ad.R | 48 ++++++++++++++++++++ human_pbmc/scripts/convert_to_zarr.py | 13 ++++++ mouse_motorcortex/Snakefile | 20 ++++++++ mouse_motorcortex/scripts/convert_to_h5ad.R | 37 +++++++++++++++ mouse_motorcortex/scripts/convert_to_zarr.py | 13 ++++++ 14 files changed, 317 insertions(+), 2 deletions(-) rename docker/{ => main}/Dockerfile (100%) create mode 100644 human_motorcortex/scripts/convert_to_h5ad.R create mode 100644 human_motorcortex/scripts/convert_to_zarr.py create mode 100644 human_pancreas/scripts/convert_to_h5ad.R create mode 100644 human_pancreas/scripts/convert_to_zarr.py create mode 100644 human_pbmc/scripts/convert_to_h5ad.R create mode 100644 human_pbmc/scripts/convert_to_zarr.py create mode 100644 mouse_motorcortex/scripts/convert_to_h5ad.R create mode 100644 mouse_motorcortex/scripts/convert_to_zarr.py diff --git a/.gitignore b/.gitignore index 1ceabd4..184c4c2 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,10 @@ src/*.dll *.annoy **/logs **/data +**/seurat-objects/ +**/seurat_objects/ +**/vitessce/ **/.snakemake **/.snakemake_timestamp **/.wget-hsts +**/**/.cache diff --git a/docker/Dockerfile b/docker/main/Dockerfile similarity index 100% rename from docker/Dockerfile rename to docker/main/Dockerfile diff --git a/human_motorcortex/Snakefile b/human_motorcortex/Snakefile index 2ec3a7b..e2d864a 100644 --- a/human_motorcortex/Snakefile +++ b/human_motorcortex/Snakefile @@ -69,11 +69,30 @@ rule setup_demo: input: data = "data/demo/matrix.csv.gz", metadata = "data/demo/metadata.csv", - script = "scripts/setup_demo.R", - + script = "scripts/setup_demo.R" output: "reference/allen_m1c_2019_ssv4.rds" shell: """ Rscript {input.script} {input.data} {input.metadata} {output} > logs/setup_demo.Rout 2>&1 """ + +rule export_zarr: + input: + ref = "reference/ref.Rds", + fullref = "seurat_objects/fullref.Rds", + script1 = "scripts/convert_to_h5ad.R", + script2 = "scripts/convert_to_zarr.py" + output: + h5Seurat = "vitessce/vitessce_ref.h5Seurat", + h5ad = "vitessce/vitessce_ref.h5ad", + zarr = directory("vitessce/vitessce_ref.zarr") + container: + "docker://satijalab/azimuth-references:vitessce" + shell: + """ + mkdir -p vitessce + Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1 + python3 {input.script2} {output.h5ad} {output.zarr} + """ + diff --git a/human_motorcortex/scripts/convert_to_h5ad.R b/human_motorcortex/scripts/convert_to_h5ad.R new file mode 100644 index 0000000..85fb2ce --- /dev/null +++ b/human_motorcortex/scripts/convert_to_h5ad.R @@ -0,0 +1,37 @@ +#!/usr/bin/env Rscript +library(Seurat) +library(SeuratDisk) +args <- commandArgs(trailingOnly = TRUE) + +ref <- readRDS(file = args[1]) +fullref <- readRDS(file = args[2]) +fullref <- subset(x = fullref, cells = Cells(x = ref)) +fullref[['umap']] <- ref[['refUMAP']] +Key(object = fullref[['umap']]) <- "umap_" +DefaultAssay(object = fullref[['umap']]) <- "RNA" + +DefaultAssay(object = fullref) <- "RNA" +fullref <- NormalizeData(object = fullref) +fullref <- DietSeurat( + object = fullref, + dimreducs = "umap", + assays = "RNA" +) +for (i in colnames(x = fullref[[]])) { + fullref[[i]] <- NULL +} +fullref <- AddMetaData(object = fullref, metadata = ref[[]]) +Misc(object = fullref[['umap']], slot = "model") <- NULL + +fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref))) + +for (i in colnames(x = fullref[[]])) { + if (is.factor(x = fullref[[i, drop = TRUE]])) { + fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]]) + } +} + +SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE) +Convert(args[3], dest = "h5ad", overwrite = TRUE) + + diff --git a/human_motorcortex/scripts/convert_to_zarr.py b/human_motorcortex/scripts/convert_to_zarr.py new file mode 100644 index 0000000..2e7a66f --- /dev/null +++ b/human_motorcortex/scripts/convert_to_zarr.py @@ -0,0 +1,13 @@ +#!/usr/bin/python +import scanpy as sc +import sys +import zarr +from scipy import sparse + +adata = sc.read_h5ad(sys.argv[1]) +del adata.raw +adata.var.index = adata.var.index.astype('str') +adata.obs.index = adata.obs.index.astype('str') +adata.var_names = adata.var_names.astype(str) +adata.X = adata.X.tocsc() +adata.write_zarr(sys.argv[2], [adata.shape[0], 10]) diff --git a/human_pancreas/Snakefile b/human_pancreas/Snakefile index 0364105..9bdd4a6 100644 --- a/human_pancreas/Snakefile +++ b/human_pancreas/Snakefile @@ -110,3 +110,23 @@ rule setup_demo: Rscript {input.script} {input.data} {output} > logs/setup_peng.Rout 2>&1 """ +rule export_zarr: + input: + ref = "reference/ref.Rds", + fullref = "seurat_objects/fullref.Rds", + script1 = "scripts/convert_to_h5ad.R", + script2 = "scripts/convert_to_zarr.py" + output: + h5Seurat = "vitessce/vitessce_ref.h5Seurat", + h5ad = "vitessce/vitessce_ref.h5ad", + zarr = directory("vitessce/vitessce_ref.zarr") + container: + "docker://satijalab/azimuth-references:vitessce" + shell: + """ + mkdir -p vitessce + Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1 + python3 {input.script2} {output.h5ad} {output.zarr} + """ + + diff --git a/human_pancreas/scripts/convert_to_h5ad.R b/human_pancreas/scripts/convert_to_h5ad.R new file mode 100644 index 0000000..85fb2ce --- /dev/null +++ b/human_pancreas/scripts/convert_to_h5ad.R @@ -0,0 +1,37 @@ +#!/usr/bin/env Rscript +library(Seurat) +library(SeuratDisk) +args <- commandArgs(trailingOnly = TRUE) + +ref <- readRDS(file = args[1]) +fullref <- readRDS(file = args[2]) +fullref <- subset(x = fullref, cells = Cells(x = ref)) +fullref[['umap']] <- ref[['refUMAP']] +Key(object = fullref[['umap']]) <- "umap_" +DefaultAssay(object = fullref[['umap']]) <- "RNA" + +DefaultAssay(object = fullref) <- "RNA" +fullref <- NormalizeData(object = fullref) +fullref <- DietSeurat( + object = fullref, + dimreducs = "umap", + assays = "RNA" +) +for (i in colnames(x = fullref[[]])) { + fullref[[i]] <- NULL +} +fullref <- AddMetaData(object = fullref, metadata = ref[[]]) +Misc(object = fullref[['umap']], slot = "model") <- NULL + +fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref))) + +for (i in colnames(x = fullref[[]])) { + if (is.factor(x = fullref[[i, drop = TRUE]])) { + fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]]) + } +} + +SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE) +Convert(args[3], dest = "h5ad", overwrite = TRUE) + + diff --git a/human_pancreas/scripts/convert_to_zarr.py b/human_pancreas/scripts/convert_to_zarr.py new file mode 100644 index 0000000..2e7a66f --- /dev/null +++ b/human_pancreas/scripts/convert_to_zarr.py @@ -0,0 +1,13 @@ +#!/usr/bin/python +import scanpy as sc +import sys +import zarr +from scipy import sparse + +adata = sc.read_h5ad(sys.argv[1]) +del adata.raw +adata.var.index = adata.var.index.astype('str') +adata.obs.index = adata.obs.index.astype('str') +adata.var_names = adata.var_names.astype(str) +adata.X = adata.X.tocsc() +adata.write_zarr(sys.argv[2], [adata.shape[0], 10]) diff --git a/human_pbmc/Snakefile b/human_pbmc/Snakefile index dc7d36d..b4dc6cf 100644 --- a/human_pbmc/Snakefile +++ b/human_pbmc/Snakefile @@ -53,3 +53,44 @@ rule download_demo: wget {params.url} -P reference echo "PBMC demo data downloaded on: $(date)" > logs/download_demo_data.log """ + +rule download_pbmc_expression: + params: + url = "'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE164378&format=file'" + output: + "data/GSM5008737_RNA_3P-matrix.mtx.gz", + "data/GSM5008737_RNA_3P-barcodes.tsv.gz", + "data/GSM5008737_RNA_3P-features.tsv.gz" + shell: + """ + wget -O data/GSE164378_RAW.tar {params.url} + tar -xvf data/GSE164378_RAW.tar -C data/ + rm data/*TCR* + rm data/*BCR* + rm data/*5P* + rm data/*ADT_3P* + rm data/*HTO_3P* + """ + +rule export_zarr: + input: + ref = "reference/ref.Rds", + fullref = "data/pbmc_multimodal.h5seurat", + rna_mat = "data/GSM5008737_RNA_3P-matrix.mtx.gz", + rna_cells = "data/GSM5008737_RNA_3P-barcodes.tsv.gz", + rna_features = "data/GSM5008737_RNA_3P-features.tsv.gz", + script1 = "scripts/convert_to_h5ad.R", + script2 = "scripts/convert_to_zarr.py" + output: + h5Seurat = "vitessce/vitessce_ref.h5Seurat", + h5ad = "vitessce/vitessce_ref.h5ad", + zarr = directory("vitessce/vitessce_ref.zarr") + container: + "docker://satijalab/azimuth-references:vitessce" + shell: + """ + mkdir -p vitessce + Rscript {input.script1} {input.ref} {input.fullref} {input.rna_mat} {input.rna_cells} {input.rna_features} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1 + python3 {input.script2} {output.h5ad} {output.zarr} + """ + diff --git a/human_pbmc/scripts/convert_to_h5ad.R b/human_pbmc/scripts/convert_to_h5ad.R new file mode 100644 index 0000000..97f56ee --- /dev/null +++ b/human_pbmc/scripts/convert_to_h5ad.R @@ -0,0 +1,48 @@ +#!/usr/bin/env Rscript +library(Seurat) +library(SeuratDisk) +args <- commandArgs(trailingOnly = TRUE) + +ref <- readRDS(file = args[1]) +fullref <- LoadH5Seurat(file = args[2], assays = "ADT", reductions = FALSE) +fullref <- subset(x = fullref, cells = Cells(x = ref)) +dat_3p <- ReadMtx(mtx = args[3], cells = args[4], features = args[5]) + +fullref[["RNA"]] <- CreateAssayObject(counts = dat_3p[, Cells(x = ref)]) +DefaultAssay(object = fullref) <- "RNA" +fullref <- NormalizeData(object = fullref) +adt.mat <- GetAssayData(object = fullref[["ADT"]], slot = "data") +rownames(x = adt.mat) <- paste0("ADT-", rownames(x = adt.mat)) + +combined.rna.adt <- CreateAssayObject(data = rbind( + GetAssayData(object = fullref[["RNA"]], slot = "data"), + adt.mat) +) +fullref[["RNA"]] <- combined.rna.adt + +fullref[['umap']] <- ref[['refUMAP']] +Key(object = fullref[['umap']]) <- "umap_" +DefaultAssay(object = fullref[['umap']]) <- "RNA" +fullref <- DietSeurat( + object = fullref, + dimreducs = "umap", + assays = "RNA" +) +for (i in colnames(x = fullref[[]])) { + fullref[[i]] <- NULL +} +fullref <- AddMetaData(object = fullref, metadata = ref[[]]) +Misc(object = fullref[['umap']], slot = "model") <- NULL + +fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref))) + +for (i in colnames(x = fullref[[]])) { + if (is.factor(x = fullref[[i, drop = TRUE]])) { + fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]]) + } +} + +SaveH5Seurat(object = fullref, file = args[6], overwrite = TRUE) +Convert(args[6], dest = "h5ad", overwrite = TRUE) + + diff --git a/human_pbmc/scripts/convert_to_zarr.py b/human_pbmc/scripts/convert_to_zarr.py new file mode 100644 index 0000000..2e7a66f --- /dev/null +++ b/human_pbmc/scripts/convert_to_zarr.py @@ -0,0 +1,13 @@ +#!/usr/bin/python +import scanpy as sc +import sys +import zarr +from scipy import sparse + +adata = sc.read_h5ad(sys.argv[1]) +del adata.raw +adata.var.index = adata.var.index.astype('str') +adata.obs.index = adata.obs.index.astype('str') +adata.var_names = adata.var_names.astype(str) +adata.X = adata.X.tocsc() +adata.write_zarr(sys.argv[2], [adata.shape[0], 10]) diff --git a/mouse_motorcortex/Snakefile b/mouse_motorcortex/Snakefile index f83b62f..6cf82c1 100644 --- a/mouse_motorcortex/Snakefile +++ b/mouse_motorcortex/Snakefile @@ -79,3 +79,23 @@ rule setup_demo: """ Rscript {input.script} {input.data} {input.metadata} {output} > logs/setup_demo.Rout 2>&1 """ + +rule export_zarr: + input: + ref = "reference/ref.Rds", + fullref = "seurat_objects/fullref.Rds", + script1 = "scripts/convert_to_h5ad.R", + script2 = "scripts/convert_to_zarr.py" + output: + h5Seurat = "vitessce/vitessce_ref.h5Seurat", + h5ad = "vitessce/vitessce_ref.h5ad", + zarr = directory("vitessce/vitessce_ref.zarr") + container: + "docker://satijalab/azimuth-references:vitessce" + shell: + """ + mkdir -p vitessce + Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1 + python3 {input.script2} {output.h5ad} {output.zarr} + """ + diff --git a/mouse_motorcortex/scripts/convert_to_h5ad.R b/mouse_motorcortex/scripts/convert_to_h5ad.R new file mode 100644 index 0000000..85fb2ce --- /dev/null +++ b/mouse_motorcortex/scripts/convert_to_h5ad.R @@ -0,0 +1,37 @@ +#!/usr/bin/env Rscript +library(Seurat) +library(SeuratDisk) +args <- commandArgs(trailingOnly = TRUE) + +ref <- readRDS(file = args[1]) +fullref <- readRDS(file = args[2]) +fullref <- subset(x = fullref, cells = Cells(x = ref)) +fullref[['umap']] <- ref[['refUMAP']] +Key(object = fullref[['umap']]) <- "umap_" +DefaultAssay(object = fullref[['umap']]) <- "RNA" + +DefaultAssay(object = fullref) <- "RNA" +fullref <- NormalizeData(object = fullref) +fullref <- DietSeurat( + object = fullref, + dimreducs = "umap", + assays = "RNA" +) +for (i in colnames(x = fullref[[]])) { + fullref[[i]] <- NULL +} +fullref <- AddMetaData(object = fullref, metadata = ref[[]]) +Misc(object = fullref[['umap']], slot = "model") <- NULL + +fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref))) + +for (i in colnames(x = fullref[[]])) { + if (is.factor(x = fullref[[i, drop = TRUE]])) { + fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]]) + } +} + +SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE) +Convert(args[3], dest = "h5ad", overwrite = TRUE) + + diff --git a/mouse_motorcortex/scripts/convert_to_zarr.py b/mouse_motorcortex/scripts/convert_to_zarr.py new file mode 100644 index 0000000..2e7a66f --- /dev/null +++ b/mouse_motorcortex/scripts/convert_to_zarr.py @@ -0,0 +1,13 @@ +#!/usr/bin/python +import scanpy as sc +import sys +import zarr +from scipy import sparse + +adata = sc.read_h5ad(sys.argv[1]) +del adata.raw +adata.var.index = adata.var.index.astype('str') +adata.obs.index = adata.obs.index.astype('str') +adata.var_names = adata.var_names.astype(str) +adata.X = adata.X.tocsc() +adata.write_zarr(sys.argv[2], [adata.shape[0], 10])