Skip to content

Commit

Permalink
add zarr export rules
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewwbutler committed Apr 10, 2021
1 parent 5f5255d commit 8b8789e
Show file tree
Hide file tree
Showing 14 changed files with 317 additions and 2 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ src/*.dll
*.annoy
**/logs
**/data
**/seurat-objects/
**/seurat_objects/
**/vitessce/
**/.snakemake
**/.snakemake_timestamp
**/.wget-hsts
**/**/.cache
File renamed without changes.
23 changes: 21 additions & 2 deletions human_motorcortex/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,30 @@ rule setup_demo:
input:
data = "data/demo/matrix.csv.gz",
metadata = "data/demo/metadata.csv",
script = "scripts/setup_demo.R",

script = "scripts/setup_demo.R"
output:
"reference/allen_m1c_2019_ssv4.rds"
shell:
"""
Rscript {input.script} {input.data} {input.metadata} {output} > logs/setup_demo.Rout 2>&1
"""

rule export_zarr:
input:
ref = "reference/ref.Rds",
fullref = "seurat_objects/fullref.Rds",
script1 = "scripts/convert_to_h5ad.R",
script2 = "scripts/convert_to_zarr.py"
output:
h5Seurat = "vitessce/vitessce_ref.h5Seurat",
h5ad = "vitessce/vitessce_ref.h5ad",
zarr = directory("vitessce/vitessce_ref.zarr")
container:
"docker://satijalab/azimuth-references:vitessce"
shell:
"""
mkdir -p vitessce
Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1
python3 {input.script2} {output.h5ad} {output.zarr}
"""

37 changes: 37 additions & 0 deletions human_motorcortex/scripts/convert_to_h5ad.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env Rscript
library(Seurat)
library(SeuratDisk)
args <- commandArgs(trailingOnly = TRUE)

ref <- readRDS(file = args[1])
fullref <- readRDS(file = args[2])
fullref <- subset(x = fullref, cells = Cells(x = ref))
fullref[['umap']] <- ref[['refUMAP']]
Key(object = fullref[['umap']]) <- "umap_"
DefaultAssay(object = fullref[['umap']]) <- "RNA"

DefaultAssay(object = fullref) <- "RNA"
fullref <- NormalizeData(object = fullref)
fullref <- DietSeurat(
object = fullref,
dimreducs = "umap",
assays = "RNA"
)
for (i in colnames(x = fullref[[]])) {
fullref[[i]] <- NULL
}
fullref <- AddMetaData(object = fullref, metadata = ref[[]])
Misc(object = fullref[['umap']], slot = "model") <- NULL

fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref)))

for (i in colnames(x = fullref[[]])) {
if (is.factor(x = fullref[[i, drop = TRUE]])) {
fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]])
}
}

SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE)
Convert(args[3], dest = "h5ad", overwrite = TRUE)


13 changes: 13 additions & 0 deletions human_motorcortex/scripts/convert_to_zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/python
import scanpy as sc
import sys
import zarr
from scipy import sparse

adata = sc.read_h5ad(sys.argv[1])
del adata.raw
adata.var.index = adata.var.index.astype('str')
adata.obs.index = adata.obs.index.astype('str')
adata.var_names = adata.var_names.astype(str)
adata.X = adata.X.tocsc()
adata.write_zarr(sys.argv[2], [adata.shape[0], 10])
20 changes: 20 additions & 0 deletions human_pancreas/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,23 @@ rule setup_demo:
Rscript {input.script} {input.data} {output} > logs/setup_peng.Rout 2>&1
"""

rule export_zarr:
input:
ref = "reference/ref.Rds",
fullref = "seurat_objects/fullref.Rds",
script1 = "scripts/convert_to_h5ad.R",
script2 = "scripts/convert_to_zarr.py"
output:
h5Seurat = "vitessce/vitessce_ref.h5Seurat",
h5ad = "vitessce/vitessce_ref.h5ad",
zarr = directory("vitessce/vitessce_ref.zarr")
container:
"docker://satijalab/azimuth-references:vitessce"
shell:
"""
mkdir -p vitessce
Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1
python3 {input.script2} {output.h5ad} {output.zarr}
"""


37 changes: 37 additions & 0 deletions human_pancreas/scripts/convert_to_h5ad.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env Rscript
library(Seurat)
library(SeuratDisk)
args <- commandArgs(trailingOnly = TRUE)

ref <- readRDS(file = args[1])
fullref <- readRDS(file = args[2])
fullref <- subset(x = fullref, cells = Cells(x = ref))
fullref[['umap']] <- ref[['refUMAP']]
Key(object = fullref[['umap']]) <- "umap_"
DefaultAssay(object = fullref[['umap']]) <- "RNA"

DefaultAssay(object = fullref) <- "RNA"
fullref <- NormalizeData(object = fullref)
fullref <- DietSeurat(
object = fullref,
dimreducs = "umap",
assays = "RNA"
)
for (i in colnames(x = fullref[[]])) {
fullref[[i]] <- NULL
}
fullref <- AddMetaData(object = fullref, metadata = ref[[]])
Misc(object = fullref[['umap']], slot = "model") <- NULL

fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref)))

for (i in colnames(x = fullref[[]])) {
if (is.factor(x = fullref[[i, drop = TRUE]])) {
fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]])
}
}

SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE)
Convert(args[3], dest = "h5ad", overwrite = TRUE)


13 changes: 13 additions & 0 deletions human_pancreas/scripts/convert_to_zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/python
import scanpy as sc
import sys
import zarr
from scipy import sparse

adata = sc.read_h5ad(sys.argv[1])
del adata.raw
adata.var.index = adata.var.index.astype('str')
adata.obs.index = adata.obs.index.astype('str')
adata.var_names = adata.var_names.astype(str)
adata.X = adata.X.tocsc()
adata.write_zarr(sys.argv[2], [adata.shape[0], 10])
41 changes: 41 additions & 0 deletions human_pbmc/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,44 @@ rule download_demo:
wget {params.url} -P reference
echo "PBMC demo data downloaded on: $(date)" > logs/download_demo_data.log
"""

rule download_pbmc_expression:
params:
url = "'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE164378&format=file'"
output:
"data/GSM5008737_RNA_3P-matrix.mtx.gz",
"data/GSM5008737_RNA_3P-barcodes.tsv.gz",
"data/GSM5008737_RNA_3P-features.tsv.gz"
shell:
"""
wget -O data/GSE164378_RAW.tar {params.url}
tar -xvf data/GSE164378_RAW.tar -C data/
rm data/*TCR*
rm data/*BCR*
rm data/*5P*
rm data/*ADT_3P*
rm data/*HTO_3P*
"""

rule export_zarr:
input:
ref = "reference/ref.Rds",
fullref = "data/pbmc_multimodal.h5seurat",
rna_mat = "data/GSM5008737_RNA_3P-matrix.mtx.gz",
rna_cells = "data/GSM5008737_RNA_3P-barcodes.tsv.gz",
rna_features = "data/GSM5008737_RNA_3P-features.tsv.gz",
script1 = "scripts/convert_to_h5ad.R",
script2 = "scripts/convert_to_zarr.py"
output:
h5Seurat = "vitessce/vitessce_ref.h5Seurat",
h5ad = "vitessce/vitessce_ref.h5ad",
zarr = directory("vitessce/vitessce_ref.zarr")
container:
"docker://satijalab/azimuth-references:vitessce"
shell:
"""
mkdir -p vitessce
Rscript {input.script1} {input.ref} {input.fullref} {input.rna_mat} {input.rna_cells} {input.rna_features} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1
python3 {input.script2} {output.h5ad} {output.zarr}
"""

48 changes: 48 additions & 0 deletions human_pbmc/scripts/convert_to_h5ad.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env Rscript
library(Seurat)
library(SeuratDisk)
args <- commandArgs(trailingOnly = TRUE)

ref <- readRDS(file = args[1])
fullref <- LoadH5Seurat(file = args[2], assays = "ADT", reductions = FALSE)
fullref <- subset(x = fullref, cells = Cells(x = ref))
dat_3p <- ReadMtx(mtx = args[3], cells = args[4], features = args[5])

fullref[["RNA"]] <- CreateAssayObject(counts = dat_3p[, Cells(x = ref)])
DefaultAssay(object = fullref) <- "RNA"
fullref <- NormalizeData(object = fullref)
adt.mat <- GetAssayData(object = fullref[["ADT"]], slot = "data")
rownames(x = adt.mat) <- paste0("ADT-", rownames(x = adt.mat))

combined.rna.adt <- CreateAssayObject(data = rbind(
GetAssayData(object = fullref[["RNA"]], slot = "data"),
adt.mat)
)
fullref[["RNA"]] <- combined.rna.adt

fullref[['umap']] <- ref[['refUMAP']]
Key(object = fullref[['umap']]) <- "umap_"
DefaultAssay(object = fullref[['umap']]) <- "RNA"
fullref <- DietSeurat(
object = fullref,
dimreducs = "umap",
assays = "RNA"
)
for (i in colnames(x = fullref[[]])) {
fullref[[i]] <- NULL
}
fullref <- AddMetaData(object = fullref, metadata = ref[[]])
Misc(object = fullref[['umap']], slot = "model") <- NULL

fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref)))

for (i in colnames(x = fullref[[]])) {
if (is.factor(x = fullref[[i, drop = TRUE]])) {
fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]])
}
}

SaveH5Seurat(object = fullref, file = args[6], overwrite = TRUE)
Convert(args[6], dest = "h5ad", overwrite = TRUE)


13 changes: 13 additions & 0 deletions human_pbmc/scripts/convert_to_zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/python
import scanpy as sc
import sys
import zarr
from scipy import sparse

adata = sc.read_h5ad(sys.argv[1])
del adata.raw
adata.var.index = adata.var.index.astype('str')
adata.obs.index = adata.obs.index.astype('str')
adata.var_names = adata.var_names.astype(str)
adata.X = adata.X.tocsc()
adata.write_zarr(sys.argv[2], [adata.shape[0], 10])
20 changes: 20 additions & 0 deletions mouse_motorcortex/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,23 @@ rule setup_demo:
"""
Rscript {input.script} {input.data} {input.metadata} {output} > logs/setup_demo.Rout 2>&1
"""

rule export_zarr:
input:
ref = "reference/ref.Rds",
fullref = "seurat_objects/fullref.Rds",
script1 = "scripts/convert_to_h5ad.R",
script2 = "scripts/convert_to_zarr.py"
output:
h5Seurat = "vitessce/vitessce_ref.h5Seurat",
h5ad = "vitessce/vitessce_ref.h5ad",
zarr = directory("vitessce/vitessce_ref.zarr")
container:
"docker://satijalab/azimuth-references:vitessce"
shell:
"""
mkdir -p vitessce
Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1
python3 {input.script2} {output.h5ad} {output.zarr}
"""

37 changes: 37 additions & 0 deletions mouse_motorcortex/scripts/convert_to_h5ad.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env Rscript
library(Seurat)
library(SeuratDisk)
args <- commandArgs(trailingOnly = TRUE)

ref <- readRDS(file = args[1])
fullref <- readRDS(file = args[2])
fullref <- subset(x = fullref, cells = Cells(x = ref))
fullref[['umap']] <- ref[['refUMAP']]
Key(object = fullref[['umap']]) <- "umap_"
DefaultAssay(object = fullref[['umap']]) <- "RNA"

DefaultAssay(object = fullref) <- "RNA"
fullref <- NormalizeData(object = fullref)
fullref <- DietSeurat(
object = fullref,
dimreducs = "umap",
assays = "RNA"
)
for (i in colnames(x = fullref[[]])) {
fullref[[i]] <- NULL
}
fullref <- AddMetaData(object = fullref, metadata = ref[[]])
Misc(object = fullref[['umap']], slot = "model") <- NULL

fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref)))

for (i in colnames(x = fullref[[]])) {
if (is.factor(x = fullref[[i, drop = TRUE]])) {
fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]])
}
}

SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE)
Convert(args[3], dest = "h5ad", overwrite = TRUE)


13 changes: 13 additions & 0 deletions mouse_motorcortex/scripts/convert_to_zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/python
import scanpy as sc
import sys
import zarr
from scipy import sparse

adata = sc.read_h5ad(sys.argv[1])
del adata.raw
adata.var.index = adata.var.index.astype('str')
adata.obs.index = adata.obs.index.astype('str')
adata.var_names = adata.var_names.astype(str)
adata.X = adata.X.tocsc()
adata.write_zarr(sys.argv[2], [adata.shape[0], 10])

0 comments on commit 8b8789e

Please sign in to comment.