Skip to content

Commit

Permalink
Merge pull request #4 from satijalab/feat/lung
Browse files Browse the repository at this point in the history
Add human lung reference
  • Loading branch information
andrewwbutler authored Jun 2, 2021
2 parents 7733a9e + 377efa3 commit 95436aa
Show file tree
Hide file tree
Showing 11 changed files with 271 additions and 22 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ src/*.dll
**/.snakemake
**/.snakemake_timestamp
**/.wget-hsts
**/.synapseCache
**/**/.cache
10 changes: 4 additions & 6 deletions docker/main/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# Dockerfile for building references
FROM satijalab/seurat:4.0.0
FROM satijalab/azimuth:0.3.2

# Install other R dependencies
RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site
RUN R --no-echo --no-restore --no-save -e "install.packages('feather')"

# Install Azimuth (0.3.0)
RUN R --no-echo --no-restore --no-save -e "remotes::install_github('immunogenomics/presto')"
RUN R --no-echo --no-restore --no-save -e "install.packages(c('DT', 'googlesheets4', 'shinyBS', 'shinydashboard', 'shinyjs'))"
RUN R --no-echo --no-restore --no-save -e "BiocManager::install('glmGamPoi')"
RUN R --no-echo --no-restore --no-save -e "remotes::install_github('satijalab/azimuth', ref = 'v0.3.0', dependencies = FALSE)"
# Install synapse for download
RUN pip3 install synapseclient

CMD [ "R" ]
11 changes: 1 addition & 10 deletions docker/vitessce/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,5 @@
# Dockerfile for building references
FROM satijalab/seurat:4.0.1

# Install other R dependencies
RUN R --no-echo --no-restore --no-save -e "install.packages('feather')"

# Install Azimuth (0.3.2)
RUN R --no-echo --no-restore --no-save -e "remotes::install_github('immunogenomics/presto')"
RUN R --no-echo --no-restore --no-save -e "install.packages(c('DT', 'googlesheets4', 'shinyBS', 'shinydashboard', 'shinyjs'))"
RUN R --no-echo --no-restore --no-save -e "BiocManager::install('glmGamPoi')"
RUN R --no-echo --no-restore --no-save -e "remotes::install_github('satijalab/azimuth', ref = 'v0.3.0', dependencies = FALSE)"
FROM satijalab/azimuth:0.4.0

# Install python dependencies
RUN pip3 install scanpy
Expand Down
104 changes: 104 additions & 0 deletions human_lung/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
############################## Config #########################################
container: "docker://satijalab/azimuth-references:latest"
envvars:
"SYNAPSE_ID",
"SYNAPSE_KEY"

############################## All ############################################
rule all:
input:
"reference/ref.Rds",
"reference/idx.annoy",
"reference/braga_lung_demo.Rds"

############################## Reference ######################################
rule download:
params:
user = os.environ["SYNAPSE_ID"],
key = os.environ["SYNAPSE_KEY"]
output:
"data/krasnow_hlca_10x_UMIs.csv",
"data/krasnow_hlca_10x_metadata.csv"
shell:
"""
mkdir -p data/
mkdir -p logs
cd data
synapse -u {params.user} -p {params.key} get syn21560510
synapse -u {params.user} -p {params.key} get syn21560409
echo "Krasnow data downloaded on: $(date)" > ../logs/download_data.log
"""

rule setup:
input:
script = "scripts/setup.R",
data = "data/krasnow_hlca_10x_UMIs.csv",
metadata = "data/krasnow_hlca_10x_metadata.csv"
output:
"seurat_objects/krasnow_lung.rds"
shell:
"""
Rscript {input.script} {input.data} {input.metadata} {output} > logs/setup.Rout 2>&1
"""

rule export:
input:
script = "scripts/export.R",
ob = "seurat_objects/krasnow_lung.rds"
output:
"reference/ref.Rds",
"reference/idx.annoy",
"seurat_objects/fullref.Rds"
shell:
"""
Rscript {input.script} {input.ob} > logs/export.Rout 2>&1
"""

############################## Demo ##########################################
rule download_demo:
params:
data_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE130nnn/GSE130148/suppl/GSE130148%5Fraw%5Fcounts%2ERData%2Egz",
metadata_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE130nnn/GSE130148/suppl/GSE130148%5Fbarcodes%5Fcell%5Ftypes%2Etxt%2Egz"
output:
"logs/download_demo_data.log",
"data/GSE130148_raw_counts.RData",
"data/GSE130148_barcodes_cell_types.txt.gz"
shell:
"""
wget {params.data_url} -P data
gzip -d data/GSE130148_raw_counts.RData.gz
wget {params.metadata_url} -P data
echo "Lung demo data downloaded on: $(date)" > logs/download_demo_data.log
"""

rule setup_demo:
input:
script = "scripts/setup_demo.R",
data = "data/GSE130148_raw_counts.RData",
metadata = "data/GSE130148_barcodes_cell_types.txt.gz"
output:
"reference/braga_lung_demo.Rds"
shell:
"""
Rscript {input.script} {input.data} {input.metadata} {output} > logs/setup_demo.log 2>&1
"""

############################## Explorer ########################################
rule export_zarr:
input:
ref = "reference/ref.Rds",
fullref = "seurat_objects/fullref.Rds",
script1 = "scripts/convert_to_h5ad.R",
script2 = "scripts/convert_to_zarr.py"
output:
h5Seurat = "vitessce/vitessce_ref.h5Seurat",
h5ad = "vitessce/vitessce_ref.h5ad",
zarr = directory("vitessce/vitessce_ref.zarr")
container:
"docker://satijalab/azimuth-references:vitessce"
shell:
"""
mkdir -p vitessce
Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1
python3 {input.script2} {output.h5ad} {output.zarr}
"""
14 changes: 14 additions & 0 deletions human_lung/reference/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"Azimuth.app.reference": "/reference-data/",
"Azimuth.app.demodataset": "/reference-data/braga_lung_demo.Rds",
"Azimuth.app.max_cells": 100000,
"Azimuth.app.default_gene": "EMP2",
"Azimuth.app.default_metadata": "annotation.l1",
"Azimuth.app.homologs": "/reference-data/homologs.rds",
"Azimuth.app.welcomebox": "div(\n h3(HTML(\"Please upload a dataset to map to our <b>human lung</b> reference\")),\n \"Upload a counts matrix from an scRNA-seq dataset of human lung in one\n of the following formats: hdf5, rds, h5ad, h5seurat\",\n width = 12\n )",
"Azimuth.app.refdescriptor": "<div class='refdescriptor'><br>This reference consists of 65,662 human lung cells (10x Genomics v2) from <a href='https://www.nature.com/articles/s41586-020-2922-4' target='blank'>Travaglini et al. 2020</a>. Cells were sourced from three donor patients and from all lung tissue compartments (bronchi, bronchiole, alveoli) as well as circulating blood. For testing, we also provide a demo dataset of 10,360 human lung cells from <a href='https://www.nature.com/articles/s41591-019-0468-5', target='_blank'>Vieira-Braga et al, 2019</a> (Drop-seq), which is loaded automatically with the 'Load demo dataset' button or available for download <a href='https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE130148', target='blank'>here</a>.</div>",
"shiny.maxRequestSize": 1048576000,
"Azimuth.app.refuri": "https://seurat.nygenome.org/azimuth/references/v1.0.0/human_lung",
"Azimuth.app.do_adt": "FALSE"
}

37 changes: 37 additions & 0 deletions human_lung/scripts/convert_to_h5ad.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env Rscript
library(Seurat)
library(SeuratDisk)
args <- commandArgs(trailingOnly = TRUE)

ref <- readRDS(file = args[1])
fullref <- readRDS(file = args[2])
fullref <- subset(x = fullref, cells = Cells(x = ref))
fullref[['umap']] <- ref[['refUMAP']]
Key(object = fullref[['umap']]) <- "umap_"
DefaultAssay(object = fullref[['umap']]) <- "RNA"

DefaultAssay(object = fullref) <- "RNA"
fullref <- NormalizeData(object = fullref)
fullref <- DietSeurat(
object = fullref,
dimreducs = "umap",
assays = "RNA"
)
for (i in colnames(x = fullref[[]])) {
fullref[[i]] <- NULL
}
fullref <- AddMetaData(object = fullref, metadata = ref[[]])
Misc(object = fullref[['umap']], slot = "model") <- NULL

fullref <- RenameCells(object = fullref, new.names = paste0("cell", 1:ncol(x = fullref)))

for (i in colnames(x = fullref[[]])) {
if (is.factor(x = fullref[[i, drop = TRUE]])) {
fullref[[i]] <- as.character(x = fullref[[i, drop = TRUE]])
}
}

SaveH5Seurat(object = fullref, file = args[3], overwrite = TRUE)
Convert(args[3], dest = "h5ad", overwrite = TRUE)


13 changes: 13 additions & 0 deletions human_lung/scripts/convert_to_zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/python
import scanpy as sc
import sys
import zarr
from scipy import sparse

adata = sc.read_h5ad(sys.argv[1])
del adata.raw
adata.var.index = adata.var.index.astype('str')
adata.obs.index = adata.obs.index.astype('str')
adata.var_names = adata.var_names.astype(str)
adata.X = adata.X.tocsc()
adata.write_zarr(sys.argv[2], [adata.shape[0], 10])
64 changes: 64 additions & 0 deletions human_lung/scripts/export.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env Rscript
library(Seurat)
library(Azimuth)

# Helper fxn
annotate <- function(obj, curr, new) {
if (!is.list(x = curr)) curr <- list(curr)
curr <- lapply(X = curr, FUN = function(vec) {
if (is.numeric(x = vec)) as.character(x = vec) else vec
})
stopifnot(length(x = curr) == length(x = new))
new <- rep(x = new, times = sapply(X = curr, FUN = length))
obj <- RenameIdents(obj, setNames(as.list(x = new), nm = unlist(x = curr)))
return(obj)
}

args <- commandArgs(trailingOnly = TRUE)
ref.dir <- "reference/"
ob.dir <- "seurat_objects/"

ob <- readRDS(file = args[1])
ob[['annotation.l2']] <- ob[['free_annotation']]
Idents(object = ob) <- 'annotation.l2'

l2.id <- list(
c('Basophil/Mast 1','Basophil/Mast 2'),
c('Ciliated','Proximal Ciliated'),
c('Signaling Alveolar Epithelial Type 2','Alveolar Epithelial Type 2'),
c('Bronchial Vessel 1','Bronchial Vessel 2'),
c('Capillary Intermediate 1','Capillary Intermediate 2'),
c('Proximal Basal','Proliferating Basal','Differentiating Basal'),
c('TREM2+ Dendritic','IGSF21+ Dendritic','Myeloid Dendritic Type 1',
'Myeloid Dendritic Type 2','EREG+ Dendritic'),
c('Nonclassical Monocyte','Intermediate Monocyte'),
c('Classical Monocyte','OLR1+ Classical Monocyte'),
c('Airway Smooth Muscle','Vascular Smooth Muscle'),
c('Alveolar Fibroblast','Adventitial Fibroblast'),
c('CD8+ Naive T','CD8+ Memory/Effector T'),
c('CD4+ Memory/Effector T','CD4+ Naive T')
)

l1.id <- c(
'Basophil/Mast','Ciliated','Alveolar Epithelial Type 2','Bronchial Vessel',
'Capillary Intermediate','Basal','Dendritic','CD16+ Monocyte','CD14+ Monocyte',
'Smooth Muscle','Fibroblast',
'CD8 T','CD4 T'
)

ob <- annotate(ob, l2.id, l1.id)
ob[['annotation.l1']] <- Idents(object = ob)

ref <- AzimuthReference(
object = ob,
refUMAP = "umap",
refDR = "pca",
refAssay = "SCT",
metadata = c("annotation.l1", "annotation.l2"),
dims = 1:50,
k.param = 31,
reference.version = "1.0.0"
)
SaveAnnoyIndex(object = ref[["refdr.annoy.neighbors"]], file = file.path(ref.dir, "idx.annoy"))
saveRDS(object = ref, file = file.path(ref.dir, "ref.Rds"))
saveRDS(object = ob, file = file.path(ob.dir, "fullref.Rds"))
16 changes: 16 additions & 0 deletions human_lung/scripts/setup.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env Rscript
library(Seurat)
library(data.table)

args <- commandArgs(trailingOnly = TRUE)

mat <- fread(file = args[1], header = TRUE, sep = ",")
mat <- as(object = as.matrix(x = mat, rownames = 1), Class = 'dgCMatrix')
meta <- read.csv(file = args[2], row.names = 1)

ob <- CreateSeuratObject(counts = mat, meta.data = meta)
ob <- SCTransform(object = ob)
ob <- RunPCA(object = ob, verbose = FALSE)
ob <- RunUMAP(object = ob, dims = 1:50, return.model = TRUE)

saveRDS(object = ob, file = args[3])
11 changes: 11 additions & 0 deletions human_lung/scripts/setup_demo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env Rscript
library(Seurat)

args <- commandArgs(trailingOnly = TRUE)

load(file = args[1])
meta <- read.table(file = args[2], header = T, sep = "\t", row.names = 1)
meta <- meta[, c("ID", "location", "celltype")]

ob <- CreateSeuratObject(counts = raw_counts, meta.data = meta)
saveRDS(object = ob, file = args[3])
12 changes: 6 additions & 6 deletions human_pancreas/reference/config.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
{
"Azimuth.app.reference": "/reference-data/human_pancreas/",
"Azimuth.app.demodataset": "/reference-data/human_pancreas/peng_n7.rds",
"Azimuth.app.reference": "/reference-data/human_lung/",
"Azimuth.app.demodataset": "/reference-data/human_lung/braga_lung_demo.Rds",
"Azimuth.app.max_cells": 100000,
"Azimuth.app.default_gene": "INS",
"Azimuth.app.default_gene": "EMP2",
"Azimuth.app.default_metadata": "annotation.l1",
"Azimuth.app.welcomebox": "div(\n h3(HTML(\"Please upload a dataset to map to our <b>human pancreas</b> reference\")),\n \"Upload a counts matrix from an scRNA-seq dataset of human pancreas in one\n of the following formats: hdf5, rds, h5ad, h5seurat. For testing, we also provide a demo dataset of 1,114 human pancreas cells from \",a(\"Peng et al, 2019\", href=\"https://www.nature.com/articles/s41422-019-0195-y\", target=\"_blank\"),\" (10x Genomics v2), which is loaded automatically with the 'Load demo dataset' button or available for download \",\n a(\"here.\",\n href=\"https://zenodo.org/record/3969339\",\n target=\"blank\"),\n width = 12\n )",
"Azimuth.app.refdescriptor": "<div class='refdescriptor'><br>This reference consists of 35,290 pancreatic cells integrated across six technologies from the following studies: <a href='https://dx.doi.org/10.1016%2Fj.stem.2016.05.010' target='blank'>Grün et al. 2016</a>, <a href='https://doi.org/10.1016/j.cels.2016.09.002' target='blank'>Muraro, Dharmadhikari et al. 2016</a>, <a href='https://doi.org/10.1016/j.cmet.2016.08.020' target='blank'>Segerstolpe, Palasantza, et al. 2016</a>, <a href='https://doi.org/10.1101/gr.212720.116' target='blank'>Lawlor, George, et al. 2017</a>, <a href='https://doi.org/10.1016/j.cels.2016.08.011' target='blank'>Baron, Veres, Wolock, Faust et al. 2016</a>, and <a href='https://doi.org/10.3791/59866' target='blank'>Xin et al. 2019</a>. All samples, which were generated using six different technologies, were integrated together and used to define a reference UMAP visualization and list of cell type annotations. </div>",
"Azimuth.app.refuri": "https://seurat.nygenome.org/azimuth/references/v1.0.0/human_pancreas",
"Azimuth.app.homologs": "/reference-data/homologs.rds",
"Azimuth.app.welcomebox": "div(\n h3(HTML(\"Please upload a dataset to map to our <b>human lung</b> reference\")),\n \"Upload a counts matrix from an scRNA-seq dataset of human lung in one\n of the following formats: hdf5, rds, h5ad, h5seurat\",\n width = 12\n )",
"Azimuth.app.refdescriptor": "<div class='refdescriptor'><br>This reference consists of 65,662 human lung cells (10x Genomics v2) from <a href='https://www.nature.com/articles/s41586-020-2922-4' target='blank'>Travaglini et al. 2020</a>. Cells were sourced from three donor patients and from all lung tissue compartments (bronchi, bronchiole, alveoli) as well as circulating blood. For testing, we also provide a demo dataset of 10,360 human lung cells from <a href='https://www.nature.com/articles/s41591-019-0468-5', target='_blank'>Vieira-Braga et al, 2019</a> (Drop-seq), which is loaded automatically with the 'Load demo dataset' button or available for download <a href='https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE130148', target='blank'>here</a>.</div>",
"shiny.maxRequestSize": 1048576000,
"Azimuth.app.do_adt": "FALSE"
}
Expand Down

0 comments on commit 95436aa

Please sign in to comment.