Skip to content

Commit

Permalink
improve heatmap performance by distribution of computeations, downsam…
Browse files Browse the repository at this point in the history
…pling and feature selection #4
  • Loading branch information
sreichl committed Jun 28, 2024
1 parent 2f6f049 commit 12f669f
Show file tree
Hide file tree
Showing 13 changed files with 216 additions and 96 deletions.
6 changes: 4 additions & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@ umap:

##### HEATMAP #####
# information on the ComplexHeatmap parameters: https://jokergoo.github.io/ComplexHeatmap-reference/book/index.html
# distance metrics: for rows and columns. all metrics that are supported by stats::dist() (https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/dist) and 'pearson','spearman', and 'kendall'.
# distance metrics: for rows and columns. all metrics that are supported by scipy.spatial.distance.pdist (https://docs.scipy.org/doc/scipy-1.14.0/reference/generated/scipy.spatial.distance.pdist.html)
# clustering methods: methods for hierarchical clustering that are supported by stats::hclust() (https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/hclust)
# it is the most resource (memory) intensive method, leave empty [] if not required
heatmap:
metrics: ['spearman']
metrics: ['correlation','cosine']
hclust_methods: ['complete']
n_observations: 1000 # random sampled proportion float [0-1] or absolute number as integer
n_features: 0.5 # highly variable features percentate float [0-1] or absolute number as integer

##### LEIDEN #####
# Leiden clustering applied on UMAP KNN graphs specified by the respective parameters (metric, n_neighbors).
Expand Down
9 changes: 7 additions & 2 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,12 @@ rule all:
n_components=[dims for dims in config["umap"]["n_components"] if dims in [2,3]]
) if 2 in config["umap"]["n_components"] or 3 in config["umap"]["n_components"] else [],
# Heatmap
heatmap_plots = expand(os.path.join(result_path,'{sample}','Heatmap','plots','Heatmap_{method}_{metric}.png'),
# distance_matrices = expand(os.path.join(result_path,'{sample}','Heatmap','DistanceMatrix_{metric}_{type}.csv'),
# sample=list(annot.index),
# metric=config["heatmap"]["metrics"],
# type=["observations","features"],
# ),
heatmap_plots = expand(os.path.join(result_path,'{sample}','Heatmap','plots','Heatmap_{metric}_{method}.png'),
sample=list(annot.index),
method=config["heatmap"]["hclust_methods"],
metric=config["heatmap"]["metrics"],
Expand All @@ -161,7 +166,7 @@ rule all:
sample=list(annot.index),
index_type = ["external", "internal"] if config["sample_proportion"]>0 else ["external"],
) if len(cluster_methods)>0 else [],
envs = expand(os.path.join(config["result_path"],'envs',module_name,'{env}.yaml'),env=['clusterCrit','clustree','ComplexHeatmap','ggplot','umap_leiden','plotly','pymcdm','sklearn']),
envs = expand(os.path.join(config["result_path"],'envs',module_name,'{env}.yaml'),env=['clusterCrit','clustree','ComplexHeatmap','ggplot','umap_leiden','plotly','pymcdm']),
configs = os.path.join(config["result_path"],'configs',module_name,'{}_config.yaml'.format(config["project_name"])),
annotations = os.path.join(config["result_path"],'configs',module_name,'{}_annot.csv'.format(config["project_name"])),
resources:
Expand Down
12 changes: 12 additions & 0 deletions workflow/envs/fastdist_UNUSED.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- scikit-learn
- pandas=1.5.0
- numpy
- numba
- pip
- pip:
- fastdist==1.1.6
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion workflow/rules/cluster_validation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ rule validation_external:
mem_mb=config.get("mem", "16000"),
threads: config.get("threads", 1)
conda:
"../envs/sklearn.yaml"
"../envs/umap_leiden.yaml"
log:
os.path.join("logs","rules","validation_external_{sample}.log"),
params:
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/clustering.smk
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ rule clustification:
mem_mb=config.get("mem", "16000"),
threads: 8#config.get("threads", 1)
conda:
"../envs/sklearn.yaml"
"../envs/umap_leiden.yaml"
log:
os.path.join("logs","rules","clustification_{sample}_clusterings.log"),
params:
Expand Down
89 changes: 40 additions & 49 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,66 +14,59 @@ def get_data_orientation(wildcards):

def get_umap_sample_paths(wildcards):
return [annot.loc[wildcards.sample,'data'],
os.path.join(config["result_path"],'unsupervised_analysis','{}'.format(wildcards.sample),'UMAP','UMAP_{}_'.format(wildcards.metric)+'{}'.format(max(config["umap"]["n_neighbors"]))+'_graph.pickle')]
os.path.join(result_path,'{}'.format(wildcards.sample),'UMAP','UMAP_{}_'.format(wildcards.metric)+'{}'.format(max(config["umap"]["n_neighbors"]))+'_graph.pickle')]

def get_dimred_paths(wildcards):
path_dict = {}

if wildcards.method=="PCA":
path_dict['dimred_data'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards))
path_dict['dimred_axes'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards))
path_dict['dimred_var'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_var.csv'.format(wildcards=wildcards))
path_dict['dimred_loadings'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_loadings_small.csv'.format(wildcards=wildcards))
# return {
# 'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
# 'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
# 'dimred_var': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_var.csv'.format(wildcards=wildcards)),
# 'dimred_loadings': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_loadings_small.csv'.format(wildcards=wildcards)),
# 'metadata': annot.loc[wildcards.sample,"metadata"],
# 'metadata_features': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
# }
path_dict['dimred_data'] = os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards))
path_dict['dimred_axes'] = os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards))
path_dict['dimred_var'] = os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_var.csv'.format(wildcards=wildcards))
path_dict['dimred_loadings'] = os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_loadings_small.csv'.format(wildcards=wildcards))
else:
path_dict['dimred_data'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards))
path_dict['dimred_axes'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards))
# return {
# 'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
# 'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
# 'metadata': annot.loc[wildcards.sample,"metadata"],
# 'metadata_features': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
# }
path_dict['dimred_data'] = os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards))
path_dict['dimred_axes'] = os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards))

# add metadata
path_dict['metadata'] = annot.loc[wildcards.sample,"metadata"]
# add features
path_dict['metadata_features'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
path_dict['metadata_features'] = os.path.join(result_path,wildcards.sample,'metadata_features.csv')
# add clustering results
if len(cluster_methods) > 0:
path_dict['metadata_clusterings'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_clusterings.csv')
path_dict['metadata_clusterings'] = os.path.join(result_path,wildcards.sample,'metadata_clusterings.csv')

return path_dict

def get_dimred_features_paths(wildcards):

if wildcards.method=="PCA":
return {
'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
'dimred_data': os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
'dimred_axes': os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
'metadata': os.path.join(result_path,wildcards.sample,'metadata_features.csv')
}
else:
return {
'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
'dimred_data': os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
'dimred_axes': os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
'metadata': os.path.join(result_path,wildcards.sample,'metadata_features.csv')
}

########## HEATMAPS ##########
def get_heatmap_paths(wildcards):
return {'data': annot.loc[wildcards.sample,'data'],
'metadata': annot.loc[wildcards.sample,"metadata"],
'observations_distance': os.path.join(result_path,wildcards.sample,'Heatmap','DistanceMatrix_{wildcards.metric}_observations.csv'.format(wildcards=wildcards)),
'features_distance': os.path.join(result_path,wildcards.sample,'Heatmap','DistanceMatrix_{wildcards.metric}_features.csv'.format(wildcards=wildcards)),
}

########## CLUSTERING ##########

# get paths for clustification
def get_clustification_paths(wildcards):
return [annot.loc[wildcards.sample,'data'],
os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'{}'.format(config["clustification"]["method"]),'{}_clusterings.csv'.format(config["clustification"]["method"]))
os.path.join(result_path,wildcards.sample,'{}'.format(config["clustification"]["method"]),'{}_clusterings.csv'.format(config["clustification"]["method"]))
]

# get all clustering results of one method to be aggregated into {method}/{method}_clusterings.csv
Expand All @@ -90,33 +83,31 @@ def get_clustering_paths(wildcards):
else:
leiden_parameters.append("{}_NA".format(partition_type))

path_list = path_list + expand(os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'Leiden','Leiden_{metric}_{n_neighbors}_{leiden_parameters}_clustering.csv'),
path_list = path_list + expand(os.path.join(result_path,wildcards.sample,'Leiden','Leiden_{metric}_{n_neighbors}_{leiden_parameters}_clustering.csv'),
metric=config["leiden"]["metrics"],
n_neighbors=config["leiden"]["n_neighbors"],
leiden_parameters=leiden_parameters,
# partition_type=config["leiden"]["partition_types"],
# resolution=config["leiden"]["resolutions"]
)
return path_list

# get all aggregated clustering results across methods to be aggregated into {sample}/metadata_clusterings.csv
def get_aggregated_clustering_paths(wildcards):
return expand(os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'{method}','{method}_clusterings.csv'), method=cluster_methods)
return expand(os.path.join(result_path,wildcards.sample,'{method}','{method}_clusterings.csv'), method=cluster_methods)

# get the aggregated clustering results across methods for visualization
def get_metadata_clustering_paths(wildcards):

if wildcards.method=="PCA":
return {
'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_clusterings.csv')
'dimred_data': os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
'dimred_axes': os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
'metadata': os.path.join(result_path,wildcards.sample,'metadata_clusterings.csv')
}
else:
return {
'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_clusterings.csv')
'dimred_data': os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
'dimred_axes': os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
'metadata': os.path.join(result_path,wildcards.sample,'metadata_clusterings.csv')
}

########## CLUSTER VALIDATION ##########
Expand All @@ -126,36 +117,36 @@ def get_clustree_paths(wildcards):

if wildcards.content=="features":
return {
'metadata_clustering': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "metadata_clusterings.csv"),
'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
'metadata_clustering': os.path.join(result_path,wildcards.sample, "metadata_clusterings.csv"),
'metadata': os.path.join(result_path,wildcards.sample,'metadata_features.csv')
}
else:
return {
'metadata_clustering': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "metadata_clusterings.csv"),
'metadata_clustering': os.path.join(result_path,wildcards.sample, "metadata_clusterings.csv"),
'metadata': annot.loc[wildcards.sample,"metadata"]
}

# get paths to determine external cluster indices
def get_external_validation_paths(wildcards):
return {'clusterings': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "metadata_clusterings.csv"),
return {'clusterings': os.path.join(result_path,wildcards.sample, "metadata_clusterings.csv"),
'metadata': annot.loc[wildcards.sample,"metadata"]
}

# get paths to determine internal cluster indices
def get_internal_validation_paths(wildcards):
return {#'data': annot.loc[wildcards.sample,'data'],
'metadata': annot.loc[wildcards.sample,"metadata"],
'clusterings': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "metadata_clusterings.csv"),
'pca': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_default_data.csv'),
'pca_var': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_default_var.csv')
'clusterings': os.path.join(result_path,wildcards.sample, "metadata_clusterings.csv"),
'pca': os.path.join(result_path,wildcards.sample,'PCA','PCA_{}_{}_data.csv'.format(config["pca"]["svd_solver"],config["pca"]["n_components"])),
'pca_var': os.path.join(result_path,wildcards.sample,'PCA','PCA_{}_{}_var.csv'.format(config["pca"]["svd_solver"],config["pca"]["n_components"]))
}

# for plotting heatmaps of cluster indices
def get_validation_paths(wildcards):
if wildcards.type=="external":
return {
idx: os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "cluster_validation", "external_index_{}.csv".format(idx)) for idx in indices_external
idx: os.path.join(result_path,wildcards.sample, "cluster_validation", "external_index_{}.csv".format(idx)) for idx in indices_external
}
else:
return {"ranked_internal_indices": os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "cluster_validation", "internal_indices_ranked.csv")}
return {"ranked_internal_indices": os.path.join(result_path,wildcards.sample, "cluster_validation", "internal_indices_ranked.csv")}

Loading

0 comments on commit 12f669f

Please sign in to comment.