improve heatmap performance by distribution of computeations, downsam…

…pling and feature selection #4
epigen · Jun 28, 2024 · 12f669f · 12f669f
1 parent 2f6f049
commit 12f669f
Show file tree

Hide file tree

Showing 13 changed files with 216 additions and 96 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -35,12 +35,14 @@ umap:
 
 ##### HEATMAP #####
 # information on the ComplexHeatmap parameters: https://jokergoo.github.io/ComplexHeatmap-reference/book/index.html
-# distance metrics: for rows and columns. all metrics that are supported by stats::dist() (https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/dist) and 'pearson','spearman', and 'kendall'.
+# distance metrics: for rows and columns. all metrics that are supported by scipy.spatial.distance.pdist (https://docs.scipy.org/doc/scipy-1.14.0/reference/generated/scipy.spatial.distance.pdist.html)
 # clustering methods: methods for hierarchical clustering that are supported by stats::hclust() (https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/hclust)
 # it is the most resource (memory) intensive method, leave empty [] if not required
 heatmap:
-    metrics: ['spearman']
+    metrics: ['correlation','cosine']
     hclust_methods: ['complete']
+    n_observations: 1000 # random sampled proportion float [0-1] or absolute number as integer
+    n_features: 0.5 # highly variable features percentate float [0-1] or absolute number as integer
 
 ##### LEIDEN #####
 # Leiden clustering applied on UMAP KNN graphs specified by the respective parameters (metric, n_neighbors).

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -142,7 +142,12 @@ rule all:
                             n_components=[dims for dims in config["umap"]["n_components"] if dims in [2,3]]
                                ) if 2 in config["umap"]["n_components"] or 3 in config["umap"]["n_components"] else [],
         # Heatmap
-        heatmap_plots = expand(os.path.join(result_path,'{sample}','Heatmap','plots','Heatmap_{method}_{metric}.png'),
+#         distance_matrices = expand(os.path.join(result_path,'{sample}','Heatmap','DistanceMatrix_{metric}_{type}.csv'),
+#                                sample=list(annot.index),
+#                                metric=config["heatmap"]["metrics"],
+#                                type=["observations","features"],
+#                           ),
+        heatmap_plots = expand(os.path.join(result_path,'{sample}','Heatmap','plots','Heatmap_{metric}_{method}.png'),
                                sample=list(annot.index),
                                method=config["heatmap"]["hclust_methods"],
                                metric=config["heatmap"]["metrics"],
@@ -161,7 +166,7 @@ rule all:
                                              sample=list(annot.index),
                                              index_type = ["external", "internal"] if config["sample_proportion"]>0 else ["external"],
                                             ) if len(cluster_methods)>0 else [],
-        envs = expand(os.path.join(config["result_path"],'envs',module_name,'{env}.yaml'),env=['clusterCrit','clustree','ComplexHeatmap','ggplot','umap_leiden','plotly','pymcdm','sklearn']),
+        envs = expand(os.path.join(config["result_path"],'envs',module_name,'{env}.yaml'),env=['clusterCrit','clustree','ComplexHeatmap','ggplot','umap_leiden','plotly','pymcdm']),
         configs = os.path.join(config["result_path"],'configs',module_name,'{}_config.yaml'.format(config["project_name"])),
         annotations = os.path.join(config["result_path"],'configs',module_name,'{}_annot.csv'.format(config["project_name"])),
     resources:

diff --git a/workflow/envs/fastdist_UNUSED.yaml b/workflow/envs/fastdist_UNUSED.yaml
@@ -0,0 +1,12 @@
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - scikit-learn
+  - pandas=1.5.0
+  - numpy
+  - numba
+  - pip
+  - pip:
+    - fastdist==1.1.6
diff --git a/workflow/envs/sklearn.yaml → workflow/envs/sklearn_UNUSED.yaml b/workflow/envs/sklearn.yaml → workflow/envs/sklearn_UNUSED.yaml
diff --git a/workflow/envs/umap_unused.yaml → workflow/envs/umap_UNUSED.yaml b/workflow/envs/umap_unused.yaml → workflow/envs/umap_UNUSED.yaml
diff --git a/workflow/rules/cluster_validation.smk b/workflow/rules/cluster_validation.smk
@@ -82,7 +82,7 @@ rule validation_external:
         mem_mb=config.get("mem", "16000"),
     threads: config.get("threads", 1)
     conda:
-        "../envs/sklearn.yaml"
+        "../envs/umap_leiden.yaml"
     log:
         os.path.join("logs","rules","validation_external_{sample}.log"),
     params:

diff --git a/workflow/rules/clustering.smk b/workflow/rules/clustering.smk
@@ -32,7 +32,7 @@ rule clustification:
         mem_mb=config.get("mem", "16000"),
     threads: 8#config.get("threads", 1)
     conda:
-        "../envs/sklearn.yaml"
+        "../envs/umap_leiden.yaml"
     log:
         os.path.join("logs","rules","clustification_{sample}_clusterings.log"),
     params:

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -14,66 +14,59 @@ def get_data_orientation(wildcards):
 
 def get_umap_sample_paths(wildcards):
     return [annot.loc[wildcards.sample,'data'],
-           os.path.join(config["result_path"],'unsupervised_analysis','{}'.format(wildcards.sample),'UMAP','UMAP_{}_'.format(wildcards.metric)+'{}'.format(max(config["umap"]["n_neighbors"]))+'_graph.pickle')]
+           os.path.join(result_path,'{}'.format(wildcards.sample),'UMAP','UMAP_{}_'.format(wildcards.metric)+'{}'.format(max(config["umap"]["n_neighbors"]))+'_graph.pickle')]
 
 def get_dimred_paths(wildcards):
     path_dict = {}
 
     if wildcards.method=="PCA":
-        path_dict['dimred_data'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards))
-        path_dict['dimred_axes'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards))
-        path_dict['dimred_var'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_var.csv'.format(wildcards=wildcards))
-        path_dict['dimred_loadings'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_loadings_small.csv'.format(wildcards=wildcards))
-#         return {
-#             'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
-#             'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
-#             'dimred_var': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_var.csv'.format(wildcards=wildcards)),
-#             'dimred_loadings': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_loadings_small.csv'.format(wildcards=wildcards)),
-#             'metadata': annot.loc[wildcards.sample,"metadata"],
-#             'metadata_features': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
-#                }
+        path_dict['dimred_data'] = os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards))
+        path_dict['dimred_axes'] = os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards))
+        path_dict['dimred_var'] = os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_var.csv'.format(wildcards=wildcards))
+        path_dict['dimred_loadings'] = os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_loadings_small.csv'.format(wildcards=wildcards))
     else:
-        path_dict['dimred_data'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards))
-        path_dict['dimred_axes'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards))
-#         return {
-#             'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
-#             'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
-#             'metadata': annot.loc[wildcards.sample,"metadata"],
-#             'metadata_features': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
-#                }
+        path_dict['dimred_data'] = os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards))
+        path_dict['dimred_axes'] = os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards))
 
     # add metadata
     path_dict['metadata'] = annot.loc[wildcards.sample,"metadata"]
     # add features
-    path_dict['metadata_features'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
+    path_dict['metadata_features'] = os.path.join(result_path,wildcards.sample,'metadata_features.csv')
     # add clustering results
     if len(cluster_methods) > 0:
-        path_dict['metadata_clusterings'] = os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_clusterings.csv')
+        path_dict['metadata_clusterings'] = os.path.join(result_path,wildcards.sample,'metadata_clusterings.csv')
 
     return path_dict
 
 def get_dimred_features_paths(wildcards):
 
     if wildcards.method=="PCA":
         return {
-            'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
-            'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
-            'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
+            'dimred_data': os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
+            'dimred_axes': os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
+            'metadata': os.path.join(result_path,wildcards.sample,'metadata_features.csv')
                }
     else:
         return {
-            'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
-            'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
-            'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
+            'dimred_data': os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
+            'dimred_axes': os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
+            'metadata': os.path.join(result_path,wildcards.sample,'metadata_features.csv')
                }
 
+########## HEATMAPS ##########
+def get_heatmap_paths(wildcards):
+    return {'data': annot.loc[wildcards.sample,'data'],
+           'metadata': annot.loc[wildcards.sample,"metadata"],
+            'observations_distance': os.path.join(result_path,wildcards.sample,'Heatmap','DistanceMatrix_{wildcards.metric}_observations.csv'.format(wildcards=wildcards)),
+            'features_distance': os.path.join(result_path,wildcards.sample,'Heatmap','DistanceMatrix_{wildcards.metric}_features.csv'.format(wildcards=wildcards)),
+           }
 
 ########## CLUSTERING ##########
 
 # get paths for clustification
 def get_clustification_paths(wildcards):
     return [annot.loc[wildcards.sample,'data'],
-            os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'{}'.format(config["clustification"]["method"]),'{}_clusterings.csv'.format(config["clustification"]["method"]))
+            os.path.join(result_path,wildcards.sample,'{}'.format(config["clustification"]["method"]),'{}_clusterings.csv'.format(config["clustification"]["method"]))
            ]
 
 # get all clustering results of one method to be aggregated into {method}/{method}_clusterings.csv
@@ -90,33 +83,31 @@ def get_clustering_paths(wildcards):
             else:
                 leiden_parameters.append("{}_NA".format(partition_type))
 
-        path_list = path_list + expand(os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'Leiden','Leiden_{metric}_{n_neighbors}_{leiden_parameters}_clustering.csv'),
+        path_list = path_list + expand(os.path.join(result_path,wildcards.sample,'Leiden','Leiden_{metric}_{n_neighbors}_{leiden_parameters}_clustering.csv'),
                                 metric=config["leiden"]["metrics"],
                                 n_neighbors=config["leiden"]["n_neighbors"],
                                 leiden_parameters=leiden_parameters,
-#                                 partition_type=config["leiden"]["partition_types"],
-#                                 resolution=config["leiden"]["resolutions"]
                                    )
     return path_list
 
 # get all aggregated clustering results across methods to be aggregated into {sample}/metadata_clusterings.csv
 def get_aggregated_clustering_paths(wildcards):
-    return expand(os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'{method}','{method}_clusterings.csv'), method=cluster_methods)
+    return expand(os.path.join(result_path,wildcards.sample,'{method}','{method}_clusterings.csv'), method=cluster_methods)
 
 # get the aggregated clustering results across methods for visualization
 def get_metadata_clustering_paths(wildcards):
 
     if wildcards.method=="PCA":
         return {
-            'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
-            'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
-            'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_clusterings.csv')
+            'dimred_data': os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_data_small.csv'.format(wildcards=wildcards)),
+            'dimred_axes': os.path.join(result_path,wildcards.sample,'PCA','PCA_{wildcards.parameters}_axes.csv'.format(wildcards=wildcards)),
+            'metadata': os.path.join(result_path,wildcards.sample,'metadata_clusterings.csv')
                }
     else:
         return {
-            'dimred_data': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
-            'dimred_axes': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
-            'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_clusterings.csv')
+            'dimred_data': os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_data.csv'.format(wildcards=wildcards)),
+            'dimred_axes': os.path.join(result_path,wildcards.sample,wildcards.method,'{wildcards.method}_{wildcards.parameters}_{wildcards.n_components}_axes.csv'.format(wildcards=wildcards)),
+            'metadata': os.path.join(result_path,wildcards.sample,'metadata_clusterings.csv')
                }
 
 ########## CLUSTER VALIDATION ##########
@@ -126,36 +117,36 @@ def get_clustree_paths(wildcards):
 
     if wildcards.content=="features":
         return {
-            'metadata_clustering': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "metadata_clusterings.csv"),
-            'metadata': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'metadata_features.csv')
+            'metadata_clustering': os.path.join(result_path,wildcards.sample, "metadata_clusterings.csv"),
+            'metadata': os.path.join(result_path,wildcards.sample,'metadata_features.csv')
         }
     else:
         return {
-            'metadata_clustering': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "metadata_clusterings.csv"),
+            'metadata_clustering': os.path.join(result_path,wildcards.sample, "metadata_clusterings.csv"),
             'metadata': annot.loc[wildcards.sample,"metadata"]
         }
 
 # get paths to determine external cluster indices
 def get_external_validation_paths(wildcards):
-    return {'clusterings': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "metadata_clusterings.csv"),
+    return {'clusterings': os.path.join(result_path,wildcards.sample, "metadata_clusterings.csv"),
             'metadata': annot.loc[wildcards.sample,"metadata"]
            }
 
 # get paths to determine internal cluster indices
 def get_internal_validation_paths(wildcards):
     return {#'data': annot.loc[wildcards.sample,'data'],
             'metadata': annot.loc[wildcards.sample,"metadata"],
-            'clusterings': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "metadata_clusterings.csv"),
-            'pca': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_default_data.csv'),
-            'pca_var': os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample,'PCA','PCA_default_var.csv')
+            'clusterings': os.path.join(result_path,wildcards.sample, "metadata_clusterings.csv"),
+            'pca': os.path.join(result_path,wildcards.sample,'PCA','PCA_{}_{}_data.csv'.format(config["pca"]["svd_solver"],config["pca"]["n_components"])),
+            'pca_var': os.path.join(result_path,wildcards.sample,'PCA','PCA_{}_{}_var.csv'.format(config["pca"]["svd_solver"],config["pca"]["n_components"]))
            }
 
 # for plotting heatmaps of cluster indices
 def get_validation_paths(wildcards):
     if wildcards.type=="external":
         return {
-            idx: os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "cluster_validation", "external_index_{}.csv".format(idx)) for idx in indices_external
+            idx: os.path.join(result_path,wildcards.sample, "cluster_validation", "external_index_{}.csv".format(idx)) for idx in indices_external
         }
     else:
-        return {"ranked_internal_indices": os.path.join(config["result_path"],'unsupervised_analysis',wildcards.sample, "cluster_validation", "internal_indices_ranked.csv")}
+        return {"ranked_internal_indices": os.path.join(result_path,wildcards.sample, "cluster_validation", "internal_indices_ranked.csv")}