cleanup

cbg-ethz · Jun 7, 2024 · 0ce2ee8 · 0ce2ee8
1 parent 6cc4408
commit 0ce2ee8
Show file tree

Hide file tree

Showing 60 changed files with 1,251 additions and 5,335 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,5 @@ experiments/.snakemake/*
 .Rapp.history
 *.Rhistory
 experiments/data/markdowns/
+experiments/data/htmls/*files
+experiments/logs/
diff --git a/CTC_SCITE_katharina/Beta_Distr.h → CTC_SCITE/Beta_Distr.h b/CTC_SCITE_katharina/Beta_Distr.h → CTC_SCITE/Beta_Distr.h
diff --git a/CTC_SCITE_katharina/CTC_SCITE → CTC_SCITE/CTC_SCITE b/CTC_SCITE_katharina/CTC_SCITE → CTC_SCITE/CTC_SCITE
diff --git a/CTC_SCITE_katharina/CTC_SCITE_PAR → CTC_SCITE/CTC_SCITE_PAR b/CTC_SCITE_katharina/CTC_SCITE_PAR → CTC_SCITE/CTC_SCITE_PAR
diff --git a/CTC_SCITE_katharina/CTC_treeScoring.cpp → CTC_SCITE/CTC_treeScoring.cpp b/CTC_SCITE_katharina/CTC_treeScoring.cpp → CTC_SCITE/CTC_treeScoring.cpp
diff --git a/CTC_SCITE_katharina/CTC_treeScoring.h → CTC_SCITE/CTC_treeScoring.h b/CTC_SCITE_katharina/CTC_treeScoring.h → CTC_SCITE/CTC_treeScoring.h
diff --git a/CTC_SCITE_katharina/README → CTC_SCITE/README b/CTC_SCITE_katharina/README → CTC_SCITE/README
diff --git a/CTC_SCITE_katharina/binTree_output.cpp → CTC_SCITE/binTree_output.cpp b/CTC_SCITE_katharina/binTree_output.cpp → CTC_SCITE/binTree_output.cpp
diff --git a/CTC_SCITE_katharina/binTree_output.h → CTC_SCITE/binTree_output.h b/CTC_SCITE_katharina/binTree_output.h → CTC_SCITE/binTree_output.h
diff --git a/CTC_SCITE_katharina/doublets.cpp → CTC_SCITE/doublets.cpp b/CTC_SCITE_katharina/doublets.cpp → CTC_SCITE/doublets.cpp
diff --git a/CTC_SCITE_katharina/doublets.h → CTC_SCITE/doublets.h b/CTC_SCITE_katharina/doublets.h → CTC_SCITE/doublets.h
diff --git a/CTC_SCITE_katharina/enum.cpp → CTC_SCITE/enum.cpp b/CTC_SCITE_katharina/enum.cpp → CTC_SCITE/enum.cpp
diff --git a/CTC_SCITE_katharina/enum.h → CTC_SCITE/enum.h b/CTC_SCITE_katharina/enum.h → CTC_SCITE/enum.h
diff --git a/CTC_SCITE_katharina/findBestTrees.cpp → CTC_SCITE/findBestTrees.cpp b/CTC_SCITE_katharina/findBestTrees.cpp → CTC_SCITE/findBestTrees.cpp
diff --git a/CTC_SCITE_katharina/matrices.cpp → CTC_SCITE/matrices.cpp b/CTC_SCITE_katharina/matrices.cpp → CTC_SCITE/matrices.cpp
diff --git a/CTC_SCITE_katharina/matrices.h → CTC_SCITE/matrices.h b/CTC_SCITE_katharina/matrices.h → CTC_SCITE/matrices.h
diff --git a/CTC_SCITE_katharina/mcmc.cpp → CTC_SCITE/mcmc.cpp b/CTC_SCITE_katharina/mcmc.cpp → CTC_SCITE/mcmc.cpp
diff --git a/CTC_SCITE_katharina/mcmc.h → CTC_SCITE/mcmc.h b/CTC_SCITE_katharina/mcmc.h → CTC_SCITE/mcmc.h
diff --git a/CTC_SCITE_katharina/mcmcBinTreeMove.cpp → CTC_SCITE/mcmcBinTreeMove.cpp b/CTC_SCITE_katharina/mcmcBinTreeMove.cpp → CTC_SCITE/mcmcBinTreeMove.cpp
diff --git a/CTC_SCITE_katharina/mcmcBinTreeMove.h → CTC_SCITE/mcmcBinTreeMove.h b/CTC_SCITE_katharina/mcmcBinTreeMove.h → CTC_SCITE/mcmcBinTreeMove.h
diff --git a/CTC_SCITE_katharina/mcmcTreeMove.cpp → CTC_SCITE/mcmcTreeMove.cpp b/CTC_SCITE_katharina/mcmcTreeMove.cpp → CTC_SCITE/mcmcTreeMove.cpp
diff --git a/CTC_SCITE_katharina/mcmcTreeMove.h → CTC_SCITE/mcmcTreeMove.h b/CTC_SCITE_katharina/mcmcTreeMove.h → CTC_SCITE/mcmcTreeMove.h
diff --git a/CTC_SCITE_katharina/output.cpp → CTC_SCITE/output.cpp b/CTC_SCITE_katharina/output.cpp → CTC_SCITE/output.cpp
diff --git a/CTC_SCITE_katharina/output.h → CTC_SCITE/output.h b/CTC_SCITE_katharina/output.h → CTC_SCITE/output.h
diff --git a/CTC_SCITE_katharina/rand.cpp → CTC_SCITE/rand.cpp b/CTC_SCITE_katharina/rand.cpp → CTC_SCITE/rand.cpp
diff --git a/CTC_SCITE_katharina/rand.h → CTC_SCITE/rand.h b/CTC_SCITE_katharina/rand.h → CTC_SCITE/rand.h
diff --git a/CTC_SCITE_katharina/recMut.cpp → CTC_SCITE/recMut.cpp b/CTC_SCITE_katharina/recMut.cpp → CTC_SCITE/recMut.cpp
diff --git a/CTC_SCITE_katharina/recMut.h → CTC_SCITE/recMut.h b/CTC_SCITE_katharina/recMut.h → CTC_SCITE/recMut.h
diff --git a/CTC_SCITE_katharina/scoreBinTree.cpp → CTC_SCITE/scoreBinTree.cpp b/CTC_SCITE_katharina/scoreBinTree.cpp → CTC_SCITE/scoreBinTree.cpp
diff --git a/CTC_SCITE_katharina/scoreBinTree.h → CTC_SCITE/scoreBinTree.h b/CTC_SCITE_katharina/scoreBinTree.h → CTC_SCITE/scoreBinTree.h
diff --git a/CTC_SCITE_katharina/scoreTree.cpp → CTC_SCITE/scoreTree.cpp b/CTC_SCITE_katharina/scoreTree.cpp → CTC_SCITE/scoreTree.cpp
diff --git a/CTC_SCITE_katharina/scoreTree.h → CTC_SCITE/scoreTree.h b/CTC_SCITE_katharina/scoreTree.h → CTC_SCITE/scoreTree.h
diff --git a/CTC_SCITE_katharina/treelist.cpp → CTC_SCITE/treelist.cpp b/CTC_SCITE_katharina/treelist.cpp → CTC_SCITE/treelist.cpp
diff --git a/CTC_SCITE_katharina/treelist.h → CTC_SCITE/treelist.h b/CTC_SCITE_katharina/treelist.h → CTC_SCITE/treelist.h
diff --git a/CTC_SCITE_katharina/trees.cpp → CTC_SCITE/trees.cpp b/CTC_SCITE_katharina/trees.cpp → CTC_SCITE/trees.cpp
diff --git a/CTC_SCITE_katharina/trees.h → CTC_SCITE/trees.h b/CTC_SCITE_katharina/trees.h → CTC_SCITE/trees.h
diff --git a/README → README.md b/README → README.md
diff --git a/cancel_cluster_jobs.sh b/cancel_cluster_jobs.sh
diff --git a/compare_filters.py b/compare_filters.py
diff --git a/experiments/config/config.yaml b/experiments/config/config.yaml
@@ -1,2 +1,3 @@
+#sample: ['Lu2']
 sample: ['Br11', 'Br7', 'Br61', 'Br38', 'LM2', 'Pr9', 'Br23','Br39', 'Br57', 'Lu2', 'Br16_AC', 'Br16_B', 'Br16_C', 'Br26', 'Br44', 'Lu7', 'Br30', 'Br45', 'Ov8', 'Br37', 'Br46', 'Brx50', 'Pr6']
 author: Johannes Gawron
diff --git a/experiments/data/htmls/Br11.html b/experiments/data/htmls/Br11.html
diff --git a/experiments/data/htmls/Br16_AC.html b/experiments/data/htmls/Br16_AC.html
diff --git a/experiments/data/htmls/Br16_B.html b/experiments/data/htmls/Br16_B.html
diff --git a/experiments/data/htmls/Br23.html b/experiments/data/htmls/Br23.html
diff --git a/experiments/data/htmls/Br26.html b/experiments/data/htmls/Br26.html
diff --git a/experiments/data/htmls/Br37.html b/experiments/data/htmls/Br37.html
diff --git a/experiments/data/htmls/Br38.html b/experiments/data/htmls/Br38.html
diff --git a/experiments/data/htmls/Br39.html b/experiments/data/htmls/Br39.html
diff --git a/experiments/data/htmls/Br44.html b/experiments/data/htmls/Br44.html
diff --git a/experiments/data/htmls/Br45.html b/experiments/data/htmls/Br45.html
diff --git a/experiments/data/htmls/Br46.html b/experiments/data/htmls/Br46.html
diff --git a/experiments/data/htmls/Br61.html b/experiments/data/htmls/Br61.html
diff --git a/experiments/data/htmls/Brx50.html b/experiments/data/htmls/Brx50.html
diff --git a/experiments/data/htmls/LM2.html b/experiments/data/htmls/LM2.html
diff --git a/experiments/data/htmls/Lu2.html b/experiments/data/htmls/Lu2.html
diff --git a/experiments/data/htmls/Ov8.html b/experiments/data/htmls/Ov8.html
diff --git a/experiments/data/htmls/Pr9.html b/experiments/data/htmls/Pr9.html
diff --git a/experiments/workflow/resources/functions.R b/experiments/workflow/resources/functions.R
@@ -234,40 +234,44 @@ produce_Distance_Posterior <- function(leaf1, leaf2,postSampling, treeName,nCell
 
   tryCatch(
     expr = {
-      plot(
-        ggplot(data, aes(x = StatisticsOfMutationPlacement)) +
-          geom_histogram(bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7)+ 
-          xlab("S") + ylab("total count") +
-          ggtitle("Posterior sampling of branching probabilites") +
-          geom_vline(xintercept = mean(StatisticsOfMutationPlacement),color = "blue", linetype = "dashed", linewidth = 1) +
-          labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName),caption = "mean indicated by dashed blue line") +
-          theme_minimal() +
-          theme(
-            plot.title = element_text(size = 20, face = "bold"),
-            axis.title.x = element_text(size = 18),
-            axis.title.y = element_text(size = 18),
-            plot.subtitle = element_text(size= 18),
-            axis.text = element_text(size = 16) 
-          )
-      ) 
+      histo <- ggplot(data, aes(x = StatisticsOfMutationPlacement)) +
+        geom_histogram(bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7)+ 
+        xlab("Splitting score") + ylab("total count") +
+        ggtitle("Posterior sampling of branching probabilites") +
+        geom_vline(xintercept = mean(StatisticsOfMutationPlacement),color = "blue", linetype = "dashed", linewidth = 1) +
+        labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName)) +
+        theme_minimal() +
+        theme(
+          plot.title = element_text(size = 20, face = "bold"),
+          axis.title.x = element_text(size = 18),
+          axis.title.y = element_text(size = 18),
+          plot.subtitle = element_text(size= 18),
+          axis.text = element_text(size = 16) 
+        )
+      hist_data <- ggplot_build(histo)$data[[1]]
+      max_y <- max(hist_data$count)
+      histo <- histo + annotate("text", x = mean(StatisticsOfMutationPlacement) + 0.08, y = 0.9 * max_y, label="mean",  color = "blue", size = 7)
+      print(histo)
     },
     error = function(e){
-      plot(
-        ggplot(data, aes(x = log(StatisticsOfMutationPlacement))) +
-          geom_histogram(bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7)+ 
-          xlab("Maximal probability of branching evolution") + ylab("total count") +
-          ggtitle("Posterior sampling of branching probabilites - Logarithmic Scale") +
-          geom_vline(xintercept = log(mean(StatisticsOfMutationPlacement)),color = "blue", linetype = "dashed", linewidth = 1) +
-          labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName),caption = "mean indicated by dashed red line") +
-          theme_minimal() +
-          theme(
-            plot.title = element_text(size = 20, face = "bold"),
-            axis.title.x = element_text(size = 18),
-            axis.title.y = element_text(size = 18),
-            plot.subtitle = element_text(size= 18),
-            axis.text = element_text(size = 16) 
-          )
-      )
+      histo <- ggplot(data, aes(x = log(StatisticsOfMutationPlacement))) +
+        geom_histogram(bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7)+ 
+        xlab("log(Splitting Score") + ylab("total count") +
+        ggtitle("Posterior sampling of branching probabilites - Logarithmic Scale") +
+        geom_vline(xintercept = log(mean(StatisticsOfMutationPlacement)),color = "blue", linetype = "dashed", linewidth = 1) +
+        labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName),caption = "mean indicated by dashed red line") +
+        theme_minimal() +
+        theme(
+          plot.title = element_text(size = 20, face = "bold"),
+          axis.title.x = element_text(size = 18),
+          axis.title.y = element_text(size = 18),
+          plot.subtitle = element_text(size= 18),
+          axis.text = element_text(size = 16) 
+        )
+      hist_data <- ggplot_build(histo)$data[[1]]
+      max_y <- max(hist_data$count)
+      histo <- histo + annotate("text", x = log(mean(StatisticsOfMutationPlacement)) + 0.08, y = 0.9 * max_y, label="log(mean)",  color = "blue", size = 7)
+      print(histo)
     }
   )
 
@@ -424,12 +428,12 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, nCel
     )
   }
 
-  plot(
-  splittingProbs %>% group_by(Cluster) %>% summarize(meanSplittingProbability = mean(Splitting_probability)) %>%
-  ggplot(aes(x = Cluster, y = meanSplittingProbability)) +
-    geom_col() +
-    theme_minimal()
-  )
+ # plot(
+#  splittingProbs %>% group_by(Cluster) %>% summarize(meanSplittingProbability = mean(Splitting_probability)) %>%
+#  ggplot(aes(x = Cluster, y = meanSplittingProbability)) +
+#    geom_col() +
+#    theme_minimal()
+ # )
 
   return(list(splittingProbs = splittingProbs, aggregatedBranchingProbabilities = aggregatedProbabilities))
 }

diff --git a/experiments/workflow/resources/template.Rmd b/experiments/workflow/resources/template.Rmd
@@ -2,7 +2,9 @@
 title: "__tree__"
 author: "__author__"
 date: "__date__"
-output: html_document
+output:
+  html_document:
+    keep_md: yes
 ---
 
 ```{r setup, include=FALSE}
@@ -13,7 +15,7 @@ knitr::opts_chunk$set(echo = TRUE)
 
 This code analyses splitting statistics for CTC-clusters.
 
-The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees.
+The analysis takes a list of trees sampled from its posterior distribution as input and computes the mutations placement probability distribution for each oneof them. From this distribution we derive a score that quantifies the probability that two cells have experienced divergent evolution. This score is called the splitting score.
 
 
 ## Configure the script
@@ -26,11 +28,11 @@ nMutationSamplingEvents <- __nSamplingEvents__
 ```
 
 ## Loading data
-```{r load}
-quiet(source("__functionsScript__"))
+```{r load, results="hide"}
+source("__functionsScript__")
 
 
-input <- quiet(load_data(inputFolder, treeName))
+input <- load_data(inputFolder, treeName)
 ```
 
 
@@ -44,14 +46,14 @@ Column description:
   - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv
           file.
 
-```{r Describe samples}
-print(sampleDescription)
+```{r sample-description}
+print(input$sample_description)
 ```
 
 
 
 ## General overview
-We sample __nSamplingEvents__ many trees.
+We sample __nSamplingEvents__ trees.
 
 For each pair of cells in the same cluster and each sampled tree we compute the splitting score, that is, the probability that the two cells have experienced divergent evolution. A low splitting score (close to 0) indicates that the two cells are likely genealogically closely related, while a high splitting score (close to 1) indicates that the two cells have evolved in a divergent manner.
 
@@ -68,7 +70,7 @@ Finally, we print the empirical distribution of the the splitting scores for all
 The latter is used to specify the cutoff for oligo-clonality: It is defined as the 95%-percentile of the aggregated distribution of splitting scores.
 
 
-```{r}
+```{r computing-simulated-clusters, results="hide", dev='png'}
 cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector())
 cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector())
 
@@ -77,7 +79,7 @@ for (clusterSize in 2:5){
   {treeNameSimulated <- paste(treeName, clusterSize, sep = '_')
 
 
-  inputSimulated <- quiet(load_data(simulationInputFolder, treeNameSimulated))
+  inputSimulated <- load_data(simulationInputFolder, treeNameSimulated)
 
   sampleDescriptionSimulated <- inputSimulated$sample_description
   
@@ -107,14 +109,14 @@ print(cutoffsBranchingProbabilities)
 
 Now we can compute the aggregated splitting score distributions for each cluster. The distribution's mean is compared to the cutoffs computed above, and if it is higher than the cutoff, we call the cluster oligo-clonal.
 
-```{r}
+```{r computing-real-clusters}
 nTumorClusters <- 0
 nOligoclonalClusters2 <- 0
 splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector())
 
 for(clusterSize in 2:5){
   try({
-    clusterColor <- sampleDescription %>%
+    clusterColor <- input$sample_description %>%
     filter(WBC ==0 &  color != 'gray93') %>%
     group_by(color) %>%
     filter(n() == clusterSize) %>%
@@ -131,7 +133,7 @@ for(clusterSize in 2:5){
 
       splittingProbs <- mean(distance$splittingProbs$Splitting_probability)
       branchingProbs <- mean(distance$aggregatedBranchingProbabilities)
-    
+      
       nTumorClusters <- nTumorClusters + 1
       oligoclonal <- FALSE
 
@@ -145,7 +147,7 @@ for(clusterSize in 2:5){
 }
 
 
-numberOfCancerClusters <- sampleDescription %>%
+numberOfCancerClusters <- input$sample_description %>%
     filter(WBC ==0 &  color != 'gray93') %>%
     group_by(color) %>%
     filter(n() > 1) %>%

diff --git a/experiments/workflow/rules/base.smk b/experiments/workflow/rules/base.smk
@@ -34,5 +34,5 @@ rule render_markdown_file:
         PROJECT_DIR / 'logs' / 'render_markdown_file.{SAMPLE}.log',
     shell:
         """
-        ( Rscript -e "rmarkdown::render('{input}', output_file = '{output}')" ) %> {log}
+        ( Rscript -e "rmarkdown::render('{input}', output_file = '{output}')" ) &> {log}
         """