From c546d0a4f3f09bfd8de6b767436a7110cebd637b Mon Sep 17 00:00:00 2001
From: JohannesGawron <johannes.gawron@bsse.ethz.ch>
Date: Thu, 9 Jan 2025 11:41:08 +0100
Subject: [PATCH 1/5] bugfix

---
 .../workflow/resources/functions.R                   |  8 +++++++-
 .../workflow/resources/mutations_placement.cpp       |  5 ++++-
 .../workflow/scripts/simulateCTCclusters.R           | 12 +++++++++---
 3 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/experiments/assessing_cluster_clonality/workflow/resources/functions.R b/experiments/assessing_cluster_clonality/workflow/resources/functions.R
index d92dfea..12df533 100755
--- a/experiments/assessing_cluster_clonality/workflow/resources/functions.R
+++ b/experiments/assessing_cluster_clonality/workflow/resources/functions.R
@@ -275,6 +275,7 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName,
     counter <- 1
     system.time(
       for (it in cellPairSelection) {
+        print(it)
         leaf1 <- which(sampleDescription$ClusterName == it[1]) - 1
         leaf2 <- which(sampleDescription$ClusterName == it[2]) - 1
 
@@ -309,9 +310,11 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName,
   } else if (class(cellPairSelection) == "character") {
     CTCclusters <- unique(cellPairSelection)
     CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite", "gray93"))]
+    print(CTCclusters)
+
     system.time(
       for (it in CTCclusters) {
-        cellsInCluster <- which(sampleDescription$color %in% it) - 1
+        cellsInCluster <- which(sampleDescription$color == it) - 1
         ## Make sure array indication is compatible with cpp
         cluster_done <- 0
         for (i in cellsInCluster) {
@@ -332,6 +335,7 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName,
             print(paste(paste("Computing genomic distances of leaves:", i,
               sep = " "
             ), j, sep = " "))
+
             posterior <- produce_Distance_Posterior(i, j, postSampling,
               treeName, nCells,
               nMutations, nClusters,
@@ -344,6 +348,7 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName,
                 nMutationSamplingEvents,
               clusterName = it
             )
+
             splittingProbs <- rbind(
               splittingProbs,
               data.frame(
@@ -400,6 +405,7 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName,
                 nMutationSamplingEvents,
               clusterName = it
             )
+            print("Posterior computed")
             splittingProbs <- rbind(
               splittingProbs,
               data.frame(
diff --git a/experiments/assessing_cluster_clonality/workflow/resources/mutations_placement.cpp b/experiments/assessing_cluster_clonality/workflow/resources/mutations_placement.cpp
index 485f900..6754274 100644
--- a/experiments/assessing_cluster_clonality/workflow/resources/mutations_placement.cpp
+++ b/experiments/assessing_cluster_clonality/workflow/resources/mutations_placement.cpp
@@ -519,6 +519,7 @@ std::vector<std::vector<double>> computeMutationDistribution(
   // double logScore = 0.0;
 
   std::vector<std::vector<double>> logAttachmentScores;
+
   logAttachmentScores.reserve(n + 1);
   for (int mut = 0; mut < n;
        mut++) { // compute score separately for each mutation
@@ -563,9 +564,9 @@ std::vector<std::vector<double>> computeMutationDistribution(
     for (int att = 0; att < (2 * m) - 1; att++) {
       // cout << "wbcBelowCount.at(" << att << ") = " << wbcBelowCount.at(att)
       // << endl;
-
       double logAttachmentScore = 0.0;
       double wbc_penalty = chi_wbc * wbcBelowCount.at(att);
+
       for (int sample = 0; sample < sampleCount; sample++) { // for each sample
         // cout << "sample " << sample << endl;
         int expCount = expVarAlleleCount[sample][att];
@@ -589,6 +590,7 @@ std::vector<std::vector<double>> computeMutationDistribution(
         logAttachmentScore += newScore;
         // cout << newScore << endl;
       }
+
       // cout << "\n mut score: " << logAttachmentScore << endl;
       logAttachmentScores.back().push_back(logAttachmentScore - wbc_penalty);
 
@@ -1028,6 +1030,7 @@ std::vector<std::vector<double>> computePairwiseDistanceOfLeavesGivenTree(
     const std::vector<std::vector<int>> &mutatedReadCounts,
     const std::vector<std::vector<int>> &totalReadCounts,
     const std::vector<bool> &wbcStatus, int nSamplingEvents) {
+
   std::string tree = treeData["Tree"];
 
   // Now need to split the string into single numbers and turn them into
diff --git a/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R b/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R
index e9c3a2c..f502b5a 100644
--- a/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R
+++ b/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R
@@ -6,6 +6,7 @@ library(boot)
 source("functions.R")
 library("optparse")
 
+
 ############
 # Config
 ############
@@ -35,7 +36,7 @@ parser <- add_option(parser, c("-s", "--simulation-cluster-size"),
 )
 parser <- add_option(parser, c("-o", "--output-folder"),
   type = "character",
-  default = "~/Documents/projects/CTC_backup/simulations/simulation2", help = ""
+  default = "~/Documents/projects/CTC_backup/simulations/simulation3", help = ""
 )
 parser <- add_option(parser, c("-m", "--monoclonal"),
   type = "logical",
@@ -511,6 +512,7 @@ simulate_oligoclonals <- function(input, output_directory, number_of_cells, samp
     return(sum(x != y))
   }
 
+  it <- 0
   while (TRUE) { # I sample until I get a cluster where at least two cells are distinct
     # Sample cells to merge
     cells_to_merge <-
@@ -534,6 +536,10 @@ simulate_oligoclonals <- function(input, output_directory, number_of_cells, samp
     if (sum_of_distances > 0) {
       break
     }
+    it <- 1 + it
+    if (it > 100) {
+      break
+    }
   }
 
 
@@ -596,7 +602,7 @@ simulate_oligoclonals <- function(input, output_directory, number_of_cells, samp
   )
 
   write_delim(
-    x = description_data,
+    x = description_data_output_format,
     file = file.path(
       output_directory,
       paste(input$sampleName, output_label, sep = "_"),
@@ -767,7 +773,7 @@ simulateCTCclusters <- function(
 # for (tree in c("Br11", "Br16_AC_max2", "Br16_AC_max3", "Br16_AC_max4", "Br16_B_max1", "Br16_B_max2", "Br16_B_max3", "Br16_B_max4", "Br16_C_max1", "Br16_C_max2", "Br16_C_max3", "Br23", "Br26", "Br30", "Br37", "Br38", "Br39", "Br44", "Br45", "Br46", "Br53", "Br57", "Brx50", "Lu2", "Lu7", "Ov8", "Pr6", "Pr9")) {}
 
 
-cluster_size_vector <- c(0, 4, 3, 2, 2, 2, 2, 2, 2)
+cluster_size_vector <- c(0, 3, 3, 3, 3, 3, 3, 3, 3)
 
 
 print(paste("Running simulation for", tree_name))

From 6225918f8a2cc210ed296dbce33f3dac46ef78e1 Mon Sep 17 00:00:00 2001
From: JohannesGawron <johannes.gawron@bsse.ethz.ch>
Date: Thu, 9 Jan 2025 16:57:29 +0100
Subject: [PATCH 2/5] code refactoring

---
 .pre-commit-config.yaml                       |   1 +
 .../workflow/resources/functions.R            | 573 +++++++++---------
 2 files changed, 289 insertions(+), 285 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f8a041a..495d4ac 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,6 +7,7 @@ repos:
     rev: v0.4.2
     hooks:
       - id: style-files
+        dry: on
       - id: parsable-R  
       - id: lintr
         args: [--warn_only]
diff --git a/experiments/assessing_cluster_clonality/workflow/resources/functions.R b/experiments/assessing_cluster_clonality/workflow/resources/functions.R
index 12df533..785a4df 100755
--- a/experiments/assessing_cluster_clonality/workflow/resources/functions.R
+++ b/experiments/assessing_cluster_clonality/workflow/resources/functions.R
@@ -5,10 +5,8 @@
 library(Rcpp)
 library(tidyverse)
 
-
-
 sourceCpp("../../workflow/resources/mutations_placement.cpp")
-
+util::globalVariables(computePairwiseDistanceOfLeavesGivenTree)
 
 
 #' Takes a list of mutations and outputs which one of these is a driver.
@@ -28,17 +26,19 @@ sourceCpp("../../workflow/resources/mutations_placement.cpp")
 #' @export
 #'
 #' @examples
-IsDriver <- function(mutations, annotations) {
-  annotated_mutations <- annotations %>%
-    filter(annotations$"#CHROM" == as.character(mutations[1]) &
-      annotations$POS == as.numeric(mutations[2]))
+is_driver <- function(mutations, annotations) {
+  annotated_mutations <- annotations |>
+    filter(
+      annotations$"#CHROM" == as.character(mutations[1]) &
+        annotations$POS == as.numeric(mutations[2])
+    )
 
-  check <- annotated_mutations %>%
-    select(c(
+  check <- annotated_mutations |>
+    dplyr::select(c(
       "CGI-Oncogenic Summary", "CGI-Oncogenic Prediction",
       "CGI-External oncogenic annotation"
     )) %in%
-    c("oncogenic (predicted)", "driver (oncodriveMUT)") %>%
+    c("oncogenic (predicted)", "driver (oncodriveMUT)") |>
     sum()
 
   driver <- FALSE
@@ -56,20 +56,20 @@ IsDriver <- function(mutations, annotations) {
 #'
 #' @param leaf1 integer-valued index of first leaf
 #' @param leaf2 integer-valued index of second leaf
-#' @param postSampling loaded list of tibbles tibble containing the posterior
+#' @param post_sampling loaded list of tibbles tibble containing the posterior
 #'                     Sampling
-#' @param treeName character string: Name of the tree for the output plot
-#' @param nCells integer-valued total number of cells in the dataset
-#' @param nMutations integer-valued total number of mutations in the dataset
-#' @param nClusters integer-valued total number of clusters in the dataset
-#' @param alleleCount integer vector of numbers of alleles per clusters
-#' @param ClusterID integer vector of cluster IDs
-#' @param mutatedReadCounts list of integer-valued vectors indicating the number
-#' of mutated read per mutation (list index) and sample (vector index)
-#' @param totalReadCounts list of integer-valued vectors indicating the total
+#' @param tree_name character string: Name of the tree for the output plot
+#' @param n_cells integer-valued total number of cells in the dataset
+#' @param n_mutations integer-valued total number of mutations in the dataset
+#' @param n_clusters integer-valued total number of clusters in the dataset
+#' @param allele_count integer vector of numbers of alleles per clusters
+#' @param cluster_id integer vector of cluster IDs
+#' @param mutated_read_counts list of integer-valued vectors indicating the
+#' number of mutated read per mutation (list index) and sample (vector index)
+#' @param total_read_counts list of integer-valued vectors indicating the total
 #' number of reads per mutation (list index) and sample (vector index)
-#' @param wbcStatus boolean vector of length nCells indicating for each cell if
-#' it is a white blood cell (TRUE) or not (FALSE)
+#' @param wbc_status boolean vector of length n_cells indicating for each cell
+#' if it is a white blood cell (TRUE) or not (FALSE)
 #'
 #' @return splittingFraction: The fraction of sampling events for which the pair
 #'         of cells
@@ -78,126 +78,130 @@ IsDriver <- function(mutations, annotations) {
 #' @export
 #'
 #' @examples
-produce_Distance_Posterior <- function(leaf1, leaf2, postSampling, treeName,
-                                       nCells, nMutations, nClusters,
-                                       alleleCount, ClusterID,
-                                       mutatedReadCounts, totalReadCounts,
-                                       wbcStatus, nSamplingEvents = 20,
-                                       clusterName = "") {
+produce_distance_posterior <- function(leaf1, leaf2, post_sampling, tree_name,
+                                       n_cells, n_mutations, n_clusters,
+                                       allele_count, cluster_id,
+                                       mutated_read_counts, total_read_counts,
+                                       wbc_status, n_sampling_events = 20,
+                                       cluster_name = "") {
   ## For each row in the posterior Sampling file, the distance of two leaves is
   ## computed
 
   print("Computing the posterior distribution")
 
-  distance_statistics <- parallel::mclapply(postSampling,
+  distance_statistics <- parallel::mclapply(post_sampling,
     FUN = computePairwiseDistanceOfLeavesGivenTree, leaf1, leaf2,
-    nCells, nMutations, nClusters, alleleCount,
-    ClusterID, mutatedReadCounts, totalReadCounts, wbcStatus,
-    nSamplingEvents
+    n_cells, n_mutations, n_clusters, allele_count,
+    cluster_id, mutated_read_counts, total_read_counts, wbc_status,
+    n_sampling_events
   )
 
 
-  dist_histogram <- parallel::mclapply(distance_statistics,
-    FUN = function(input_list_elements) {
-      return(input_list_elements[1])
-    }
-  ) %>%
-    unlist()
-
-  totalNumberOfSplits <- parallel::mclapply(distance_statistics,
+  no_splits <- parallel::mclapply(distance_statistics,
     FUN = function(input_list_elements) {
       return(input_list_elements[2])
     }
-  ) %>%
-    unlist() %>%
+  ) |>
+    unlist() |>
     sum()
 
-  StatisticsOfMutationPlacement <- parallel::mclapply(distance_statistics,
+  mutation_placement_stats <- parallel::mclapply(distance_statistics,
     FUN =
       function(input_list_elements) {
         return(input_list_elements[3])
       }
-  ) %>%
+  ) |>
     unlist()
 
 
-  totalNumberOfSamplingEvents <- nSamplingEvents * length(postSampling)
+  no_sampling_events <- n_sampling_events * length(post_sampling)
 
 
   data <-
-    data.frame(StatisticsOfMutationPlacement = StatisticsOfMutationPlacement)
+    data.frame(
+      mutation_placement_stats = mutation_placement_stats
+    )
 
 
-  sum(is.na(data$StatisticsOfMutationPlacement))
-  class(data$StatisticsOfMutationPlacement)
+  sum(is.na(data$mutation_placement_stats))
+  class(data$mutation_placement_stats)
 
 
-  ggplot(data = data, aes(x = StatisticsOfMutationPlacement, y = 1)) +
-    geom_point()
+  ggplot2::ggplot(
+    data = data, ggplot2::aes(x = mutation_placement_stats, y = 1)
+  ) +
+    ggplot2::geom_point()
 
 
   tryCatch(
     expr = {
       histo <-
-        ggplot(data, aes(x = StatisticsOfMutationPlacement)) +
-        geom_histogram(
+        ggplot2::ggplot(
+          data, ggplot2::aes(x = mutation_placement_stats)
+        ) +
+        ggplot2::geom_histogram(
           bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7
         ) +
-        xlab("Splitting score") +
-        ylab("total count") +
-        ggtitle("Posterior sampling of branching probabilites") +
-        geom_vline(
-          xintercept = mean(StatisticsOfMutationPlacement),
+        ggplot2::xlab("Splitting score") +
+        ggplot2::ylab("total count") +
+        ggplot2::ggtitle("Posterior sampling of branching probabilites") +
+        ggplot2::geom_vline(
+          xintercept = mean(mutation_placement_stats),
           color = "blue", linetype = "dashed", linewidth = 1
         ) +
-        labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName)) +
-        theme_minimal() +
-        theme(
-          plot.title = element_text(size = 20, face = "bold"),
-          axis.title.x = element_text(size = 18),
-          axis.title.y = element_text(size = 18),
-          plot.subtitle = element_text(size = 18),
-          axis.text = element_text(size = 16)
+        ggplot2::labs(
+          subtitle = sprintf("Tree %s - %s", tree_name, cluster_name)
+        ) +
+        ggplot2::theme_minimal() +
+        ggplot2::theme(
+          plot.title = ggplot2::element_text(size = 20, face = "bold"),
+          axis.title.x = ggplot2::element_text(size = 18),
+          axis.title.y = ggplot2::element_text(size = 18),
+          plot.subtitle = ggplot2::element_text(size = 18),
+          axis.text = ggplot2::element_text(size = 16)
         )
-      hist_data <- ggplot_build(histo)$data[[1]]
+      hist_data <- ggplot2::ggplot_build(histo)$data[[1]]
       max_y <- max(hist_data$count)
-      histo <- histo + annotate("text",
-        x = mean(StatisticsOfMutationPlacement) + 0.08,
+      histo <- histo + ggplot2::annotate("text",
+        x = mean(mutation_placement_stats) + 0.08,
         y = 0.9 * max_y, label = "mean", color = "blue",
         size = 7
       )
       print(histo)
     },
     error = function(e) {
-      histo <- ggplot(data, aes(x = log(StatisticsOfMutationPlacement))) +
-        geom_histogram(
+      histo <- ggplot2::ggplot(
+        data, ggplot2::aes(x = log(mutation_placement_stats))
+      ) +
+        ggplot2::geom_histogram(
           bins = 10, fill = "skyblue", color = "skyblue",
           alpha = 0.7
         ) +
-        xlab("log(Splitting Score") +
-        ylab("total count") +
-        ggtitle("Posterior sampling of branching probabilites - Logarithmic
-                Scale") +
-        geom_vline(
-          xintercept = log(mean(StatisticsOfMutationPlacement)),
+        ggplot2::xlab("log(Splitting Score") +
+        ggplot2::ylab("total count") +
+        ggplot2::ggtitle(
+          "Posterior sampling of branching probabilites - Logarithmic Scale"
+        ) +
+        ggplot2::geom_vline(
+          xintercept = log(mean(mutation_placement_stats)),
           color = "blue", linetype = "dashed", linewidth = 1
         ) +
-        labs(
-          subtitle = sprintf("Tree %s - %s", treeName, clusterName),
+        ggplot2::labs(
+          subtitle = sprintf("Tree %s - %s", tree_name, cluster_name),
           caption = "mean indicated by dashed red line"
         ) +
-        theme_minimal() +
-        theme(
-          plot.title = element_text(size = 20, face = "bold"),
-          axis.title.x = element_text(size = 18),
-          axis.title.y = element_text(size = 18),
-          plot.subtitle = element_text(size = 18),
-          axis.text = element_text(size = 16)
+        ggplot2::theme_minimal() +
+        ggplot2::theme(
+          plot.title = ggplot2::element_text(size = 20, face = "bold"),
+          axis.title.x = ggplot2::element_text(size = 18),
+          axis.title.y = ggplot2::element_text(size = 18),
+          plot.subtitle = ggplot2::element_text(size = 18),
+          axis.text = ggplot2::element_text(size = 16)
         )
-      hist_data <- ggplot_build(histo)$data[[1]]
+      hist_data <- ggplot2::ggplot_build(histo)$data[[1]]
       max_y <- max(hist_data$count)
-      histo <- histo + annotate("text",
-        x = log(mean(StatisticsOfMutationPlacement)) +
+      histo <- histo + ggplot2::annotate("text",
+        x = log(mean(mutation_placement_stats)) +
           0.08, y = 0.9 * max_y, label = "log(mean)",
         color = "blue", size = 7
       )
@@ -210,8 +214,8 @@ produce_Distance_Posterior <- function(leaf1, leaf2, postSampling, treeName,
 
   return(list(
     splittingFraction =
-      totalNumberOfSplits / totalNumberOfSamplingEvents,
-    branchingStatistics = StatisticsOfMutationPlacement
+      no_splits / no_sampling_events,
+    branchingStatistics = mutation_placement_stats
   ))
 }
 
@@ -222,26 +226,26 @@ produce_Distance_Posterior <- function(leaf1, leaf2, postSampling, treeName,
 #' those which have been physically split. For each pair of tumour cells from
 #' the same CTC cluster, the distnace postior is computed.
 #'
-#' @param sampleDescription A data frame with the description of each sample.
+#' @param sample_description A data frame with the description of each sample.
 #' Expects the following columns:
 #' Cluster: numeric vector indicating the cluster identity. Physically separated
 #' clusters usually have different cluster identities, but this is not
 #' necessary.
-#' @param postSampling The loaded posterior sampling table.
-#' @param treeName A string with the name of the tree that is output to the
+#' @param post_sampling The loaded posterior sampling table.
+#' @param tree_name A string with the name of the tree that is output to the
 #' plots.
-#' @param nCells The total number of cells in the experiment.
-#' @param nMutations The total number of mutations in the experiment.
-#' @param nClusters The total number of clusters in the experiment.
-#' @param alleleCount A numeric vector which indicates the number of alleles in
+#' @param n_cells The total number of cells in the experiment.
+#' @param n_mutations The total number of mutations in the experiment.
+#' @param n_clusters The total number of clusters in the experiment.
+#' @param allele_count A numeric vector which indicates the number of alleles in
 #' each of the clusters.
-#' @param mutatedReadCounts A tibble containing the mutated reads. Rows are
+#' @param mutated_read_counts A tibble containing the mutated reads. Rows are
 #' mutations and columns are samples (clusters).
-#' @param totalReadCounts A tibble containing the total read counts.
-#' @param nMutationSamplingEvents The number of mutation that should be sampled
-#' per tree.
-#' @param nTreeSamplingEvents The number of trees that should be sampled.
-#' @param cellPairSelection An optional parameter that takes a list of
+#' @param total_read_counts A tibble containing the total read counts.
+#' @param n_mutation_sampling_events The number of mutation that should be
+#' sampled per tree.
+#' @param n_tree_sampling_events The number of trees that should be sampled.
+#' @param cell_pair_selection An optional parameter that takes a list of
 #' pairs of strings-valued names of cells that should be analysed (the names as
 #' in the samples_nodeDescription.tsv file).
 #' It can also take a character vector, in which case the entries should be the
@@ -251,114 +255,118 @@ produce_Distance_Posterior <- function(leaf1, leaf2, postSampling, treeName,
 #' of trees for which they split
 #' aggregatedBranchingProbabilities: a vector of aggregated probabilities for
 #' all considered pairs of leaves and all sampled trees. At the moment only
-#' implement if  cellPairSelection
+#' implement if  cell_pair_selection
 #' parameter is passed to the function.
 #' @export
 #'
 #' @examples
-computeClusterSplits <- function(sampleDescription, postSampling, treeName,
-                                 nCells, nMutations, nClusters, alleleCount,
-                                 mutatedReadCounts, totalReadCounts,
-                                 nMutationSamplingEvents = 1000,
-                                 nTreeSamplingEvents = 500,
-                                 cellPairSelection = NA) {
-  desired_values <- sample(1:length(postSampling),
-    size = nTreeSamplingEvents,
+compute_cluster_splits <- function(sample_description, post_sampling, tree_name,
+                                   n_cells, n_mutations, n_clusters,
+                                   allele_count, mutated_read_counts,
+                                   total_read_counts,
+                                   n_mutation_sampling_events = 1000,
+                                   n_tree_sampling_events = 500,
+                                   cell_pair_selection = NA) {
+  desired_values <- sample(seq_along(post_sampling),
+    size = n_tree_sampling_events,
     replace = FALSE
-  ) %>% sort()
+  ) |> sort()
 
-  postSampling <- postSampling[desired_values]
-  splittingProbs <- matrix(0, nrow = 0, ncol = 2) %>% as.data.frame()
-  colnames(splittingProbs) <- c("Cluster", "Splitting_probability")
-  aggregatedProbabilities <- vector()
-  if (class(cellPairSelection) == "list") {
+  post_sampling <- post_sampling[desired_values]
+  splitting_probs <- matrix(0, nrow = 0, ncol = 2) |> as.data.frame()
+  colnames(splitting_probs) <- c("Cluster", "Splitting_probability")
+  aggregated_probabilities <- vector()
+  if (class(cell_pair_selection) == "list") {
     counter <- 1
     system.time(
-      for (it in cellPairSelection) {
+      for (it in cell_pair_selection) {
         print(it)
-        leaf1 <- which(sampleDescription$ClusterName == it[1]) - 1
-        leaf2 <- which(sampleDescription$ClusterName == it[2]) - 1
-
-        print(paste(paste("Computing genomic distances of leaves:", leaf1,
-          sep = " "
-        ), leaf2, sep = " "))
-        posterior <- produce_Distance_Posterior(leaf1, leaf2, postSampling,
-          treeName, nCells, nMutations,
-          nClusters, alleleCount,
-          sampleDescription$Cluster,
-          mutatedReadCounts,
-          totalReadCounts,
-          sampleDescription$WBC,
-          nSamplingEvents =
-            nMutationSamplingEvents
+        leaf1 <- which(sample_description$ClusterName == it[1]) - 1
+        leaf2 <- which(sample_description$ClusterName == it[2]) - 1
+
+        print(
+          paste(
+            "Computing genomic distances of leaves:", leaf1, leaf2,
+            sep = " "
+          )
+        )
+        posterior <- produce_distance_posterior(leaf1, leaf2, post_sampling,
+          tree_name, n_cells, n_mutations,
+          n_clusters, allele_count,
+          sample_description$Cluster,
+          mutated_read_counts,
+          total_read_counts,
+          sample_description$WBC,
+          n_sampling_events =
+            n_mutation_sampling_events
         )
-        splittingProbs <- rbind(
-          splittingProbs,
+        splitting_probs <- rbind(
+          splitting_probs,
           data.frame(
             Cluster = as.character(counter),
             Splitting_probability =
               posterior$splittingFraction
           )
         )
-        aggregatedProbabilities <- c(
-          aggregatedProbabilities,
+        aggregated_probabilities <- c(
+          aggregated_probabilities,
           posterior$branchingStatistics
         )
         counter <- counter + 1
       }
     )
-  } else if (class(cellPairSelection) == "character") {
-    CTCclusters <- unique(cellPairSelection)
-    CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite", "gray93"))]
-    print(CTCclusters)
+  } else if (class(cell_pair_selection) == "character") {
+    ctc_clusters <- unique(cell_pair_selection)
+    ctc_clusters <- ctc_clusters[!(ctc_clusters %in% c("ghostwhite", "gray93"))]
+    print(ctc_clusters)
 
     system.time(
-      for (it in CTCclusters) {
-        cellsInCluster <- which(sampleDescription$color == it) - 1
+      for (it in ctc_clusters) {
+        cells_in_cluster <- which(sample_description$color == it) - 1
         ## Make sure array indication is compatible with cpp
         cluster_done <- 0
-        for (i in cellsInCluster) {
+        for (i in cells_in_cluster) {
           if (cluster_done == 1) {
             cluster_done <- 0
             break
           }
-          if (sampleDescription$WBC[i + 1] == 1) next
-          j <- cellsInCluster[1]
+          if (sample_description$WBC[i + 1] == 1) next
+          j <- cells_in_cluster[1]
           while (j < i) {
             if (cluster_done == 1) {
               break
             }
-            if (sampleDescription$WBC[j + 1] == 1) {
+            if (sample_description$WBC[j + 1] == 1) {
               j <- j + 1
               next
             }
-            print(paste(paste("Computing genomic distances of leaves:", i,
-              sep = " "
-            ), j, sep = " "))
-
-            posterior <- produce_Distance_Posterior(i, j, postSampling,
-              treeName, nCells,
-              nMutations, nClusters,
-              alleleCount,
-              sampleDescription$Cluster,
-              mutatedReadCounts,
-              totalReadCounts,
-              sampleDescription$WBC,
-              nSamplingEvents =
-                nMutationSamplingEvents,
-              clusterName = it
+            print(
+              paste("Computing genomic distances of leaves:", i, j, sep = " ")
             )
 
-            splittingProbs <- rbind(
-              splittingProbs,
+            posterior <- produce_distance_posterior(i, j, post_sampling,
+              tree_name, n_cells,
+              n_mutations, n_clusters,
+              allele_count,
+              sample_description$Cluster,
+              mutated_read_counts,
+              total_read_counts,
+              sample_description$WBC,
+              n_sampling_events =
+                n_mutation_sampling_events,
+              cluster_name = it
+            )
+
+            splitting_probs <- rbind(
+              splitting_probs,
               data.frame(
                 Cluster = it,
                 Splitting_probability =
                   posterior$splittingFraction
               )
             )
-            aggregatedProbabilities <- c(
-              aggregatedProbabilities,
+            aggregated_probabilities <- c(
+              aggregated_probabilities,
               posterior$branchingStatistics
             )
             j <- j + 1
@@ -368,46 +376,39 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName,
       }
     )
   } else {
-    CTCclusters <- unique(sampleDescription$color)
-    CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite", "gray93"))]
+    ctc_clusters <- unique(sample_description$color)
+    ctc_clusters <- ctc_clusters[!(ctc_clusters %in% c("ghostwhite", "gray93"))]
     system.time(
-      for (it in CTCclusters) {
-        cellsInCluster <- which(sampleDescription$color %in% it) - 1
+      for (it in ctc_clusters) {
+        cells_in_cluster <- which(sample_description$color %in% it) - 1
         ## Make sure array indication is compatible with cpp
-        # cluster_done <- 0
-        for (i in cellsInCluster) {
-          # if(cluster_done == 1){
-          #  cluster_done <- 0
-          #  break
-          # }
-          if (sampleDescription$WBC[i + 1] == 1) next
-          j <- cellsInCluster[1]
+
+        for (i in cells_in_cluster) {
+          if (sample_description$WBC[i + 1] == 1) next
+          j <- cells_in_cluster[1]
           while (j < i) {
-            # if(cluster_done == 1){
-            #  break
-            # }
-            if (sampleDescription$WBC[j + 1] == 1) {
+            if (sample_description$WBC[j + 1] == 1) {
               j <- j + 1
               next
             }
-            print(paste(paste("Computing genomic distances of leaves:", i,
-              sep = " "
-            ), j, sep = " "))
-            posterior <- produce_Distance_Posterior(i, j, postSampling,
-              treeName, nCells,
-              nMutations, nClusters,
-              alleleCount,
-              sampleDescription$Cluster,
-              mutatedReadCounts,
-              totalReadCounts,
-              sampleDescription$WBC,
-              nSamplingEvents =
-                nMutationSamplingEvents,
-              clusterName = it
+            print(
+              paste("Computing genomic distances of leaves:", i, j, sep = " ")
+            )
+            posterior <- produce_distance_posterior(i, j, post_sampling,
+              tree_name, n_cells,
+              n_mutations, n_clusters,
+              allele_count,
+              sample_description$Cluster,
+              mutated_read_counts,
+              total_read_counts,
+              sample_description$WBC,
+              n_sampling_events =
+                n_mutation_sampling_events,
+              cluster_name = it
             )
             print("Posterior computed")
-            splittingProbs <- rbind(
-              splittingProbs,
+            splitting_probs <- rbind(
+              splitting_probs,
               data.frame(
                 Cluster = it,
                 Splitting_probability =
@@ -415,7 +416,6 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName,
               )
             )
             j <- j + 1
-            # cluster_done <- 1
           }
         }
       }
@@ -425,8 +425,8 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName,
 
 
   return(list(
-    splittingProbs = splittingProbs,
-    aggregatedBranchingProbabilities = aggregatedProbabilities
+    splitting_probs = splitting_probs,
+    aggregatedBranchingProbabilities = aggregated_probabilities
   ))
 }
 
@@ -435,17 +435,17 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName,
 
 #' Loads all necessary data for the CTC-project.
 #' Specifically it return a named list as follows:
-#' postSampling: Loads the posterior sampling tsv as a list of named vectors
+#' post_sampling: Loads the posterior sampling tsv as a list of named vectors
 #' with the following columns: the (unnormalised) LogScore, estimated sequencing
 #' error rate, the estimated dropout rate, logTau and the Tree in parent vector
 #' format meaning that the i'th entry of the vector is te parent node of the
 #' entry i.
 #' Nodes are counted from zero and the root is length(Tree)
 #'
-#' @param inputFolder The total number of CTC-clusters
-#' @param treeName
+#' @param input_folder The total number of CTC-clusters
+#' @param tree_name
 #'
-#' @return postSampling: Loads the posterior sampling tsv as a list of named
+#' @return post_sampling: Loads the posterior sampling tsv as a list of named
 #' vectors with the following columns: the (unnormalised) LogScore, estimated
 #' sequencing error rate, the estimated dropout rate, logTau and the Tree in
 #' parent vector format meaning that the i'th entry of the vector is the parent
@@ -453,60 +453,60 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName,
 #' @export
 #'
 #' @examples
-load_data <- function(inputFolder, treeName) {
+load_data <- function(input_folder, tree_name) {
   ## Define paths
 
-  posteriorSamplingFile <- sprintf(
-    "%s/%s/%s_postSampling.tsv", inputFolder,
-    treeName, treeName
+  posterior_sampling_file <- sprintf(
+    "%s/%s/%s_postSampling.tsv", input_folder,
+    tree_name, tree_name
   )
 
-  countFile <- sprintf("%s/%s/%s.txt", inputFolder, treeName, treeName)
-  descriptionFile <- sprintf(
+  count_file <- sprintf("%s/%s/%s.txt", input_folder, tree_name, tree_name)
+  description_file <- sprintf(
     "%s/%s/%s_samples_nodeDescription.tsv",
-    inputFolder, treeName, treeName
+    input_folder, tree_name, tree_name
   )
 
 
   ## Load data
 
-  postSampling <- read_delim(posteriorSamplingFile,
+  post_sampling <- readr::read_delim(posterior_sampling_file,
     delim = "\t", col_names = c(
       "LogScore", "SequencingErrorRate",
       "DropoutRate", "LogTau", "Tree"
     )
   )
-  postSampling <- split(postSampling, seq(nrow(postSampling)))
+  post_sampling <- split(post_sampling, seq_len(nrow(post_sampling)))
 
 
-  counts <- read_delim(countFile,
+  counts <- readr::read_delim(count_file,
     delim = "\t", col_names = FALSE
   )
-  description <- read_delim(descriptionFile,
+  description <- readr::read_delim(description_file,
     delim = "\t", col_names = c(
       "Cluster", "CellCount", "TCs", "WBCs",
       "Description"
     )
   )
-  nCells <- sum(description$CellCount)
-  nClusters <- nrow(description)
-  nMutations <- nrow(counts)
-  alleleCount <- description$CellCount * 2
+  n_cells <- sum(description$CellCount)
+  n_clusters <- nrow(description)
+  n_mutations <- nrow(counts)
+  allele_count <- description$CellCount * 2
 
 
-  description <- description %>%
-    mutate(color = regmatches(Description, regexpr(
+  description <- description |>
+    dplyr::mutate(color = regmatches(rlang::.data$Description, regexpr(
       "color=([a-zA-Z]+[0-9]*)",
-      Description
-    )) %>%
-      substr(start = 7, stop = (nchar(.))))
+      rlang::.data$Description
+    ))) |>
+    (\(data) substr(start = 7, stop = (nchar(data))))()
 
 
 
-  ClusterID <- vector()
-  for (i in 1:nClusters) {
-    ClusterID <- c(
-      ClusterID,
+  cluster_id <- vector()
+  for (i in seq_len(n_clusters)) {
+    cluster_id <- c(
+      cluster_id,
       rep.int(
         i - 1,
         description$CellCount[i]
@@ -518,49 +518,49 @@ load_data <- function(inputFolder, treeName) {
 
   ## Pull apart the count file into counts for mutated read and total counts
   ## respectively
-  mutatedReadCounts <- matrix(0, nrow = nMutations, ncol = 0)
-  for (j in 1:nClusters) {
-    mutatedReadCounts <- cbind(mutatedReadCounts, counts[, 4 + 2 * j])
+  mutated_read_counts <- matrix(0, nrow = n_mutations, ncol = 0)
+  for (j in seq_len(n_clusters)) {
+    mutated_read_counts <- cbind(mutated_read_counts, counts[, 4 + 2 * j])
   }
 
-  totalReadCounts <- matrix(0, nrow = nMutations, ncol = 0)
-  for (j in 1:nClusters) {
-    totalReadCounts <- cbind(totalReadCounts, counts[, 4 + 2 * j - 1])
+  total_read_counts <- matrix(0, nrow = n_mutations, ncol = 0)
+  for (j in seq_len(n_clusters)) {
+    total_read_counts <- cbind(total_read_counts, counts[, 4 + 2 * j - 1])
   }
 
 
-  wildtypeReadCounts <- totalReadCounts - mutatedReadCounts
+  wildtype_read_counts <- total_read_counts - mutated_read_counts
 
 
-  mutatedReadCounts <- mutatedReadCounts %>%
-    t() %>%
-    as.data.frame() %>%
+  mutated_read_counts <- mutated_read_counts |>
+    t() |>
+    as.data.frame() |>
     as.list()
-  wildtypeReadCounts <- wildtypeReadCounts %>%
-    t() %>%
-    as.data.frame() %>%
+  wildtype_read_counts <- wildtype_read_counts |>
+    t() |>
+    as.data.frame() |>
     as.list()
-  totalReadCounts <- totalReadCounts %>%
-    t() %>%
-    as.data.frame() %>%
+  total_read_counts <- total_read_counts |>
+    t() |>
+    as.data.frame() |>
     as.list()
 
 
-  mutationDescription <- counts[, 1:4]
+  mutation_description <- counts[, 1:4]
 
   ## wbc status indicates which of the cells is a white blood cells and which
   ## one isn't.
   ## So far, the cells are arbitrary, and I will assign the fist cells from a
   ## cluster to be WBCs.
-  wbcStatus <- rep(0, nCells)
+  wbc_status <- rep(0, n_cells)
 
-  for (i in 1:nClusters) {
+  for (i in seq_len(n_clusters)) {
     j <- 1
     while (j <= description$WBCs[i]) { # Iterating over the number of White
       # blood cells of a cluster
-      wbcStatus[which(ClusterID == i - 1)[1] + j - 1] <- 1 # and identifying the
-      # first cell
-      # that belongs to a cluster and counting from then on
+      wbc_status[which(cluster_id == i - 1)[1] + j - 1] <- 1
+      # and identifying the first cell that belongs to a cluster and counting
+      # from then on.
       ## Note: The cluster IDs are counted from zero!
       j <- j + 1
     }
@@ -569,30 +569,31 @@ load_data <- function(inputFolder, treeName) {
 
 
   sample_description <- data.frame(
-    Cluster = ClusterID,
-    ClusterName = description$Cluster[ClusterID + 1],
-    WBC = wbcStatus,
-    color = description$color[ClusterID + 1]
+    Cluster = cluster_id,
+    ClusterName = description$Cluster[cluster_id + 1],
+    WBC = wbc_status,
+    color = description$color[cluster_id + 1]
   )
 
-  sample_description <- sample_description %>%
-    mutate(single_cell = !(duplicated(Cluster)) &
-      !(duplicated(Cluster, fromLast = TRUE)))
+  sample_description <- sample_description |>
+    dplyr::mutate(
+      single_cell =
+        !(duplicated(rlang::.data$Cluster)) &
+          !(duplicated(rlang::.data$Cluster, fromLast = TRUE))
+    )
 
 
 
 
   return(list(
-    "postSampling" = postSampling, "nClusters" = nClusters,
-    "clusterID" = ClusterID, "nCells" = nCells,
-    "nMutations" = nMutations, "nClusters" = nClusters,
-    "alleleCount" = alleleCount,
-    "mutatedReadCounts" = mutatedReadCounts,
-    "totalReadCounts" = totalReadCounts, "wbcStatus" = wbcStatus,
+    "post_sampling" = post_sampling, "n_clusters" = n_clusters,
+    "cluster_id" = cluster_id, "n_cells" = n_cells,
+    "n_mutations" = n_mutations, "allele_count" = allele_count,
+    "mutated_read_counts" = mutated_read_counts,
+    "total_read_counts" = total_read_counts, "wbc_status" = wbc_status,
     "sample_description" = sample_description,
-    "mutationDescription" = mutationDescription,
-    #              "annotations" = annotations,
-    "sampleName" = treeName, "directory" = inputFolder
+    "mutation_description" = mutation_description,
+    "sampleName" = tree_name, "directory" = input_folder
   ))
 }
 
@@ -609,8 +610,8 @@ load_data <- function(inputFolder, treeName) {
 #' distances.
 #' As the distance the Hamming distance is chosen.
 #'
-#' @param inputFolder
-#' @param treeName
+#' @param input_folder
+#' @param tree_name
 #'
 #' @return
 #' monoclonal_pairs: A list of pairs of cell names that are similar to each
@@ -623,28 +624,28 @@ load_data <- function(inputFolder, treeName) {
 #' @export
 #'
 #' @examples
-load_monoclonal_pairs <- function(inputFolder, treeName, cutoff = "") {
+load_monoclonal_pairs <- function(input_folder, tree_name, cutoff = "") {
   data_file <- sprintf(
-    "%s/%s/%s_genotypes.ped", inputFolder, treeName,
-    treeName
+    "%s/%s/%s_genotypes.ped", input_folder, tree_name,
+    tree_name
   )
 
-  data <- read_delim(data_file, delim = "\t", col_names = FALSE)
+  data <- readr::read_delim(data_file, delim = "\t", col_names = FALSE)
 
-  data2 <- data %>% select(!2:6)
+  data2 <- data |> dplyr::select(!2:6)
 
   distance_matrix <- matrix(0, nrow = nrow(data2), ncol = nrow(data2))
 
 
-  for (i in 1:nrow(data2)) {
+  for (i in seq_len(nrow(data2))) {
     j <- 1
     while (j < i) {
-      row_i <- data2 %>%
-        select(!1) %>%
-        slice(i)
-      row_j <- data2 %>%
-        select(!1) %>%
-        slice(j)
+      row_i <- data2 |>
+        dplyr::select(!1) |>
+        dplyr::slice(i)
+      row_j <- data2 |>
+        dplyr::select(!1) |>
+        dplyr::slice(j)
 
       distance_matrix[i, j] <- sum(!(row_i == row_j))
       j <- j + 1
@@ -670,9 +671,11 @@ load_monoclonal_pairs <- function(inputFolder, treeName, cutoff = "") {
   print(monoclonal_candidate_cutoff)
 
   plot(
-    ggplot(data.frame(x = distance_vector), aes(x = x)) +
-      geom_histogram(binwidth = 2) +
-      geom_vline(
+    ggplot2::ggplot(
+      data.frame(x = distance_vector), ggplot2::aes(x = rlang::.data$x)
+    ) +
+      ggplot2::geom_histogram(binwidth = 2) +
+      ggplot2::geom_vline(
         xintercept = monoclonal_candidate_cutoff, linetype = "dashed",
         color = "red"
       )

From 43c37f9f60b8fc1c21b59a2d456f5e5c7e1eb871 Mon Sep 17 00:00:00 2001
From: JohannesGawron <johannes.gawron@bsse.ethz.ch>
Date: Thu, 9 Jan 2025 17:00:04 +0100
Subject: [PATCH 3/5] integrate new data points into barcode analysis

---
 ...g_the_clonality_of_barcoded_CTC_clusters.R | 93 +++++--------------
 1 file changed, 25 insertions(+), 68 deletions(-)

diff --git a/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R b/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R
index 6ddfc0e..45ac5ce 100644
--- a/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R
+++ b/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R
@@ -6,6 +6,7 @@
 
 # Load libraries
 library(data.table)
+library(readxl)
 library(stringr)
 library(dplyr)
 library(tidyr)
@@ -48,7 +49,7 @@ for (file in files) {
 # Create empty data frame to store summary information for all samples
 summary_df <- data.frame( # Sample basename
   basename = character(),
-  # Number of barcodes accumulating 90% of total counts
+  # Number of barcodes accumulating 90% of total aligned counts
   num_rows_accumulating_nine = integer(),
   # Proportion of total counts for most abundant barcode
   prop_col_1 = numeric(),
@@ -63,14 +64,6 @@ for (i in seq_along(df_list)) {
   cumprop_col <- df[, "cumprop_col"]
   num_rows_accumulating_nine <- sum(cumprop_col < 0.9) + 1
   basename <- names(df_list)[i]
-  # Extract corresponding primary tumor complexity from basename
-  pt_complexity <- switch(substr(basename, 1, 4),
-    "10k_" = 10000,
-    "50k_" = 50000,
-    "100_" = 100,
-    "1000" = 1000,
-    NA
-  ) # if no match, assign NA
   prop_col_1 <- df$prop_col[1]
   prop_col_2 <- df$prop_col[2]
   summary_row <- data.frame(
@@ -78,91 +71,56 @@ for (i in seq_along(df_list)) {
     num_rows_accumulating_nine = num_rows_accumulating_nine,
     prop_col_1 = prop_col_1,
     prop_col_2 = prop_col_2,
-    value = value,
     stringsAsFactors = FALSE
   )
   summary_df <- rbind(summary_df, summary_row)
 }
 
-# Extract the number of cells per CTC cluster from the sample name and add to summary_df
-summary_df <- summary_df %>%
-  mutate(cluster_size = case_when(
-    grepl("_0_", basename) ~ "0",
-    grepl("_1_", basename) ~ "1",
-    grepl("_2_", basename) ~ "2",
-    grepl("_3_", basename) ~ "3",
-    grepl("_4_", basename) ~ "4",
-    grepl("_5_", basename) ~ "5",
-    grepl("_6_", basename) ~ "6",
-    grepl("_7_", basename) ~ "7",
-    grepl("_8_", basename) ~ "8",
-    grepl("_9_", basename) ~ "9",
-    grepl("_10_", basename) ~ "10",
-    grepl("_10plus_", basename) ~ "11",
-    grepl("_11_", basename) ~ "11",
-    grepl("_12_", basename) ~ "12",
-    grepl("_13_", basename) ~ "13",
-    grepl("_14_", basename) ~ "14",
-    grepl("_20_", basename) ~ "20",
-    grepl("_25_", basename) ~ "25",
-    TRUE ~ NA_character_
-  ))
+# Read-in metadata containing information on CTC cell count per cluster, primary tumor barcode diversity and mouse/tumor IDs
+metadata <- read_excel("metadata.xlsx")
 
+# Merge summary dataframe and metadata by CTC cluster basename
+merged_df <- merge(summary_df, metadata, by = "basename", all = TRUE)
 
-# Assign CTC clusters into categories "0" (negative controls), "2" or "3+" based on the cell number
-summary_df <- summary_df %>%
+# Assign CTC clusters into categories "0" (negative controls), "2" or "3+" based on the CTC counts
+merged_df <- merged_df %>%
   mutate(cluster_category = case_when(
-    grepl("_0_", basename) ~ "0",
-    grepl("_2_", basename) ~ "2",
-    grepl("_2_|_3_|_4_|_5_|_6_|_7_|_8_|_9_|_10_|_10plus_|_11_|_12_|_13_|_14_|_20_|_25_", basename) ~ "3+",
-    TRUE ~ NA_character_
-  ))
-
-# Add a column with the corresponding tumor ID
-summary_df <- summary_df %>%
-  mutate(tumor_id = case_when(
-    grepl("910", basename) ~ "910",
-    grepl("905", basename) ~ "905",
-    grepl("904", basename) ~ "904",
-    grepl("903", basename) ~ "903",
-    grepl("902", basename) ~ "902",
-    grepl("141", basename) ~ "141",
-    grepl("140", basename) ~ "140",
-    TRUE ~ NA_character_
+    cluster_size == 0 ~ "0",
+    cluster_size == 2 ~ "2",
+    TRUE ~ "3+"
   ))
 
-
 # Quality filtering of CTC cluster samples
-summary_df$cluster_size <- as.numeric(summary_df$cluster_size)
-summary_df_filter <- summary_df[summary_df$num_rows_accumulating_nine <= summary_df$cluster_size, ]
+merged_df$cluster_size <- as.numeric(merged_df$cluster_size)
+merged_df_filter <- merged_df[merged_df$num_rows_accumulating_nine <= merged_df$cluster_size, ]
 
 # Assign CTC cluster samples mono- or oligoclonal based on the dominance of the most abundant barcode, taking into account the cell number
-summary_df_filter$clonality <- ifelse(summary_df_filter$prop_col_2 / summary_df_filter$prop_col_1 < 1 / summary_df_filter$cluster_size,
+merged_df_filter$clonality <- ifelse(merged_df_filter$prop_col_2 / merged_df_filter$prop_col_1 < 1 / merged_df_filter$cluster_size,
   "mono",
   "oligo"
 )
 
 # Return fraction of oligoclonal CTC clusters across all samples
-nrow(summary_df_filter[summary_df_filter$clonality == "oligo", ]) / nrow(summary_df_filter)
+nrow(merged_df_filter[merged_df_filter$clonality == "oligo", ]) / nrow(merged_df_filter)
 
 # Assign CTC cluster samples a complexity value of "Low", "Medium" or "High", based on corresponding primary tumor barcode complexity
-summary_df_filter <- summary_df_filter %>%
-  mutate(complexity = ifelse(value %in% c(100, 1000), "Low",
-    ifelse(value == 10000, "Medium", "High")
+merged_df_filter <- merged_df_filter %>%
+  mutate(complexity = ifelse(pt_complexity %in% c(100, 1000), "Low",
+    ifelse(pt_complexity == 10000, "Medium", "High")
   ))
 
 
 # Perform Cochran-Armitage test for trend
-# Define matrix with counts for oligo- and monoclonal CTC clusters across complexities (as inferred from summary_df_filter)
-x <- matrix(c(16, 133, 21, 52, 40, 14), byrow = TRUE, ncol = 2)
+# Define matrix with counts for oligo- and monoclonal CTC clusters across complexities (as inferred from merged_df_filter)
+x <- matrix(c(16, 133, 57, 88, 90, 42), byrow = TRUE, ncol = 2)
 CochranArmitageTest(x, alternative = "one.sided")
 
 
 # Fig. 2b
 # Generate a data.frame with counts for mono- and oligoclonal CTC clusters within each complexity level using PieDonut
 low <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(133, 16))
-medium <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(52, 21))
-high <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(14, 40))
+medium <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(88, 57))
+high <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(42, 90))
 
 
 # Generate a donut plot illustrating mono- and oligoclonal CTC cluster counts for each complexity level
@@ -172,7 +130,7 @@ PieDonut(high, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize =
 
 
 # Create a data frame specifying for each tumor sample the proportion of oligoclonal CTC clusters in categories "2" and "3+"
-combined_summary <- summary_df_filter %>%
+combined_summary <- merged_df_filter %>%
   group_by(tumor_id, cluster_category, complexity) %>%
   summarise(
     n = n(),
@@ -182,7 +140,7 @@ combined_summary <- summary_df_filter %>%
   ungroup()
 
 # Only keep samples with counts in both categories ("2" and "3+")
-combined_summary <- combined_summary[c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13), ]
+combined_summary <- combined_summary[-c(21), ]
 
 # Calculate the absolute number of mono- and oligoconal CTC clusters per category
 combined_summary$oligo <- combined_summary$n * combined_summary$prop_oligo
@@ -205,7 +163,6 @@ contingency_table <- matrix(
 # Perform Fisher's Exact Test for "2" vs. "3+"
 fisher.test(contingency_table)
 
-
 # Generate plot for Fig. 2c
 complexity_colors <- c("Low" = "#9fc8c8", "Medium" = "#54a1a1", "High" = "#1f6f6f")
 ggplot(combined_summary, aes(x = as.character(cluster_category), y = prop_oligo, color = complexity)) +
@@ -214,4 +171,4 @@ ggplot(combined_summary, aes(x = as.character(cluster_category), y = prop_oligo,
   theme_classic() +
   theme(axis.title = element_text(size = 0), legend.title = element_text(size = 0), legend.text = element_text(size = 0), axis.text = element_text(size = 0)) + # Populate labels in Adobe Illustrator
   scale_color_manual(values = complexity_colors) +
-  ylim(0, 1)
+  ylim(0, 1)
\ No newline at end of file

From c30b80b824c25c7b248923e5ade017fe9c548dd2 Mon Sep 17 00:00:00 2001
From: JohannesGawron <johannes.gawron@bsse.ethz.ch>
Date: Thu, 9 Jan 2025 17:32:00 +0100
Subject: [PATCH 4/5] adding file to compute p value for extended data figure
 7a

---
 .../extended_data_figure_7a_test.py           | 232 ++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 experiments/barcoding_experiment/extended_data_figure_7a_test.py

diff --git a/experiments/barcoding_experiment/extended_data_figure_7a_test.py b/experiments/barcoding_experiment/extended_data_figure_7a_test.py
new file mode 100644
index 0000000..cf1e4e9
--- /dev/null
+++ b/experiments/barcoding_experiment/extended_data_figure_7a_test.py
@@ -0,0 +1,232 @@
+from pathlib import Path
+import logging
+
+import numpy as np
+import pandas as pd
+import math
+from matplotlib import pyplot as plt
+from scipy.stats import combine_pvalues
+
+# Generate a list of barcode counts files
+
+logging.basicConfig(level=logging.INFO)
+
+
+def load_cluster_data(path, pattern):
+    files = [str(file) for file in Path(path).glob('*.csv') if pattern in str(file)]
+
+    # Create empty dictionary to store data frames
+    df_dict = {}
+    cluster_sizes = []
+    # Loop through all sample barcode counts files
+    for idx,file in enumerate(files):
+        basename = Path(file).stem
+        pattern = next((p for p in ["910", "905", "904", "903", "902", "141", "140"] if p in basename), None)
+        cluster_size = int(basename.split('_')[1])
+        
+        try:
+            # Load counts into data frame
+            # Use pd.read_csv with chunksize for faster processing
+            df = pd.read_csv(file, sep='\t', header=None, dtype={0: 'str'})
+            logging.info(f'File {file} loaded')
+        except Exception as e:
+            logging.error(f"Error reading {file} - skipping\n")
+            logging.error(e)
+            continue
+        df.columns = ['Barcode ID', '1', 'Barcode Count', '3']
+             
+
+
+        # Sort by decreasing barcode counts
+        df_sorted = df.sort_values(by=df.columns[2], ascending=False)
+        df_sorted['Mouse ID'] = pattern
+        df['Cluster Size'] = cluster_size
+        df['CTC Cluster ID'] = id ### The choice of cluster ID is completely arbitrary and only helps to distinguish between different clusters
+        # Calculate fraction of total counts for each Barcode
+        df_sorted['prop_col'] = df_sorted.iloc[:, 2] / df_sorted.iloc[:, 2].sum()
+        
+        # Calculate cumulative barcode proportions
+        df_sorted['cumprop_col'] = df_sorted['prop_col'].cumsum()
+        
+        # Find the largest row number for which cumprop_col < 0.9
+        
+        n_clones = len(df_sorted[df_sorted['cumprop_col'] < 0.9].index) + 1
+        df_sorted['Clone present in cluster'] = False
+        df_sorted.loc[df_sorted.index[:n_clones], 'Clone present in cluster'] = True
+
+        # do a quality check to exclude all clusters for which the number of clones exceeds the number of cells in the cluster
+
+
+        if not df_sorted['Clone present in cluster'].sum() > cluster_size:
+            df_dict[basename] = df_sorted
+            cluster_sizes.append(cluster_size)
+            logging.info(f'File {file} added to dictionary')
+
+    return df_dict, list(set(cluster_sizes))
+
+
+def load_primary_data(path):
+    file_list = ["combined_140_order_filter_merge.rds.csv", "combined_141_order_filter_merge.rds.csv", "combined_902_order_filter_merge.rds.csv", "combined_903_order_filter_merge.rds.csv", "combined_904_order_filter_merge.rds.csv", "combined_905_order_filter_merge.rds.csv", "combined_910_order_filter_merge.rds.csv"]
+    primary_dict = {}
+    for file in file_list:
+        mouse_ID = file.split('_')[1]
+        primary_dict[mouse_ID] = pd.read_csv(Path(path) / file, sep=',', header=0)
+    
+        primary_dict[mouse_ID]['observations'] = primary_dict[mouse_ID]['observations'].astype(str)
+    return primary_dict
+
+
+def preprocess_primary_data(primary_data):
+    # Determine the cutoff for each mouse_ID
+
+    for mouse_ID, data in primary_data.items():
+        cutoff = data['prop_av'].quantile(0.01)
+        data_sorted = data.sort_values(by="prop_av", ascending=False)
+        
+        # Calculate cumulative barcode proportions
+        data_sorted['cumulative_proportions'] = data_sorted['prop_av'].cumsum()
+        
+        # Find the largest row number for which cumprop_col < 0.9
+        
+        n_clones = len(data_sorted[data_sorted['prop_av'] > cutoff].index)+1
+        data_sorted = data_sorted.iloc[:n_clones,:]
+        primary_data.update({mouse_ID: data_sorted})
+
+    return primary_data
+
+    # Calculate fraction of total counts for each Barcode
+
+
+def merge_data(files, primary_data):
+    for mouse_ID, data in primary_data.items():
+        for cluster_ID, cluster_data in files.items():
+            if cluster_data['Mouse ID'].iloc[0] == mouse_ID:
+                # Filter out barcode IDs in cluster_data that are not in primary_data
+                
+                cluster_data = cluster_data[cluster_data['Barcode ID'].isin(data['observations'])]
+
+                # Check for barcode IDs with "Clone present in cluster" True but not in observations
+                missing_barcodes = cluster_data[(cluster_data['Clone present in cluster'] == True) & (~cluster_data['Barcode ID'].isin(data['observations']))]
+                if not missing_barcodes.empty:
+                    logging.warning(f"Mouse ID {mouse_ID}, Cluster ID {cluster_ID} has barcodes with 'Clone present in cluster' True but not in observations: {missing_barcodes['Barcode ID'].tolist()}")
+
+                # Merge the prop_av values from primary_data into cluster_data
+                
+                cluster_data = cluster_data.merge(data[['observations', 'prop_av']], left_on='Barcode ID', right_on='observations', how='left')
+
+                cluster_data.drop(columns=['observations'], inplace=True)
+ 
+                files.update({cluster_ID: cluster_data})
+    return files
+
+
+def compute_G_score(clones_in_cluster, n_cells):
+    return 2*clones_in_cluster["prop_av"].map(lambda x: np.log(1/(1-(1-x) ** n_cells))).sum()
+
+
+
+def simulate_G_scores(proportions, n_cells, n_simulations):
+            G_scores = []
+            probabilities = proportions/np.sum(proportions)
+
+            for _ in range(n_simulations):
+                simulated_counts = np.random.multinomial(n_cells, probabilities)
+                simulated_data = pd.DataFrame({
+                    'simulated_counts': simulated_counts,
+                    'prop_av': proportions
+                })
+                simulated_data['Clone present in cluster'] = simulated_data['simulated_counts'].apply(lambda x: 1 if x > 0 else 0)
+                simulated_data = simulated_data[simulated_data['simulated_counts'] > 0]
+                G_score = compute_G_score(simulated_data, n_cells)
+                G_scores.append(G_score)
+            
+            plt.hist(G_scores, bins=60, edgecolor='k', alpha=0.7)
+            plt.xlabel('G Score')
+            plt.ylabel('Frequency')
+            plt.title('Histogram of Simulated G Scores')
+            plt.show()
+            
+            return G_scores
+
+
+def compute_test(all_clones, n_cells_in_cluster, simulations = None):
+
+    resolution_of_simulation = 10000
+    if simulations is None:
+        G_scores = simulate_G_scores(all_clones['prop_av'], n_cells_in_cluster, resolution_of_simulation)
+    else:
+        G_scores = simulations
+    clones_in_cluster = all_clones[all_clones['Clone present in cluster'] == True]
+    G = compute_G_score(clones_in_cluster, n_cells_in_cluster)
+    p_value = sum(G_scores >= G)/resolution_of_simulation
+    return(p_value)
+    #p_value = clones_in_cluster["prop_av"].prod()* math.factorial(n_cells_in_cluster)/math.factorial(clones_in_cluster.shape[0])
+
+
+
+
+if __name__ == '__main__':
+    path = '/Users/jgawron/Documents/projects/CTC_backup/validation_experiment/barcode_data/cluster_data'
+    primary_data_path = '/Users/jgawron/Downloads'
+    primary_data = load_primary_data(primary_data_path)
+    primary_data = preprocess_primary_data(primary_data)
+
+    # Create empty dictionary to store data frames
+    p_values = []
+    mouse_models = []
+    number_of_simulations = 10000
+    for pattern in ["910", "905", "904", "903", "902", "141", "140"]:
+        files, cluster_sizes = load_cluster_data(path, pattern)
+
+        merged_data = merge_data(files, primary_data)
+
+        first_dataset = next(iter(merged_data.values()))
+
+        print(cluster_sizes)
+        simulated_G_scores = pd.DataFrame(np.zeros((number_of_simulations, len(cluster_sizes))), columns=[str(cluster_size) for cluster_size in cluster_sizes])
+        
+        for cell_number in cluster_sizes:
+            if simulated_G_scores[str(cell_number)].sum() == 0:
+                logging.info(f"Simulating null distribution for cluster size:  {cell_number}")
+                simulated_G_scores[str(cell_number)] = simulate_G_scores(first_dataset['prop_av'], cell_number, number_of_simulations)
+
+        for cluster_id, cluster_data in merged_data.items():
+            n_cells = cluster_id.split('_')[1]
+            p_values.append(compute_test(cluster_data, int(n_cells), simulated_G_scores[n_cells])+1e-16)
+            mouse_models.append(pattern)
+            logging.info(f"P value for cluster {cluster_id}: {p_values[-1]}")
+
+    p_value_summary = pd.DataFrame({'P value': p_values, 'Mouse Model': mouse_models})
+    logging.info(p_value_summary)
+    p_value_summary.to_csv('/Users/jgawron/Documents/projects/CTC_backup/validation_experiment/barcode_data/cluster_data/p_values.csv')
+
+    # Combine p-values using Fisher's method
+    combined_p_value = combine_pvalues(p_values, method='fisher')[1]
+    print(f'Combined p-value: {combined_p_value}')
+    plt.hist(p_values, bins=30, edgecolor='k', alpha=0.7)
+    plt.xlabel('P values')
+    plt.ylabel('Frequency')
+    plt.title('Histogram of p_values')
+    plt.show()
+    # Plot the p_values stratified by Mouse model
+    plt.figure(figsize=(10, 6))
+    for mouse_model in p_value_summary['Mouse Model'].unique():
+        subset = p_value_summary[p_value_summary['Mouse Model'] == mouse_model]
+        plt.hist(subset['P value'], bins=30, alpha=0.5, label=f'Mouse Model {mouse_model}')
+
+    plt.xlabel('P values')
+    plt.ylabel('Frequency')
+    plt.title('Histogram of P values stratified by Mouse Model')
+    plt.legend()
+    plt.show()
+
+    """
+    for cluster_id, cluster_data in merged_data.items():
+        for idx in range(15):
+            simulate_G_scores(cluster_data['prop_av'], idx, 1000)
+        n_cells = int(cluster_id.split('_')[1])
+        clones_in_cluster = cluster_data[cluster_data['Clone present in cluster'] == True]
+        G_score = compute_G_score(clones_in_cluster, n_cells)
+        print(f'Cluster ID: {cluster_id}, G_score: {G_score}')
+        simulate_G_scores(cluster_data['prop_av'], n_cells, 100000)
+    """
\ No newline at end of file

From 47d3d04a1c10285e99cfdd0280512997566d6f2b Mon Sep 17 00:00:00 2001
From: JohannesGawron <johannes.gawron@bsse.ethz.ch>
Date: Thu, 9 Jan 2025 17:35:33 +0100
Subject: [PATCH 5/5] bugfix

---
 .pre-commit-config.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 495d4ac..f8a041a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,7 +7,6 @@ repos:
     rev: v0.4.2
     hooks:
       - id: style-files
-        dry: on
       - id: parsable-R  
       - id: lintr
         args: [--warn_only]