From c546d0a4f3f09bfd8de6b767436a7110cebd637b Mon Sep 17 00:00:00 2001 From: JohannesGawron Date: Thu, 9 Jan 2025 11:41:08 +0100 Subject: [PATCH 1/5] bugfix --- .../workflow/resources/functions.R | 8 +++++++- .../workflow/resources/mutations_placement.cpp | 5 ++++- .../workflow/scripts/simulateCTCclusters.R | 12 +++++++++--- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/experiments/assessing_cluster_clonality/workflow/resources/functions.R b/experiments/assessing_cluster_clonality/workflow/resources/functions.R index d92dfea..12df533 100755 --- a/experiments/assessing_cluster_clonality/workflow/resources/functions.R +++ b/experiments/assessing_cluster_clonality/workflow/resources/functions.R @@ -275,6 +275,7 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, counter <- 1 system.time( for (it in cellPairSelection) { + print(it) leaf1 <- which(sampleDescription$ClusterName == it[1]) - 1 leaf2 <- which(sampleDescription$ClusterName == it[2]) - 1 @@ -309,9 +310,11 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, } else if (class(cellPairSelection) == "character") { CTCclusters <- unique(cellPairSelection) CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite", "gray93"))] + print(CTCclusters) + system.time( for (it in CTCclusters) { - cellsInCluster <- which(sampleDescription$color %in% it) - 1 + cellsInCluster <- which(sampleDescription$color == it) - 1 ## Make sure array indication is compatible with cpp cluster_done <- 0 for (i in cellsInCluster) { @@ -332,6 +335,7 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, print(paste(paste("Computing genomic distances of leaves:", i, sep = " " ), j, sep = " ")) + posterior <- produce_Distance_Posterior(i, j, postSampling, treeName, nCells, nMutations, nClusters, @@ -344,6 +348,7 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, nMutationSamplingEvents, clusterName = it ) + splittingProbs <- rbind( splittingProbs, data.frame( @@ -400,6 +405,7 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, nMutationSamplingEvents, clusterName = it ) + print("Posterior computed") splittingProbs <- rbind( splittingProbs, data.frame( diff --git a/experiments/assessing_cluster_clonality/workflow/resources/mutations_placement.cpp b/experiments/assessing_cluster_clonality/workflow/resources/mutations_placement.cpp index 485f900..6754274 100644 --- a/experiments/assessing_cluster_clonality/workflow/resources/mutations_placement.cpp +++ b/experiments/assessing_cluster_clonality/workflow/resources/mutations_placement.cpp @@ -519,6 +519,7 @@ std::vector> computeMutationDistribution( // double logScore = 0.0; std::vector> logAttachmentScores; + logAttachmentScores.reserve(n + 1); for (int mut = 0; mut < n; mut++) { // compute score separately for each mutation @@ -563,9 +564,9 @@ std::vector> computeMutationDistribution( for (int att = 0; att < (2 * m) - 1; att++) { // cout << "wbcBelowCount.at(" << att << ") = " << wbcBelowCount.at(att) // << endl; - double logAttachmentScore = 0.0; double wbc_penalty = chi_wbc * wbcBelowCount.at(att); + for (int sample = 0; sample < sampleCount; sample++) { // for each sample // cout << "sample " << sample << endl; int expCount = expVarAlleleCount[sample][att]; @@ -589,6 +590,7 @@ std::vector> computeMutationDistribution( logAttachmentScore += newScore; // cout << newScore << endl; } + // cout << "\n mut score: " << logAttachmentScore << endl; logAttachmentScores.back().push_back(logAttachmentScore - wbc_penalty); @@ -1028,6 +1030,7 @@ std::vector> computePairwiseDistanceOfLeavesGivenTree( const std::vector> &mutatedReadCounts, const std::vector> &totalReadCounts, const std::vector &wbcStatus, int nSamplingEvents) { + std::string tree = treeData["Tree"]; // Now need to split the string into single numbers and turn them into diff --git a/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R b/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R index e9c3a2c..f502b5a 100644 --- a/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R +++ b/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R @@ -6,6 +6,7 @@ library(boot) source("functions.R") library("optparse") + ############ # Config ############ @@ -35,7 +36,7 @@ parser <- add_option(parser, c("-s", "--simulation-cluster-size"), ) parser <- add_option(parser, c("-o", "--output-folder"), type = "character", - default = "~/Documents/projects/CTC_backup/simulations/simulation2", help = "" + default = "~/Documents/projects/CTC_backup/simulations/simulation3", help = "" ) parser <- add_option(parser, c("-m", "--monoclonal"), type = "logical", @@ -511,6 +512,7 @@ simulate_oligoclonals <- function(input, output_directory, number_of_cells, samp return(sum(x != y)) } + it <- 0 while (TRUE) { # I sample until I get a cluster where at least two cells are distinct # Sample cells to merge cells_to_merge <- @@ -534,6 +536,10 @@ simulate_oligoclonals <- function(input, output_directory, number_of_cells, samp if (sum_of_distances > 0) { break } + it <- 1 + it + if (it > 100) { + break + } } @@ -596,7 +602,7 @@ simulate_oligoclonals <- function(input, output_directory, number_of_cells, samp ) write_delim( - x = description_data, + x = description_data_output_format, file = file.path( output_directory, paste(input$sampleName, output_label, sep = "_"), @@ -767,7 +773,7 @@ simulateCTCclusters <- function( # for (tree in c("Br11", "Br16_AC_max2", "Br16_AC_max3", "Br16_AC_max4", "Br16_B_max1", "Br16_B_max2", "Br16_B_max3", "Br16_B_max4", "Br16_C_max1", "Br16_C_max2", "Br16_C_max3", "Br23", "Br26", "Br30", "Br37", "Br38", "Br39", "Br44", "Br45", "Br46", "Br53", "Br57", "Brx50", "Lu2", "Lu7", "Ov8", "Pr6", "Pr9")) {} -cluster_size_vector <- c(0, 4, 3, 2, 2, 2, 2, 2, 2) +cluster_size_vector <- c(0, 3, 3, 3, 3, 3, 3, 3, 3) print(paste("Running simulation for", tree_name)) From 6225918f8a2cc210ed296dbce33f3dac46ef78e1 Mon Sep 17 00:00:00 2001 From: JohannesGawron Date: Thu, 9 Jan 2025 16:57:29 +0100 Subject: [PATCH 2/5] code refactoring --- .pre-commit-config.yaml | 1 + .../workflow/resources/functions.R | 573 +++++++++--------- 2 files changed, 289 insertions(+), 285 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f8a041a..495d4ac 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,6 +7,7 @@ repos: rev: v0.4.2 hooks: - id: style-files + dry: on - id: parsable-R - id: lintr args: [--warn_only] diff --git a/experiments/assessing_cluster_clonality/workflow/resources/functions.R b/experiments/assessing_cluster_clonality/workflow/resources/functions.R index 12df533..785a4df 100755 --- a/experiments/assessing_cluster_clonality/workflow/resources/functions.R +++ b/experiments/assessing_cluster_clonality/workflow/resources/functions.R @@ -5,10 +5,8 @@ library(Rcpp) library(tidyverse) - - sourceCpp("../../workflow/resources/mutations_placement.cpp") - +util::globalVariables(computePairwiseDistanceOfLeavesGivenTree) #' Takes a list of mutations and outputs which one of these is a driver. @@ -28,17 +26,19 @@ sourceCpp("../../workflow/resources/mutations_placement.cpp") #' @export #' #' @examples -IsDriver <- function(mutations, annotations) { - annotated_mutations <- annotations %>% - filter(annotations$"#CHROM" == as.character(mutations[1]) & - annotations$POS == as.numeric(mutations[2])) +is_driver <- function(mutations, annotations) { + annotated_mutations <- annotations |> + filter( + annotations$"#CHROM" == as.character(mutations[1]) & + annotations$POS == as.numeric(mutations[2]) + ) - check <- annotated_mutations %>% - select(c( + check <- annotated_mutations |> + dplyr::select(c( "CGI-Oncogenic Summary", "CGI-Oncogenic Prediction", "CGI-External oncogenic annotation" )) %in% - c("oncogenic (predicted)", "driver (oncodriveMUT)") %>% + c("oncogenic (predicted)", "driver (oncodriveMUT)") |> sum() driver <- FALSE @@ -56,20 +56,20 @@ IsDriver <- function(mutations, annotations) { #' #' @param leaf1 integer-valued index of first leaf #' @param leaf2 integer-valued index of second leaf -#' @param postSampling loaded list of tibbles tibble containing the posterior +#' @param post_sampling loaded list of tibbles tibble containing the posterior #' Sampling -#' @param treeName character string: Name of the tree for the output plot -#' @param nCells integer-valued total number of cells in the dataset -#' @param nMutations integer-valued total number of mutations in the dataset -#' @param nClusters integer-valued total number of clusters in the dataset -#' @param alleleCount integer vector of numbers of alleles per clusters -#' @param ClusterID integer vector of cluster IDs -#' @param mutatedReadCounts list of integer-valued vectors indicating the number -#' of mutated read per mutation (list index) and sample (vector index) -#' @param totalReadCounts list of integer-valued vectors indicating the total +#' @param tree_name character string: Name of the tree for the output plot +#' @param n_cells integer-valued total number of cells in the dataset +#' @param n_mutations integer-valued total number of mutations in the dataset +#' @param n_clusters integer-valued total number of clusters in the dataset +#' @param allele_count integer vector of numbers of alleles per clusters +#' @param cluster_id integer vector of cluster IDs +#' @param mutated_read_counts list of integer-valued vectors indicating the +#' number of mutated read per mutation (list index) and sample (vector index) +#' @param total_read_counts list of integer-valued vectors indicating the total #' number of reads per mutation (list index) and sample (vector index) -#' @param wbcStatus boolean vector of length nCells indicating for each cell if -#' it is a white blood cell (TRUE) or not (FALSE) +#' @param wbc_status boolean vector of length n_cells indicating for each cell +#' if it is a white blood cell (TRUE) or not (FALSE) #' #' @return splittingFraction: The fraction of sampling events for which the pair #' of cells @@ -78,126 +78,130 @@ IsDriver <- function(mutations, annotations) { #' @export #' #' @examples -produce_Distance_Posterior <- function(leaf1, leaf2, postSampling, treeName, - nCells, nMutations, nClusters, - alleleCount, ClusterID, - mutatedReadCounts, totalReadCounts, - wbcStatus, nSamplingEvents = 20, - clusterName = "") { +produce_distance_posterior <- function(leaf1, leaf2, post_sampling, tree_name, + n_cells, n_mutations, n_clusters, + allele_count, cluster_id, + mutated_read_counts, total_read_counts, + wbc_status, n_sampling_events = 20, + cluster_name = "") { ## For each row in the posterior Sampling file, the distance of two leaves is ## computed print("Computing the posterior distribution") - distance_statistics <- parallel::mclapply(postSampling, + distance_statistics <- parallel::mclapply(post_sampling, FUN = computePairwiseDistanceOfLeavesGivenTree, leaf1, leaf2, - nCells, nMutations, nClusters, alleleCount, - ClusterID, mutatedReadCounts, totalReadCounts, wbcStatus, - nSamplingEvents + n_cells, n_mutations, n_clusters, allele_count, + cluster_id, mutated_read_counts, total_read_counts, wbc_status, + n_sampling_events ) - dist_histogram <- parallel::mclapply(distance_statistics, - FUN = function(input_list_elements) { - return(input_list_elements[1]) - } - ) %>% - unlist() - - totalNumberOfSplits <- parallel::mclapply(distance_statistics, + no_splits <- parallel::mclapply(distance_statistics, FUN = function(input_list_elements) { return(input_list_elements[2]) } - ) %>% - unlist() %>% + ) |> + unlist() |> sum() - StatisticsOfMutationPlacement <- parallel::mclapply(distance_statistics, + mutation_placement_stats <- parallel::mclapply(distance_statistics, FUN = function(input_list_elements) { return(input_list_elements[3]) } - ) %>% + ) |> unlist() - totalNumberOfSamplingEvents <- nSamplingEvents * length(postSampling) + no_sampling_events <- n_sampling_events * length(post_sampling) data <- - data.frame(StatisticsOfMutationPlacement = StatisticsOfMutationPlacement) + data.frame( + mutation_placement_stats = mutation_placement_stats + ) - sum(is.na(data$StatisticsOfMutationPlacement)) - class(data$StatisticsOfMutationPlacement) + sum(is.na(data$mutation_placement_stats)) + class(data$mutation_placement_stats) - ggplot(data = data, aes(x = StatisticsOfMutationPlacement, y = 1)) + - geom_point() + ggplot2::ggplot( + data = data, ggplot2::aes(x = mutation_placement_stats, y = 1) + ) + + ggplot2::geom_point() tryCatch( expr = { histo <- - ggplot(data, aes(x = StatisticsOfMutationPlacement)) + - geom_histogram( + ggplot2::ggplot( + data, ggplot2::aes(x = mutation_placement_stats) + ) + + ggplot2::geom_histogram( bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7 ) + - xlab("Splitting score") + - ylab("total count") + - ggtitle("Posterior sampling of branching probabilites") + - geom_vline( - xintercept = mean(StatisticsOfMutationPlacement), + ggplot2::xlab("Splitting score") + + ggplot2::ylab("total count") + + ggplot2::ggtitle("Posterior sampling of branching probabilites") + + ggplot2::geom_vline( + xintercept = mean(mutation_placement_stats), color = "blue", linetype = "dashed", linewidth = 1 ) + - labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName)) + - theme_minimal() + - theme( - plot.title = element_text(size = 20, face = "bold"), - axis.title.x = element_text(size = 18), - axis.title.y = element_text(size = 18), - plot.subtitle = element_text(size = 18), - axis.text = element_text(size = 16) + ggplot2::labs( + subtitle = sprintf("Tree %s - %s", tree_name, cluster_name) + ) + + ggplot2::theme_minimal() + + ggplot2::theme( + plot.title = ggplot2::element_text(size = 20, face = "bold"), + axis.title.x = ggplot2::element_text(size = 18), + axis.title.y = ggplot2::element_text(size = 18), + plot.subtitle = ggplot2::element_text(size = 18), + axis.text = ggplot2::element_text(size = 16) ) - hist_data <- ggplot_build(histo)$data[[1]] + hist_data <- ggplot2::ggplot_build(histo)$data[[1]] max_y <- max(hist_data$count) - histo <- histo + annotate("text", - x = mean(StatisticsOfMutationPlacement) + 0.08, + histo <- histo + ggplot2::annotate("text", + x = mean(mutation_placement_stats) + 0.08, y = 0.9 * max_y, label = "mean", color = "blue", size = 7 ) print(histo) }, error = function(e) { - histo <- ggplot(data, aes(x = log(StatisticsOfMutationPlacement))) + - geom_histogram( + histo <- ggplot2::ggplot( + data, ggplot2::aes(x = log(mutation_placement_stats)) + ) + + ggplot2::geom_histogram( bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7 ) + - xlab("log(Splitting Score") + - ylab("total count") + - ggtitle("Posterior sampling of branching probabilites - Logarithmic - Scale") + - geom_vline( - xintercept = log(mean(StatisticsOfMutationPlacement)), + ggplot2::xlab("log(Splitting Score") + + ggplot2::ylab("total count") + + ggplot2::ggtitle( + "Posterior sampling of branching probabilites - Logarithmic Scale" + ) + + ggplot2::geom_vline( + xintercept = log(mean(mutation_placement_stats)), color = "blue", linetype = "dashed", linewidth = 1 ) + - labs( - subtitle = sprintf("Tree %s - %s", treeName, clusterName), + ggplot2::labs( + subtitle = sprintf("Tree %s - %s", tree_name, cluster_name), caption = "mean indicated by dashed red line" ) + - theme_minimal() + - theme( - plot.title = element_text(size = 20, face = "bold"), - axis.title.x = element_text(size = 18), - axis.title.y = element_text(size = 18), - plot.subtitle = element_text(size = 18), - axis.text = element_text(size = 16) + ggplot2::theme_minimal() + + ggplot2::theme( + plot.title = ggplot2::element_text(size = 20, face = "bold"), + axis.title.x = ggplot2::element_text(size = 18), + axis.title.y = ggplot2::element_text(size = 18), + plot.subtitle = ggplot2::element_text(size = 18), + axis.text = ggplot2::element_text(size = 16) ) - hist_data <- ggplot_build(histo)$data[[1]] + hist_data <- ggplot2::ggplot_build(histo)$data[[1]] max_y <- max(hist_data$count) - histo <- histo + annotate("text", - x = log(mean(StatisticsOfMutationPlacement)) + + histo <- histo + ggplot2::annotate("text", + x = log(mean(mutation_placement_stats)) + 0.08, y = 0.9 * max_y, label = "log(mean)", color = "blue", size = 7 ) @@ -210,8 +214,8 @@ produce_Distance_Posterior <- function(leaf1, leaf2, postSampling, treeName, return(list( splittingFraction = - totalNumberOfSplits / totalNumberOfSamplingEvents, - branchingStatistics = StatisticsOfMutationPlacement + no_splits / no_sampling_events, + branchingStatistics = mutation_placement_stats )) } @@ -222,26 +226,26 @@ produce_Distance_Posterior <- function(leaf1, leaf2, postSampling, treeName, #' those which have been physically split. For each pair of tumour cells from #' the same CTC cluster, the distnace postior is computed. #' -#' @param sampleDescription A data frame with the description of each sample. +#' @param sample_description A data frame with the description of each sample. #' Expects the following columns: #' Cluster: numeric vector indicating the cluster identity. Physically separated #' clusters usually have different cluster identities, but this is not #' necessary. -#' @param postSampling The loaded posterior sampling table. -#' @param treeName A string with the name of the tree that is output to the +#' @param post_sampling The loaded posterior sampling table. +#' @param tree_name A string with the name of the tree that is output to the #' plots. -#' @param nCells The total number of cells in the experiment. -#' @param nMutations The total number of mutations in the experiment. -#' @param nClusters The total number of clusters in the experiment. -#' @param alleleCount A numeric vector which indicates the number of alleles in +#' @param n_cells The total number of cells in the experiment. +#' @param n_mutations The total number of mutations in the experiment. +#' @param n_clusters The total number of clusters in the experiment. +#' @param allele_count A numeric vector which indicates the number of alleles in #' each of the clusters. -#' @param mutatedReadCounts A tibble containing the mutated reads. Rows are +#' @param mutated_read_counts A tibble containing the mutated reads. Rows are #' mutations and columns are samples (clusters). -#' @param totalReadCounts A tibble containing the total read counts. -#' @param nMutationSamplingEvents The number of mutation that should be sampled -#' per tree. -#' @param nTreeSamplingEvents The number of trees that should be sampled. -#' @param cellPairSelection An optional parameter that takes a list of +#' @param total_read_counts A tibble containing the total read counts. +#' @param n_mutation_sampling_events The number of mutation that should be +#' sampled per tree. +#' @param n_tree_sampling_events The number of trees that should be sampled. +#' @param cell_pair_selection An optional parameter that takes a list of #' pairs of strings-valued names of cells that should be analysed (the names as #' in the samples_nodeDescription.tsv file). #' It can also take a character vector, in which case the entries should be the @@ -251,114 +255,118 @@ produce_Distance_Posterior <- function(leaf1, leaf2, postSampling, treeName, #' of trees for which they split #' aggregatedBranchingProbabilities: a vector of aggregated probabilities for #' all considered pairs of leaves and all sampled trees. At the moment only -#' implement if cellPairSelection +#' implement if cell_pair_selection #' parameter is passed to the function. #' @export #' #' @examples -computeClusterSplits <- function(sampleDescription, postSampling, treeName, - nCells, nMutations, nClusters, alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = 1000, - nTreeSamplingEvents = 500, - cellPairSelection = NA) { - desired_values <- sample(1:length(postSampling), - size = nTreeSamplingEvents, +compute_cluster_splits <- function(sample_description, post_sampling, tree_name, + n_cells, n_mutations, n_clusters, + allele_count, mutated_read_counts, + total_read_counts, + n_mutation_sampling_events = 1000, + n_tree_sampling_events = 500, + cell_pair_selection = NA) { + desired_values <- sample(seq_along(post_sampling), + size = n_tree_sampling_events, replace = FALSE - ) %>% sort() + ) |> sort() - postSampling <- postSampling[desired_values] - splittingProbs <- matrix(0, nrow = 0, ncol = 2) %>% as.data.frame() - colnames(splittingProbs) <- c("Cluster", "Splitting_probability") - aggregatedProbabilities <- vector() - if (class(cellPairSelection) == "list") { + post_sampling <- post_sampling[desired_values] + splitting_probs <- matrix(0, nrow = 0, ncol = 2) |> as.data.frame() + colnames(splitting_probs) <- c("Cluster", "Splitting_probability") + aggregated_probabilities <- vector() + if (class(cell_pair_selection) == "list") { counter <- 1 system.time( - for (it in cellPairSelection) { + for (it in cell_pair_selection) { print(it) - leaf1 <- which(sampleDescription$ClusterName == it[1]) - 1 - leaf2 <- which(sampleDescription$ClusterName == it[2]) - 1 - - print(paste(paste("Computing genomic distances of leaves:", leaf1, - sep = " " - ), leaf2, sep = " ")) - posterior <- produce_Distance_Posterior(leaf1, leaf2, postSampling, - treeName, nCells, nMutations, - nClusters, alleleCount, - sampleDescription$Cluster, - mutatedReadCounts, - totalReadCounts, - sampleDescription$WBC, - nSamplingEvents = - nMutationSamplingEvents + leaf1 <- which(sample_description$ClusterName == it[1]) - 1 + leaf2 <- which(sample_description$ClusterName == it[2]) - 1 + + print( + paste( + "Computing genomic distances of leaves:", leaf1, leaf2, + sep = " " + ) + ) + posterior <- produce_distance_posterior(leaf1, leaf2, post_sampling, + tree_name, n_cells, n_mutations, + n_clusters, allele_count, + sample_description$Cluster, + mutated_read_counts, + total_read_counts, + sample_description$WBC, + n_sampling_events = + n_mutation_sampling_events ) - splittingProbs <- rbind( - splittingProbs, + splitting_probs <- rbind( + splitting_probs, data.frame( Cluster = as.character(counter), Splitting_probability = posterior$splittingFraction ) ) - aggregatedProbabilities <- c( - aggregatedProbabilities, + aggregated_probabilities <- c( + aggregated_probabilities, posterior$branchingStatistics ) counter <- counter + 1 } ) - } else if (class(cellPairSelection) == "character") { - CTCclusters <- unique(cellPairSelection) - CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite", "gray93"))] - print(CTCclusters) + } else if (class(cell_pair_selection) == "character") { + ctc_clusters <- unique(cell_pair_selection) + ctc_clusters <- ctc_clusters[!(ctc_clusters %in% c("ghostwhite", "gray93"))] + print(ctc_clusters) system.time( - for (it in CTCclusters) { - cellsInCluster <- which(sampleDescription$color == it) - 1 + for (it in ctc_clusters) { + cells_in_cluster <- which(sample_description$color == it) - 1 ## Make sure array indication is compatible with cpp cluster_done <- 0 - for (i in cellsInCluster) { + for (i in cells_in_cluster) { if (cluster_done == 1) { cluster_done <- 0 break } - if (sampleDescription$WBC[i + 1] == 1) next - j <- cellsInCluster[1] + if (sample_description$WBC[i + 1] == 1) next + j <- cells_in_cluster[1] while (j < i) { if (cluster_done == 1) { break } - if (sampleDescription$WBC[j + 1] == 1) { + if (sample_description$WBC[j + 1] == 1) { j <- j + 1 next } - print(paste(paste("Computing genomic distances of leaves:", i, - sep = " " - ), j, sep = " ")) - - posterior <- produce_Distance_Posterior(i, j, postSampling, - treeName, nCells, - nMutations, nClusters, - alleleCount, - sampleDescription$Cluster, - mutatedReadCounts, - totalReadCounts, - sampleDescription$WBC, - nSamplingEvents = - nMutationSamplingEvents, - clusterName = it + print( + paste("Computing genomic distances of leaves:", i, j, sep = " ") ) - splittingProbs <- rbind( - splittingProbs, + posterior <- produce_distance_posterior(i, j, post_sampling, + tree_name, n_cells, + n_mutations, n_clusters, + allele_count, + sample_description$Cluster, + mutated_read_counts, + total_read_counts, + sample_description$WBC, + n_sampling_events = + n_mutation_sampling_events, + cluster_name = it + ) + + splitting_probs <- rbind( + splitting_probs, data.frame( Cluster = it, Splitting_probability = posterior$splittingFraction ) ) - aggregatedProbabilities <- c( - aggregatedProbabilities, + aggregated_probabilities <- c( + aggregated_probabilities, posterior$branchingStatistics ) j <- j + 1 @@ -368,46 +376,39 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, } ) } else { - CTCclusters <- unique(sampleDescription$color) - CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite", "gray93"))] + ctc_clusters <- unique(sample_description$color) + ctc_clusters <- ctc_clusters[!(ctc_clusters %in% c("ghostwhite", "gray93"))] system.time( - for (it in CTCclusters) { - cellsInCluster <- which(sampleDescription$color %in% it) - 1 + for (it in ctc_clusters) { + cells_in_cluster <- which(sample_description$color %in% it) - 1 ## Make sure array indication is compatible with cpp - # cluster_done <- 0 - for (i in cellsInCluster) { - # if(cluster_done == 1){ - # cluster_done <- 0 - # break - # } - if (sampleDescription$WBC[i + 1] == 1) next - j <- cellsInCluster[1] + + for (i in cells_in_cluster) { + if (sample_description$WBC[i + 1] == 1) next + j <- cells_in_cluster[1] while (j < i) { - # if(cluster_done == 1){ - # break - # } - if (sampleDescription$WBC[j + 1] == 1) { + if (sample_description$WBC[j + 1] == 1) { j <- j + 1 next } - print(paste(paste("Computing genomic distances of leaves:", i, - sep = " " - ), j, sep = " ")) - posterior <- produce_Distance_Posterior(i, j, postSampling, - treeName, nCells, - nMutations, nClusters, - alleleCount, - sampleDescription$Cluster, - mutatedReadCounts, - totalReadCounts, - sampleDescription$WBC, - nSamplingEvents = - nMutationSamplingEvents, - clusterName = it + print( + paste("Computing genomic distances of leaves:", i, j, sep = " ") + ) + posterior <- produce_distance_posterior(i, j, post_sampling, + tree_name, n_cells, + n_mutations, n_clusters, + allele_count, + sample_description$Cluster, + mutated_read_counts, + total_read_counts, + sample_description$WBC, + n_sampling_events = + n_mutation_sampling_events, + cluster_name = it ) print("Posterior computed") - splittingProbs <- rbind( - splittingProbs, + splitting_probs <- rbind( + splitting_probs, data.frame( Cluster = it, Splitting_probability = @@ -415,7 +416,6 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, ) ) j <- j + 1 - # cluster_done <- 1 } } } @@ -425,8 +425,8 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, return(list( - splittingProbs = splittingProbs, - aggregatedBranchingProbabilities = aggregatedProbabilities + splitting_probs = splitting_probs, + aggregatedBranchingProbabilities = aggregated_probabilities )) } @@ -435,17 +435,17 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, #' Loads all necessary data for the CTC-project. #' Specifically it return a named list as follows: -#' postSampling: Loads the posterior sampling tsv as a list of named vectors +#' post_sampling: Loads the posterior sampling tsv as a list of named vectors #' with the following columns: the (unnormalised) LogScore, estimated sequencing #' error rate, the estimated dropout rate, logTau and the Tree in parent vector #' format meaning that the i'th entry of the vector is te parent node of the #' entry i. #' Nodes are counted from zero and the root is length(Tree) #' -#' @param inputFolder The total number of CTC-clusters -#' @param treeName +#' @param input_folder The total number of CTC-clusters +#' @param tree_name #' -#' @return postSampling: Loads the posterior sampling tsv as a list of named +#' @return post_sampling: Loads the posterior sampling tsv as a list of named #' vectors with the following columns: the (unnormalised) LogScore, estimated #' sequencing error rate, the estimated dropout rate, logTau and the Tree in #' parent vector format meaning that the i'th entry of the vector is the parent @@ -453,60 +453,60 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, #' @export #' #' @examples -load_data <- function(inputFolder, treeName) { +load_data <- function(input_folder, tree_name) { ## Define paths - posteriorSamplingFile <- sprintf( - "%s/%s/%s_postSampling.tsv", inputFolder, - treeName, treeName + posterior_sampling_file <- sprintf( + "%s/%s/%s_postSampling.tsv", input_folder, + tree_name, tree_name ) - countFile <- sprintf("%s/%s/%s.txt", inputFolder, treeName, treeName) - descriptionFile <- sprintf( + count_file <- sprintf("%s/%s/%s.txt", input_folder, tree_name, tree_name) + description_file <- sprintf( "%s/%s/%s_samples_nodeDescription.tsv", - inputFolder, treeName, treeName + input_folder, tree_name, tree_name ) ## Load data - postSampling <- read_delim(posteriorSamplingFile, + post_sampling <- readr::read_delim(posterior_sampling_file, delim = "\t", col_names = c( "LogScore", "SequencingErrorRate", "DropoutRate", "LogTau", "Tree" ) ) - postSampling <- split(postSampling, seq(nrow(postSampling))) + post_sampling <- split(post_sampling, seq_len(nrow(post_sampling))) - counts <- read_delim(countFile, + counts <- readr::read_delim(count_file, delim = "\t", col_names = FALSE ) - description <- read_delim(descriptionFile, + description <- readr::read_delim(description_file, delim = "\t", col_names = c( "Cluster", "CellCount", "TCs", "WBCs", "Description" ) ) - nCells <- sum(description$CellCount) - nClusters <- nrow(description) - nMutations <- nrow(counts) - alleleCount <- description$CellCount * 2 + n_cells <- sum(description$CellCount) + n_clusters <- nrow(description) + n_mutations <- nrow(counts) + allele_count <- description$CellCount * 2 - description <- description %>% - mutate(color = regmatches(Description, regexpr( + description <- description |> + dplyr::mutate(color = regmatches(rlang::.data$Description, regexpr( "color=([a-zA-Z]+[0-9]*)", - Description - )) %>% - substr(start = 7, stop = (nchar(.)))) + rlang::.data$Description + ))) |> + (\(data) substr(start = 7, stop = (nchar(data))))() - ClusterID <- vector() - for (i in 1:nClusters) { - ClusterID <- c( - ClusterID, + cluster_id <- vector() + for (i in seq_len(n_clusters)) { + cluster_id <- c( + cluster_id, rep.int( i - 1, description$CellCount[i] @@ -518,49 +518,49 @@ load_data <- function(inputFolder, treeName) { ## Pull apart the count file into counts for mutated read and total counts ## respectively - mutatedReadCounts <- matrix(0, nrow = nMutations, ncol = 0) - for (j in 1:nClusters) { - mutatedReadCounts <- cbind(mutatedReadCounts, counts[, 4 + 2 * j]) + mutated_read_counts <- matrix(0, nrow = n_mutations, ncol = 0) + for (j in seq_len(n_clusters)) { + mutated_read_counts <- cbind(mutated_read_counts, counts[, 4 + 2 * j]) } - totalReadCounts <- matrix(0, nrow = nMutations, ncol = 0) - for (j in 1:nClusters) { - totalReadCounts <- cbind(totalReadCounts, counts[, 4 + 2 * j - 1]) + total_read_counts <- matrix(0, nrow = n_mutations, ncol = 0) + for (j in seq_len(n_clusters)) { + total_read_counts <- cbind(total_read_counts, counts[, 4 + 2 * j - 1]) } - wildtypeReadCounts <- totalReadCounts - mutatedReadCounts + wildtype_read_counts <- total_read_counts - mutated_read_counts - mutatedReadCounts <- mutatedReadCounts %>% - t() %>% - as.data.frame() %>% + mutated_read_counts <- mutated_read_counts |> + t() |> + as.data.frame() |> as.list() - wildtypeReadCounts <- wildtypeReadCounts %>% - t() %>% - as.data.frame() %>% + wildtype_read_counts <- wildtype_read_counts |> + t() |> + as.data.frame() |> as.list() - totalReadCounts <- totalReadCounts %>% - t() %>% - as.data.frame() %>% + total_read_counts <- total_read_counts |> + t() |> + as.data.frame() |> as.list() - mutationDescription <- counts[, 1:4] + mutation_description <- counts[, 1:4] ## wbc status indicates which of the cells is a white blood cells and which ## one isn't. ## So far, the cells are arbitrary, and I will assign the fist cells from a ## cluster to be WBCs. - wbcStatus <- rep(0, nCells) + wbc_status <- rep(0, n_cells) - for (i in 1:nClusters) { + for (i in seq_len(n_clusters)) { j <- 1 while (j <= description$WBCs[i]) { # Iterating over the number of White # blood cells of a cluster - wbcStatus[which(ClusterID == i - 1)[1] + j - 1] <- 1 # and identifying the - # first cell - # that belongs to a cluster and counting from then on + wbc_status[which(cluster_id == i - 1)[1] + j - 1] <- 1 + # and identifying the first cell that belongs to a cluster and counting + # from then on. ## Note: The cluster IDs are counted from zero! j <- j + 1 } @@ -569,30 +569,31 @@ load_data <- function(inputFolder, treeName) { sample_description <- data.frame( - Cluster = ClusterID, - ClusterName = description$Cluster[ClusterID + 1], - WBC = wbcStatus, - color = description$color[ClusterID + 1] + Cluster = cluster_id, + ClusterName = description$Cluster[cluster_id + 1], + WBC = wbc_status, + color = description$color[cluster_id + 1] ) - sample_description <- sample_description %>% - mutate(single_cell = !(duplicated(Cluster)) & - !(duplicated(Cluster, fromLast = TRUE))) + sample_description <- sample_description |> + dplyr::mutate( + single_cell = + !(duplicated(rlang::.data$Cluster)) & + !(duplicated(rlang::.data$Cluster, fromLast = TRUE)) + ) return(list( - "postSampling" = postSampling, "nClusters" = nClusters, - "clusterID" = ClusterID, "nCells" = nCells, - "nMutations" = nMutations, "nClusters" = nClusters, - "alleleCount" = alleleCount, - "mutatedReadCounts" = mutatedReadCounts, - "totalReadCounts" = totalReadCounts, "wbcStatus" = wbcStatus, + "post_sampling" = post_sampling, "n_clusters" = n_clusters, + "cluster_id" = cluster_id, "n_cells" = n_cells, + "n_mutations" = n_mutations, "allele_count" = allele_count, + "mutated_read_counts" = mutated_read_counts, + "total_read_counts" = total_read_counts, "wbc_status" = wbc_status, "sample_description" = sample_description, - "mutationDescription" = mutationDescription, - # "annotations" = annotations, - "sampleName" = treeName, "directory" = inputFolder + "mutation_description" = mutation_description, + "sampleName" = tree_name, "directory" = input_folder )) } @@ -609,8 +610,8 @@ load_data <- function(inputFolder, treeName) { #' distances. #' As the distance the Hamming distance is chosen. #' -#' @param inputFolder -#' @param treeName +#' @param input_folder +#' @param tree_name #' #' @return #' monoclonal_pairs: A list of pairs of cell names that are similar to each @@ -623,28 +624,28 @@ load_data <- function(inputFolder, treeName) { #' @export #' #' @examples -load_monoclonal_pairs <- function(inputFolder, treeName, cutoff = "") { +load_monoclonal_pairs <- function(input_folder, tree_name, cutoff = "") { data_file <- sprintf( - "%s/%s/%s_genotypes.ped", inputFolder, treeName, - treeName + "%s/%s/%s_genotypes.ped", input_folder, tree_name, + tree_name ) - data <- read_delim(data_file, delim = "\t", col_names = FALSE) + data <- readr::read_delim(data_file, delim = "\t", col_names = FALSE) - data2 <- data %>% select(!2:6) + data2 <- data |> dplyr::select(!2:6) distance_matrix <- matrix(0, nrow = nrow(data2), ncol = nrow(data2)) - for (i in 1:nrow(data2)) { + for (i in seq_len(nrow(data2))) { j <- 1 while (j < i) { - row_i <- data2 %>% - select(!1) %>% - slice(i) - row_j <- data2 %>% - select(!1) %>% - slice(j) + row_i <- data2 |> + dplyr::select(!1) |> + dplyr::slice(i) + row_j <- data2 |> + dplyr::select(!1) |> + dplyr::slice(j) distance_matrix[i, j] <- sum(!(row_i == row_j)) j <- j + 1 @@ -670,9 +671,11 @@ load_monoclonal_pairs <- function(inputFolder, treeName, cutoff = "") { print(monoclonal_candidate_cutoff) plot( - ggplot(data.frame(x = distance_vector), aes(x = x)) + - geom_histogram(binwidth = 2) + - geom_vline( + ggplot2::ggplot( + data.frame(x = distance_vector), ggplot2::aes(x = rlang::.data$x) + ) + + ggplot2::geom_histogram(binwidth = 2) + + ggplot2::geom_vline( xintercept = monoclonal_candidate_cutoff, linetype = "dashed", color = "red" ) From 43c37f9f60b8fc1c21b59a2d456f5e5c7e1eb871 Mon Sep 17 00:00:00 2001 From: JohannesGawron Date: Thu, 9 Jan 2025 17:00:04 +0100 Subject: [PATCH 3/5] integrate new data points into barcode analysis --- ...g_the_clonality_of_barcoded_CTC_clusters.R | 93 +++++-------------- 1 file changed, 25 insertions(+), 68 deletions(-) diff --git a/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R b/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R index 6ddfc0e..45ac5ce 100644 --- a/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R +++ b/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R @@ -6,6 +6,7 @@ # Load libraries library(data.table) +library(readxl) library(stringr) library(dplyr) library(tidyr) @@ -48,7 +49,7 @@ for (file in files) { # Create empty data frame to store summary information for all samples summary_df <- data.frame( # Sample basename basename = character(), - # Number of barcodes accumulating 90% of total counts + # Number of barcodes accumulating 90% of total aligned counts num_rows_accumulating_nine = integer(), # Proportion of total counts for most abundant barcode prop_col_1 = numeric(), @@ -63,14 +64,6 @@ for (i in seq_along(df_list)) { cumprop_col <- df[, "cumprop_col"] num_rows_accumulating_nine <- sum(cumprop_col < 0.9) + 1 basename <- names(df_list)[i] - # Extract corresponding primary tumor complexity from basename - pt_complexity <- switch(substr(basename, 1, 4), - "10k_" = 10000, - "50k_" = 50000, - "100_" = 100, - "1000" = 1000, - NA - ) # if no match, assign NA prop_col_1 <- df$prop_col[1] prop_col_2 <- df$prop_col[2] summary_row <- data.frame( @@ -78,91 +71,56 @@ for (i in seq_along(df_list)) { num_rows_accumulating_nine = num_rows_accumulating_nine, prop_col_1 = prop_col_1, prop_col_2 = prop_col_2, - value = value, stringsAsFactors = FALSE ) summary_df <- rbind(summary_df, summary_row) } -# Extract the number of cells per CTC cluster from the sample name and add to summary_df -summary_df <- summary_df %>% - mutate(cluster_size = case_when( - grepl("_0_", basename) ~ "0", - grepl("_1_", basename) ~ "1", - grepl("_2_", basename) ~ "2", - grepl("_3_", basename) ~ "3", - grepl("_4_", basename) ~ "4", - grepl("_5_", basename) ~ "5", - grepl("_6_", basename) ~ "6", - grepl("_7_", basename) ~ "7", - grepl("_8_", basename) ~ "8", - grepl("_9_", basename) ~ "9", - grepl("_10_", basename) ~ "10", - grepl("_10plus_", basename) ~ "11", - grepl("_11_", basename) ~ "11", - grepl("_12_", basename) ~ "12", - grepl("_13_", basename) ~ "13", - grepl("_14_", basename) ~ "14", - grepl("_20_", basename) ~ "20", - grepl("_25_", basename) ~ "25", - TRUE ~ NA_character_ - )) +# Read-in metadata containing information on CTC cell count per cluster, primary tumor barcode diversity and mouse/tumor IDs +metadata <- read_excel("metadata.xlsx") +# Merge summary dataframe and metadata by CTC cluster basename +merged_df <- merge(summary_df, metadata, by = "basename", all = TRUE) -# Assign CTC clusters into categories "0" (negative controls), "2" or "3+" based on the cell number -summary_df <- summary_df %>% +# Assign CTC clusters into categories "0" (negative controls), "2" or "3+" based on the CTC counts +merged_df <- merged_df %>% mutate(cluster_category = case_when( - grepl("_0_", basename) ~ "0", - grepl("_2_", basename) ~ "2", - grepl("_2_|_3_|_4_|_5_|_6_|_7_|_8_|_9_|_10_|_10plus_|_11_|_12_|_13_|_14_|_20_|_25_", basename) ~ "3+", - TRUE ~ NA_character_ - )) - -# Add a column with the corresponding tumor ID -summary_df <- summary_df %>% - mutate(tumor_id = case_when( - grepl("910", basename) ~ "910", - grepl("905", basename) ~ "905", - grepl("904", basename) ~ "904", - grepl("903", basename) ~ "903", - grepl("902", basename) ~ "902", - grepl("141", basename) ~ "141", - grepl("140", basename) ~ "140", - TRUE ~ NA_character_ + cluster_size == 0 ~ "0", + cluster_size == 2 ~ "2", + TRUE ~ "3+" )) - # Quality filtering of CTC cluster samples -summary_df$cluster_size <- as.numeric(summary_df$cluster_size) -summary_df_filter <- summary_df[summary_df$num_rows_accumulating_nine <= summary_df$cluster_size, ] +merged_df$cluster_size <- as.numeric(merged_df$cluster_size) +merged_df_filter <- merged_df[merged_df$num_rows_accumulating_nine <= merged_df$cluster_size, ] # Assign CTC cluster samples mono- or oligoclonal based on the dominance of the most abundant barcode, taking into account the cell number -summary_df_filter$clonality <- ifelse(summary_df_filter$prop_col_2 / summary_df_filter$prop_col_1 < 1 / summary_df_filter$cluster_size, +merged_df_filter$clonality <- ifelse(merged_df_filter$prop_col_2 / merged_df_filter$prop_col_1 < 1 / merged_df_filter$cluster_size, "mono", "oligo" ) # Return fraction of oligoclonal CTC clusters across all samples -nrow(summary_df_filter[summary_df_filter$clonality == "oligo", ]) / nrow(summary_df_filter) +nrow(merged_df_filter[merged_df_filter$clonality == "oligo", ]) / nrow(merged_df_filter) # Assign CTC cluster samples a complexity value of "Low", "Medium" or "High", based on corresponding primary tumor barcode complexity -summary_df_filter <- summary_df_filter %>% - mutate(complexity = ifelse(value %in% c(100, 1000), "Low", - ifelse(value == 10000, "Medium", "High") +merged_df_filter <- merged_df_filter %>% + mutate(complexity = ifelse(pt_complexity %in% c(100, 1000), "Low", + ifelse(pt_complexity == 10000, "Medium", "High") )) # Perform Cochran-Armitage test for trend -# Define matrix with counts for oligo- and monoclonal CTC clusters across complexities (as inferred from summary_df_filter) -x <- matrix(c(16, 133, 21, 52, 40, 14), byrow = TRUE, ncol = 2) +# Define matrix with counts for oligo- and monoclonal CTC clusters across complexities (as inferred from merged_df_filter) +x <- matrix(c(16, 133, 57, 88, 90, 42), byrow = TRUE, ncol = 2) CochranArmitageTest(x, alternative = "one.sided") # Fig. 2b # Generate a data.frame with counts for mono- and oligoclonal CTC clusters within each complexity level using PieDonut low <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(133, 16)) -medium <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(52, 21)) -high <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(14, 40)) +medium <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(88, 57)) +high <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(42, 90)) # Generate a donut plot illustrating mono- and oligoclonal CTC cluster counts for each complexity level @@ -172,7 +130,7 @@ PieDonut(high, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = # Create a data frame specifying for each tumor sample the proportion of oligoclonal CTC clusters in categories "2" and "3+" -combined_summary <- summary_df_filter %>% +combined_summary <- merged_df_filter %>% group_by(tumor_id, cluster_category, complexity) %>% summarise( n = n(), @@ -182,7 +140,7 @@ combined_summary <- summary_df_filter %>% ungroup() # Only keep samples with counts in both categories ("2" and "3+") -combined_summary <- combined_summary[c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13), ] +combined_summary <- combined_summary[-c(21), ] # Calculate the absolute number of mono- and oligoconal CTC clusters per category combined_summary$oligo <- combined_summary$n * combined_summary$prop_oligo @@ -205,7 +163,6 @@ contingency_table <- matrix( # Perform Fisher's Exact Test for "2" vs. "3+" fisher.test(contingency_table) - # Generate plot for Fig. 2c complexity_colors <- c("Low" = "#9fc8c8", "Medium" = "#54a1a1", "High" = "#1f6f6f") ggplot(combined_summary, aes(x = as.character(cluster_category), y = prop_oligo, color = complexity)) + @@ -214,4 +171,4 @@ ggplot(combined_summary, aes(x = as.character(cluster_category), y = prop_oligo, theme_classic() + theme(axis.title = element_text(size = 0), legend.title = element_text(size = 0), legend.text = element_text(size = 0), axis.text = element_text(size = 0)) + # Populate labels in Adobe Illustrator scale_color_manual(values = complexity_colors) + - ylim(0, 1) + ylim(0, 1) \ No newline at end of file From c30b80b824c25c7b248923e5ade017fe9c548dd2 Mon Sep 17 00:00:00 2001 From: JohannesGawron Date: Thu, 9 Jan 2025 17:32:00 +0100 Subject: [PATCH 4/5] adding file to compute p value for extended data figure 7a --- .../extended_data_figure_7a_test.py | 232 ++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 experiments/barcoding_experiment/extended_data_figure_7a_test.py diff --git a/experiments/barcoding_experiment/extended_data_figure_7a_test.py b/experiments/barcoding_experiment/extended_data_figure_7a_test.py new file mode 100644 index 0000000..cf1e4e9 --- /dev/null +++ b/experiments/barcoding_experiment/extended_data_figure_7a_test.py @@ -0,0 +1,232 @@ +from pathlib import Path +import logging + +import numpy as np +import pandas as pd +import math +from matplotlib import pyplot as plt +from scipy.stats import combine_pvalues + +# Generate a list of barcode counts files + +logging.basicConfig(level=logging.INFO) + + +def load_cluster_data(path, pattern): + files = [str(file) for file in Path(path).glob('*.csv') if pattern in str(file)] + + # Create empty dictionary to store data frames + df_dict = {} + cluster_sizes = [] + # Loop through all sample barcode counts files + for idx,file in enumerate(files): + basename = Path(file).stem + pattern = next((p for p in ["910", "905", "904", "903", "902", "141", "140"] if p in basename), None) + cluster_size = int(basename.split('_')[1]) + + try: + # Load counts into data frame + # Use pd.read_csv with chunksize for faster processing + df = pd.read_csv(file, sep='\t', header=None, dtype={0: 'str'}) + logging.info(f'File {file} loaded') + except Exception as e: + logging.error(f"Error reading {file} - skipping\n") + logging.error(e) + continue + df.columns = ['Barcode ID', '1', 'Barcode Count', '3'] + + + + # Sort by decreasing barcode counts + df_sorted = df.sort_values(by=df.columns[2], ascending=False) + df_sorted['Mouse ID'] = pattern + df['Cluster Size'] = cluster_size + df['CTC Cluster ID'] = id ### The choice of cluster ID is completely arbitrary and only helps to distinguish between different clusters + # Calculate fraction of total counts for each Barcode + df_sorted['prop_col'] = df_sorted.iloc[:, 2] / df_sorted.iloc[:, 2].sum() + + # Calculate cumulative barcode proportions + df_sorted['cumprop_col'] = df_sorted['prop_col'].cumsum() + + # Find the largest row number for which cumprop_col < 0.9 + + n_clones = len(df_sorted[df_sorted['cumprop_col'] < 0.9].index) + 1 + df_sorted['Clone present in cluster'] = False + df_sorted.loc[df_sorted.index[:n_clones], 'Clone present in cluster'] = True + + # do a quality check to exclude all clusters for which the number of clones exceeds the number of cells in the cluster + + + if not df_sorted['Clone present in cluster'].sum() > cluster_size: + df_dict[basename] = df_sorted + cluster_sizes.append(cluster_size) + logging.info(f'File {file} added to dictionary') + + return df_dict, list(set(cluster_sizes)) + + +def load_primary_data(path): + file_list = ["combined_140_order_filter_merge.rds.csv", "combined_141_order_filter_merge.rds.csv", "combined_902_order_filter_merge.rds.csv", "combined_903_order_filter_merge.rds.csv", "combined_904_order_filter_merge.rds.csv", "combined_905_order_filter_merge.rds.csv", "combined_910_order_filter_merge.rds.csv"] + primary_dict = {} + for file in file_list: + mouse_ID = file.split('_')[1] + primary_dict[mouse_ID] = pd.read_csv(Path(path) / file, sep=',', header=0) + + primary_dict[mouse_ID]['observations'] = primary_dict[mouse_ID]['observations'].astype(str) + return primary_dict + + +def preprocess_primary_data(primary_data): + # Determine the cutoff for each mouse_ID + + for mouse_ID, data in primary_data.items(): + cutoff = data['prop_av'].quantile(0.01) + data_sorted = data.sort_values(by="prop_av", ascending=False) + + # Calculate cumulative barcode proportions + data_sorted['cumulative_proportions'] = data_sorted['prop_av'].cumsum() + + # Find the largest row number for which cumprop_col < 0.9 + + n_clones = len(data_sorted[data_sorted['prop_av'] > cutoff].index)+1 + data_sorted = data_sorted.iloc[:n_clones,:] + primary_data.update({mouse_ID: data_sorted}) + + return primary_data + + # Calculate fraction of total counts for each Barcode + + +def merge_data(files, primary_data): + for mouse_ID, data in primary_data.items(): + for cluster_ID, cluster_data in files.items(): + if cluster_data['Mouse ID'].iloc[0] == mouse_ID: + # Filter out barcode IDs in cluster_data that are not in primary_data + + cluster_data = cluster_data[cluster_data['Barcode ID'].isin(data['observations'])] + + # Check for barcode IDs with "Clone present in cluster" True but not in observations + missing_barcodes = cluster_data[(cluster_data['Clone present in cluster'] == True) & (~cluster_data['Barcode ID'].isin(data['observations']))] + if not missing_barcodes.empty: + logging.warning(f"Mouse ID {mouse_ID}, Cluster ID {cluster_ID} has barcodes with 'Clone present in cluster' True but not in observations: {missing_barcodes['Barcode ID'].tolist()}") + + # Merge the prop_av values from primary_data into cluster_data + + cluster_data = cluster_data.merge(data[['observations', 'prop_av']], left_on='Barcode ID', right_on='observations', how='left') + + cluster_data.drop(columns=['observations'], inplace=True) + + files.update({cluster_ID: cluster_data}) + return files + + +def compute_G_score(clones_in_cluster, n_cells): + return 2*clones_in_cluster["prop_av"].map(lambda x: np.log(1/(1-(1-x) ** n_cells))).sum() + + + +def simulate_G_scores(proportions, n_cells, n_simulations): + G_scores = [] + probabilities = proportions/np.sum(proportions) + + for _ in range(n_simulations): + simulated_counts = np.random.multinomial(n_cells, probabilities) + simulated_data = pd.DataFrame({ + 'simulated_counts': simulated_counts, + 'prop_av': proportions + }) + simulated_data['Clone present in cluster'] = simulated_data['simulated_counts'].apply(lambda x: 1 if x > 0 else 0) + simulated_data = simulated_data[simulated_data['simulated_counts'] > 0] + G_score = compute_G_score(simulated_data, n_cells) + G_scores.append(G_score) + + plt.hist(G_scores, bins=60, edgecolor='k', alpha=0.7) + plt.xlabel('G Score') + plt.ylabel('Frequency') + plt.title('Histogram of Simulated G Scores') + plt.show() + + return G_scores + + +def compute_test(all_clones, n_cells_in_cluster, simulations = None): + + resolution_of_simulation = 10000 + if simulations is None: + G_scores = simulate_G_scores(all_clones['prop_av'], n_cells_in_cluster, resolution_of_simulation) + else: + G_scores = simulations + clones_in_cluster = all_clones[all_clones['Clone present in cluster'] == True] + G = compute_G_score(clones_in_cluster, n_cells_in_cluster) + p_value = sum(G_scores >= G)/resolution_of_simulation + return(p_value) + #p_value = clones_in_cluster["prop_av"].prod()* math.factorial(n_cells_in_cluster)/math.factorial(clones_in_cluster.shape[0]) + + + + +if __name__ == '__main__': + path = '/Users/jgawron/Documents/projects/CTC_backup/validation_experiment/barcode_data/cluster_data' + primary_data_path = '/Users/jgawron/Downloads' + primary_data = load_primary_data(primary_data_path) + primary_data = preprocess_primary_data(primary_data) + + # Create empty dictionary to store data frames + p_values = [] + mouse_models = [] + number_of_simulations = 10000 + for pattern in ["910", "905", "904", "903", "902", "141", "140"]: + files, cluster_sizes = load_cluster_data(path, pattern) + + merged_data = merge_data(files, primary_data) + + first_dataset = next(iter(merged_data.values())) + + print(cluster_sizes) + simulated_G_scores = pd.DataFrame(np.zeros((number_of_simulations, len(cluster_sizes))), columns=[str(cluster_size) for cluster_size in cluster_sizes]) + + for cell_number in cluster_sizes: + if simulated_G_scores[str(cell_number)].sum() == 0: + logging.info(f"Simulating null distribution for cluster size: {cell_number}") + simulated_G_scores[str(cell_number)] = simulate_G_scores(first_dataset['prop_av'], cell_number, number_of_simulations) + + for cluster_id, cluster_data in merged_data.items(): + n_cells = cluster_id.split('_')[1] + p_values.append(compute_test(cluster_data, int(n_cells), simulated_G_scores[n_cells])+1e-16) + mouse_models.append(pattern) + logging.info(f"P value for cluster {cluster_id}: {p_values[-1]}") + + p_value_summary = pd.DataFrame({'P value': p_values, 'Mouse Model': mouse_models}) + logging.info(p_value_summary) + p_value_summary.to_csv('/Users/jgawron/Documents/projects/CTC_backup/validation_experiment/barcode_data/cluster_data/p_values.csv') + + # Combine p-values using Fisher's method + combined_p_value = combine_pvalues(p_values, method='fisher')[1] + print(f'Combined p-value: {combined_p_value}') + plt.hist(p_values, bins=30, edgecolor='k', alpha=0.7) + plt.xlabel('P values') + plt.ylabel('Frequency') + plt.title('Histogram of p_values') + plt.show() + # Plot the p_values stratified by Mouse model + plt.figure(figsize=(10, 6)) + for mouse_model in p_value_summary['Mouse Model'].unique(): + subset = p_value_summary[p_value_summary['Mouse Model'] == mouse_model] + plt.hist(subset['P value'], bins=30, alpha=0.5, label=f'Mouse Model {mouse_model}') + + plt.xlabel('P values') + plt.ylabel('Frequency') + plt.title('Histogram of P values stratified by Mouse Model') + plt.legend() + plt.show() + + """ + for cluster_id, cluster_data in merged_data.items(): + for idx in range(15): + simulate_G_scores(cluster_data['prop_av'], idx, 1000) + n_cells = int(cluster_id.split('_')[1]) + clones_in_cluster = cluster_data[cluster_data['Clone present in cluster'] == True] + G_score = compute_G_score(clones_in_cluster, n_cells) + print(f'Cluster ID: {cluster_id}, G_score: {G_score}') + simulate_G_scores(cluster_data['prop_av'], n_cells, 100000) + """ \ No newline at end of file From 47d3d04a1c10285e99cfdd0280512997566d6f2b Mon Sep 17 00:00:00 2001 From: JohannesGawron Date: Thu, 9 Jan 2025 17:35:33 +0100 Subject: [PATCH 5/5] bugfix --- .pre-commit-config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 495d4ac..f8a041a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,6 @@ repos: rev: v0.4.2 hooks: - id: style-files - dry: on - id: parsable-R - id: lintr args: [--warn_only]