From 2d1251a46a0710c9879e4818ecd8f4125ce6bfc6 Mon Sep 17 00:00:00 2001 From: JohannesGawron Date: Tue, 10 Dec 2024 13:09:54 +0100 Subject: [PATCH] minor adaptations --- .../workflow/Snakefile | 1 - .../workflow/envs/R.yml | 4 + .../workflow/resources/annotateVariants.R | 4 +- .../workflow/scripts/createInputSummary.R | 20 +- .../workflow/scripts/simulateCTCclusters.R | 597 +++++++++++++++--- 5 files changed, 509 insertions(+), 117 deletions(-) diff --git a/experiments/assessing_cluster_clonality/workflow/Snakefile b/experiments/assessing_cluster_clonality/workflow/Snakefile index 18d4761..cbde19e 100644 --- a/experiments/assessing_cluster_clonality/workflow/Snakefile +++ b/experiments/assessing_cluster_clonality/workflow/Snakefile @@ -17,7 +17,6 @@ MARKDOWNS = PROJECT_DIR / "data" / "markdowns" ######Rules###### -include: "rules/common.smk" include: "rules/base.smk" diff --git a/experiments/assessing_cluster_clonality/workflow/envs/R.yml b/experiments/assessing_cluster_clonality/workflow/envs/R.yml index c6715dc..927420c 100644 --- a/experiments/assessing_cluster_clonality/workflow/envs/R.yml +++ b/experiments/assessing_cluster_clonality/workflow/envs/R.yml @@ -9,3 +9,7 @@ dependencies: - r-tidyverse>=2.0 - pandoc>=3.1 - r-heatmaply>=1.5 + - r-optparse>=1.7 + - r-viridis>=0.6 + - r-vgam>=1.1 + - r-pscl>=1.5 diff --git a/experiments/assessing_cluster_clonality/workflow/resources/annotateVariants.R b/experiments/assessing_cluster_clonality/workflow/resources/annotateVariants.R index c77849e..2f87b94 100755 --- a/experiments/assessing_cluster_clonality/workflow/resources/annotateVariants.R +++ b/experiments/assessing_cluster_clonality/workflow/resources/annotateVariants.R @@ -11,12 +11,12 @@ annotate_variants <- function(sampleName, inputFolder, variantList) { # Read VCF file to extract column names - file <- file.path(inputFolder, "filtered", "vcf_files_annotated", paste0(sampleName, ".ann.vcf")) + file <- file.path(inputFolder, "annotations", paste0(sampleName, ".ann.vcf")) lines <- readLines(file, warn = FALSE) vcf_names <- strsplit(lines[grep("^#CHROM", lines)], "\t")[[1]] # Read VCF file into a data frame - vcf <- read.table(file.path(inputFolder, "filtered", "vcf_files_annotated", paste0(sampleName, ".ann.vcf")), + vcf <- read.table(file.path(inputFolder, "annotations", paste0(sampleName, ".ann.vcf")), comment.char = "#", sep = "\t", header = FALSE, col.names = vcf_names ) colnames(vcf)[1] <- "#CHROM" diff --git a/experiments/assessing_cluster_clonality/workflow/scripts/createInputSummary.R b/experiments/assessing_cluster_clonality/workflow/scripts/createInputSummary.R index d496c50..e668609 100644 --- a/experiments/assessing_cluster_clonality/workflow/scripts/createInputSummary.R +++ b/experiments/assessing_cluster_clonality/workflow/scripts/createInputSummary.R @@ -2,7 +2,7 @@ source("../resources/functions.R") library("optparse") parser <- OptionParser() -parser <- add_option(parser, c("-i", "--input-file"), +parser <- add_option(parser, c("-i", "--input-folder"), type = "character", default = "~/Documents/projects/CTC_backup/input_folder", help = "Path to the folder containing all input files" ) @@ -10,22 +10,23 @@ parser <- add_option(parser, c("-n", "--name-of-tree"), type = "character", default = "Br23", help = "Name of the tree for which to simulate CTC-clusters" ) -args <- parse_args(parser, args = c("--input-file", "--name-of-tree")) +args <- parse_args(parser) -inputFolder <- dirname(args$"input-file") -treeName <- args$name_of_tree +input_folder <- args$"input-folder" +tree_name <- args$"name-of-tree" -# inputFolder <- "~/Documents/projects/CTC_backup/input_folder" -# treeName <- "Br23" +# input_folder <- "~/Documents/projects/CTC_backup/input_folder" +# tree_name <- "Br23" -input <- load_data(inputFolder, treeName) -allClusterSizes <- input$sample_description %>% +input <- load_data(input_folder, tree_name) + +all_cluster_sizes <- input$sample_description %>% filter(WBC == 0 & color != "gray93") %>% group_by(color) %>% filter(n() > 1) %>% @@ -33,4 +34,5 @@ allClusterSizes <- input$sample_description %>% dplyr::select("cluster_size") %>% unique() -write_csv(allClusterSizes, file.path(inputFolder, treeName, paste(treeName, "clusterSizes.csv", sep = "_"))) +write_csv(all_cluster_sizes, file.path(input_folder, tree_name, paste(tree_name, "clusterSizes.csv", sep = "_"))) +print("Success.") \ No newline at end of file diff --git a/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R b/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R index 2b420e6..e9c3a2c 100644 --- a/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R +++ b/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R @@ -1,17 +1,27 @@ -source("functions.R") library(viridis) library(VGAM) library(pscl) library(MASS) library(boot) +source("functions.R") library("optparse") ############ # Config ############ + +color_palette <- + list( + "orchid", "orchid1", "orchid2", "orchid3", "orchid4", "darkorchid", + "darkorchid1", "darkorchid2", "darkorchid3", "darkorchid4", "purple", + "purple1", "purple2", "purple3", "purple4" + ) + + + parser <- OptionParser() -parser <- add_option(parser, c("-i", "--input-file"), +parser <- add_option(parser, c("-i", "--input-folder"), type = "character", default = "~/Documents/projects/CTC_backup/input_folder", help = "Path to the folder containing all input files" ) @@ -20,27 +30,33 @@ parser <- add_option(parser, c("-n", "--name-of-tree"), default = "Br23", help = "Name of the tree for which to simulate CTC-clusters" ) parser <- add_option(parser, c("-s", "--simulation-cluster-size"), - type = "character", + type = "numeric", default = "2", help = "Number of cells in the simulated clusters" ) parser <- add_option(parser, c("-o", "--output-folder"), type = "character", default = "~/Documents/projects/CTC_backup/simulations/simulation2", help = "" ) -args <- parse_args(parser, args = c("--input-folder", "--name-of-tree", "--simulation-cluster-size", "--output-folder")) - - -inputFolder <- dirname(args$input - file) -treeName <- args$name_of_tree -clusterSize <- args$simulation - cluster - size -outputFolder <- args$output - folder +parser <- add_option(parser, c("-m", "--monoclonal"), + type = "logical", + default = TRUE, help = "" +) -# inputFolder <- "~/Documents/projects/CTC_backup/input_folder" -# treeName <- "Br23" +args <- parse_args(parser) -input <- load_data(inputFolder, treeName) +input_folder <- args$"input-folder" +tree_name <- args$"name-of-tree" +cluster_size <- args$"simulation-cluster-size" +output_folder <- args$"output-folder" +monoclonal <- args$monoclonal +# input_folder <- "~/Documents/projects/CTC_backup/input_folder" +# tree_name <- "Br23" +print(input_folder) +print(tree_name) +input <- load_data(input_folder, tree_name) +print("Input data successfully loaded.") # # ############ @@ -48,7 +64,7 @@ input <- load_data(inputFolder, treeName) # ############ # # -# # input <- load_data(inputFolder, treeName) +# # input <- load_data(input_folder, tree_name) # # totalReadCounts <- input$totalReadCounts # # sampleDescription <- input$sample_description # @@ -112,14 +128,16 @@ input <- load_data(inputFolder, treeName) -#' Fits a zero inflated negative binomial distribution to the total read count data. +#' Fits a zero inflated negative binomial distribution +#' to the total read count data. #' #' @param input The loaded dataset -#' @param zeroInfl If this boolean value is FALSE, then a negative binomial is fit to the data +#' @param zeroInfl If this boolean value is FALSE, +#' then a negative binomial is fit to the data #' #' -#' @return The parameters of the distribution. If zeroInfl is false, then the zero probability -#' is set to 0. +#' @return The parameters of the distribution. If zeroInfl is false, +#' then the zero probability is set to 0. #' @export #' #' @examples @@ -130,10 +148,18 @@ fitReadCountDistribution <- function(input, zeroInfl = TRUE) { if (zeroInfl == TRUE) { fit <- zeroinfl(totalReadCountVector ~ 1, dist = "negbin") - return(list(zeroProb = inv.logit(summary(fit)$coefficients$zero[1]), theta = exp(summary(fit)$coefficients$count[2, 1]), expValue = exp(summary(fit)$coefficients$count[1, 1]))) + return( + list( + zeroProb = inv.logit(summary(fit)$coefficients$zero[1]), + theta = exp(summary(fit)$coefficients$count[2, 1]), + expValue = exp(summary(fit)$coefficients$count[1, 1]) + ) + ) } else { fit <- glm.nb(totalReadCountVector ~ 1) - return(list(zeroProb = 0, theta = summary(fit)$theta, expValue = exp(coef(fit)))) + return( + list(zeroProb = 0, theta = summary(fit)$theta, expValue = exp(coef(fit))) + ) } } @@ -167,37 +193,42 @@ fitReadCountDistribution <- function(input, zeroInfl = TRUE) { #' @export #' #' @examples -simulateReads <- function(nWildtypeAlleles, nMutatedAlleles, dropoutRate, errorRate, readCountFit) { - # draw from a binomial model to simulate dropouts - nWildtypeAlleles <- rbinom(1, size = nWildtypeAlleles, prob = (1 - dropoutRate)) - nMutatedAlleles <- rbinom(1, size = nMutatedAlleles, prob = (1 - dropoutRate)) +simulateReads <- + function( + nWildtypeAlleles, nMutatedAlleles, dropoutRate, errorRate, readCountFit) { + # draw from a binomial model to simulate dropouts + nWildtypeAlleles <- rbinom(1, size = nWildtypeAlleles, prob = (1 - dropoutRate)) + nMutatedAlleles <- rbinom(1, size = nMutatedAlleles, prob = (1 - dropoutRate)) - # draw from a negative-binomial to simulate the total read count - isZero <- rbinom(1, size = 1, p = readCountFit$zeroProb) - if (isZero == TRUE) { - nReads <- 0 - } else { - nReads <- rnegbin(1, mu = readCountFit$expValue, theta = readCountFit$theta) - } + # draw from a negative-binomial to simulate the total read count + isZero <- rbinom(1, size = 1, p = readCountFit$zeroProb) + if (isZero == TRUE) { + nReads <- 0 + } else { + nReads <- rnegbin(1, mu = readCountFit$expValue, theta = readCountFit$theta) + } - # draw from a beta-binomial to simulate overdispersion through multiple- - # displacement amplification - nWildtypeReads <- rbetabinom.ab(n = 1, size = nReads, shape1 = nWildtypeAlleles, shape2 = nMutatedAlleles) + # draw from a beta-binomial to simulate overdispersion through multiple- + # displacement amplification + nWildtypeReads <- + rbetabinom.ab( + n = 1, size = nReads, shape1 = nWildtypeAlleles, shape2 = nMutatedAlleles + ) - nMutatedReads <- nReads - nWildtypeReads + nMutatedReads <- nReads - nWildtypeReads - # randomly flip the genotypes of reads with a certain error rate - falsePositives <- rbinom(1, size = nReads - nMutatedReads, prob = errorRate) - falseNegatives <- rbinom(1, size = nMutatedReads, prob = errorRate) + # randomly flip the genotypes of reads with a certain error rate + falsePositives <- rbinom(1, size = nReads - nMutatedReads, prob = errorRate) + falseNegatives <- rbinom(1, size = nMutatedReads, prob = errorRate) - nMutatedReads <- nMutatedReads + falsePositives - falseNegatives + nMutatedReads <- nMutatedReads + falsePositives - falseNegatives - return(list(read_counts = c(nReads, nMutatedReads))) -} + return(list(read_counts = c(nReads, nMutatedReads))) + } @@ -208,7 +239,8 @@ simulateReads <- function(nWildtypeAlleles, nMutatedAlleles, dropoutRate, errorR #' Calls genotypes of single cells based on the CTC-SCITE algorithm #' -#' @param nTreeSamplingEvents number of sampled trees. Appricimated postserior gets better the higher this number is. +#' @param nTreeSamplingEvents number of sampled trees. Approximated posterior +#' gets better the higher this number is. #' @param input The loaded data. #' #' @return returns a data frame in long format that gives the genotype and the @@ -216,7 +248,7 @@ simulateReads <- function(nWildtypeAlleles, nMutatedAlleles, dropoutRate, errorR #' @export #' #' @examples -getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) { +call_genotypes <- function(nTreeSamplingEvents = 1000, input) { postSampling <- input$postSampling nCells <- input$nCells nMutations <- input$nMutations @@ -227,7 +259,12 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) { totalReadCounts <- input$totalReadCounts - desired_values <- sample(1:length(postSampling), size = nTreeSamplingEvents, replace = FALSE) %>% sort() + desired_values <- + sample( + 1:length(postSampling), + size = nTreeSamplingEvents, replace = FALSE + ) %>% + sort() postSampling <- postSampling[desired_values] postSamplingTrees <- lapply(postSampling, FUN = function(entry) { return(entry$Tree) @@ -254,7 +291,10 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) { geom_tile(aes(fill = Posterior)) + scale_fill_viridis() - genotypes$WBC <- input$sample_description$WBC[(genotypes$Sample %>% substr(start = 2, stop = nchar(.)) %>% as.numeric())] + genotypes$WBC <- + input$sample_description$WBC[ + (genotypes$Sample %>% substr(start = 2, stop = nchar(.)) %>% as.numeric()) + ] genotypes %>% mutate(WBC = as.factor(WBC)) %>% @@ -264,11 +304,23 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) { - genotypes <- genotypes %>% mutate(Mutation = as.numeric(Mutation), Genotype = as.integer(Posterior > 0.5)) + genotypes <- + genotypes %>% + mutate( + Mutation = as.numeric(Mutation), Genotype = as.integer(Posterior > 0.5) + ) genotypes %>% - filter(Sample %in% - paste0("X", which(input$sample_description$single_cell == TRUE & input$sample_description$WBC == FALSE))) %>% + filter( + Sample %in% + paste0( + "X", + which( + input$sample_description$single_cell == TRUE & + input$sample_description$WBC == FALSE + ) + ) + ) %>% ggplot(aes(Mutation, Sample)) + geom_tile(aes(fill = Genotype)) + scale_fill_viridis() @@ -277,6 +329,292 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) { } +#' Samples a a specified number of genotypes +#' +#' @param input the generic input data from the CTC-SCITE tree sampling +#' @param genotypes called genotypes for each cell in long format +#' @param sampling_size a vector that indicates how many +#' +#' @return +#' @export +#' +#' @examples +sample_genotypes <- function(input, genotypes, sampling_size) { + cellIDs <- paste0("X", 1:nrow(input$sample_description)) + + if (sampling_size > length(unique(genotypes$Sample))) { + cells <- + sample( + size = length(unique(genotypes$Sample)), x = cellIDs, replace = FALSE + ) + stop("You want to sample more genotypes than can be provided") + } else { + cells <- sample(size = sampling_size, x = cellIDs, replace = FALSE) + } + return(cells) +} + + + +#' Appends the simulated data to the original data and writes to new files +#' +#' @param output_directory +#' @param input The generic tree sampling data +#' @param output_label a name for the simulated dataset files, e.g. the number +#' of clusters +#' @param simulated_sample_description The lines for the sample description file +#' adding the simulated CTC clusters +#' @param genotypes_output_format The read counts for the simulated data +#' +#' @return No return, but writes the files to disk +#' @export +#' +#' @examples +create_simulated_output <- + function(output_directory, input, output_label, simulated_sample_description, + genotypes_output_format) { + print("Writing output files") + + dir.create( + file.path( + output_directory, paste(input$sampleName, output_label, sep = "_") + ), + recursive = TRUE + ) + description_data <- + read_delim( + file.path( + input$directory, + input$sampleName, + paste0(input$sampleName, "_samples_nodeDescription.tsv") + ), + delim = "\t", + col_names = FALSE, + quote = "none" + ) + colnames(description_data) <- + c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description") + + description_data <- + rbind(description_data, simulated_sample_description) + write_delim( + x = description_data, + file = file.path( + output_directory, + paste(input$sampleName, output_label, sep = "_"), + paste0( + input$sampleName, "_", + output_label, + "_samples_nodeDescription.tsv" + ) + ), + delim = "\t", + col_names = FALSE, + quote = "none", + escape = "none" + ) + + read_data <- + read_delim( + file.path( + input$directory, + input$sampleName, + paste0(input$sampleName, ".txt") + ), + delim = "\t", + col_names = FALSE, + escape_backslash = TRUE + ) + + read_data <- cbind(read_data, genotypes_output_format) + + write_delim( + x = read_data, + file = file.path( + output_directory, + paste(input$sampleName, output_label, sep = "_"), + paste0(input$sampleName, "_", output_label, ".txt") + ), + delim = "\t", + col_names = FALSE, + quote = "none", + escape = "none" + ) + } + + +#' Create input files for the CTC SCITE algorithm with +#' one simulated oligoclonal cluster +#' +#' @param input The generic posterior sampling from CTC-SCITE +#' @param number_of_cells The size of the output CTC-cluster +#' @param output_directory The directory to write the output to +#' +#' @return +#' @export +#' +#' @examples +simulate_oligoclonals <- function(input, output_directory, number_of_cells, sampling_size = 100) { + read_data <- + read_delim( + file.path( + input$directory, + input$sampleName, + paste0(input$sampleName, ".txt") + ), + delim = "\t", + col_names = FALSE, + escape_backslash = TRUE + ) + + description_data <- + read_delim( + file.path( + input$directory, + input$sampleName, + paste0(input$sampleName, "_samples_nodeDescription.tsv") + ), + delim = "\t", + col_names = FALSE, + quote = "none" + ) + colnames(description_data) <- + c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description") + + + + + cluster_identity_of_cells <- c() + idx <- 1 + for (cell_count in description_data$total_number_cells) { + for (i in 1:cell_count) { + cluster_identity_of_cells <- c(cluster_identity_of_cells, idx) + } + idx <- idx + 1 + } + + + + genotypes <- call_genotypes(nTreeSamplingEvents = sampling_size, input = input) + genotypes_wide <- + genotypes %>% + dplyr::select(c(Sample, Genotype, Mutation)) %>% + pivot_wider(names_from = "Sample", values_from = Genotype) + + rownames(genotypes_wide) <- genotypes_wide$Mutation + genotypes_wide <- genotypes_wide[, 2:ncol(genotypes_wide)] + + + single_cell_indices <- which(description_data$tumor_cells == 1 & description_data$total_number_cells == 1) + + hamming_distance <- function(x, y) { + return(sum(x != y)) + } + + while (TRUE) { # I sample until I get a cluster where at least two cells are distinct + # Sample cells to merge + cells_to_merge <- + sample(single_cell_indices, number_of_cells, replace = FALSE) + cell_identity <- which(cluster_identity_of_cells %in% cells_to_merge) + + ## Check if all cells have the same genotype: + ## If not, the cluster is oligoclonal and we are fine. + + sum_of_distances <- 0 + for (j in 1:length(cell_identity)) { + for (k in 1:length(cell_identity)) { + sum_of_distances <- + sum_of_distances + + hamming_distance( + genotypes_wide[, cell_identity[j]], + genotypes_wide[, cell_identity[k]] + ) + } + } + if (sum_of_distances > 0) { + break + } + } + + + aggregated_data_ref <- rep(0, dim(read_data)[1]) + aggregated_data_alt <- rep(0, dim(read_data)[1]) + + for (cell in cells_to_merge) { + aggregated_data_ref <- aggregated_data_ref + read_data[, 4 + 2 * cell - 1] + aggregated_data_alt <- aggregated_data_alt + read_data[, 4 + 2 * cell] + } + + columns_to_remove <- c(4 + 2 * cells_to_merge - 1, 4 + 2 * cells_to_merge) + + read_data <- cbind(read_data, aggregated_data_ref, aggregated_data_alt) + read_data <- read_data[, -columns_to_remove] + + output_label <- paste(cells_to_merge, collapse = "_") + + newSample <- data.frame( + sample_name = paste(input$sampleName, "sim", output_label, sep = "_"), + total_number_cells = number_of_cells, tumor_cells = number_of_cells, + WBCs = 0, + description = + paste0( + "[color=", color_palette[[1]], + ',label="', input$sampleName, "_sim", + '",fillcolor=', + color_palette[[1]], + ',image="../CTC-cluster-icons/cluster_', + number_of_cells, + '-0.png"]' + ) + ) + + description_data_output_format <- + rbind(description_data, newSample) + description_data_output_format <- + description_data_output_format[-cells_to_merge, ] + + + + dir.create( + file.path( + output_directory, paste(input$sampleName, output_label, sep = "_") + ), + recursive = TRUE + ) + + write_delim( + x = read_data, + file = file.path( + output_directory, + paste(input$sampleName, output_label, sep = "_"), + paste0(input$sampleName, "_", output_label, ".txt") + ), + delim = "\t", + col_names = FALSE, + quote = "none", + escape = "none" + ) + + write_delim( + x = description_data, + file = file.path( + output_directory, + paste(input$sampleName, output_label, sep = "_"), + paste0( + input$sampleName, "_", + output_label, + "_samples_nodeDescription.tsv" + ) + ), + delim = "\t", + col_names = FALSE, + quote = "none", + escape = "none" + ) +} + + + #' Creates the input dataset for CTC-SCITE run with simulated CTC-clusters. #' @@ -293,12 +631,13 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) { #' #' #' @param samplingSize number of trees to determine the genotype of individual cells. -#' To be passed to getGenotypeMatrix. -#' @param clusterSizeVector A number that indicates the cluster complexity to by simulated +#' To be passed to call_genotypes +#' @param cluster_size_vector A number that indicates the cluster complexity to be simulated #' (i.e. the number of cells in the cluster) #' @param input the loaded dataset #' @param output_directory Directory to write the simulated input files for #' the CTC-SCITE run to. +#' @param output_label The number of cells in the simulated cluster #' @param dropoutRate The dropout rate to assume for the simulation #' @param errorRate The error rate to assume for the simulation #' @param seed Set a seed for reproducibility @@ -310,88 +649,115 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) { #' @export #' #' @examples -simulateCTCclusters <- function(samplingSize, clusterSizeVector, input, output_directory, output_label, dropoutRate = 0.3, errorRate = 0.001, seed = 123, zeroInflated = TRUE) { +simulateCTCclusters <- function( + samplingSize, + cluster_size_vector, + input, + output_directory, + output_label, + dropoutRate = 0.3, + errorRate = 0.001, + seed = 123, + zeroInflated = TRUE) { set.seed(seed) - color_palette <- list("orchid", "orchid1", "orchid2", "orchid3", "orchid4", "darkorchid", "darkorchid1", "darkorchid2", "darkorchid3", "darkorchid4", "purple", "purple1", "purple2", "purple3", "purple4") + # color_palette <- + # list( + # "orchid", "orchid1", "orchid2", "orchid3", "orchid4", "darkorchid", + # "darkorchid1", "darkorchid2", "darkorchid3", "darkorchid4", "purple", + # "purple1", "purple2", "purple3", "purple4" + # ) - fit <- fitReadCountDistribution(input, zeroInfl = zeroInflated) - print("Calling genotypes") - # Output data frame in long format. This is essentially a cell x mutation genotype matrix. - # This represents to pool of genotypes from which I can now sample for the simulation. - - genotypes <- getGenotypeMatrix(nTreeSamplingEvents = samplingSize, input = input) + # Output data frame in long format. + # This is essentially a cell x mutation genotype matrix. + # This represents to pool of genotypes from which + # I can now sample for the simulation. + genotypes <- + call_genotypes(nTreeSamplingEvents = samplingSize, input = input) - genotypesOutputFormat <- data.frame(matrix(0, nrow = input$nMutations, ncol = 0)) - sampleDescriptionOutputFormat <- data.frame(matrix(0, nrow = 0, ncol = 5)) - colnames(sampleDescriptionOutputFormat) <- c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description") + fit <- fitReadCountDistribution(input, zeroInfl = zeroInflated) - ## Sample as many genotypes as there should be simulated clusters. - cellIDs <- paste0("X", 1:nrow(input$sample_description)) + cells <- + sample_genotypes( + input = input, + genotypes = genotypes, + sampling_size = sum(cluster_size_vector) + ) - if (sum(clusterSizeVector) > length(unique(genotypes$Sample))) { - cells <- sample(size = length(unique(genotypes$Sample)), x = cellIDs, replace = FALSE) - stop("You want to sample more genotypes than can be provided") - } else { - cells <- sample(size = sum(clusterSizeVector), x = cellIDs, replace = FALSE) - } + genotypes_output_format <- + data.frame(matrix(0, nrow = input$nMutations, ncol = 0)) + sample_description_output_format <- data.frame(matrix(0, nrow = 0, ncol = 5)) + colnames(sample_description_output_format) <- + c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description") iterator <- 0 - # iterating over the size of the clusters to be simulated - for (clusterSize in 1:length(clusterSizeVector)) { - # iterating over the number of clusters of the same size to be simulated. Here not a for loop, to avoid backwards counting in R. + for (size_of_cluster in 1:length(cluster_size_vector)) { + # iterating over the number of clusters of the same size to be simulated. + # Here not a for loop, to avoid backwards counting in R. clustersBySize <- 1 - while (clustersBySize <= clusterSizeVector[clusterSize]) { + while (clustersBySize <= cluster_size_vector[size_of_cluster]) { print(paste("Simulating CTC cluster ", iterator)) - print(paste("Number of cells: ", clusterSize)) + print(paste("Number of cells: ", size_of_cluster)) genotype <- genotypes %>% filter(Sample == cells[clustersBySize]) %>% arrange(Mutation) genotype <- pull(genotype, Genotype) - nMutatedAlleles <- clusterSize * genotype - nAllelesTotal <- clusterSize * rep(2, length(genotype)) + nMutatedAlleles <- size_of_cluster * genotype + nAllelesTotal <- size_of_cluster * rep(2, length(genotype)) nWildtypeAlleles <- nAllelesTotal - nMutatedAlleles - data <- data.frame(nWildtypeAlleles = nWildtypeAlleles, nMutatedAlleles = nMutatedAlleles) + data <- data.frame( + nWildtypeAlleles = nWildtypeAlleles, nMutatedAlleles = nMutatedAlleles + ) print("Starting simulation of read counts") reads <- apply(data, FUN = function(x) { - return(simulateReads(x[1], x[2], dropoutRate, errorRate, fit)$read_counts) + return( + simulateReads(x[1], x[2], dropoutRate, errorRate, fit)$read_counts + ) }, MARGIN = 1) %>% t() - genotypesOutputFormat <- cbind(genotypesOutputFormat, reads) + genotypes_output_format <- cbind(genotypes_output_format, reads) print("Done") newSample <- data.frame( sample_name = paste0(input$sampleName, "_sim", iterator), - total_number_cells = clusterSize, tumor_cells = clusterSize, + total_number_cells = size_of_cluster, tumor_cells = size_of_cluster, WBCs = 0, description = - paste0("[color=", color_palette[[iterator + 1]], ',label="', input$sampleName, "_sim", iterator, '",fillcolor=', color_palette[[iterator + 1]], ',image="../CTC-cluster-icons/cluster_', clusterSize, '-0.png"]') + paste0( + "[color=", color_palette[[iterator + 1]], + ',label="', input$sampleName, "_sim", + iterator, + '",fillcolor=', + color_palette[[iterator + 1]], + ',image="../CTC-cluster-icons/cluster_', + size_of_cluster, + '-0.png"]' + ) ) - sampleDescriptionOutputFormat <- rbind(sampleDescriptionOutputFormat, newSample) + sample_description_output_format <- + rbind(sample_description_output_format, newSample) iterator <- iterator + 1 clustersBySize <- clustersBySize + 1 } - } - print("Writing output files") - - dir.create(file.path(output_directory, paste(input$sampleName, output_label, sep = "_")), recursive = TRUE) - description_data <- read_delim(file.path(input$directory, input$sampleName, paste0(input$sampleName, "_samples_nodeDescription.tsv")), delim = "\t", col_names = FALSE, quote = "none") - colnames(description_data) <- c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description") - description_data <- rbind(description_data, sampleDescriptionOutputFormat) - write_delim(x = description_data, file = file.path(output_directory, paste(input$sampleName, output_label, sep = "_"), paste0(input$sampleName, "_", output_label, "_samples_nodeDescription.tsv")), delim = "\t", col_names = FALSE, quote = "none", escape = "none") - - read_data <- read_delim(file.path(input$directory, input$sampleName, paste0(input$sampleName, ".txt")), delim = "\t", col_names = FALSE, escape_backslash = TRUE) - read_data <- cbind(read_data, genotypesOutputFormat) - write_delim(x = read_data, file = file.path(output_directory, paste(input$sampleName, output_label, sep = "_"), paste0(input$sampleName, "_", output_label, ".txt")), delim = "\t", col_names = FALSE, quote = "none", escape = "none") + if (cluster_size_vector[size_of_cluster] > 0) { + create_simulated_output( + output_directory, + input, + size_of_cluster, + sample_description_output_format, + genotypes_output_format + ) + } + } } @@ -399,19 +765,40 @@ simulateCTCclusters <- function(samplingSize, clusterSizeVector, input, output_d # for (tree in c("Br11", "Br16_AC_max2", "Br16_AC_max3", "Br16_AC_max4", "Br16_B_max1", "Br16_B_max2", "Br16_B_max3", "Br16_B_max4", "Br16_C_max1", "Br16_C_max2", "Br16_C_max3", "Br23", "Br26", "Br30", "Br37", "Br38", "Br39", "Br44", "Br45", "Br46", "Br53", "Br57", "Brx50", "Lu2", "Lu7", "Ov8", "Pr6", "Pr9")) {} -clusterSizeVector <- c(0, 4, 3, 2, 2, 2, 2, 2, 2) -keep <- rep(0, length(clusterSizeVector)) -keep[clusterSize] <- 1 -clusterSizeVector[keep == 0] <- 0 +cluster_size_vector <- c(0, 4, 3, 2, 2, 2, 2, 2, 2) -print(paste("Running simulation for", tree)) +print(paste("Running simulation for", tree_name)) -simulateCTCclusters( - samplingSize = 100, clusterSizeVector = clusterSizeVector, input, - output_directory = outputFolder, output_label = output_label, - dropoutRate = 0.35, errorRate = 0.0015, seed = 124, - zeroInflated = TRUE -) +all_cluster_sizes <- input$sample_description %>% + filter(WBC == 0 & color != "gray93") %>% + group_by(color) %>% + filter(n() > 1) %>% + summarize(cluster_size = n()) %>% + dplyr::select("cluster_size") %>% + unique() + + +if (monoclonal == TRUE) { + keep <- rep(0, length(cluster_size_vector)) + keep[cluster_size] <- 1 + cluster_size_vector[keep == 0] <- 0 + print("Simulating monoclonal clusters.") + simulateCTCclusters( + samplingSize = 100, cluster_size_vector = cluster_size_vector, input, + output_directory = output_folder, output_label = output_label, + dropoutRate = 0.35, errorRate = 0.0015, seed = 124, + zeroInflated = TRUE + ) +} else { + for (idx in 1:nrow(all_cluster_sizes)) { + cluster_size <- all_cluster_sizes$cluster_size[idx] + print("Simulating oligoclonal clusters.") + for (idx2 in 1:cluster_size_vector[cluster_size]) { + simulate_oligoclonals(input, output_folder, cluster_size, sampling_size = 100) + } + print("Success.") + } +}