diff --git a/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R b/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R index 050acb7..6ddfc0e 100644 --- a/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R +++ b/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R @@ -165,10 +165,10 @@ medium <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(52, 21)) high <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(14, 40)) -# Generate a donut plot illustrating mono- and oligoclonal CTC cluster counts for each complexity level -PieDonut(low, aes(Clonality, count=n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE) -PieDonut(medium, aes(Clonality, count=n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE) -PieDonut(high, aes(Clonality, count=n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE) +# Generate a donut plot illustrating mono- and oligoclonal CTC cluster counts for each complexity level +PieDonut(low, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE) +PieDonut(medium, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE) +PieDonut(high, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE) # Create a data frame specifying for each tumor sample the proportion of oligoclonal CTC clusters in categories "2" and "3+" diff --git a/experiments/barcoding_experiment/extended_data_figure_7a.R b/experiments/barcoding_experiment/extended_data_figure_7a.R index 71df106..90fd92f 100644 --- a/experiments/barcoding_experiment/extended_data_figure_7a.R +++ b/experiments/barcoding_experiment/extended_data_figure_7a.R @@ -1,77 +1,148 @@ -library(tidyverse) +library(ggplot2) +library(boot) +library(smoothr) -data <- readRDS("combi_df.rds") -View(data) +setwd("~/Downloads") +tables <- list() +filelist <- c("combined_140_order_filter_merge.rds", "combined_141_order_filter_merge.rds", "combined_902_order_filter_merge.rds", "combined_903_order_filter_merge.rds", "combined_904_order_filter_merge.rds", "combined_905_order_filter_merge.rds", "combined_910_order_filter_merge.rds") +for (file in filelist) { + tables[[file]] <- readRDS(file) + tables[[file]]$observations <- paste0(file, tables[[file]]$observations) + first_row_with_nonzero_counts <- tables[[file]][tables[[file]]$freq != 0, ][1, ] + total_counts <- first_row_with_nonzero_counts$counts / first_row_with_nonzero_counts$freq + tables[[file]]$total_counts <- total_counts + cutoff <- quantile(tables[[file]]$prop_av, prob = 0.01) + tables[[file]] <- tables[[file]] %>% + filter(prop_av > cutoff) %>% + filter(prop_av != 0) +} -data <- data %>% mutate(counts / freq, complexity = as.numeric(complexity)) -summary(data) +# Merge all tables +merged_table <- do.call(rbind, tables) +unique(merged_table$total_counts) -data2 <- data %>% - group_by(observations) %>% - slice(rep(1:n(), counts / freq)) %>% - mutate(present = row_number() <= first(counts)) %>% - ungroup() +merged_table <- merged_table %>% mutate(complexity = as.numeric(complexity)) -fit <- glm(present ~ log(prop_av), data = data2, family = binomial(link = "log")) +summary(merged_table) -summary(fit) -confint(fit) -coef(fit) -data %>% - ggplot(aes(y = log(freq), x = log(prop_av))) + - geom_point() + - labs(x = "log(tumorVAF)", y = "log(fraction among CTC clusters)") + - geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) +merged_table2 <- merged_table %>% + group_by(observations) %>% + slice(rep(1:n(), total_counts)) %>% + mutate(present = as.integer(row_number() <= first(counts))) %>% + ungroup() -data %>% - ggplot(aes(y = log(ratio), x = log(prop_av))) + +fit <- + glm( + present ~ log(prop_av), + data = merged_table2, family = binomial(link = "logit") + ) +fit1 <- + glm( + present ~ prop_av, + data = merged_table2, family = binomial(link = "logit") + ) +fit2 <- + glm( + present ~ log(prop_av), + data = merged_table2, family = binomial(link = "log") + ) +fit3 <- + glm(present ~ logit(prop_av), + data = merged_table2, family = binomial(link = "logit") + ) +fit4 <- + glm(present ~ logit(prop_av), + data = merged_table2, family = binomial(link = "log") + ) + +AIC(fit) +AIC(fit1) +AIC(fit2) +AIC(fit3) +AIC(fit4) + +### fit3 and fit4 have approximately the same AIC, but fit 3 transforms the x +## coordinates to a well defined range, so I will use fit3. +fit_linear_model <- lm(freq ~ log(prop_av), data = merged_table) +AIC(fit_linear_model) + +merged_table %>% + ggplot(aes(y = freq, x = prop_av)) + geom_point() + - geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2] - 1) + - labs(x = "log(VAF in primary tumor)", y = "log(fold-change in prevalence)") + - theme_minimal() + labs(x = "tumor VAF", y = "fraction among CTC clusters") + + geom_smooth(method = "lm", formula = y ~ x) +merged_table %>% + ggplot(aes(y = freq, x = log(prop_av))) + + geom_point() + + labs(x = "tumor VAF", y = "fraction among CTC clusters") + + geom_smooth(method = "lm") +merged_table %>% + ggplot(aes(y = logit(freq), x = logit(prop_av))) + + geom_point() + + labs(x = "logit(tumor VAF)", y = "logit(fraction among CTC clusters)") + + geom_smooth(method = "lm", formula = y ~ x) -pdf("~/Desktop/clonal_prevalence.pdf", width = "3.6", height = "3.6") -par(bty = "l") -plot( - x = log(data$prop_av), - y = log(data$ratio), - cex = 0.8, - pch = 19, - col = "#81A9A9", - xlab = "log( VAF in primary tumor )", - ylab = "log( fold-change in prevalence in CTC clusters )" -) -abline(b = coef(fit)[2] - 1, a = coef(fit)[1], col = "#3C8181", lwd = 2.2) -text(x = max(log(data$prop_av)) - 5, y = max(log(data$ratio)) - 3.5, labels = sprintf("slope = %f***", coef(fit)[2] - 1), pos = 4, col = "black") -dev.off() -summary(fit) -#### The p-value is computed manually. Compare (Regression, Modelle, Methoden -## und Anwendungen, second edition, page 204f; https://www.uni-goettingen.de/de/551357.html) +fit_logit_logit_linear <- + lm(asin(sqrt(freq)) ~ asin(sqrt(prop_av)), data = merged_table) +plot(fit_logit_logit_linear) +primary_VAF <- seq(0.01, 0.55, 0.01) -### The coefficient scaled by the standard error and then squared is -## approximately xi-square (1) distributed with one degree of freedom. +predicted <- predict(fit4, + newdata = data.frame(prop_av = primary_VAF), + type = "response" +) -### Also note that the number of data points 1798 is large enough (>50) for a -## xi-square approximation to work, according to rule of thumb. +color <- "#3C8181" -pseudo_t_value <- as.numeric(coef(fit)["log(prop_av)"] - 1) / as.numeric(summary(fit)$coefficients["log(prop_av)", "Std. Error"]) +ggplot(merged_table2, aes(x = prop_av, y = present)) + + geom_jitter(width = 0, height = 0.1, color = color) + + geom_smooth( + method = "glm", + method.args = list(family = "binomial"), formula = y ~ logit(x), color = color + ) + + labs( + x = "Clonal frequency in primary tumor", + y = "P(barcode is present in CTC cluster)" + ) + + geom_abline(linetype = "dotted", slope = 1, intercept = 0) -wald_statistics <- pseudo_t_value^2 ### xi^2_1 distributed -p.value <- 1 - pchisq(wald_statistics, 1) -print(p.value) -cor(log(data$prop_av), log(data$ratio), method = "pearson") -### The p-value is smaller that measurable by machine precision, so I suggest to report -### the smallest p-value that the generic glm summary can output: +merged_table %>% + ggplot(aes(y = freq, x = prop_av)) + + geom_point() + + labs(x = "tumor VAF", y = "fraction among CTC clusters") + + geom_line(data = data.frame(prop_av = primary_VAF, freq = predicted)) + +means <- rep(NA, 100) +for (i in 1:25) { + window_start <- (i - 1) / 50 + window_end <- i / 50 + means[i] <- merged_table2 %>% + dplyr::filter(prop_av > window_start) %>% + dplyr::filter(prop_av <= window_end) %>% + pull(present) %>% + mean() +} + +ggplot(data.frame(y = means, x = 0:99), aes(x = x, y = y)) + + geom_point() + +ggsave( + "~/Desktop/clonal_prevalence2.pdf", + width = 3.6, height = 3.6, units = "in" +) -### p-value < 2.2e-16 +cor(x = merged_table2$prop_av, y = merged_table2$present, method = "spearman") +cor.test( + x = merged_table2$prop_av, y = merged_table2$present, method = "spearman" +) diff --git a/experiments/barcoding_experiment/extended_data_figure_7b.R b/experiments/barcoding_experiment/extended_data_figure_7b.R index 8761154..96ebd23 100644 --- a/experiments/barcoding_experiment/extended_data_figure_7b.R +++ b/experiments/barcoding_experiment/extended_data_figure_7b.R @@ -1,3 +1,6 @@ +## Author: Johannes Gawron +## Colors and labels in the final figure may deviate through manual manipulation in Adobe Illustrator + library(MCMCprecision) library(ggplot2) library(tidyverse) @@ -54,17 +57,13 @@ for (mouse_model in mouse_models) { print(files) summary_temp <- summary_df_filter_combined[paste0(summary_df_filter_combined$basename, ".csv") %in% files, ] - summary_df_filter_combined <- summary_temp %>% - mutate(group = ifelse(value %in% c(100, 1000), "100-1000", - ifelse(value == 10000, "10000", "50000") - )) - total_number_of_clusters_by_size <- summary_df_filter_combined %>% + total_number_of_clusters_by_size <- summary_temp %>% group_by(cluster_size) %>% summarize(all_clusters = n()) - number_of_monoclonal_cluster_by_size <- summary_df_filter_combined %>% + number_of_monoclonal_cluster_by_size <- summary_temp %>% filter(clonality == "mono") %>% group_by(cluster_size) %>% summarize(monoclonal_clusters = n()) @@ -161,50 +160,9 @@ for (mouse_model in mouse_models) { frequency_table <- mono_clusters / all_clusters -rownames(all_clusters) <- 2:25 -rownames(mono_clusters) <- 2:25 rownames(frequency_table) <- 2:25 rownames(p_values_table) <- 2:25 rownames(fold_change_table) <- 2:25 -rownames(expected_values_null_table) <- 2:25 - - - -expected_values_null_table[is.na(expected_values_null_table)] <- 0 - -expected_values_null_per_sample <- expected_values_null_table %>% apply(MARGIN = 2, FUN = sum) - -expected_proportion_null_per_sample <- expected_values_null_per_sample / all_clusters %>% apply(MARGIN = 2, FUN = sum) - -frequencies <- mono_clusters %>% apply(MARGIN = 2, FUN = sum) / all_clusters %>% apply(MARGIN = 2, FUN = sum) - - -annotations <- ifelse(p_values_columns < 0.001, "***", - ifelse(p_values_columns < 0.01, "**", - ifelse(p_values_columns < 0.05, "*", "ns") - ) -) -annotations <- paste0(mono_clusters %>% apply(MARGIN = 2, FUN = sum), "/", all_clusters %>% apply(MARGIN = 2, FUN = sum), "\n", annotations) -annotation_data <- data.frame(SampleID = rownames(data), annotations = annotations, frequencies = frequencies, expected = expected_proportion_null_per_sample) - -annotation_data$SampleID <- factor(annotation_data$SampleID, levels = annotation_data$SampleID[c(5, 6, 2, 1, 3, 4, 7)]) - - -ggplot(annotation_data, aes(x = SampleID, y = frequencies)) + - ylim(0, 1.1) + - geom_bar(stat = "identity", fill = "#598F8F", alpha = 0.8, width = 0.8, position = position_dodge(width = 0.6)) + - geom_text( - data = annotation_data, aes(x = SampleID, y = frequencies, label = annotations), - position = position_dodge(width = 0.6), vjust = -0.5, size = 4 - ) + - labs(x = "Sample ID", y = "Proportion of monoclonal Clusters") + - theme_classic() + - scale_x_discrete(limits = levels(annotation_data$SampleID)) + - theme( - axis.text = element_text(size = 14), - axis.title = element_text(size = 14) - ) + - geom_point(aes(y = expected), shape = 23, fill = "black", size = 4)