Skip to content

Commit

Permalink
Merge pull request #5 from JohannesGawron/main
Browse files Browse the repository at this point in the history
bugfix in extended_data_figure_7b.R and redoing extended_data_figure_7a.R
  • Loading branch information
JohannesGawron authored Aug 14, 2024
2 parents 17b3080 + 6876c30 commit 7901b6b
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 104 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,10 @@ medium <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(52, 21))
high <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(14, 40))


# Generate a donut plot illustrating mono- and oligoclonal CTC cluster counts for each complexity level
PieDonut(low, aes(Clonality, count=n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
PieDonut(medium, aes(Clonality, count=n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
PieDonut(high, aes(Clonality, count=n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
# Generate a donut plot illustrating mono- and oligoclonal CTC cluster counts for each complexity level
PieDonut(low, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
PieDonut(medium, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
PieDonut(high, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)


# Create a data frame specifying for each tumor sample the proportion of oligoclonal CTC clusters in categories "2" and "3+"
Expand Down
177 changes: 124 additions & 53 deletions experiments/barcoding_experiment/extended_data_figure_7a.R
Original file line number Diff line number Diff line change
@@ -1,77 +1,148 @@
library(tidyverse)
library(ggplot2)
library(boot)
library(smoothr)

data <- readRDS("combi_df.rds")
View(data)
setwd("~/Downloads")

tables <- list()
filelist <- c("combined_140_order_filter_merge.rds", "combined_141_order_filter_merge.rds", "combined_902_order_filter_merge.rds", "combined_903_order_filter_merge.rds", "combined_904_order_filter_merge.rds", "combined_905_order_filter_merge.rds", "combined_910_order_filter_merge.rds")
for (file in filelist) {
tables[[file]] <- readRDS(file)
tables[[file]]$observations <- paste0(file, tables[[file]]$observations)
first_row_with_nonzero_counts <- tables[[file]][tables[[file]]$freq != 0, ][1, ]
total_counts <- first_row_with_nonzero_counts$counts / first_row_with_nonzero_counts$freq
tables[[file]]$total_counts <- total_counts
cutoff <- quantile(tables[[file]]$prop_av, prob = 0.01)
tables[[file]] <- tables[[file]] %>%
filter(prop_av > cutoff) %>%
filter(prop_av != 0)
}

data <- data %>% mutate(counts / freq, complexity = as.numeric(complexity))

summary(data)
# Merge all tables
merged_table <- do.call(rbind, tables)
unique(merged_table$total_counts)

data2 <- data %>%
group_by(observations) %>%
slice(rep(1:n(), counts / freq)) %>%
mutate(present = row_number() <= first(counts)) %>%
ungroup()
merged_table <- merged_table %>% mutate(complexity = as.numeric(complexity))

fit <- glm(present ~ log(prop_av), data = data2, family = binomial(link = "log"))
summary(merged_table)

summary(fit)

confint(fit)
coef(fit)

data %>%
ggplot(aes(y = log(freq), x = log(prop_av))) +
geom_point() +
labs(x = "log(tumorVAF)", y = "log(fraction among CTC clusters)") +
geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2])
merged_table2 <- merged_table %>%
group_by(observations) %>%
slice(rep(1:n(), total_counts)) %>%
mutate(present = as.integer(row_number() <= first(counts))) %>%
ungroup()


data %>%
ggplot(aes(y = log(ratio), x = log(prop_av))) +
fit <-
glm(
present ~ log(prop_av),
data = merged_table2, family = binomial(link = "logit")
)
fit1 <-
glm(
present ~ prop_av,
data = merged_table2, family = binomial(link = "logit")
)
fit2 <-
glm(
present ~ log(prop_av),
data = merged_table2, family = binomial(link = "log")
)
fit3 <-
glm(present ~ logit(prop_av),
data = merged_table2, family = binomial(link = "logit")
)
fit4 <-
glm(present ~ logit(prop_av),
data = merged_table2, family = binomial(link = "log")
)

AIC(fit)
AIC(fit1)
AIC(fit2)
AIC(fit3)
AIC(fit4)

### fit3 and fit4 have approximately the same AIC, but fit 3 transforms the x
## coordinates to a well defined range, so I will use fit3.
fit_linear_model <- lm(freq ~ log(prop_av), data = merged_table)
AIC(fit_linear_model)

merged_table %>%
ggplot(aes(y = freq, x = prop_av)) +
geom_point() +
geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2] - 1) +
labs(x = "log(VAF in primary tumor)", y = "log(fold-change in prevalence)") +
theme_minimal()
labs(x = "tumor VAF", y = "fraction among CTC clusters") +
geom_smooth(method = "lm", formula = y ~ x)

merged_table %>%
ggplot(aes(y = freq, x = log(prop_av))) +
geom_point() +
labs(x = "tumor VAF", y = "fraction among CTC clusters") +
geom_smooth(method = "lm")


merged_table %>%
ggplot(aes(y = logit(freq), x = logit(prop_av))) +
geom_point() +
labs(x = "logit(tumor VAF)", y = "logit(fraction among CTC clusters)") +
geom_smooth(method = "lm", formula = y ~ x)

pdf("~/Desktop/clonal_prevalence.pdf", width = "3.6", height = "3.6")
par(bty = "l")
plot(
x = log(data$prop_av),
y = log(data$ratio),
cex = 0.8,
pch = 19,
col = "#81A9A9",
xlab = "log( VAF in primary tumor )",
ylab = "log( fold-change in prevalence in CTC clusters )"
)
abline(b = coef(fit)[2] - 1, a = coef(fit)[1], col = "#3C8181", lwd = 2.2)
text(x = max(log(data$prop_av)) - 5, y = max(log(data$ratio)) - 3.5, labels = sprintf("slope = %f***", coef(fit)[2] - 1), pos = 4, col = "black")
dev.off()

summary(fit)

#### The p-value is computed manually. Compare (Regression, Modelle, Methoden
## und Anwendungen, second edition, page 204f; https://www.uni-goettingen.de/de/551357.html)
fit_logit_logit_linear <-
lm(asin(sqrt(freq)) ~ asin(sqrt(prop_av)), data = merged_table)
plot(fit_logit_logit_linear)
primary_VAF <- seq(0.01, 0.55, 0.01)

### The coefficient scaled by the standard error and then squared is
## approximately xi-square (1) distributed with one degree of freedom.
predicted <- predict(fit4,
newdata = data.frame(prop_av = primary_VAF),
type = "response"
)

### Also note that the number of data points 1798 is large enough (>50) for a
## xi-square approximation to work, according to rule of thumb.
color <- "#3C8181"

pseudo_t_value <- as.numeric(coef(fit)["log(prop_av)"] - 1) / as.numeric(summary(fit)$coefficients["log(prop_av)", "Std. Error"])
ggplot(merged_table2, aes(x = prop_av, y = present)) +
geom_jitter(width = 0, height = 0.1, color = color) +
geom_smooth(
method = "glm",
method.args = list(family = "binomial"), formula = y ~ logit(x), color = color
) +
labs(
x = "Clonal frequency in primary tumor",
y = "P(barcode is present in CTC cluster)"
) +
geom_abline(linetype = "dotted", slope = 1, intercept = 0)

wald_statistics <- pseudo_t_value^2 ### xi^2_1 distributed

p.value <- 1 - pchisq(wald_statistics, 1)
print(p.value)
cor(log(data$prop_av), log(data$ratio), method = "pearson")
### The p-value is smaller that measurable by machine precision, so I suggest to report
### the smallest p-value that the generic glm summary can output:
merged_table %>%
ggplot(aes(y = freq, x = prop_av)) +
geom_point() +
labs(x = "tumor VAF", y = "fraction among CTC clusters") +
geom_line(data = data.frame(prop_av = primary_VAF, freq = predicted))

means <- rep(NA, 100)
for (i in 1:25) {
window_start <- (i - 1) / 50
window_end <- i / 50
means[i] <- merged_table2 %>%
dplyr::filter(prop_av > window_start) %>%
dplyr::filter(prop_av <= window_end) %>%
pull(present) %>%
mean()
}

ggplot(data.frame(y = means, x = 0:99), aes(x = x, y = y)) +
geom_point()

ggsave(
"~/Desktop/clonal_prevalence2.pdf",
width = 3.6, height = 3.6, units = "in"
)

### p-value < 2.2e-16
cor(x = merged_table2$prop_av, y = merged_table2$present, method = "spearman")
cor.test(
x = merged_table2$prop_av, y = merged_table2$present, method = "spearman"
)
52 changes: 5 additions & 47 deletions experiments/barcoding_experiment/extended_data_figure_7b.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
## Author: Johannes Gawron
## Colors and labels in the final figure may deviate through manual manipulation in Adobe Illustrator

library(MCMCprecision)
library(ggplot2)
library(tidyverse)
Expand Down Expand Up @@ -54,17 +57,13 @@ for (mouse_model in mouse_models) {
print(files)
summary_temp <- summary_df_filter_combined[paste0(summary_df_filter_combined$basename, ".csv") %in% files, ]

summary_df_filter_combined <- summary_temp %>%
mutate(group = ifelse(value %in% c(100, 1000), "100-1000",
ifelse(value == 10000, "10000", "50000")
))

total_number_of_clusters_by_size <- summary_df_filter_combined %>%
total_number_of_clusters_by_size <- summary_temp %>%
group_by(cluster_size) %>%
summarize(all_clusters = n())


number_of_monoclonal_cluster_by_size <- summary_df_filter_combined %>%
number_of_monoclonal_cluster_by_size <- summary_temp %>%
filter(clonality == "mono") %>%
group_by(cluster_size) %>%
summarize(monoclonal_clusters = n())
Expand Down Expand Up @@ -161,50 +160,9 @@ for (mouse_model in mouse_models) {


frequency_table <- mono_clusters / all_clusters
rownames(all_clusters) <- 2:25
rownames(mono_clusters) <- 2:25
rownames(frequency_table) <- 2:25
rownames(p_values_table) <- 2:25
rownames(fold_change_table) <- 2:25
rownames(expected_values_null_table) <- 2:25



expected_values_null_table[is.na(expected_values_null_table)] <- 0

expected_values_null_per_sample <- expected_values_null_table %>% apply(MARGIN = 2, FUN = sum)

expected_proportion_null_per_sample <- expected_values_null_per_sample / all_clusters %>% apply(MARGIN = 2, FUN = sum)

frequencies <- mono_clusters %>% apply(MARGIN = 2, FUN = sum) / all_clusters %>% apply(MARGIN = 2, FUN = sum)


annotations <- ifelse(p_values_columns < 0.001, "***",
ifelse(p_values_columns < 0.01, "**",
ifelse(p_values_columns < 0.05, "*", "ns")
)
)
annotations <- paste0(mono_clusters %>% apply(MARGIN = 2, FUN = sum), "/", all_clusters %>% apply(MARGIN = 2, FUN = sum), "\n", annotations)
annotation_data <- data.frame(SampleID = rownames(data), annotations = annotations, frequencies = frequencies, expected = expected_proportion_null_per_sample)

annotation_data$SampleID <- factor(annotation_data$SampleID, levels = annotation_data$SampleID[c(5, 6, 2, 1, 3, 4, 7)])


ggplot(annotation_data, aes(x = SampleID, y = frequencies)) +
ylim(0, 1.1) +
geom_bar(stat = "identity", fill = "#598F8F", alpha = 0.8, width = 0.8, position = position_dodge(width = 0.6)) +
geom_text(
data = annotation_data, aes(x = SampleID, y = frequencies, label = annotations),
position = position_dodge(width = 0.6), vjust = -0.5, size = 4
) +
labs(x = "Sample ID", y = "Proportion of monoclonal Clusters") +
theme_classic() +
scale_x_discrete(limits = levels(annotation_data$SampleID)) +
theme(
axis.text = element_text(size = 14),
axis.title = element_text(size = 14)
) +
geom_point(aes(y = expected), shape = 23, fill = "black", size = 4)



Expand Down

0 comments on commit 7901b6b

Please sign in to comment.