-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDGE_edgeR_Pipeline_inProgress.R
111 lines (85 loc) · 3.15 KB
/
DGE_edgeR_Pipeline_inProgress.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#--- set working directory
setwd("/mnt/DATA/LairdLab/ernesto/kob2022_ePGCtoEGC")
#---- load packages
{
library(dplyr)
library(DESeq2)
library(edgeR)
library(stringr)
library(ggplot2)
library(ggrepel)
library(GEOquery)
library(tidyr)
library(EnhancedVolcano)
library(org.Hs.eg.db)
library(clusterProfiler)
library(enrichplot)
}
counts_data <- read.table(file = "data/bulk/GSE174465_Normalized_counts.txt",
sep = "\t", header = T, stringsAsFactors = FALSE)
sample_info <- metadata %>%
select(1,8,44,45,46) %>%
rename_with(.cols = 1, ~"sample_id") %>%
rename_with(.cols = 2, ~"group") %>%
rename_with(.cols = 3, ~"clone") %>%
rename_with(.cols = 4, ~"cell_info") %>%
rename_with(.cols = 5, ~"sex") %>%
mutate(group = gsub("human ", "", group))
head(sample_info)
head(dat)
counts_data <- dat
rownames(counts_data) <- counts_data$X
counts_data$X <- NULL
head(counts_data)
if (!all(colnames(counts_data) %in% sample_info$sample_id)) {
stop("The sample names in the counts data do not match the sample information.")
}
# Create a DGEList object from the counts data
dge <- DGEList(counts = counts_data)
# Assign group information to the DGEList object
dge$samples$group <- factor(sample_info$group)
# Estimate common dispersion
dge <- estimateCommonDisp(dge)
# Estimate tagwise dispersion
dge <- estimateTagwiseDisp(dge)
# Perform the exact test for differential expression
et <- exactTest(dge, pair=cbind("PGCLC", "EG")) # Replace "group1" and "group2" with your specific group names
# Extract and sort the results by FDR (False Discovery Rate)
degs <- topTags(et, n = Inf, p.value = 0.05, adjust.method = "BH")
# Save the differentially expressed genes to a file
write.csv(degs, file = "bulk_analysis/results/differentially_expressed_genes_edgeR.csv")
--------
# Load required libraries
library(pheatmap)
# Filter the top N most variable genes for the heatmap (adjust N as desired)
N <- 100
most_variable_genes <- head(order(rowVars(dge$counts), decreasing = TRUE), n = N)
# Extract the log-CPM values of the top N most variable genes
log_cpm <- cpm(dge, log = TRUE)
log_cpm_top_genes <- log_cpm[most_variable_genes, ]
# Create the heatmap
pheatmap(log_cpm_top_genes,
scale = "row", # Scale genes (rows) to have mean zero and standard deviation one
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = FALSE,
show_colnames = TRUE)
# Load required libraries
library(ggplot2)
# Extract the results from the edgeR output
results <- et$table
results$Gene <- rownames(results)
results$FDR <- p.adjust(results$PValue, method = "BH")
# Calculate log2 fold change and -log10 adjusted p-value
results$log2FoldChange <- results$logFC
results$minusLog10AdjustedPvalue <- -log10(results$FDR)
# Create a volcano plot with a significance threshold of FDR < 0.05
threshold <- 0.05
ggplot(results, aes(x = log2FoldChange, y = minusLog10AdjustedPvalue)) +
geom_point(aes(color = FDR < threshold), alpha = 0.8, size = 2) +
scale_color_manual(values = c("grey", "red")) +
theme_minimal() +
theme(legend.position = "none") +
xlab("log2(Fold Change)") +
ylab("-log10(FDR)") +
ggtitle("Volcano Plot")