-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcancer_analysis_orig.R
104 lines (92 loc) · 3.83 KB
/
cancer_analysis_orig.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
library("vsn")
directory <- "/Users/jenny/Desktop/Cancer Project/Cancer Project/data"
outdir <- "Users/jenny/Desktop/Cancer Project/Cancer Project/output"
directory
list <-list.files(directory)
list
sampleFiles <- grep("normal",list.files(directory),value=TRUE)
head(sampleFiles)
sampleCondition <- sub("(.*normal).*","\\1",sampleFiles)
sampleTable <- data.frame(sampleName = sampleFiles,
fileName = sampleFiles,
condition = sampleCondition)
sampleTable$condition <- factor(sampleTable$condition)
library("DESeq2")
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
directory = directory,
design= ~ condition)
ddsHTSeq
#filter to keep only rows that have 10 reads total. before: 60483 x 519 after: 53996 x 519
keep <- rowSums(counts(ddsHTSeq)) >= 10
ddsHTSeq <- ddsHTSeq[keep,]
head(ddsHTSeq,10)
#for visualization or clustering – it might be useful to work with transformed versions of the count data
#if you have many samples (e.g. 100s), the rlog function might take too long, and so the vst function will be a faster choice
vsd <- vst(ddsHTSeq, blind=FALSE)
#plot the standard deviation of the transformed data, across samples
#against the mean, using the variance stabilizing transformation
#the vertical axis in such plots is the square root of the variance over all samples, so including the variance due to the experimental conditions
sdplot <-meanSdPlot(assay(vsd))
library("ggplot2")
sdplot$gg <- sdplot$gg +
ggtitle(label="Mean SD of transformed read counts") +
ylab("standard deviation")
print(sdplot$gg)
#saving plot not allowed on rstudio
#savePlot(filename = "Mean SD", type = "png",)
#data quality assessment
#heatmap of count matrix
#library("pheatmap")
#library(grid)
#select <- order(rowMeans(counts(ddsHTSeq,normalized=TRUE)),
# decreasing=TRUE)[1:20]
#rownames(df) <- colnames(ddsHTSeq)
#heatmap<-pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
# cluster_cols=FALSE, annotation_col=df)
#print(heatmap)
#str(df)
#str(select)
#PCA Plot
pcaData<-plotPCA(vsd, intgroup=c("condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=condition, shape=condition)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()
#to further check outliers
boxplot(log10(assays(ddsHTSeq$condition=="normal")[["cooks"]]), range=0, las=2)
#level represents the control group or which level you want to compare against
ddsHTSeq$condition <- factor(ddsHTSeq$condition, levels = c("normal","notnormal"))
#differential expression testing
ddsHTSeq <- DESeq(ddsHTSeq)
res <- results(ddsHTSeq)
res
write.csv(res, file = "output/DE_Results.csv")
#Log fold change shrinkage for visualization and ranking
resultsNames(ddsHTSeq)
resLFC <- lfcShrink(ddsHTSeq, coef="condition_notnormal_vs_normal", type="apeglm")
write.csv(resLFC, file = "output/DE_Results_lfc.cs")
#order our results table by the smallest p value:
resOrdered <- resLFC[order(resLFC$padj),]
resOrdered
write.csv(resOrdered, file = "output/DE_Results_ordered.csv")
#By default the argument alpha is set to 0.1
res05 <- results(ddsHTSeq, alpha=0.05)
sink("output/log.txt")
print(summary(res05))
sink()
#read counts per gene
#You can select the gene to plot by rowname or by numeric index
#this sample gets the gene which had the smallest p value
plotCounts(ddsHTSeq, gene=which.min(resOrdered$padj), intgroup="condition")
resOrdered
head(resOrdered,10)
#get top 10 genes
colnames(resOrdered)
goi <- rownames(resOrdered)[1:10]
goi
for(i in 1:10) {
d <- plotCounts(ddsHTSeq, gene=goi[i], intgroup="condition", returnData = )
}
plotCounts(ddsHTSeq, gene=which.min(res$padj), intgroup="condition")