-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtsne.rna.r
101 lines (68 loc) · 2.41 KB
/
tsne.rna.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
library(data.table)
setwd("~/SCRATCH/tsne/rna")
source("../tsne.util.r")
load("rna.count.rda")
# variable name "count"
# replace sample names to barcode
meta = fread("../meta/rna.meta.txt")
# aliquot selection
aliquot2 = SelectAliquot(aliquot, my.type = c("01", "03"))
m = match(aliquot2, colnames(count))
count = count[, m]
# QC
# rnaseq data do not have NA. Because this is accross multiple project, zero.ratio set to 0.99
count2 = CompletenessFilter(count, feature.missing = 1, sample.mising = 1, zero.ratio = 0.99, dup.removal = T)
# low count expression filter
low.expr = apply(count2, 1, function(x) sum(x <= 1) / ncol(count2))
count2 = count2[which(low.expr < 0.99, ), ]
save(count2, file="count2.rda")
# make log(tpm)
total = apply(count2, 2, sum)
count2[which(count2 == 0)] = 1
tpm2 = t(t(count2)/total)
tpm2 = log2(tpm2)
# pca
ptm <- proc.time()
pca2 = pca(tpm2)
proc.time() - ptm
save(pca2, file="TCGA.rna.filter1.allgene.pca.rda")
# t-SNE
library(Rtsne)
ptm <- proc.time()
tsne2 = tsne(pca2$x, permute=100, use=50)
proc.time() - ptm
save(tsne2, file="TCGA.rna.allgene.tsne.rda")
aliquot = fread("disease_aliquot.txt")
meta$aliquot = meta$barcode
pdata2 = annotate.tsne(tsne2, meta)
pdata2 = pdata2[, c("aliquot", "feature_1", "feature_2", "disease", "center")]
colnames(pdata2)[4] = "project"
write.table(pdata2, "TCGA.rna.allgene.pdata.txt", col.names=T, row.names=F, sep="\t", quote=F)
png("TCGA.rna.allgene.pdata.perm100.png", width=1200, height=800)
p = ggplot(pdata2, aes(x=feature_1, y=feature_2, col=project, shape = project)) +
geom_point(size = 2) +
scale_shape_manual(values=c(rep(c(0, 2, 8, 16, 17), 6), 0, 2, 8) )
p
dev.off()
#####
low expr
m = apply(tpm2, 1, mean)
w = which(m > 1e-6)
tpm3 = tpm2[w, ]
pca3 = pca(tpm3)
save(pca3, file="TCGA.rna.filter1.lowexpr.pca.rda")
# t-SNE
library(Rtsne)
tsne3 = tsne(pca3$x, permute=100, use=50)
save(tsne3, file="TCGA.rna.lowexpr.tsne.rda")
meta$aliquot = meta$barcode
pdata3 = annotate.tsne(tsne3, meta)
pdata3 = pdata3[, c("aliquot", "feature_1", "feature_2", "disease", "center")]
colnames(pdata3)[4] = "project"
write.table(pdata3, "TCGA.rna.lowexpr.pdata.txt", col.names=T, row.names=F, sep="\t", quote=F)
png("TCGA.rna.lowexpr.pdata.perm100.png", width=1200, height=800)
p = ggplot(pdata3, aes(x=feature_1, y=feature_2, col=project, shape = project)) +
geom_point(size = 2) +
scale_shape_manual(values=c(rep(c(0, 2, 8, 16, 17), 6), 0, 2, 8) )
p
dev.off()