-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcnv.collection.r
154 lines (123 loc) · 4.88 KB
/
cnv.collection.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# get.hg38.segmean.r
# This is a modifed version GetGeneLevelCNA.r from
# https://github.com/ZhenyuZ/eqtl/blob/master/cnv/GetGeneLevelCNA.r
#
# Input GDC segmentation data, window size and calculate segment mean
# in genomic regions every window.size bases
# split chromosome by window.size to create region
# input "chromosome" contains two columns in order: chr and len
# window.size is any interger > 1
make.region = function(chromosomes, window.size) {
chromosome$nseg = ceiling(chromosome$len / window.size)
region = data.frame(chr = unlist(apply(chromosome, 1, function(x) rep(x[1], x[3]))),
start = unlist(apply(chromosome, 1, function(x) (seq(as.numeric(x[3]))-1)*window.size + 1)),
end = unlist(apply(chromosome, 1, function(x) c((seq(as.numeric(x[3]))-1)*window.size, as.numeric(x[2]))[-1]))
)
region$name = with(region, paste0(chr, ":", start, "-", end))
return(region)
}
# read segmentation file and convert to GRanges object (only autosomes)
seg2gr = function(file){
seg = read.table(file, h=T, stringsAsFactors=F)
# load into granges object
gr = with(seg[seg$Chromosome %in% 1:22, ], GRanges( seqnames = paste0("chr", Chromosome),
ranges = IRanges(start = Start, end = End),
Segment_Mean = Segment_Mean))
return(gr)
}
# calculate mean of segment_mean in gr by region
# region is a data frame containing the following 4 columns in order: chr, start, end, name
get.segmean = function(gr, region) {
window.mean = NULL
for (chr in chromosome$chr) {
my.gr = gr[seqnames(gr) == chr]
my.region = region[region$chr == chr, ]
window.mean = c(window.mean, apply(my.region, 1,
function(x){y = restrict(my.gr, start = as.numeric(x[2]), end = as.numeric(x[3]));
weighted.mean(y$Segment_Mean, width(y))}) )
}
names(window.mean) = region$name
return(window.mean)
}
# calculate extremum of segment_mean in gr by region
# region is a data frame containing the following 4 columns in order: chr, start, end, name
get.extremum = function(gr, region) {
window.extremum = NULL
for (chr in chromosome$chr) {
my.gr = gr[seqnames(gr) == chr]
my.region = region[region$chr == chr, ]
window.extremum = c(window.extremum, unlist(apply(my.region, 1,
function(x){y = restrict(my.gr, start = as.numeric(x[2]), end = as.numeric(x[3]))$Segment_Mean;
if(length(y) > 0) {y[which.max(abs(y))]} else { NA }})))
}
names(window.extremum) = region$name
return(window.extremum)
}
library(data.table)
library(GenomicRanges)
# load library and init
options(stringsAsFactors=F)
options(scipen=999)
# read file list
nocnv.files = fread("~/SCRATCH/cbs/data/all/nocnv", h=F)$V1
n = length(nocnv.files )
# collect data into list
segs = array(list(), n)
for (i in 1:n) {
if(i %% 100 == 0) {
cat(i, "out of", n, "done\n")
}
f = nocnv.files[i]
segs[[i]] = seg2gr(paste0("~/SCRATCH/cbs/data/all/", f))
}
# annotate aliquot names
meta = fread("../meta/snp6.sample.txt", h=T)
meta$core.name = sapply(meta$filename, function(x) strsplit(x, "\\.")[[1]][1])
m = match(sapply(nocnv.files, function(x) strsplit(x, "\\.")[[1]][1]), meta$core.name)
meta = meta[m, ]
names(segs) = meta$aliquot_barcode
save(segs, file = "TCGA.nocnv.segment.rda")
# reconstruct autosomal chromosome information for GRCh38
chromosome = data.frame(chr=paste0("chr", 1:22), len=c(248956422, 242193529, 198295559, 190214555, 181538259,
170805979, 159345973, 145138636, 138394717, 133797422, 135086622, 133275309, 114364328, 107043718,
101991189, 90338345, 83257441, 80373285, 58617616, 64444167, 46709983, 50818468))
# subset by window.size and get extreme cnvs
window.size = 5e6
region = make.region(chromosome, window.size)
extremum = matrix(0, nrow(region), n)
for (i in 1:n) {
if(i %% 100 == 0) {
cat(i, "out of", n, "done\n")
}
gr = segs[[i]]
extremum[, i] = get.extremum(gr, region)
}
library(data.table)
cnv = fread("TCGA.cnv.window_1M.txt", skip=1)
region = cnv$V1
cnv = as.matrix(cnv[, 2:ncol(cnv), with=F])
header = readLines("TCGA.cnv.window_1M.txt", n=1)
aliquot = strsplit(header, "\t")[[1]]
colnames(cnv) = aliquot
rownames(cnv) = region
save(cnv, file="TCGA.cnv.1M.matrix.rda")
# aliquot selection
meta = fread("../meta/snp6.sample.txt", h=T)
aliquot2 = SelectAliquot(aliquot, my.type = c("01", "03"))
cnv = cnv[ , match(aliquot2, aliquot)]
meta = meta[match(aliquot2, meta$aliquot_barcode), ]
# cut off
cnv2 = cnv
cutoff = 0.1
cnv2[which(is.na(cnv2))] = 0
cnv2[which(cnv2 < -cutoff)] = -1
cnv2[which(cnv2 > cutoff)] = 1
cnv2[which(cnv2 < cutoff & cnv > -cutoff)] = 0
cnv2 = CompletenessFilter(cnv2, feature.missing = 1, sample.mising = 1, dup.removal = T)
w = duplicated(cnv2)
cnv2 = cnv2[-w, ]
library(Rtsne)
pca2 = pca(cnv2)
x = pca2$x
w = duplicated(x)
tsne2 = tsne(x, permute=1, use=200)