-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmethylation maps.r
285 lines (249 loc) · 11.8 KB
/
methylation maps.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
############## METHYLATION MAPS OF SCAFFOLDS #############
rm(list=ls(all=TRUE)) #empty memory
Sys.setenv(http_proxy="http://proxy.unicaen.fr:3128/") #proxy settings for R internet access
##########
setwd("D:/BOULOT/RECHERCHE/MeDIPseq/analyse_MeDIP/ElizabethCrowell/Bouinages_Guillaume/methylmaps/bedfiles")
getwd()
### Preparing data
## Chargement des données, originales dans Elizabeth Data\BAMtoBED
message("loading data...")
#bed sample files
#df<-read.table("troc2s.bed", header=F, sep="\t")
#head(df)
# elimination des lignes ne correspondant pas au scaffold à mapper pour GenomeCoverage de BEDtools #
#df1<-df[df[,1]=="scaffold22",]
#head(df1)
#write.table(df1,"troc2s_scaffold22.bed", quote=F, col.names=F, row.names=F, sep="\t")
#The data *sample*_scaffold22.bed were processed using BEDtools GeneomeCoverage on Galaxy because my Laptop doesn't have a linux distribution installed...
#(bedtools_genomecoveragebed/2.22.0) Output type hist
#Specify max depth 0
#Genome file 95: Scaffold22100bpintervals
#Treat split/spliced BAM or BED12 entries as distinct BED intervals when computing coverage. False
#Calculate coverage based on both strands combined
#Report the depth at each genome position with 1-based coordinates True
#Report the depth at each genome position with 0-based coordinatess False
#Calculate coverage of 5’ positions False
#Calculate coverage of 3’ positions False
# files were renamed *sample*_covsca22 and stored in the folder 'Buinages_Guillaume/methylmaps/Bedfiles/covsca22 files
#loading 1-base sample maps
#something like fileslist<-list.files("covsca22 files/") could have been smarter...
message("loading 1-base sample maps...")
ovo1<-read.table("covsca22 files/ovo1_covsca22.txt", header=F, sep="\t")
ovo2<-read.table("covsca22 files/ovo2_covsca22.txt", header=F, sep="\t")
ovo3<-read.table("covsca22 files/ovo3_covsca22.txt", header=F, sep="\t")
x281<-read.table("covsca22 files/28cell1_covsca22.txt", header=F, sep="\t")
x282<-read.table("covsca22 files/28cell2_covsca22.txt", header=F, sep="\t")
x283<-read.table("covsca22 files/28cell3_covsca22.txt", header=F, sep="\t")
mor1<-read.table("covsca22 files/morula1_covsca22.txt", header=F, sep="\t")
mor2<-read.table("covsca22 files/morula2_covsca22.txt", header=F, sep="\t")
bla2<-read.table("covsca22 files/blastula2_covsca22.txt", header=F, sep="\t")
bla3<-read.table("covsca22 files/blastula3_covsca22.txt", header=F, sep="\t")
gas1<-read.table("covsca22 files/gastrula1_covsca22.txt", header=F, sep="\t")
gas2<-read.table("covsca22 files/gastrula2_covsca22.txt", header=F, sep="\t")
gas3<-read.table("covsca22 files/gastrula3_covsca22.txt", header=F, sep="\t")
tro1<-read.table("covsca22 files/troc1_covsca22.txt", header=F, sep="\t")
tro2<-read.table("covsca22 files/troc2_covsca22.txt", header=F, sep="\t")
dla1<-read.table("covsca22 files/dlarv1_covsca22.txt", header=F, sep="\t")
dla2<-read.table("covsca22 files/dlarv2_covsca22.txt", header=F, sep="\t")
dla3<-read.table("covsca22 files/dlarv3_covsca22.txt", header=F, sep="\t")
spa1<-read.table("covsca22 files/spat1_covsca22.txt", header=F, sep="\t")
spa2<-read.table("covsca22 files/spat2_covsca22.txt", header=F, sep="\t")
spa3<-read.table("covsca22 files/spat3_covsca22.txt", header=F, sep="\t")
list<-list(ovo1=ovo1,ovo2=ovo2,ovo3=ovo3,x281=x281,x282=x282,x283=x283,mor1=mor1,mor2=mor2,bla2=bla2,bla3=bla3,gas1=gas1,gas2=gas2,gas3=gas3,tro1=tro1,tro2=tro2,dla1=dla1,dla2=dla2,dla3=dla3,spa1=spa1,spa2=spa2,spa3=spa3) # make a list with sample data tables
# loading nb reads per sample for normalization
read.number<-read.table("D:/BOULOT/RECHERCHE/MeDIPseq/analyse_MeDIP/ElizabethCrowell/Bouinages_Guillaume/Guillaume_SourceFiles/readnumber.txt", header=T, sep="\t") # loads the nb_reads file
rownames(read.number)<-c("ovo1","ovo2","ovo3","x281","x282","x283","mor1","mor2","bla2","bla3","gas1","gas2","gas3","tro1","tro2","dla1","dla2","dla3","spa1","spa2","spa3") #gives a rowname corresponding to the sample name in the list
### formatting scaffold 22 physical methylation map per sample
for (i in 1:length(list))
{
colnames(list[[i]])<-c("chrom","base","cov") # gives column names to the objects in the list
list[[i]]<-data.frame(list[[i]]$chrom,list[[i]]$base,((list[[i]]$cov/read.number[i,2])*1000)) # creates a data frame containing the base position and the normalised coverage per million read number
colnames(list[[i]])<-c("chrom","base","rpm") # gives normalized list objects the column names
list[[i]]<-list[[i]][1:(1000*(as.integer(nrow(list[[i]])/1000))),] # removes rows beyond the greatest 1000 multiple
list[[i]]$levels<-dfcut<-cut(list[[i]]$base,as.integer(nrow(list[[i]])/10000)) # adds a 'levels' column and cuts the base vector into intervals of 10kb
list[[i]]<-aggregate(list[[i]]$rpm, by=list(list[[i]]$levels),sum) # sums the read number (eg methylation) per interval
colnames(list[[i]])<-c("levels","rpm") # gives normalized list objects the column names
list[[i]]$plotpos<-c(1:nrow(list[[i]])) # converts each interval into a position on the x axis ('plotpos') for plotting
}
### compute mean per stage Ovo, 28cells, I, spat and create data frame containing all the data for plotting
ovo<-data.frame(list[["ovo1"]]$plotpos, (list[["ovo1"]]$rpm )) #+ list[["ovo2"]]$rpm + list[["ovo3"]]$rpm)/3)
colnames(ovo)<-c("plotpos","rpm.ovo")
x28<-data.frame(list[["x281"]]$plotpos, (list[["x281"]]$rpm + list[["x282"]]$rpm + list[["x283"]]$rpm)/3)
colnames(x28)<-c("plotpos","rpm.x28")
I<-data.frame(list[["mor1"]]$plotpos, ( ((list[["mor1"]]$rpm + list[["mor2"]]$rpm)/2 +
(list[["bla2"]]$rpm + list[["bla3"]]$rpm)/2 +
(list[["gas1"]]$rpm + list[["gas2"]]$rpm + list[["gas3"]]$rpm)/3 +
(list[["tro1"]]$rpm + list[["tro2"]]$rpm) /2 +
(list[["dla1"]]$rpm + list[["dla2"]]$rpm + list[["dla3"]]$rpm)/3)
/5))
colnames(I)<-c("plotpos","rpm.I")
spa<-data.frame(list[["spa1"]]$plotpos, (list[["spa1"]]$rpm + list[["spa2"]]$rpm + list[["spa3"]]$rpm)/3)
colnames(spa)<-c("plotpos","rpm.spa")
stages<-list(ovo,x28,I,spa)
map<-Reduce(function(...) merge(..., all=T), stages) # builds a unique data table with the data
map[,2:5][map[,2:5]<=2]<-0 # Gives a 0 value for a sum of rpm <2 (considered background)
#write.table(map, "map.txt")
### PLOT
par(plot(map$rpm.spa~map$plotpos,
type="s",
xlim=c(0,nrow(map)),
ylim=c(0,max(map$rpm.spa)),
col="blue",
axes=F,
ylab="methylation level",
xlab=""),
new=T)
x <- seq_along(map$rpm.spa)
y2 <- rep(map$rpm.spa, each=2)
y2 <- y2[-length(y2)]
x2 <- rep(x, each=2)[-1]
x3 <- c(min(x2), x2, max(x2))
y3 <- c(0, y2, 0)
par(polygon(x3, y3, border=NA, col=rgb(0,0,1,0.5)), new=T)
par(lines(x2, y2), new=T)
par(plot(map$rpm.I~map$plotpos, type="s",xlim=c(0,nrow(map)),ylim=c(0,max(map$rpm.spa)),col="orange", axes=F, xlab="",ylab=""), new=T)
x <- seq_along(map$rpm.I)
y2 <- rep(map$rpm.I, each=2)
y2 <- y2[-length(y2)]
x2 <- rep(x, each=2)[-1]
x3 <- c(min(x2), x2, max(x2))
y3 <- c(0, y2, 0)
par(polygon(x3, y3, border=NA, col=rgb(1,1,0,0.5)),new=T)
par(lines(x2, y2), new=T)
par(plot(map$rpm.x28~map$plotpos, type="s",xlim=c(0,nrow(map)),ylim=c(0,max(map$rpm.spa)),col="red",axes=F, xlab="",ylab=""), new=T)
x <- seq_along(map$rpm.x28)
y2 <- rep(map$rpm.x28, each=2)
y2 <- y2[-length(y2)]
x2 <- rep(x, each=2)[-1]
x3 <- c(min(x2), x2, max(x2))
y3 <- c(0, y2, 0)
par(polygon(x3, y3, border=NA, col=rgb(1,0,0,0.5)),new=T)
par(lines(x2, y2),new=T)
par(plot(map$rpm.ovo~map$plotpos, type="s",xlim=c(0,nrow(map)),ylim=c(0,max(map$rpm.spa)),col="black",axes=F, xlab="",ylab=""), new=T)
x <- seq_along(map$rpm.ovo)
y2 <- rep(map$rpm.ovo, each=2)
y2 <- y2[-length(y2)]
x2 <- rep(x, each=2)[-1]
x3 <- c(min(x2), x2, max(x2))
y3 <- c(0, y2, 0)
par(polygon(x3, y3, border=NA, col=rgb(0,1,0,0.5)), new=T)
par(lines(x2, y2),new=T)
par(segments(0,0,196.45),new=T)
### Gene map on scaffold 22
##load data
genemap<-read.table("D:/BOULOT/RECHERCHE/MeDIPseq/analyse_MeDIP/ElizabethCrowell/Bouinages_Guillaume/methylmaps/sca22_genemap.txt", header=T, sep="\t")
## create dataframe for plotting
gmap<-as.data.frame(seq(1:1964558)); colnames(gmap)<-c("base")
#builds 'list' containing vectors of the bases in scaffold 22 bases that are within a gene
list<-list() #creates an empty list
length(list)<-nrow(genemap)
for (i in (1:nrow(genemap)))
{
list[[i]]<-c(genemap[i,2]:genemap[i,3]) #
}
ingene<-as.data.frame(unique(unlist(list))) # keeps unique values
gmap$ingene<-numeric(length(gmap$"base")) # creates a 'ingene' column in gmap giving a "0" value to each base
gmap$ingene[which(gmap$"base" %in% ingene[,1])]<-1 # attributes a "1" value for bases within genes
#plot
x<-gmap$base
y<-gmap$ingene
df<-data.frame(x,y)
par(mfrow=c(2,1))
plot(y~x,
xlim=c(0,nrow(df)),
ylim=c(0,max(df$y)),
type="s",
col="black",
axes=F,
xlab="",
ylab="Scaffold 22"
)
segments(0,0,nrow(df),0)
segments(0,max(df$y),nrow(df),max(df$y))
segments(0,0,0,max(df$y))
segments(nrow(df),0,nrow(df),max(df$y))
x <- seq_along(df$y)
y2 <- rep(df$y, each=2)
y2 <- y2[-length(y2)]
x2 <- rep(x, each=2)[-1]
x3 <- c(min(x2), x2, max(x2))
y3 <- c(0, y2, 0)
par(polygon(x3, y3, border=NA, col="black"), new=T)
par(lines(x2, y2),new=T)
############TESTS################
t<-data.frame(c("a","b","c","d"),c(1,2,3,4))
u<-data.frame(c("a","b","c","d"),c(11,22,33,44))
v<-data.frame(c("a","b","c","d"),c(111,222,333,444))
w<-data.frame(c("t","u","v"),c(1,10,100))
list<-list(t=t,u=u,v=v)
for (i in 1:length(list))
{
colnames(list[[i]])<-c("pos","cov")
list[[i]]<-data.frame(list[[i]]$pos,(list[[i]]$cov/w[i,2])*10)
colnames(list[[i]])<-c("pos","rpm")
}
x<-c(1:1042)
y<-rnorm(1042)
df<-data.frame(x,y)
colnames(df)<-c("x","y") #genere des données
df<-df[1:1000*(as.integer(nrow(df)/1000)),] #enlève les dernieres lignes au delà du plus grand multiple de 1000
df$levels<-dfcut<-cut(df$x,as.integer(nrow(df)/100)) #ajoute une colonne 'levels' qui coupe le vecteur x en intervalles de 100
df$sum<-
df$mean<-ave(df$y,df$levels,FUN=mean) #computes mean by interval
df<-df[!duplicated(df$levels),3:4] #keeps only one line per interval and its mean
df$plotpos<-c(1:nrow(df)) # converts each interval into a position on the x axis
#plot
x<-df$plotpos
y<-df$mean
plot(x,y)
polygon(c(x,x[length(x)]), c(y, y[1]), col='red')
ggplot(dt) + stat_density(aes(x=value,color=site),geom="line",position="dodge")+
geom_ribbon(data=subset(gg,site=="site1" & x>q1),
aes(x=x,ymax=y),ymin=0,fill="red", alpha=0.5)+
geom_ribbon(data=subset(gg,site=="site2" & x<q2),
aes(x=x,ymax=y),ymin=0,fill="blue", alpha=0.5)
#gene map
#interval vector building
x<-c(3,7,12)
y<-c(4,10,13)
df<-data.frame(x,y)
z<-as.data.frame(seq(1:2000000))
list<-list()
length(list)<-nrow(df)
for (i in (1:nrow(df)))
{
list[[i]]<-c(df[i,1]:df[i,2])
}
ingene<-unique(unlist(list))
for (i in 1:nrow(z))
{
if (i %in% ingene) z$ingene[i]<-1
else z$ingene[i]<-0
}
# gene map plot
x<-seq(1:10)
y<-c(0,0,1,0,1,1,1,0,1,0)
df<-data.frame(x,y)
plot(y~x,xlim=c(0,nrow(df)),ylim=c(0,max(df$y)), type="s", col="black", axes=F, xlab="", ylab="")
abline(h=c(0,1),v=c(0,nrow(df)))
x <- seq_along(df$y)
y2 <- rep(df$y, each=2)
y2 <- y2[-length(y2)]
x2 <- rep(x, each=2)[-1]
x3 <- c(min(x2), x2, max(x2))
y3 <- c(0, y2, 0)
par(polygon(x3, y3, border=NA, col=rgb(0.25,0.25,0.25,1)), new=T)
par(lines(x2, y2),new=T)
x<-c(3,4,7,10,12,13)
y<-c(1,0,1,0,1,0)
df<-data.frame(x,y)
plot(x,y,xlim=c(0,21),ylim=c(0,1), type="s", col="black", axes=F, xlab="", ylab="")
abline(h=c(0,1),v=c(0,nrow(20)))
x <- seq_along(df$y)
y2 <- rep(df$y, each=2)
y2 <- y2[-length(y2)]
x2 <- rep(x, each=2)[-1]
x3 <- c(min(x2), x2, max(x2))
y3 <- c(0, y2, 0)
par(polygon(x3, y3, border=NA, col=rgb(0.25,0.25,0.25,1)), new=T)
par(lines(x2, y2),new=T)