-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_origins.R
executable file
·245 lines (202 loc) · 9.38 KB
/
merge_origins.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# A script to merge replication origin data from multiple datasets.
# Set paths
begPath <- "/Users/User/Research/DNArep";
wkDir <- paste(begPath, "/Data", sep="");
# Functions
source(paste(begPath, '/Code/functions_data_processing.R', sep=""));
# READ IN DATA ------------------------------------------------------------
# Read in skeletal replication origin data from OriDB
skeleton <- read.table(paste(wkDir, "/oriDB_origins_S_cer.txt", sep=""), header=TRUE, sep="\t", comment.char="", quote="");
# Read in yabuki_raghu.txt to get time of replication data
Trep <- read.table(paste(wkDir, "/yabuki_raghu.txt", sep=""), header=TRUE, sep="\t", comment.char="", quote="");
# Read in Whitehouse efficiency data
whitehouse <- read.table(paste(wkDir, "/whitehouse/whitehouse.txt", sep=""), header=TRUE, sep="\t", comment.char="", quote="");
# Read in Chen efficiency data
chen <- read.table(paste(wkDir, "/chen.txt", sep=""), header=TRUE, sep="\t", comment.char="", quote="");
# VERSION 1.0 -------------------------------------------------------------
# Aggregate data
# Initialize variables
dirty_data <- c();
# Give unique ID to each origin
ID <- seq(1:nrow(skeleton));
skeleton <- cbind(ID, skeleton);
# Rename columns to clarify datasets that were aggregated
colnames(Trep) <- paste("Trep", colnames(Trep), sep="_");
colnames(whitehouse) <- paste("whitehouse", colnames(whitehouse), sep="_");
colnames(chen) <- paste("chen", colnames(chen), sep="_");
# Add OriCenter to OriDB data
center <- ceiling(((skeleton$end - skeleton$start)/2) + skeleton$start);
skeleton <- cbind(skeleton[1:2], center, skeleton[3:ncol(skeleton)]);
# Format chromosome numbers in Chen data
chen_chr <- c();
for(i in 1:nrow(chen)){
if(chen$chen_ChrName[i] == "chrI") {
chen_chr[i] <- 1;
} else if (chen$chen_ChrName[i] == "chrII") {
chen_chr[i] <- 2;
} else if (chen$chen_ChrName[i] == "chrIII") {
chen_chr[i] <- 3;
} else if (chen$chen_ChrName[i] == "chrIV") {
chen_chr[i] <- 4;
} else if (chen$chen_ChrName[i] == "chrV") {
chen_chr[i] <- 5;
} else if (chen$chen_ChrName[i] == "chrVI") {
chen_chr[i] <- 6;
} else if (chen$chen_ChrName[i] == "chrVII") {
chen_chr[i] <- 7;
} else if (chen$chen_ChrName[i] == "chrVIII") {
chen_chr[i] <- 8;
} else if (chen$chen_ChrName[i] == "chrIX") {
chen_chr[i] <- 9;
} else if (chen$chen_ChrName[i] == "chrX") {
chen_chr[i] <- 10;
} else if (chen$chen_ChrName[i] == "chrXI") {
chen_chr[i] <- 11;
} else if (chen$chen_ChrName[i] == "chrXII") {
chen_chr[i] <- 12;
} else if (chen$chen_ChrName[i] == "chrXIII") {
chen_chr[i] <- 13;
} else if (chen$chen_ChrName[i] == "chrXIV") {
chen_chr[i] <- 14;
} else if (chen$chen_ChrName[i] == "chrXV") {
chen_chr[i] <- 15;
} else if (chen$chen_ChrName[i] == "chrXVI") {
chen_chr[i] <- 16;
}
}
chen <- cbind(chen_chr, chen);
# Since not all origins have names, mapping by bp position requires breaking up data by chromosomes.
totalChrom <- 16;
for (chrom in c(1:totalChrom)) {
skeleton_block <- skeleton[which(skeleton$chr == chrom),];
# STEP 1 ------------------------------------------------------------------
# Merge yabuki raghu trep data by starting position
Trep_block <- Trep[which(Trep$Trep_chr == chrom),];
# Need to match to closest bp from OriDB
nearest_bps <- sapply(Trep_block$Trep_start, function(x) which.min(abs(x-skeleton_block$start)));
# Add matching bp column for merging by bp
start <- skeleton_block$start[nearest_bps];
Trep_match <- cbind(start, Trep_block);
# Check matching start locations for duplicates and remove duplicates with greatest origin difference
duplicates <- which(duplicated(Trep_match$start));
if(length(duplicates) != 0) {
duplicates <- c((duplicates[1]-1), duplicates);
keep <- which.min(abs(Trep_match[duplicates, "start"] - Trep_match[duplicates, "Trep_start"]));
remove <- duplicates[-keep];
Trep_noDups <- Trep_match[-remove, ];
} else {
Trep_noDups <- Trep_match;
}
# Remove any data with bp difference greater than 6000
mismatches <- which(abs(Trep_noDups$start - Trep_noDups$Trep_start) > 6000);
if (length(mismatches) != 0) {
Trep_clean <- Trep_noDups[-mismatches,];
} else {
Trep_clean <- Trep_noDups;
}
data_1_block <- merge.with.order(skeleton_block, Trep_clean, by="start", all.x = TRUE, sort=F ,keep_order = 1);
# Merge whitehouse data by starting position
whitehouse_block <- whitehouse[which(whitehouse$whitehouse_chr == chrom),];
# Need to match to closest bp from OriDB
nearest_bps <- sapply(whitehouse_block$whitehouse_WT_A_bp, function(x) which.min(abs(x-skeleton_block$center)));
# Add matching bp column for merging by bp
center <- skeleton_block$center[nearest_bps];
whitehouse_match <- cbind(center, whitehouse_block);
# Check matching start locations for duplicates and remove duplicates with greatest origin difference
duplicates <- which(duplicated(whitehouse_match$center));
if(length(duplicates) != 0) {
duplicates <- c((duplicates[1]-1), duplicates);
keep <- which.min(abs(whitehouse_match[duplicates, "center"] - whitehouse_match[duplicates, "whitehouse_WT_A_bp"]));
remove <- duplicates[-keep];
whitehouse_noDups <- whitehouse_match[-remove, ];
} else {
whitehouse_noDups <- whitehouse_match;
}
# Remove any data with bp difference greater than 6000
mismatches <- which(abs(whitehouse_noDups$center - whitehouse_noDups$whitehouse_WT_A_bp) > 6000);
if (length(mismatches) != 0) {
whitehouse_clean <- whitehouse_noDups[-mismatches,];
} else {
whitehouse_clean <- whitehouse_noDups;
}
# Merge data
data_2_block <- merge.with.order(data_1_block, whitehouse_clean, by = "center", all.x=TRUE, sort=F, keep_order = 1);
# Merge chen data by center
chen_block <- chen[which(chen$chen_chr == chrom),];
# Need to match to closest bp from OriDB
nearest_bps <- sapply(chen_block$chen_OriCent, function(x) which.min(abs(x-skeleton_block$center)));
# Add matching bp column for merging by bp
center <- skeleton_block$center[nearest_bps];
chen_match <- cbind(center, chen_block);
# Check matching start locations for duplicates and remove duplicates with greatest origin difference
duplicates <- which(duplicated(chen_match$center));
if(length(duplicates) != 0) {
duplicates <- c((duplicates[1]-1), duplicates);
keep <- which.min(abs(chen_match[duplicates, "center"] - chen_match[duplicates, "chen_OriCent"]));
remove <- duplicates[-keep];
chen_noDups <- chen_match[-remove, ];
} else {
chen_noDups <- chen_match;
}
# Remove any data with bp difference greater than 6000
mismatches <- which(abs(chen_noDups$center - chen_noDups$chen_OriCent) > 6000);
if (length(mismatches) != 0) {
chen_clean <- chen_noDups[-mismatches,];
} else {
chen_clean <- chen_noDups;
}
# Merge data
data_3_block <- merge.with.order(data_2_block, chen_clean, by = "center", all.x=TRUE, sort=F, keep_order = 1);
# Bind the block data for each chromosome
dirty_data <- rbind(dirty_data, data_3_block);
}
# STEP 2 ------------------------------------------------------------------
# Clean the data
# Rearrange order of for ID, chr, start, center, end columns
clean_data <- dirty_data
colnames(clean_data) [1] <- "ID";
clean_data[1] <- dirty_data$ID;
colnames(clean_data)[2] <- "chr";
clean_data[2] <- dirty_data$chr;
colnames(clean_data)[3] <- "start";
clean_data[3] <- dirty_data$start;
colnames(clean_data)[4] <- "center";
clean_data[4] <- dirty_data$center;
colnames(clean_data)[5] <- "end";
clean_data[5] <- dirty_data$end
# Check matching names (a similar check was performed for chen and Trep resulting in zero name discrepancies)
# We show there exists name discrepancies in whitehouse data which will be manually adjusted for
for (i in 1:nrow(clean_data)) {
if(is.na(clean_data$whitehouse_Name[i])) {
#print(clean_data$whitehouse_Name[i]);
}
else if (as.character(clean_data$whitehouse_Name[i]) == as.character(clean_data$name[i])) {
#print(as.character(clean_data$whitehouse_Name[i]));
}
else {
#print(as.character(clean_data$whitehouse_Name[i]));
}
}
# Adjust whitehouse data for names: ARS434, ARS1022, ARS1218, ARS1628
clean_data[189, 20:28] <- clean_data[188, 20:28];
clean_data[188, 20:28] <- NA;
clean_data[454, 20:28] <- clean_data[453, 20:28];
clean_data[453, 20:28] <- NA;
clean_data[538, 20:28] <- clean_data[537, 20:28];
clean_data[537, 20:28] <- NA;
clean_data[820, 20:28] <- clean_data[821, 20:28];
clean_data[821, 20:28] <- NA;
# Output the aggregate table
write.table(clean_data, paste(wkDir, "/ori_data_1.0.txt", sep=""), sep='\t', row.names=F);
# Run bp check to see if centers of origins greatly differ between datasets.
# Check bp difference between Ori and whitehouse_A experiment
bp_diff <- abs(clean_data$center - clean_data$whitehouse_WT_A_bp);
mean_whitehouseA_diff <- mean(bp_diff, na.rm=TRUE);
median_whitehouseA_diff <- median(bp_diff, na.rm=TRUE);
std_whitehouseA_diff <- sd(bp_diff, na.rm=TRUE);
max_whitehouseA_diff <- max(bp_diff, na.rm=TRUE);
bp_difference <- na.omit(bp_diff);
hist(bp_difference, breaks = 80);
bp_distance <- abs(clean_data$whitehouse_WT_A_bp - clean_data$center);
OEM <- clean_data$whitehouse_WT_A_efficiency;
plot(bp_distance, OEM);