-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathintSiteLogic.R
506 lines (385 loc) · 20 KB
/
intSiteLogic.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
getTrimmedSeqs <- function(qualityThreshold, badQuality, qualityWindow, primer,
ltrbit, largeLTRFrag, linker, linker_common, mingDNA,
read1, read2, alias, vectorSeq){
##### Load libraries #####
library("hiReadsProcessor")
library("ShortRead")
stats <- data.frame()
message(alias)
workingDir <- alias
suppressWarnings(dir.create(workingDir, recursive=TRUE))
setwd(workingDir)
stats.bore <- data.frame(sample=alias)
message("\t read data and make quality heatmaps")
filenames <- list(read1, read2)
reads <- lapply(filenames, function(x) {
sapply(x, readFastq)
})
stats.bore$Reads.l.beforeTrim <- sum(sapply(reads[[1]], length))
stats.bore$Reads.p.beforeTrim <- sum(sapply(reads[[2]], length))
r <- lapply(reads, function(x){
seqs <- x[[1]]
if(length(seqs) > 0){
#remove anything after 5 bases under Q30 in 10bp window
trimmed <- trimTailw(seqs, badQuality, qualityThreshold,
round(qualityWindow/2))
#get rid of anything that lost more than half the bases
trimmed <- trimmed[width(trimmed) > 65]
if(length(trimmed) > 0){
trimmedSeqs <- sread(trimmed)
trimmedqSeqs <- quality(quality(trimmed))
names(trimmedSeqs) <- names(trimmedqSeqs) <-
sapply(sub("(.+) .+","\\1",ShortRead::id(trimmed)),
function(z){paste0(alias, "%", strsplit(z, "-")[[1]][2])})
}
}
list(trimmedSeqs, trimmedqSeqs)
})
reads <- sapply(r, "[[", 1)
qualities <- sapply(r, "[[", 2)
#this is needed for primerID quality scores later on
R1Quality <- qualities[[1]]
stats.bore$Reads.p.afterTrim <- length(reads[[2]])
stats.bore$Reads.l.afterTrim <- length(reads[[1]])
print(stats.bore)
message("\t trim adaptors")
#'.p' suffix signifies the 'primer' side of the amplicon (i.e. read2)
#'.l' suffix indicates the 'liner' side of the amplicon (i.e. read1)
res.p <- pairwiseAlignSeqs(reads[[2]], patternSeq=primer,
qualityThreshold=1, doRC=F)
reads.p <- reads[[2]]
if(length(res.p) > 0){
reads.p <- trimSeqs(reads[[2]], res.p, side='left', offBy=1)
}
stats.bore$primed <- length(reads.p)
res.ltr <- pairwiseAlignSeqs(reads.p, patternSeq=ltrbit,
qualityThreshold=1, doRC=F)
if(length(res.ltr) > 0 ){
reads.p <- trimSeqs(reads.p, res.ltr, side='left', offBy=1)
}
stats.bore$LTRed <- length(reads.p)
if(grepl("N", linker)){
res <- primerIDAlignSeqs(subjectSeqs=reads[[1]], patternSeq=linker,
doAnchored=T, qualityThreshold1=1,
qualityThreshold2=1, doRC=F)
res.l <- res[["hits"]]
res.pID <- res[["primerIDs"]]
}else{
res.l <- pairwiseAlignSeqs(subjectSeqs=reads[[1]], patternSeq=linker,
qualityThreshold=.95, doRC=F, side="middle")
start(res.l) <- 1
}
reads.l <- reads[[1]]
if(length(res.l) > 0 ){
reads.l <- trimSeqs(reads[[1]], res.l, side='left', offBy=1)
if(grepl("N", linker)){ #i.e. contains a primerID
R1Quality <- R1Quality[match(names(res.pID), names(R1Quality))]
primerIDs <- trimSeqs(reads[[1]], res.pID, side="middle")
primerIDQuality <- subseq(R1Quality, start=start(res.pID),
end=end(res.pID))
primerIDData <- list(primerIDs, primerIDQuality)
save(primerIDData, file="primerIDData.RData")
}
}
stats.bore$linkered <- length(reads.l)
print(stats.bore)
## check if reads were sequenced all the way by checking for opposite adaptor
message("\t trim opposite side adaptors")
res.p <- NULL
tryCatch(res.p <- pairwiseAlignSeqs(reads.p, linker_common,
qualityThreshold=.55, side='middle',
doRC=F),
error=function(e){print(paste0("Caught ERROR in intSiteLogic: ",
e$message))})
if(!is.null(res.p)){
#if we see the common sequence, pitch the rest
end(res.p) <- width(reads.p[names(res.p)]) + 1
if(length(res.p) > 0 ){
reads.p <- c(reads.p[!names(reads.p) %in% names(res.p)],
trimSeqs(reads.p, res.p, side='right', offBy=1))
}
}
res.l <- NULL
tryCatch(res.l <- pairwiseAlignSeqs(reads.l, largeLTRFrag,
qualityThreshold=.55, side='middle', doRC=F),
error=function(e){print(paste0("Caught ERROR in intSiteLogic: ",
e$message))})
if(!is.null(res.l)){
end(res.l) <- width(reads.l[names(res.l)]) + 1
if(length(res.l) > 0 ){
reads.l <- c(reads.l[!names(reads.l) %in% names(res.l)],
trimSeqs(reads.l, res.l, side='right', offBy=1))
}
}
reads.p <- subset(reads.p, width(reads.p) > mingDNA)
reads.l <- subset(reads.l, width(reads.l) > mingDNA)
stats.bore$reads.p_afterTrim <- length(reads.p)
stats.bore$reads.l_afterTrim <- length(reads.l)
message("\t trim vector")
#we want to do this at the end so that we don't have to worry about partial
#vector alignments secondary to incomplete trimming of long reads
#we've set our workingdir as the individual sample dir, but the vectordir is
#relative to the run directory
oldWD <- getwd()
setwd("..")
Vector <- readDNAStringSet(vectorSeq)
setwd(oldWD)
blatParameters <- c(minIdentity=70, minScore=5, stepSize=3,
tileSize=8, repMatch=112312, dots=1000,
q="dna", t="dna", out="psl")
findAndRemoveVector <- function(reads, Vector, blatParameters, minLength=10){
hits.v <- read.psl(blatSeqs(query=reads, subject=Vector,
blatParameters=blatParameters, parallel=F),
bestScoring=F)
#collapse instances where a single read has multiple vector alignments
hits.v <- reduce(GRanges(seqnames=hits.v$qName, IRanges(hits.v$qStart,
hits.v$qEnd)),
min.gapwidth=1200)
names(hits.v) <- as.character(seqnames(hits.v))
hits.v <- hits.v[start(hits.v)<=5 & width(hits.v)>minLength]
reads[!names(reads) %in% names(hits.v)]
}
tryCatch(reads.p <- findAndRemoveVector(reads.p, Vector,
blatParameters=blatParameters),
error=function(e){print(paste0("Caught ERROR in intSiteLogic: ",
e$message))})
tryCatch(reads.l <- findAndRemoveVector(reads.l, Vector,
blatParameters=blatParameters),
error=function(e){print(paste0("Caught ERROR in intSiteLogic: ",
e$message))})
stats.bore$reads.p_afterVTrim <- length(reads.p)
stats.bore$reads.l_afterVTrim <- length(reads.l)
toload <- intersect(names(reads.p), names(reads.l))
stats.bore$reads.lLength <- mean(width(reads.l))
stats.bore$reads.pLength <- mean(width(reads.p))
stats.bore$curated <- length(toload)
print(stats.bore)
stats <- rbind(stats, stats.bore)
#this could probably be cleaner with sapplys
reads.p <- reads.p[toload]
reads.l <- reads.l[toload]
#dereplicate seqs for faster alignments
#this is re-expand at the beginning of callSeqs
reads.p.u <- unique(reads.p)
reads.l.u <- unique(reads.l)
names(reads.p.u) <- seq(reads.p.u)
names(reads.l.u) <- seq(reads.l.u)
keys <- data.frame("R2"=match(reads.p, reads.p.u),
"R1"=match(reads.l, reads.l.u), "names"=toload)
save(keys, file="keys.RData")
if(length(toload) > 0){
#cap number of reads per thread-we care about speed rather than # of procs
chunks.p <- split(seq_along(reads.p.u), ceiling(seq_along(reads.p.u)/30000))
for(i in c(1:length(chunks.p))){
writeXStringSet(reads.p.u[chunks.p[[i]]], file=paste0("R2-", i, ".fa"),
append=TRUE)
}
chunks.l <- split(seq_along(reads.l.u), ceiling(seq_along(reads.l.u)/30000))
for(i in c(1:length(chunks.l))){
writeXStringSet(reads.l.u[chunks.l[[i]]], file=paste0("R1-", i, ".fa"),
append=TRUE)
}
save(stats, file="stats.RData")
alias #return 'value' which ultimately gets saved as trimStatus.RData
}else{
stop("error - no curated reads")
}
}
processAlignments <- function(workingDir, minPercentIdentity, maxAlignStart, maxLength, refGenome){
##### Load libraries #####
library("hiReadsProcessor")
library("GenomicRanges")
codeDir <- get(load("codeDir.RData"))
source(paste0(codeDir, "/programFlow.R"))#for get_reference_genome function
setwd(workingDir)
dereplicateSites <- function(uniqueReads){
#do the dereplication, but loose the coordinates
sites.reduced <- reduce(flank(uniqueReads, -5, both=TRUE), with.revmap=T)
sites.reduced$counts <- sapply(sites.reduced$revmap, length)
#order the unique sites as described by revmap
dereplicatedSites <- uniqueReads[unlist(sites.reduced$revmap)]
#if no sites are present, skip this step - keep doing the rest to provide a
#similar output to a successful dereplication
if(length(uniqueReads)>0){
#split the unique sites as described by revmap (sites.reduced$counts came from revmap above)
dereplicatedSites <- split(dereplicatedSites, Rle(values=seq(length(sites.reduced)), lengths=sites.reduced$counts))
}
#do the standardization - this will pick a single starting position and
#choose the longest fragment as ending position
dereplicatedSites <- unlist(reduce(dereplicatedSites, min.gapwidth=5))
mcols(dereplicatedSites) <- mcols(sites.reduced)
dereplicatedSites
}
standardizeSites <- function(unstandardizedSites){
if(length(unstandardizedSites)>0){
dereplicated <- dereplicateSites(unstandardizedSites)
dereplicated$dereplicatedSiteID <- seq(length(dereplicated))
#expand, keeping the newly standardized starts
standardized <- unname(dereplicated[rep(dereplicated$dereplicatedSiteID, dereplicated$counts)])
#order the original object to match
unstandardizedSites <- unstandardizedSites[unlist(dereplicated$revmap)]
#graft over the widths and metadata
trueBreakpoints <- start(flank(unstandardizedSites, -1, start=F))
standardizedStarts <- start(flank(standardized, -1, start=T))
standardized <- GRanges(seqnames=seqnames(standardized),
ranges=IRanges(start=pmin(standardizedStarts, trueBreakpoints),
end=pmax(standardizedStarts, trueBreakpoints)),
strand=strand(standardized),
seqinfo=seqinfo(unstandardizedSites))
mcols(standardized) <- mcols(unstandardizedSites)
standardized
}
else{
unstandardizedSites
}
}
#' clean up alignments and prepare for int site calling
#'
#' @param algns df with: 21 columns from BLAT (psl)
#' @param from is "R1" or "R2"
#' @return Granges object
processBLATData <- function(algns, from){
stopifnot(from == "R1" | from == "R2")
algns$from <- from
algns <- merge(algns, keys[c(from, "names")], by.x="qName", by.y=from)
algns.gr <- GRanges(seqnames=Rle(algns$tName),
ranges=IRanges(start=algns$tStart, end=algns$tEnd),
strand=Rle(algns$strand),
seqinfo=seqinfo(get_reference_genome(refGenome)))
names(algns.gr) <- algns[,"names"]
mcols(algns.gr) <- algns[,c("matches", "qStart", "qSize", "tBaseInsert", "from")]
rm(algns)
algns.gr
}
load("keys.RData")
hits.R2 <- processBLATData(read.psl(system("ls R2*.fa.psl.gz", intern=T), bestScoring=F, removeFile=F), "R2")
save(hits.R2, file="hits.R2.RData")
hits.R1 <- processBLATData(read.psl(system("ls R1*.fa.psl.gz", intern=T), bestScoring=F, removeFile=F), "R1")
save(hits.R1, file="hits.R1.RData")
load("stats.RData")
#no more '.p' or '.l' nomenclature here
#we're combining alignments from both sides of the read
allAlignments <- append(hits.R1, hits.R2)
stopifnot(!any(strand(allAlignments)=="*"))
#TODO: star strand is impossible '*'
readsAligning <- length(unique(names(allAlignments)))
stats <- cbind(stats, readsAligning)
allAlignments$percIdent <- 100 * allAlignments$matches/allAlignments$qSize
#doing this first subset speeds up the next steps
allAlignments <- subset(allAlignments,
allAlignments$percIdent >= minPercentIdentity
& allAlignments$qStart <= maxAlignStart
& allAlignments$tBaseInsert <= 5)
#even if a single block spans the vast majority of the qSize, it's NOT ok to
#accept the alignment as it will give a spurrious integration site/breakpoint
#that's a few dozen nt away from the real answer. That won't be caught by our
#collapsing algorithms
readsWithGoodAlgnmts <- length(unique(names(allAlignments)))
stats <- cbind(stats, readsWithGoodAlgnmts)
# allAlignments now are GRangesList
# and later only keep concordant pairs
allAlignments <- split(allAlignments, names(allAlignments))
# need alignemnent when R1 is on one strand and R2 is on the opposite
numStrands <- unname(table(strand(allAlignments)))
#just a quick pre-filter to reduce amount of work in future steps
allAlignments <- subset(allAlignments, numStrands[,1]>=1 & numStrands[,2]>=1)
######## REDUCE ALIGNMENTS INTO POTENTIAL SITES AT THE READ-LEVEL ###########
#private method stuff is so that we can maintain the revmap
#doing sapply(split(allAlignments, names(allAlignments)), reduce, min.gapwidth=maxLength) is incredibly slow
#might be faster with dplyr or data.table?
pairedAlignments <- GenomicRanges:::deconstructGRLintoGR(flank(allAlignments, -1, start=T))
pairedAlignments <- reduce(pairedAlignments, min.gapwidth=maxLength, with.revmap=TRUE, ignore.strand=TRUE)
pairedAlignments <- GenomicRanges:::reconstructGRLfromGR(pairedAlignments, flank(allAlignments, -1, start=T))
pairedAlignments <- unlist(pairedAlignments)
# strand can be '*" here because paired has + and -
#names are no longer unique identifier
#candidate sites are either a unique site, a member of a multihit cluster, or a member of a chimera
pairedAlignments$pairingID <- seq(pairedAlignments)
########## IDENTIFY PROPERLY-PAIRED READS ##########
#properly-paired reads will have one representitive each from R1 and R2
#there can be multiple candidate sites per read
alignmentsPerPairing <- sapply(pairedAlignments$revmap, length)
allAlignments <- unlist(allAlignments, use.names=FALSE)
alignmentSources <- split(allAlignments$from[unlist(pairedAlignments$revmap)],
as.vector(Rle(pairedAlignments$pairingID, alignmentsPerPairing)))
# alignmentSources have values of only "R1" and "R2"
R1Counts <- sapply(alignmentSources, function(x){sum(x=="R1")})
R2Counts <- sapply(alignmentSources, function(x){sum(x=="R2")})
oneEach <- R1Counts==1 & R2Counts==1
sitesFrom2Alignments <- pairedAlignments[alignmentsPerPairing==2]
properlyPairedAlignments <- sitesFrom2Alignments[oneEach[sitesFrom2Alignments$pairingID]]
#still can be with multihits
#assign strand to be whatever was seen on the LTR read (i.e. R2) in allAlignments
allPairedSingleAlignments <- allAlignments[unlist(properlyPairedAlignments$revmap)]
#guaranteed to have R1 and R2 at this point
R1s <- allPairedSingleAlignments[allPairedSingleAlignments$from=="R1"]
R2s <- allPairedSingleAlignments[allPairedSingleAlignments$from=="R2"]
strand(properlyPairedAlignments) <- strand(R2s)
#need to kick out properlyPaired Alignments that are 'outward facing'
#these will have union widths that are far greater than the reduction widths
unionWidths <- width(punion(R1s, R2s, ignore.strand=T, fill.gap=T))
properlyPairedAlignments <- properlyPairedAlignments[abs(unionWidths-width(properlyPairedAlignments)) < 5]
numProperlyPairedAlignments <- length(unique(names(properlyPairedAlignments)))
stats <- cbind(stats, numProperlyPairedAlignments)
########## IDENTIFY MULTIPLY-PAIRED READS (multihits) ##########
properlyPairedAlignments$sampleName <- sapply(strsplit(names(properlyPairedAlignments), "%"), "[[", 1)
properlyPairedAlignments$ID <- sapply(strsplit(names(properlyPairedAlignments), "%"), "[[", 2)
multihitNames <- unique(names(properlyPairedAlignments[duplicated(properlyPairedAlignments$ID)]))
unclusteredMultihits <- subset(properlyPairedAlignments, names(properlyPairedAlignments) %in% multihitNames)
unclusteredMultihits <- standardizeSites(unclusteredMultihits) #not sure if this is required anymore
clusteredMultihitPositions <- GRangesList()
clusteredMultihitLengths <- list()
if(length(unclusteredMultihits) > 0){
library("igraph")
multihits.split <- split(unclusteredMultihits, unclusteredMultihits$ID)
multihits.medians <- round(median(width(multihits.split))) #could have a half for a median
multihits.split <- flank(multihits.split, -1, start=T) #now just care about solostart
overlaps <- findOverlaps(multihits.split, multihits.split, maxgap=5)
edgelist <- matrix(c(queryHits(overlaps), subjectHits(overlaps)), ncol=2)
clusteredMultihitData <- clusters(graph.edgelist(edgelist, directed=F))
clusteredMultihitNames <- split(names(multihits.split), clusteredMultihitData$membership)
clusteredMultihitPositions <- GRangesList(lapply(clusteredMultihitNames, function(x){
unname(granges(unique(unlist(multihits.split[x]))))
}))
clusteredMultihitLengths <- lapply(clusteredMultihitNames, function(x){
data.frame(table(multihits.medians[x]))
})
}
stopifnot(length(clusteredMultihitPositions)==length(clusteredMultihitLengths))
multihitData <- list(unclusteredMultihits, clusteredMultihitPositions, clusteredMultihitLengths)
names(multihitData) <- c("unclusteredMultihits", "clusteredMultihitPositions", "clusteredMultihitLengths")
save(multihitData, file="multihitData.RData")
#making new variable multihitReads so that the naming in stats is nice
multihitReads <- length(multihitNames) #multihit names is already unique
stats <- cbind(stats, multihitReads)
########## IDENTIFY UNIQUELY-PAIRED READS (real sites) ##########
allSites <- properlyPairedAlignments[!properlyPairedAlignments$ID %in% unclusteredMultihits$ID]
allSites <- standardizeSites(allSites)
sites.final <- dereplicateSites(allSites)
if(length(sites.final)>0){
sites.final$sampleName <- allSites[1]$sampleName
sites.final$posid <- paste0(as.character(seqnames(sites.final)),
as.character(strand(sites.final)),
start(flank(sites.final, width=-1, start=TRUE)))
}
save(sites.final, file="sites.final.RData")
save(allSites, file="allSites.RData")
########## IDENTIFY IMPROPERLY-PAIRED READS (chimeras) ##########
singletonAlignments <- pairedAlignments[alignmentsPerPairing==1]
strand(singletonAlignments) <- strand(allAlignments[unlist(singletonAlignments$revmap)])
t <- table(names(singletonAlignments))
chimeras <- subset(singletonAlignments, names(singletonAlignments) %in%
names(subset(t, t==2))) #should be >=?
#not an already-assigned read
chimeras <- chimeras[!names(chimeras) %in% names(properlyPairedAlignments)]
chimeras <- split(chimeras, names(chimeras))
chimeras <- subset(chimeras,
sapply(chimeras, function(x){sum(R1Counts[x$pairingID]) ==
sum(R2Counts[x$pairingID])}))
dereplicatedChimeras <- dereplicateSites(unlist(chimeras, use.names=FALSE))
chimera <- length(dereplicatedChimeras)
stats <- cbind(stats, chimera)
chimeraData <- list("chimeras"=chimeras, "dereplicatedChimeras"=dereplicatedChimeras)
save(chimeraData, file="chimeraData.RData")
save(stats, file="stats.RData")
}