-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathaddGeneListAnnotationToVcf.R
75 lines (64 loc) · 2.41 KB
/
addGeneListAnnotationToVcf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env Rscript
# parse gene list
# usage: ./addGeneListAnnotationToVcf.pl
suppressPackageStartupMessages(library("optparse"));
suppressPackageStartupMessages(library("GenomicRanges"));
suppressPackageStartupMessages(library("rtracklayer"));
suppressPackageStartupMessages(library("VariantAnnotation"));
options(warn = -1, error = quote({ traceback(2); q('no', status = 1) }))
optList <- list(
make_option("--genome", default = 'hg19', help = "genome build [default %default]"),
make_option("--geneBed", default = NULL, type = "character", action = "store", help ="input gene bed (required)"),
make_option("--name", default = 'genelist', type = "character", action = "store", help ="annotation name (default %default)"),
make_option("--outFile", default = NULL, type = "character", action = "store", help ="output file (required)"))
parser <- OptionParser(usage = "%prog [options] [vcf file]", option_list = optList);
arguments <- parse_args(parser, positional_arguments = T);
opt <- arguments$options;
if (length(arguments$args) < 1) {
cat("Need input vcf file\n\n")
print_help(parser);
stop();
} else if (is.null(opt$geneBed)) {
cat("Need input gene bed\n\n")
print_help(parser);
stop();
} else if (is.null(opt$outFile)) {
cat("Need output file name\n\n")
print_help(parser);
stop();
}
fn <- arguments$args[1];
outfn <- opt$outFile
null <- suppressWarnings(file.remove(outfn))
out <- file(outfn, open = 'a')
cat('Reading target bed ... ')
genes <- import(opt$geneBed)
cat('done\n')
cat('Indexing vcf ... ')
temp <- tempfile()
zipped <- bgzip(fn, temp)
idx <- indexTabix(temp, "vcf")
cat('done\n')
tab <- TabixFile(zipped, idx, yieldSize = 2000)
open(tab)
cat('Processing vcf by chunk\n')
i <- 1
while(nrow(vcf <- readVcf(tab, genome = opt$genome))) {
# replace header
newInfo <- DataFrame(Number = 0, Type = "Flag", Description = paste(opt$name, ": variant is in gene list", sep = ''), row.names = opt$name)
info(header(vcf)) <- rbind(info(header(vcf)), newInfo)
ol <- findOverlaps(rowData(vcf), genes, select = 'first')
info(vcf)[,opt$name] <- !is.na(ol)
cat(paste('Chunk', i, "\n"))
cat("Appending vcf chunk to", opt$outFile, "... ")
writeVcf(vcf, out)
cat("done\n")
i <- i + 1
}
close(tab)
if (i == 1) {
cat("No entries, creating empty vcf file\n")
vcf <- readVcf(fn, genome = opt$genome)
writeVcf(vcf, out)
}
close(out)