From f7ef8b914be2f19f67fa387b022713bbc563bcdb Mon Sep 17 00:00:00 2001 From: "naumenko.sa" Date: Thu, 17 Jan 2019 15:45:50 -0500 Subject: [PATCH] modified: README.md modified: cheo.R modified: cre.gnomad_scores.R modified: cre.panel_filter.R --- README.md | 4 +- cheo.R | 363 ++++++++++++++++++++------------------------ cre.gnomad_scores.R | 37 ++--- cre.panel_filter.R | 8 +- 4 files changed, 191 insertions(+), 221 deletions(-) diff --git a/README.md b/README.md index 32fc68f..0ea6142 100644 --- a/README.md +++ b/README.md @@ -149,14 +149,16 @@ vcf.platypus.getNV.sh ${family}-platypus-annotated-decomposed.vcf.gz * bcbio.array.pbs * bcbio.pbs * bcbio.prepare_families.sh -* cre.prepare_bcbio_run.sh * bcbio.rename_old_names.sh * bcl2fastq.sh +* cheo.R - mostly Venn diagrams to compare pipeline validations + some coverage analysis +* cre.prepare_bcbio_run.sh * cre.bam.validate.sh * cre.bcbio.upgrade.sh * cre.coverage.bamstats05.sh - calculate coverage * cre.fixit.sh - fixes sample names * cre.gemini_load.sh loads vep-annotated vcf to gemini db. +* cre.gnomad_scores.R - download and parse gnomad scores. * cre.kinship.R - to plot relatedness (kinship) diagram for a group of samples. Sometimes helps to detect and solve mislabelling. * cre.package.sh * cre.rtg.validate.sh - validates NA12878 calls vs genome in a bottle callset with RTG and a bed file diff --git a/cheo.R b/cheo.R index b438a97..71c68c8 100644 --- a/cheo.R +++ b/cheo.R @@ -1,200 +1,177 @@ -wgs_report_test = function() -{ - setwd("~/Desktop/work/wgs_report/") - test = read.csv("180_79585.sv.csv") -} - - - #https://cran.r-project.org/web/packages/VennDiagram/VennDiagram.pdf -pipeline_comparisons = function() -{ +pipeline_comparisons <- function(){ library("VennDiagram") - venn.plot=draw.pairwise.venn(945+4958,132+4958,4958, - c("BCBIO","Jacek"),fill=c("blue","red"),lty="blank", + venn.plot <- draw.pairwise.venn(945+4958,132+4958,4958, + c("BCBIO","Jacek"), fill=c("blue","red"), lty="blank", cex = 2, cat.cex=2, cat.just = list(c(-1,-1),c(1,1)), ext.length = 0.3, ext.line.lwd=2, ext.text = F, main="Sdf" ) - venn.plot=draw.pairwise.venn(46807953, 54328695, 44991414, - c("Nimblegen.capture","Agilent"), - fill=c("blue","red"),cex=2,cat.cex=2) + venn.plot <- draw.pairwise.venn(46807953, 54328695, 44991414, + c("Nimblegen.capture", "Agilent"), + fill = c("blue","red"), cex = 2, cat.cex = 2) - venn.plot3=draw.triple.venn(54328695,46807953,65824553, - 39517373,43777891,44991414, + venn.plot3 <- draw.triple.venn(54328695, 46807953, 65824553, + 39517373, 43777891, 44991414, 37679751, - c("Agilent","Nimblegen.capture","Nimblegen.empirical") + c("Agilent", "Nimblegen.capture", "Nimblegen.empirical") ) - venn.plot3=draw.triple.venn(1,2,3,12,23,13,123, - c("Agilent","Nimblegen.capture","Nimblegen.empirical") + venn.plot3 <- draw.triple.venn(1, 2, 3, 12, 23, 13, 123, + c("Agilent", "Nimblegen.capture", "Nimblegen.empirical") ) #example - for (pipeline in c("genap","bcbio","jacek","cpipe")) - { - for (sample in c("S1","S2","S4","S5","S6","S7","S8")) - { - agilent=unlist(read.table(paste0("S1.",pipeline,".agilent.omim.variants.indels"))) - nimblegen=unlist(read.table(paste0("S1.",pipeline,".nimblegen.omim.variants.indels"))) + for (pipeline in c("genap", "bcbio", "jacek", "cpipe")){ + for (sample in c("S1", "S2", "S4", "S5", "S6", "S7", "S8")){ + agilent <- unlist(read.table(paste0("S1.", pipeline, ".agilent.omim.variants.indels"))) + nimblegen <- unlist(read.table(paste0("S1.", pipeline, ".nimblegen.omim.variants.indels"))) #illumina=unlist(read.table(paste0("S1.",pipeline,".illumina.txt"))) - x=list(agilent,nimblegen) - names=list("Agilent","Nimblegen") - png(paste0(sample,".",pipeline,".png")) - grid.draw(venn.diagram(x,NULL,category.names=names)) + x <- list(agilent, nimblegen) + names <- list("Agilent", "Nimblegen") + png(paste0(sample, ".", pipeline, ".png")) + grid.draw(venn.diagram(x, NULL, category.names = names)) dev.off() } } - for (platform in c("agilent","nimblegen")) - { - for (sample in c("S1","S2","S4","S5","S6","S7","S8")) - { - - #sample="S3" - #platform="illumina" - jacek=unlist(read.table(paste0(sample,".jacek.",platform,".omim.variants.indels"))) - cpipe=unlist(read.table(paste0(sample,".cpipe.",platform,".omim.variants.indels"))) - bcbio=unlist(read.table(paste0(sample,".bcbio.",platform,".omim.variants.indels"))) - genap=unlist(read.table(paste0(sample,".genap.",platform,".omim.variants.indels"))) + for (platform in c("agilent", "nimblegen")){ + for (sample in c("S1", "S2", "S4", "S5", "S6", "S7", "S8")){ + #sample="S3" + #platform="illumina" + jacek <- unlist(read.table(paste0(sample, ".jacek.", platform, ".omim.variants.indels"))) + cpipe <- unlist(read.table(paste0(sample, ".cpipe.", platform, ".omim.variants.indels"))) + bcbio <- unlist(read.table(paste0(sample, ".bcbio.", platform, ".omim.variants.indels"))) + genap <- unlist(read.table(paste0(sample, ".genap.", platform, ".omim.variants.indels"))) - x=list(jacek,cpipe,bcbio,genap) - names=list("jacek","cpipe","bcbio","genap") + x <- list(jacek, cpipe, bcbio, genap) + names <- list("jacek", "cpipe", "bcbio", "genap") - png(paste0(sample,".",platform,".png")) - grid.draw(venn.diagram(x,NULL,category.names=names)) + png(paste0(sample, ".", platform, ".png")) + grid.draw(venn.diagram(x, NULL, category.names = names)) dev.off() } } } -intersect_bed_files = function() -{ +intersect_bed_files <- function(){ # http://davetang.org/muse/2013/01/02/iranges-and-genomicranges/ - source("http://bioconductor.org/biocLite.R") - biocLite("GenomicRanges") - library("GenomicRanges") - setwd("coverage/nimblegen") - capture=read.table("nimblegen.capture",header=F) -colnames(capture)=c('chr','start','end') -capture.bed = with(capture, GRanges(chr, IRanges(start+1, end))) - -empirical=read.table("nimblegen.empirical",header=F) -colnames(empirical)=c('chr','start','end') -empirical.bed = with(capture, GRanges(chr, IRanges(start+1, end))) - -omim=read.table("omim.orphanet.goodnames.v2.bed",header=F) -colnames(empirical)=c('chr','start','end') -omim.bed = with(capture, GRanges(chr, IRanges(start+1, end))) - -bed.intersect = intersect(omim.bed,capture.bed) - - -#bed files -setwd("venn_diagrams/bed_intersection/") -omim=unlist(read.table("omim.orphanet.goodnames.v2.bed.exons")) -agilent=unlist(read.table("omim_vs_agilent.50percent.wo.exons")) -nimblegen=unlist(read.table("omim_vs_nimblegen.capture.50percent.wo.exons")) -illumina=unlist(read.table("omim_vs_illumina.50percent.wo.exons")) - -x=list(omim,agilent,nimblegen,illumina) -names=list("omim","agilent","nimblegen","illumina") - -grid.draw(venn.diagram(x,NULL,category.names=names)) - + source("http://bioconductor.org/biocLite.R") + biocLite("GenomicRanges") + library("GenomicRanges") + setwd("coverage/nimblegen") + capture <- read.table("nimblegen.capture", header = F) + colnames(capture) <- c("chr", "start", "end") + capture.bed <- with(capture, GRanges(chr, IRanges(start+1, end))) + + empirical <- read.table("nimblegen.empirical", header = F) + colnames(empirical) <- c("chr", "start", "end") + empirical.bed <- with(capture, GRanges(chr, IRanges(start+1, end))) + + omim <- read.table("omim.orphanet.goodnames.v2.bed", header = F) + colnames(empirical) <- c("chr", "start", "end") + omim.bed <- with(capture, GRanges(chr, IRanges(start+1, end))) + + bed.intersect <- intersect(omim.bed, capture.bed) + + #bed files + setwd("venn_diagrams/bed_intersection/") + omim <- unlist(read.table("omim.orphanet.goodnames.v2.bed.exons")) + agilent <- unlist(read.table("omim_vs_agilent.50percent.wo.exons")) + nimblegen <- unlist(read.table("omim_vs_nimblegen.capture.50percent.wo.exons")) + illumina <- unlist(read.table("omim_vs_illumina.50percent.wo.exons")) + + x <- list(omim,agilent,nimblegen,illumina) + names <- list("omim","agilent","nimblegen","illumina") + + grid.draw(venn.diagram(x, NULL, category.names = names)) } -variants_parameter = function() -{ - type="snps" - type="indels" - for (platform in c("agilent","nimblegen")) - { - for (sample in c("S1","S2","S4","S5","S6","S7","S8")) - { - sample="S1" - platform="agilent" - cpipe.hom=read.table(paste0(sample,".cpipe.",platform,".omim.",type,".hom.AD")) - cpipe.het=read.table(paste0(sample,".cpipe.",platform,".omim.",type,".het.AD")) +variants_parameter <- function(){ + type <- "snps" + type <- "indels" + for (platform in c("agilent", "nimblegen")){ + for (sample in c("S1", "S2", "S4", "S5", "S6", "S7", "S8")){ + sample <- "S1" + platform <- "agilent" + cpipe.hom <- read.table(paste0(sample, ".cpipe.", platform, ".omim.", type, ".hom.AD")) + cpipe.het <- read.table(paste0(sample, ".cpipe.", platform, ".omim.", type, ".het.AD")) - bcbio.hom=read.table(paste0(sample,".bcbio.",platform,".omim.",type,".hom.AD")) - bcbio.het=read.table(paste0(sample,".bcbio.",platform,".omim.",type,".het.AD")) + bcbio.hom <- read.table(paste0(sample, ".bcbio.", platform, ".omim.", type, ".hom.AD")) + bcbio.het <- read.table(paste0(sample, ".bcbio.", platform, ".omim.", type, ".het.AD")) - genap.hom=read.table(paste0(sample,".genap.",platform,".omim.",type,".hom.AD")) - genap.het=read.table(paste0(sample,".genap.",platform,".omim.",type,".het.AD")) + genap.hom <- read.table(paste0(sample, ".genap.", platform, ".omim.", type, ".hom.AD")) + genap.het <- read.table(paste0(sample, ".genap.", platform, ".omim.", type, ".het.AD")) - genap.only = read.table("S1.genap.only.recode.vcf.AD") - v=c(genap.hom,genap.het,genap.only) - names=c("genap.hom","genap.het","genap.only") + genap.only <- read.table("S1.genap.only.recode.vcf.AD") + v <- c(genap.hom, genap.het, genap.only) + names <- c("genap.hom", "genap.het", "genap.only") #indels - v=c(cpipe.het,bcbio.het,genap.het) - names=c("cpipe.het","bcbio.het","genap.het") + v <- c(cpipe.het, bcbio.het, genap.het) + names <- c("cpipe.het", "bcbio.het", "genap.het") - png(paste0(sample,".",platform,".",type,".png"),width=1000) - boxplot(v,names=names) + png(paste0(sample,".", platform, ".", type, ".png"), width = 1000) + boxplot(v, names = names) dev.off() } } - setwd("~/cluster/dorin_test") - b1=read.table("UNIQUE_to_BCBIO_1496461.vcf.AD") - b2=read.table("UNIQUE_to_BCBIO_1496462.vcf.AD") - b3=read.table("UNIQUE_to_BCBIO_1496463.vcf.AD") - g1=read.table("UNIQUE_to_DNASEQ_1496461.vcf.AD") - g2=read.table("UNIQUE_to_DNASEQ_1496462.vcf.AD") - g3=read.table("UNIQUE_to_DNASEQ_1496463.vcf.AD") + setwd("~/cluster/dorin_test") + b1 <- read.table("UNIQUE_to_BCBIO_1496461.vcf.AD") + b2 <- read.table("UNIQUE_to_BCBIO_1496462.vcf.AD") + b3 <- read.table("UNIQUE_to_BCBIO_1496463.vcf.AD") + g1 <- read.table("UNIQUE_to_DNASEQ_1496461.vcf.AD") + g2 <- read.table("UNIQUE_to_DNASEQ_1496462.vcf.AD") + g3 <- read.table("UNIQUE_to_DNASEQ_1496463.vcf.AD") - v=c(b1,b2,b3,g1,g2,g3) - names=c("b1","b2","b3","g1","g2","g3") + v <- c(b1, b2, b3, g1, g2, g3) + names <- c("b1", "b2", "b3", "g1", "g2", "g3") - png(paste0(sample,".",platform,".",type,".png"),width=1000) - boxplot(v,names=names) + png(paste0(sample, ".", platform, ".", type, ".png"), width = 1000) + boxplot(v, names = names) } # title = "cheo.omim_genes.coverage" # coverage.gene_panel(title) # plots coverage for every gene for all samples in samples.txt, each sample should have sample.coverage - output of # bam.coverage.bamstats05.sh -coverage.gene_panel = function (title) -{ +coverage.gene_panel <- function(title){ #test - title = "test" + title <- "test" setwd("~/Desktop/work") - files = list.files(".","\\.coverage$") + files <- list.files(".", "\\.coverage$") #samples = unlist(read.table("samples.txt", stringsAsFactors=F)) - coverage = read.delim(files[1],header=T,stringsAsFactors = F) - coverage = coverage[,c("gene","avg")] - colnames(coverage)[2]=files[1] - - for (file in tail(files,-1)) - { - sample_coverage = read.delim(file,header=T,stringsAsFactors = F) - sample_coverage = sample_coverage[,c("gene","avg")] - colnames(sample_coverage)[2]=file - coverage = cbind(coverage,sample_coverage[2]) + coverage <- read.delim(files[1], header = T, stringsAsFactors = F) + coverage <- coverage[,c("gene", "avg")] + colnames(coverage)[2] <- files[1] + + for (file in tail(files,-1)){ + sample_coverage <- read.delim(file,header=T,stringsAsFactors = F) + sample_coverage <- sample_coverage[,c("gene", "avg")] + colnames(sample_coverage)[2] <- file + coverage <- cbind(coverage,sample_coverage[2]) } - row.names(coverage) = coverage$gene - coverage$gene=NULL + row.names(coverage) <- coverage$gene + coverage$gene <- NULL - n_genes = nrow(coverage) + n_genes <- nrow(coverage) - for(i in seq(1,ceiling(n_genes/100))) - { - start_index = (i-1)*100+1 - end_index = i*100 + for(i in seq(1, ceiling(n_genes/100))){ + start_index <- (i-1)*100+1 + end_index <- i*100 - if (end_index > n_genes) - end_index = n_genes + if (end_index > n_genes) end_index <- n_genes - png(paste0(title,".part",i,".png"),res=300,width=5000,height=2000) - boxplot(t(coverage[start_index:end_index,]),las=2,cex.axis=0.8, - main=paste0("Coverage in ",length(files)," samples of NextSeq for ",title," gene panel,part ",i)) + png(paste0(title, ".part", i, ".png"), res = 300, width = 5000, height = 2000) + boxplot(t(coverage[start_index:end_index,]), las = 2, cex.axis = 0.8, + main = paste0("Coverage in ", length(files), " samples of NextSeq for ", + title, " gene panel,part ", i)) dev.off() } } @@ -204,99 +181,89 @@ coverage.gene_panel = function (title) # CA0246.coverage 17268 # CH0317.coverage 15833 # GM15262.coverage 14435 -coverage.all_genes = function () -{ - title="Coverage in project 913 across all protein coding genes,no outliers" +coverage.all_genes <- function (){ + title <- "Coverage in project 913 across all protein coding genes,no outliers" library("matrixStats") setwd("~/Desktop/work") - samples = unlist(read.table("samples.txt", stringsAsFactors=F)) + samples <- unlist(read.table("samples.txt", stringsAsFactors = F)) #hopefully 1st sample has most genes - coverage = read.delim(paste0(samples[1],".coverage"),header=T,stringsAsFactors = F) - coverage = coverage[,c("gene","mean")] - colnames(coverage)[2]=samples[1] - row.names(coverage)=coverage$gene - coverage$gene=NULL + coverage <- read.delim(paste0(samples[1], ".coverage"), header = T, stringsAsFactors = F) + coverage <- coverage[,c("gene","mean")] + colnames(coverage)[2] <- samples[1] + row.names(coverage) <- coverage$gene + coverage$gene <- NULL - for (sample in tail(samples,-1)) - { - sample_coverage = read.delim(paste0(sample,".coverage"),header=T,stringsAsFactors = F) - sample_coverage = sample_coverage[,c("gene","mean")] - colnames(sample_coverage)[2]=sample - coverage = merge(coverage,sample_coverage,by.x="row.names",by.y="gene",all.x=T) - row.names(coverage) = coverage$Row.names - coverage$Row.names=NULL + for (sample in tail(samples,-1)){ + sample_coverage <- read.delim(paste0(sample,".coverage"),header=T,stringsAsFactors = F) + sample_coverage <- sample_coverage[,c("gene","mean")] + colnames(sample_coverage)[2] <- sample + coverage <- merge(coverage, sample_coverage, by.x = "row.names", by.y = "gene", all.x = T) + row.names(coverage) <- coverage$Row.names + coverage$Row.names <- NULL } - coverage[is.na(coverage)]=0 - coverage$Mean = rowMeans(coverage) - png("coverage.png",res=300,width=5000,height=2000) - boxplot(coverage, - las=1, - cex.axis=0.6, - main=title,outline = F) + coverage[is.na(coverage)] <- 0 + coverage$Mean <- rowMeans(coverage) + png("coverage.png", res = 300, width = 5000, height = 2000) + boxplot(coverage, las = 1, cex.axis = 0.6, + main = title, outline = F) dev.off() - - meds=rbind(colnames(coverage),colMedians(as.matrix(coverage))) - write.table(meds,"medians.txt",col.names = F, quote = F,row.names = F) + meds <- rbind(colnames(coverage), colMedians(as.matrix(coverage))) + write.table(meds, "medians.txt", col.names = F, quote = F, row.names = F) } #%of bases covered more than 10x -coverage.percent_more_than10x = function() -{ +coverage.percent_more_than10x <- function(){ setwd("~/Desktop/work") - samples = unlist(read.table("samples.txt", stringsAsFactors=F)) + samples <- unlist(read.table("samples.txt", stringsAsFactors = F)) - for (sample in samples) - { - coverage = read.delim(paste0(sample,".coverage"),header=T,stringsAsFactors = F) - total_len = sum(coverage$length) + for (sample in samples){ + coverage <- read.delim(paste0(sample, ".coverage"), header = T, stringsAsFactors = F) + total_len <- sum(coverage$length) - coverage_10x = coverage[coverage$mean>10,] - len_10x = sum(coverage_10x$length) + coverage_10x <- coverage[coverage$mean > 10,] + len_10x <- sum(coverage_10x$length) - print(paste0(sample," ",len_10x/total_len)) + print(paste0(sample, " ", len_10x/total_len)) } } -omim_table_manipulation = function() -{ +omim_table_manipulation <- function(){ setwd("~/Desktop/reference_tables/OMIM_2017-04-13/") mimTitles.percent <- read.delim2("mimTitles.percent.txt", comment.char="#") genemap2 <- read.delim("genemap2.txt", comment.char="#") - mimTitles.percent = merge(mimTitles.percent,genemap2, by.x="Mim.Number",by.y="Mim.Number",all.x=T,all.y=F) - write.csv(mimTitles.percent,"mimTitles.percent.location.csv",row.names = F) + mimTitles.percent <- merge(mimTitles.percent, genemap2, by.x = "Mim.Number", + by.y = "Mim.Number", all.x = T, all.y = F) + write.csv(mimTitles.percent, "mimTitles.percent.location.csv", row.names = F) } -read_length_distribution = function(family) -{ +read_length_distribution <- function(family){ - family_data = read.delim(paste0(family,".tsv"), stringsAsFactors=F) - samples = unique(family_data$sample) + family_data <- read.delim(paste0(family,".tsv"), stringsAsFactors = F) + samples <- unique(family_data$sample) - for (sample in samples) - { - tmp = subset(family_data,sample==sample) - tmp$sample=NULL - print(paste0(sample," ",round(sum(tmp$Length*tmp$Count) / sum(tmp$Count))),quote=F) + for (sample in samples){ + tmp <- subset(family_data, sample == sample) + tmp$sample <- NULL + print(paste0(sample, " ", round(sum(tmp$Length*tmp$Count) / sum(tmp$Count))), quote = F) } } -read_lengths = function() -{ +read_lengths <- function(){ setwd("~/Desktop/project_cheo/2017-04-12_read_lengths/") - families = unlist(read.table("projects.txt", stringsAsFactors=F)) - for (family in families) - { + families <- unlist(read.table("projects.txt", stringsAsFactors = F)) + for (family in families){ read_length_distribution(family) } - read_lengths <- read.csv("read_lengths.txt", sep="", stringsAsFactors=FALSE) - read_lengths$id=NULL + read_lengths <- read.csv("read_lengths.txt", sep="", stringsAsFactors = F) + read_lengths$id <- NULL - png("read_lengths_all_samples.png",width=2000) - barplot(read_lengths$average_read_length, names.arg = read_lengths$sample,main = "Average read lengths for NextSeq samples is 134", - las=2) + png("read_lengths_all_samples.png", width = 2000) + barplot(read_lengths$average_read_length, names.arg = read_lengths$sample, + main = "Average read lengths for NextSeq samples is 134", + las = 2) dev.off() } diff --git a/cre.gnomad_scores.R b/cre.gnomad_scores.R index 87bd6bb..160b98d 100644 --- a/cre.gnomad_scores.R +++ b/cre.gnomad_scores.R @@ -14,37 +14,38 @@ # git clone https://github.com/naumenko-sa/bioscripts source("~/bioscripts/genes.R") -library("R.utils") +library(R.utils) print("Working hard ...") print("Downloading constraints.txt.bgz") -gnomad_scores_url = "https://storage.googleapis.com/gnomad-public/release/2.1/ht/constraint/constraint.txt.bgz" -download.file(gnomad_scores_url,"gnomad_scores.txt.bgz") -gunzip("gnomad_scores.txt.bgz","gnomad_scores.txt") -gnomad_scores = read.delim("gnomad_scores.txt", stringsAsFactors=F) -gnomad_scores = gnomad_scores[,c("gene","transcript","canonical","oe_lof","oe_mis","pLI","pRec","pNull")] -gnomad_scores = gnomad_scores[gnomad_scores$canonical == "true",] +gnomad_scores_url <- "https://storage.googleapis.com/gnomad-public/release/2.1/ht/constraint/constraint.txt.bgz" +download.file(gnomad_scores_url, "gnomad_scores.txt.bgz") +gunzip("gnomad_scores.txt.bgz", "gnomad_scores.txt") +gnomad_scores <- read.delim("gnomad_scores.txt", stringsAsFactors = F) +gnomad_scores <- gnomad_scores[,c("gene","transcript","canonical","oe_lof","oe_mis","pLI","pRec","pNull")] +gnomad_scores <- gnomad_scores[gnomad_scores$canonical == "true",] #still has a few duplicates #gnomad_scores[duplicated(gnomad_scores$gene),] -gnomad_scores = gnomad_scores[!duplicated(gnomad_scores$gene),] +gnomad_scores <- gnomad_scores[!duplicated(gnomad_scores$gene),] -mart = init_mart_human() +mart <- init_mart_human() get_protein_coding_genes(mart) -genes_transcripts = read.csv("genes.transcripts.csv", stringsAsFactors = F) -gnomad_scores = merge(gnomad_scores,genes_transcripts,by.x="transcript",by.y="Ensembl_transcript_id",all.x=T,all.y=F) +genes_transcripts <- read.csv("genes.transcripts.csv", stringsAsFactors = F) +gnomad_scores <- merge(gnomad_scores, genes_transcripts, by.x = "transcript", + by.y = "Ensembl_transcript_id", all.x = T, all.y = F) #some genes are absent in grch37 #gnomad_scores[is.na(gnomad_scores$Ensembl_gene_id),] +gnomad_scores <- gnomad_scores[!is.na(gnomad_scores$Ensembl_gene_id),] +gnomad_scores <- gnomad_scores[,c("Ensembl_gene_id","oe_lof","oe_mis","pLI","pRec","pNull")] -gnomad_scores = gnomad_scores[!is.na(gnomad_scores$Ensembl_gene_id),] -gnomad_scores = gnomad_scores[,c("Ensembl_gene_id","oe_lof","oe_mis","pLI","pRec","pNull")] +colnames(gnomad_scores) <- c("Ensembl_gene_id", "Gnomad_oe_lof_score", "Gnomad_oe_mis_score", + "Exac_pli_score", "Exac_prec_score", "Exac_pnull_score") +write.csv(gnomad_scores, "gnomad_scores.csv", row.names = F) -colnames(gnomad_scores) = c("Ensembl_gene_id","Gnomad_oe_lof_score","Gnomad_oe_mis_score","Exac_pli_score","Exac_prec_score","Exac_pnull_score") -write.csv(gnomad_scores,"gnomad_scores.csv",row.names = F) - -file.copy("gnomad_scores.csv","~/cre/data",overwrite=T) +file.copy("gnomad_scores.csv", "~/cre/data", overwrite = T) file.remove("genes.transcripts.csv") file.remove("gnomad_scores.txt") -file.remove("gnomad_scores.csv") +file.remove("gnomad_scores.csv") \ No newline at end of file diff --git a/cre.panel_filter.R b/cre.panel_filter.R index 0d29d69..4b09429 100644 --- a/cre.panel_filter.R +++ b/cre.panel_filter.R @@ -31,15 +31,15 @@ filter_variants_genomics_england_panel = function(variant_report.csv,panel.tsv,o variants.ens = variants[variants$Ensembl_gene_id %in% panel$PanelAPP.EnsemblId.GRch37,] variants.gene = variants[variants$Gene %in% panel$PanelAPP.Gene.Symbol,] - variants = unique(sort(rbind(variants.ens,variants.gene))) + variants = unique(rbind(variants.ens,variants.gene)) write.csv(variants,output.csv,row.names=F) } args = commandArgs(trailingOnly = T) -if (args[4] == "genomics_england") +if (is.null(args[4])) { - filter_variants_genomics_england_panel(args[1],args[2],args[3]) -}else{ filter_variants(args[1],args[2],args[3]) +}else{ + filter_variants_genomics_england_panel(args[1],args[2],args[3]) } \ No newline at end of file