From f7ef8b914be2f19f67fa387b022713bbc563bcdb Mon Sep 17 00:00:00 2001
From: "naumenko.sa" <evolgenomicslab@gmail.com>
Date: Thu, 17 Jan 2019 15:45:50 -0500
Subject: [PATCH] 	modified:   README.md 	modified:   cheo.R 
 modified:   cre.gnomad_scores.R 	modified:   cre.panel_filter.R

---
 README.md           |   4 +-
 cheo.R              | 363 ++++++++++++++++++++------------------------
 cre.gnomad_scores.R |  37 ++---
 cre.panel_filter.R  |   8 +-
 4 files changed, 191 insertions(+), 221 deletions(-)

diff --git a/README.md b/README.md
index 32fc68f..0ea6142 100644
--- a/README.md
+++ b/README.md
@@ -149,14 +149,16 @@ vcf.platypus.getNV.sh ${family}-platypus-annotated-decomposed.vcf.gz
 * bcbio.array.pbs
 * bcbio.pbs
 * bcbio.prepare_families.sh
-* cre.prepare_bcbio_run.sh
 * bcbio.rename_old_names.sh
 * bcl2fastq.sh
+* cheo.R - mostly Venn diagrams to compare pipeline validations + some coverage analysis
+* cre.prepare_bcbio_run.sh
 * cre.bam.validate.sh
 * cre.bcbio.upgrade.sh
 * cre.coverage.bamstats05.sh - calculate coverage
 * cre.fixit.sh - fixes sample names
 * cre.gemini_load.sh loads vep-annotated vcf to gemini db.
+* cre.gnomad_scores.R - download and parse gnomad scores.
 * cre.kinship.R - to plot relatedness (kinship) diagram for a group of samples. Sometimes helps to detect and solve mislabelling.
 * cre.package.sh
 * cre.rtg.validate.sh - validates NA12878 calls vs genome in a bottle callset with RTG and a bed file
diff --git a/cheo.R b/cheo.R
index b438a97..71c68c8 100644
--- a/cheo.R
+++ b/cheo.R
@@ -1,200 +1,177 @@
-wgs_report_test = function()
-{
-    setwd("~/Desktop/work/wgs_report/")
-    test = read.csv("180_79585.sv.csv")
-}
-
-
-
 #https://cran.r-project.org/web/packages/VennDiagram/VennDiagram.pdf
-pipeline_comparisons = function()
-{
+pipeline_comparisons <- function(){
   
     library("VennDiagram")
-    venn.plot=draw.pairwise.venn(945+4958,132+4958,4958,
-              c("BCBIO","Jacek"),fill=c("blue","red"),lty="blank",
+    venn.plot <- draw.pairwise.venn(945+4958,132+4958,4958,
+                c("BCBIO","Jacek"), fill=c("blue","red"), lty="blank",
               cex = 2, cat.cex=2, cat.just = list(c(-1,-1),c(1,1)),
               ext.length = 0.3,  ext.line.lwd=2,
               ext.text = F, main="Sdf"
               )
-    venn.plot=draw.pairwise.venn(46807953, 54328695, 44991414,
-                             c("Nimblegen.capture","Agilent"),
-                             fill=c("blue","red"),cex=2,cat.cex=2)
+    venn.plot <- draw.pairwise.venn(46807953, 54328695, 44991414,
+                             c("Nimblegen.capture", "Agilent"),
+                             fill = c("blue","red"), cex = 2, cat.cex = 2)
 
-    venn.plot3=draw.triple.venn(54328695,46807953,65824553,
-                            39517373,43777891,44991414,
+    venn.plot3 <- draw.triple.venn(54328695, 46807953, 65824553,
+                            39517373, 43777891, 44991414,
                             37679751,
-                            c("Agilent","Nimblegen.capture","Nimblegen.empirical")
+                            c("Agilent", "Nimblegen.capture", "Nimblegen.empirical")
     )
 
-    venn.plot3=draw.triple.venn(1,2,3,12,23,13,123,
-      c("Agilent","Nimblegen.capture","Nimblegen.empirical")
+    venn.plot3 <- draw.triple.venn(1, 2, 3, 12, 23, 13, 123,
+                                   c("Agilent", "Nimblegen.capture", "Nimblegen.empirical")
     )
 
     #example
-    for (pipeline in c("genap","bcbio","jacek","cpipe"))
-    {
-        for (sample in c("S1","S2","S4","S5","S6","S7","S8"))
-        {
-            agilent=unlist(read.table(paste0("S1.",pipeline,".agilent.omim.variants.indels")))
-            nimblegen=unlist(read.table(paste0("S1.",pipeline,".nimblegen.omim.variants.indels")))
+    for (pipeline in c("genap", "bcbio", "jacek", "cpipe")){
+        for (sample in c("S1", "S2", "S4", "S5", "S6", "S7", "S8")){
+            agilent <- unlist(read.table(paste0("S1.", pipeline, ".agilent.omim.variants.indels")))
+            nimblegen <- unlist(read.table(paste0("S1.", pipeline, ".nimblegen.omim.variants.indels")))
             #illumina=unlist(read.table(paste0("S1.",pipeline,".illumina.txt")))
 
-            x=list(agilent,nimblegen)
-            names=list("Agilent","Nimblegen")
-            png(paste0(sample,".",pipeline,".png"))
-            grid.draw(venn.diagram(x,NULL,category.names=names))
+            x <- list(agilent, nimblegen)
+            names <- list("Agilent", "Nimblegen")
+            png(paste0(sample, ".", pipeline, ".png"))
+            grid.draw(venn.diagram(x, NULL, category.names = names))
             dev.off()
         }
     }
 
-    for (platform in c("agilent","nimblegen"))
-    {
-        for (sample in c("S1","S2","S4","S5","S6","S7","S8"))
-        {
-    
-        #sample="S3"
-        #platform="illumina"
-            jacek=unlist(read.table(paste0(sample,".jacek.",platform,".omim.variants.indels")))
-            cpipe=unlist(read.table(paste0(sample,".cpipe.",platform,".omim.variants.indels")))
-            bcbio=unlist(read.table(paste0(sample,".bcbio.",platform,".omim.variants.indels")))
-            genap=unlist(read.table(paste0(sample,".genap.",platform,".omim.variants.indels")))
+    for (platform in c("agilent", "nimblegen")){
+        for (sample in c("S1", "S2", "S4", "S5", "S6", "S7", "S8")){
+            #sample="S3"
+            #platform="illumina"
+            jacek <- unlist(read.table(paste0(sample, ".jacek.", platform, ".omim.variants.indels")))
+            cpipe <- unlist(read.table(paste0(sample, ".cpipe.", platform, ".omim.variants.indels")))
+            bcbio <- unlist(read.table(paste0(sample, ".bcbio.", platform, ".omim.variants.indels")))
+            genap <- unlist(read.table(paste0(sample, ".genap.", platform, ".omim.variants.indels")))
 
-            x=list(jacek,cpipe,bcbio,genap)
-            names=list("jacek","cpipe","bcbio","genap")
+            x <- list(jacek, cpipe, bcbio, genap)
+            names <- list("jacek", "cpipe", "bcbio", "genap")
 
-            png(paste0(sample,".",platform,".png"))
-            grid.draw(venn.diagram(x,NULL,category.names=names))
+            png(paste0(sample, ".", platform, ".png"))
+            grid.draw(venn.diagram(x, NULL, category.names = names))
             dev.off()
         }
     }
 
 }
 
-intersect_bed_files = function()
-{
+intersect_bed_files <- function(){
     # http://davetang.org/muse/2013/01/02/iranges-and-genomicranges/
-  source("http://bioconductor.org/biocLite.R")
-  biocLite("GenomicRanges")
-  library("GenomicRanges")
-  setwd("coverage/nimblegen")
-  capture=read.table("nimblegen.capture",header=F)
-colnames(capture)=c('chr','start','end')
-capture.bed = with(capture, GRanges(chr, IRanges(start+1, end)))
-
-empirical=read.table("nimblegen.empirical",header=F)
-colnames(empirical)=c('chr','start','end')
-empirical.bed = with(capture, GRanges(chr, IRanges(start+1, end)))
-
-omim=read.table("omim.orphanet.goodnames.v2.bed",header=F)
-colnames(empirical)=c('chr','start','end')
-omim.bed = with(capture, GRanges(chr, IRanges(start+1, end)))
-
-bed.intersect = intersect(omim.bed,capture.bed)
-
-
-#bed files
-setwd("venn_diagrams/bed_intersection/")
-omim=unlist(read.table("omim.orphanet.goodnames.v2.bed.exons"))
-agilent=unlist(read.table("omim_vs_agilent.50percent.wo.exons"))
-nimblegen=unlist(read.table("omim_vs_nimblegen.capture.50percent.wo.exons"))
-illumina=unlist(read.table("omim_vs_illumina.50percent.wo.exons"))
-
-x=list(omim,agilent,nimblegen,illumina)
-names=list("omim","agilent","nimblegen","illumina")
-
-grid.draw(venn.diagram(x,NULL,category.names=names))
-
+    source("http://bioconductor.org/biocLite.R")
+    biocLite("GenomicRanges")
+    library("GenomicRanges")
+    setwd("coverage/nimblegen")
+    capture <- read.table("nimblegen.capture", header = F)
+    colnames(capture) <- c("chr", "start", "end")
+    capture.bed <- with(capture, GRanges(chr, IRanges(start+1, end)))
+
+    empirical <- read.table("nimblegen.empirical", header = F)
+    colnames(empirical) <- c("chr", "start", "end")
+    empirical.bed <- with(capture, GRanges(chr, IRanges(start+1, end)))
+
+    omim <- read.table("omim.orphanet.goodnames.v2.bed", header = F)
+    colnames(empirical) <- c("chr", "start", "end")
+    omim.bed <- with(capture, GRanges(chr, IRanges(start+1, end)))
+
+    bed.intersect <- intersect(omim.bed, capture.bed)
+
+    #bed files
+    setwd("venn_diagrams/bed_intersection/")
+    omim <- unlist(read.table("omim.orphanet.goodnames.v2.bed.exons"))
+    agilent <- unlist(read.table("omim_vs_agilent.50percent.wo.exons"))
+    nimblegen <- unlist(read.table("omim_vs_nimblegen.capture.50percent.wo.exons"))
+    illumina <- unlist(read.table("omim_vs_illumina.50percent.wo.exons"))
+
+    x <- list(omim,agilent,nimblegen,illumina)
+    names <- list("omim","agilent","nimblegen","illumina")
+
+    grid.draw(venn.diagram(x, NULL, category.names = names))
 }
 
 
-variants_parameter = function()
-{
-    type="snps"
-    type="indels"
-    for (platform in c("agilent","nimblegen"))
-    {
-        for (sample in c("S1","S2","S4","S5","S6","S7","S8"))
-        {
-            sample="S1"
-            platform="agilent"
-            cpipe.hom=read.table(paste0(sample,".cpipe.",platform,".omim.",type,".hom.AD"))
-            cpipe.het=read.table(paste0(sample,".cpipe.",platform,".omim.",type,".het.AD"))
+variants_parameter <- function(){
+    type <- "snps"
+    type <- "indels"
+    for (platform in c("agilent", "nimblegen")){
+        for (sample in c("S1", "S2", "S4", "S5", "S6", "S7", "S8")){
+            sample <- "S1"
+            platform <- "agilent"
+            cpipe.hom <- read.table(paste0(sample, ".cpipe.", platform, ".omim.", type, ".hom.AD"))
+            cpipe.het <- read.table(paste0(sample, ".cpipe.", platform, ".omim.", type, ".het.AD"))
     
-            bcbio.hom=read.table(paste0(sample,".bcbio.",platform,".omim.",type,".hom.AD"))
-            bcbio.het=read.table(paste0(sample,".bcbio.",platform,".omim.",type,".het.AD"))
+            bcbio.hom <- read.table(paste0(sample, ".bcbio.", platform, ".omim.", type, ".hom.AD"))
+            bcbio.het <- read.table(paste0(sample, ".bcbio.", platform, ".omim.", type, ".het.AD"))
     
-            genap.hom=read.table(paste0(sample,".genap.",platform,".omim.",type,".hom.AD"))
-            genap.het=read.table(paste0(sample,".genap.",platform,".omim.",type,".het.AD"))
+            genap.hom <- read.table(paste0(sample, ".genap.", platform, ".omim.", type, ".hom.AD"))
+            genap.het <- read.table(paste0(sample, ".genap.", platform, ".omim.", type, ".het.AD"))
 
-            genap.only = read.table("S1.genap.only.recode.vcf.AD")
-            v=c(genap.hom,genap.het,genap.only)
-            names=c("genap.hom","genap.het","genap.only")
+            genap.only <- read.table("S1.genap.only.recode.vcf.AD")
+            v <- c(genap.hom, genap.het, genap.only)
+            names <- c("genap.hom", "genap.het", "genap.only")
     
         #indels
-        v=c(cpipe.het,bcbio.het,genap.het)
-        names=c("cpipe.het","bcbio.het","genap.het")
+        v <- c(cpipe.het, bcbio.het, genap.het)
+        names <- c("cpipe.het", "bcbio.het", "genap.het")
     
-        png(paste0(sample,".",platform,".",type,".png"),width=1000)
-        boxplot(v,names=names)
+        png(paste0(sample,".", platform, ".", type, ".png"), width = 1000)
+        boxplot(v, names = names)
         dev.off()
       }
     }
 
-      setwd("~/cluster/dorin_test")
-      b1=read.table("UNIQUE_to_BCBIO_1496461.vcf.AD")
-      b2=read.table("UNIQUE_to_BCBIO_1496462.vcf.AD")
-      b3=read.table("UNIQUE_to_BCBIO_1496463.vcf.AD")
-      g1=read.table("UNIQUE_to_DNASEQ_1496461.vcf.AD")
-      g2=read.table("UNIQUE_to_DNASEQ_1496462.vcf.AD")
-      g3=read.table("UNIQUE_to_DNASEQ_1496463.vcf.AD")
+    setwd("~/cluster/dorin_test")
+    b1 <- read.table("UNIQUE_to_BCBIO_1496461.vcf.AD")
+    b2 <- read.table("UNIQUE_to_BCBIO_1496462.vcf.AD")
+    b3 <- read.table("UNIQUE_to_BCBIO_1496463.vcf.AD")
+    g1 <- read.table("UNIQUE_to_DNASEQ_1496461.vcf.AD")
+    g2 <- read.table("UNIQUE_to_DNASEQ_1496462.vcf.AD")
+    g3 <- read.table("UNIQUE_to_DNASEQ_1496463.vcf.AD")
 
-      v=c(b1,b2,b3,g1,g2,g3)
-      names=c("b1","b2","b3","g1","g2","g3")
+    v <- c(b1, b2, b3, g1, g2, g3)
+    names <- c("b1", "b2", "b3", "g1", "g2", "g3")
 
-      png(paste0(sample,".",platform,".",type,".png"),width=1000)
-      boxplot(v,names=names)
+    png(paste0(sample, ".", platform, ".", type, ".png"), width = 1000)
+    boxplot(v, names = names)
 }
 
 # title = "cheo.omim_genes.coverage"
 # coverage.gene_panel(title)
 # plots coverage for every gene for all samples in samples.txt, each sample should have sample.coverage - output of
 # bam.coverage.bamstats05.sh
-coverage.gene_panel = function (title)
-{
+coverage.gene_panel <- function(title){
     #test
-    title = "test"
+    title <- "test"
     setwd("~/Desktop/work")
-    files = list.files(".","\\.coverage$")
+    files <- list.files(".", "\\.coverage$")
     #samples = unlist(read.table("samples.txt", stringsAsFactors=F))
     
-    coverage = read.delim(files[1],header=T,stringsAsFactors = F)
-    coverage = coverage[,c("gene","avg")]
-    colnames(coverage)[2]=files[1]
-
-    for (file in tail(files,-1))
-    {
-        sample_coverage = read.delim(file,header=T,stringsAsFactors = F)
-        sample_coverage = sample_coverage[,c("gene","avg")]
-        colnames(sample_coverage)[2]=file
-        coverage = cbind(coverage,sample_coverage[2])
+    coverage <- read.delim(files[1], header = T, stringsAsFactors = F)
+    coverage <- coverage[,c("gene", "avg")]
+    colnames(coverage)[2] <- files[1]
+
+    for (file in tail(files,-1)){
+        sample_coverage <- read.delim(file,header=T,stringsAsFactors = F)
+        sample_coverage <- sample_coverage[,c("gene", "avg")]
+        colnames(sample_coverage)[2] <- file
+        coverage <- cbind(coverage,sample_coverage[2])
     }
-    row.names(coverage) = coverage$gene
-    coverage$gene=NULL
+    row.names(coverage) <- coverage$gene
+    coverage$gene <- NULL
     
-    n_genes = nrow(coverage)
+    n_genes <- nrow(coverage)
     
-    for(i in seq(1,ceiling(n_genes/100)))
-    {
-         start_index = (i-1)*100+1
-         end_index = i*100
+    for(i in seq(1, ceiling(n_genes/100))){
+         start_index <- (i-1)*100+1
+         end_index <- i*100
          
-         if (end_index > n_genes)
-           end_index = n_genes
+         if (end_index > n_genes) end_index <-  n_genes
          
-         png(paste0(title,".part",i,".png"),res=300,width=5000,height=2000)
-         boxplot(t(coverage[start_index:end_index,]),las=2,cex.axis=0.8,
-                 main=paste0("Coverage in ",length(files)," samples of NextSeq for ",title," gene panel,part ",i))
+         png(paste0(title, ".part", i, ".png"), res = 300, width = 5000, height = 2000)
+         boxplot(t(coverage[start_index:end_index,]), las = 2, cex.axis = 0.8,
+                 main = paste0("Coverage in ", length(files), " samples of NextSeq for ", 
+                               title, " gene panel,part ", i))
          dev.off()
     }
 }
@@ -204,99 +181,89 @@ coverage.gene_panel = function (title)
 # CA0246.coverage 17268
 # CH0317.coverage 15833
 # GM15262.coverage 14435
-coverage.all_genes = function ()
-{
-    title="Coverage in project 913 across all protein coding genes,no outliers"
+coverage.all_genes <- function (){
+    title <- "Coverage in project 913 across all protein coding genes,no outliers"
     library("matrixStats")
     setwd("~/Desktop/work")
-    samples = unlist(read.table("samples.txt", stringsAsFactors=F))
+    samples <- unlist(read.table("samples.txt", stringsAsFactors = F))
   
     #hopefully 1st sample has most genes
-    coverage = read.delim(paste0(samples[1],".coverage"),header=T,stringsAsFactors = F)
-    coverage = coverage[,c("gene","mean")]
-    colnames(coverage)[2]=samples[1]
-    row.names(coverage)=coverage$gene
-    coverage$gene=NULL
+    coverage <- read.delim(paste0(samples[1], ".coverage"), header = T, stringsAsFactors = F)
+    coverage <- coverage[,c("gene","mean")]
+    colnames(coverage)[2] <- samples[1]
+    row.names(coverage) <- coverage$gene
+    coverage$gene <- NULL
   
-    for (sample in tail(samples,-1))
-    {
-        sample_coverage = read.delim(paste0(sample,".coverage"),header=T,stringsAsFactors = F)
-        sample_coverage = sample_coverage[,c("gene","mean")]
-        colnames(sample_coverage)[2]=sample
-        coverage = merge(coverage,sample_coverage,by.x="row.names",by.y="gene",all.x=T)
-        row.names(coverage) = coverage$Row.names
-        coverage$Row.names=NULL
+    for (sample in tail(samples,-1)){
+        sample_coverage <- read.delim(paste0(sample,".coverage"),header=T,stringsAsFactors = F)
+        sample_coverage <- sample_coverage[,c("gene","mean")]
+        colnames(sample_coverage)[2] <- sample
+        coverage <- merge(coverage, sample_coverage, by.x = "row.names", by.y = "gene", all.x = T)
+        row.names(coverage) <- coverage$Row.names
+        coverage$Row.names <- NULL
     }
   
-    coverage[is.na(coverage)]=0  
-    coverage$Mean = rowMeans(coverage)
-    png("coverage.png",res=300,width=5000,height=2000)
-    boxplot(coverage,
-            las=1,
-            cex.axis=0.6,
-            main=title,outline = F)
+    coverage[is.na(coverage)] <- 0  
+    coverage$Mean <- rowMeans(coverage)
+    png("coverage.png", res = 300, width = 5000, height = 2000)
+    boxplot(coverage, las = 1, cex.axis = 0.6,
+            main = title, outline = F)
     dev.off()
     
-    
-    meds=rbind(colnames(coverage),colMedians(as.matrix(coverage)))
-    write.table(meds,"medians.txt",col.names = F, quote = F,row.names = F)
+    meds <- rbind(colnames(coverage), colMedians(as.matrix(coverage)))
+    write.table(meds, "medians.txt", col.names = F, quote = F, row.names = F)
 }
 
 #%of bases covered more than 10x
-coverage.percent_more_than10x = function()
-{
+coverage.percent_more_than10x <- function(){
     setwd("~/Desktop/work")
-    samples = unlist(read.table("samples.txt", stringsAsFactors=F))
+    samples <- unlist(read.table("samples.txt", stringsAsFactors = F))
   
-    for (sample in samples)
-    {
-        coverage = read.delim(paste0(sample,".coverage"),header=T,stringsAsFactors = F)
-        total_len = sum(coverage$length)
+    for (sample in samples){
+        coverage <- read.delim(paste0(sample, ".coverage"), header = T, stringsAsFactors = F)
+        total_len <- sum(coverage$length)
     
-        coverage_10x = coverage[coverage$mean>10,]
-        len_10x = sum(coverage_10x$length)
+        coverage_10x <- coverage[coverage$mean > 10,]
+        len_10x <- sum(coverage_10x$length)
     
-        print(paste0(sample," ",len_10x/total_len))
+        print(paste0(sample, " ", len_10x/total_len))
     }
 }
 
-omim_table_manipulation = function()
-{
+omim_table_manipulation <- function(){
     setwd("~/Desktop/reference_tables/OMIM_2017-04-13/")
     mimTitles.percent <- read.delim2("mimTitles.percent.txt", comment.char="#")    
     genemap2 <- read.delim("genemap2.txt", comment.char="#")
-    mimTitles.percent = merge(mimTitles.percent,genemap2, by.x="Mim.Number",by.y="Mim.Number",all.x=T,all.y=F)
-    write.csv(mimTitles.percent,"mimTitles.percent.location.csv",row.names = F)
+    mimTitles.percent <- merge(mimTitles.percent, genemap2, by.x = "Mim.Number", 
+                               by.y = "Mim.Number", all.x = T, all.y = F)
+    write.csv(mimTitles.percent, "mimTitles.percent.location.csv", row.names = F)
 }
 
-read_length_distribution = function(family)
-{
+read_length_distribution <- function(family){
     
-    family_data = read.delim(paste0(family,".tsv"), stringsAsFactors=F)
-    samples = unique(family_data$sample)
+    family_data <- read.delim(paste0(family,".tsv"), stringsAsFactors = F)
+    samples <- unique(family_data$sample)
     
-    for (sample in samples)
-    {
-        tmp = subset(family_data,sample==sample)
-        tmp$sample=NULL
-        print(paste0(sample," ",round(sum(tmp$Length*tmp$Count) / sum(tmp$Count))),quote=F)
+    for (sample in samples){
+        tmp <- subset(family_data, sample == sample)
+        tmp$sample <- NULL
+        print(paste0(sample, " ", round(sum(tmp$Length*tmp$Count) / sum(tmp$Count))), quote = F)
     }
 }
 
-read_lengths = function()
-{
+read_lengths <- function(){
     setwd("~/Desktop/project_cheo/2017-04-12_read_lengths/")
-    families = unlist(read.table("projects.txt", stringsAsFactors=F))
-    for (family in families)
-    {
+    families <- unlist(read.table("projects.txt", stringsAsFactors = F))
+    for (family in families){
       read_length_distribution(family)
     }
 
-    read_lengths <- read.csv("read_lengths.txt", sep="", stringsAsFactors=FALSE)
-    read_lengths$id=NULL
+    read_lengths <- read.csv("read_lengths.txt", sep="", stringsAsFactors = F)
+    read_lengths$id <- NULL
 
-    png("read_lengths_all_samples.png",width=2000)
-    barplot(read_lengths$average_read_length, names.arg = read_lengths$sample,main = "Average read lengths for NextSeq samples is 134",
-        las=2)
+    png("read_lengths_all_samples.png", width = 2000)
+    barplot(read_lengths$average_read_length, names.arg = read_lengths$sample,
+            main = "Average read lengths for NextSeq samples is 134",
+            las = 2)
     dev.off()
 }
diff --git a/cre.gnomad_scores.R b/cre.gnomad_scores.R
index 87bd6bb..160b98d 100644
--- a/cre.gnomad_scores.R
+++ b/cre.gnomad_scores.R
@@ -14,37 +14,38 @@
 # git clone https://github.com/naumenko-sa/bioscripts
 
 source("~/bioscripts/genes.R")
-library("R.utils")
+library(R.utils)
 
 print("Working hard ...")
 print("Downloading constraints.txt.bgz")
-gnomad_scores_url = "https://storage.googleapis.com/gnomad-public/release/2.1/ht/constraint/constraint.txt.bgz"
-download.file(gnomad_scores_url,"gnomad_scores.txt.bgz")
-gunzip("gnomad_scores.txt.bgz","gnomad_scores.txt")
-gnomad_scores = read.delim("gnomad_scores.txt", stringsAsFactors=F)
-gnomad_scores = gnomad_scores[,c("gene","transcript","canonical","oe_lof","oe_mis","pLI","pRec","pNull")]
-gnomad_scores = gnomad_scores[gnomad_scores$canonical == "true",]
+gnomad_scores_url <- "https://storage.googleapis.com/gnomad-public/release/2.1/ht/constraint/constraint.txt.bgz"
+download.file(gnomad_scores_url, "gnomad_scores.txt.bgz")
+gunzip("gnomad_scores.txt.bgz", "gnomad_scores.txt")
+gnomad_scores <- read.delim("gnomad_scores.txt", stringsAsFactors = F)
+gnomad_scores <- gnomad_scores[,c("gene","transcript","canonical","oe_lof","oe_mis","pLI","pRec","pNull")]
+gnomad_scores <- gnomad_scores[gnomad_scores$canonical == "true",]
 
 #still has a few duplicates
 #gnomad_scores[duplicated(gnomad_scores$gene),]
-gnomad_scores = gnomad_scores[!duplicated(gnomad_scores$gene),]
+gnomad_scores <- gnomad_scores[!duplicated(gnomad_scores$gene),]
 
-mart = init_mart_human()
+mart <- init_mart_human()
 get_protein_coding_genes(mart)
 
-genes_transcripts = read.csv("genes.transcripts.csv", stringsAsFactors = F)
-gnomad_scores = merge(gnomad_scores,genes_transcripts,by.x="transcript",by.y="Ensembl_transcript_id",all.x=T,all.y=F)
+genes_transcripts <- read.csv("genes.transcripts.csv", stringsAsFactors = F)
+gnomad_scores <- merge(gnomad_scores, genes_transcripts, by.x = "transcript",
+                       by.y = "Ensembl_transcript_id", all.x = T, all.y = F)
 
 #some genes are absent in grch37
 #gnomad_scores[is.na(gnomad_scores$Ensembl_gene_id),]
+gnomad_scores <- gnomad_scores[!is.na(gnomad_scores$Ensembl_gene_id),]
+gnomad_scores <- gnomad_scores[,c("Ensembl_gene_id","oe_lof","oe_mis","pLI","pRec","pNull")]
 
-gnomad_scores = gnomad_scores[!is.na(gnomad_scores$Ensembl_gene_id),]
-gnomad_scores = gnomad_scores[,c("Ensembl_gene_id","oe_lof","oe_mis","pLI","pRec","pNull")]
+colnames(gnomad_scores) <- c("Ensembl_gene_id", "Gnomad_oe_lof_score", "Gnomad_oe_mis_score",
+                             "Exac_pli_score", "Exac_prec_score", "Exac_pnull_score")
+write.csv(gnomad_scores, "gnomad_scores.csv", row.names = F)
 
-colnames(gnomad_scores) = c("Ensembl_gene_id","Gnomad_oe_lof_score","Gnomad_oe_mis_score","Exac_pli_score","Exac_prec_score","Exac_pnull_score")
-write.csv(gnomad_scores,"gnomad_scores.csv",row.names = F)
-
-file.copy("gnomad_scores.csv","~/cre/data",overwrite=T)
+file.copy("gnomad_scores.csv", "~/cre/data", overwrite = T)
 file.remove("genes.transcripts.csv")
 file.remove("gnomad_scores.txt")
-file.remove("gnomad_scores.csv")
+file.remove("gnomad_scores.csv")
\ No newline at end of file
diff --git a/cre.panel_filter.R b/cre.panel_filter.R
index 0d29d69..4b09429 100644
--- a/cre.panel_filter.R
+++ b/cre.panel_filter.R
@@ -31,15 +31,15 @@ filter_variants_genomics_england_panel = function(variant_report.csv,panel.tsv,o
     variants.ens = variants[variants$Ensembl_gene_id %in% panel$PanelAPP.EnsemblId.GRch37,]
     variants.gene = variants[variants$Gene %in% panel$PanelAPP.Gene.Symbol,]
     
-    variants = unique(sort(rbind(variants.ens,variants.gene)))
+    variants = unique(rbind(variants.ens,variants.gene))
     write.csv(variants,output.csv,row.names=F)
 }
 
 args = commandArgs(trailingOnly = T)
 
-if (args[4] == "genomics_england")
+if (is.null(args[4]))
 {
-    filter_variants_genomics_england_panel(args[1],args[2],args[3])
-}else{
     filter_variants(args[1],args[2],args[3])
+}else{
+    filter_variants_genomics_england_panel(args[1],args[2],args[3])
 }
\ No newline at end of file