diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf2..ad939b3b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,9 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +README.md +ELMER.example.tar.xz +^appveyor\.yml$ +^\.travis\.yml$ +^codecov\.yml$ +^doc$ +^Meta$ diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..8ec7d3cd --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +supplemental/ filter=lfs diff=lfs merge=lfs -text +supplemental/BRCA_unsupervised.html filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index 807ea251..ea6a75bd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ .Rproj.user .Rhistory .RData +doc +Meta +/doc/ +/Meta/ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 00000000..78b6cdac --- /dev/null +++ b/.travis.yml @@ -0,0 +1,111 @@ +# R for travis: see documentation at https://docs.travis-ci.com/user/languages/r + +language: r +r: + - bioc-release +sudo: true +cache: packages +bioc_required: true +bioc_use_devel: true +warnings_are_errors: false + +r_check_args: '--ignore-vignettes --no-examples' + +os: + - linux + - osx +osx_image: xcode9 + +addons: + apt: + packages: + - libgdal-dev + - libgdal1-dev + - libproj-dev + - libudunits2-dev + brew_packages: + - udunits + +before_install: + - if [ ${TRAVIS_OS_NAME} == linux ]; then sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable --yes; fi + - if [ ${TRAVIS_OS_NAME} == linux ]; then sudo apt-get --yes --force-yes update -qq; fi + - if [ ${TRAVIS_OS_NAME} == linux ]; then sudo apt-get install --yes libudunits2-dev libproj-dev libgeos-dev libgdal-dev; fi + + +r_binary_packages: + - withr + - knitr + - crayon + - roxygen2 + - testthat + - rex + - R6 + - whisker + - jsonlite + - httr + - RCurl + - bitops + - DBI + - getopt + - rjson + - snow + - RSQLite + - base64enc + - fastmatch + - XML + - igraph + + +bioc_packages: + - BiocStyle + - GenomicRanges + - Biobase + - affy + - EDASeq + - edgeR + - biomaRt + - IRanges + - supraHex + - S4Vectors + - ComplexHeatmap + - SummarizedExperiment + - BiocGenerics + - GenomicFeatures + - TxDb.Hsapiens.UCSC.hg19.knownGene + - limma + - genefilter + - ConsensusClusterPlus + - pathview + - clusterProfiler + - BiocCheck + - fgsea + - GenomicInteractions + - GO.db + - DO.db + - org.Hs.eg.db + +r_github_packages: + - tiagochst/ELMER.data + - hfang-bristol/dnet + - BioinformaticsFMRP/TCGAbiolinks + +install: + - echo 'GITHUB_PAT=15f150aedeaf7dea7ba59ffacb54243b66c10618' > .Renviron + +script: + - | + travis_wait R CMD build . + travis_wait 60 R CMD check ELMER*tar.gz + +after_script: + - ls -lah + - FILE=$(ls -1t *.tar.gz | head -n 1) + - Rscript -e "library(BiocCheck); BiocCheck(\"${FILE}\") + +r_build_args: --no-resave-data + +notifications: + slack: junkdnalab:KV8Dr7etoqqDQPsORdaZLOz2 + email: false + on_success: never # default: change + on_failure: always # default: always \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 0b3e6a37..1a438dba 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,95 @@ Package: ELMER -Title: What the Package Does (one line, title case) -Version: 0.0.0.9000 -Authors@R: "Lijing Yao [aut, cre]" -Description: What the package does (one paragraph) -Depends: R (>= 3.1.1) -License: What license is it under? +Title: + Inferring Regulatory Element Landscapes and Transcription Factor + Networks Using Cancer Methylomes +Version: 2.21.0 +Authors@R: c( person("Tiago","Chedraoui Silva", role = c("aut","cre"), email = "tiagochst@gmail.com"), + person("Lijing", "Yao", role = "aut", email = "lijingya@usc.edu"), + person("Simon","Coetzee", role = c("aut"), email = "Simon.Coetzee@cshs.org"), + person("Nicole", "Gull", role = "ctb"), + person("Hui", "Shen", role = "ctb"), + person("Peter", "Laird", role = "ctb"), + person("Peggy", "Farnham", role = "aut"), + person("Dechen", "Li", role = "ctb"), + person("Benjamin", "Berman", role = "aut") + ) +Maintainer: + Tiago Chedraoui Silva +Description: ELMER is designed to use DNA methylation and gene expression from a + large number of samples to infere regulatory element landscape and transcription + factor network in primary tissue. +Depends: + R (>= 3.4.0), + ELMER.data (>= 2.9.3) +License: GPL-3 LazyData: true +VignetteBuilder: knitr +Imports: + GenomicRanges, + ggplot2, + reshape, + grid, + grDevices, + graphics, + methods, + parallel, + stats, + utils, + IRanges, + GenomeInfoDb, + S4Vectors, + GenomicFeatures, + TCGAbiolinks (>= 2.23.7), + plyr, + Matrix, + dplyr, + Gviz, + ComplexHeatmap, + circlize, + MultiAssayExperiment, + SummarizedExperiment, + biomaRt, + doParallel, + downloader, + ggrepel, + lattice, + magrittr, + readr, + scales, + rvest, + xml2, + plotly, + gridExtra, + rmarkdown, + stringr, + tibble, + tidyr, + progress, + purrr, + reshape2, + ggpubr, + rtracklayer, + DelayedArray +Suggests: + BiocStyle, + AnnotationHub, + ExperimentHub, + knitr, + testthat, + data.table, + DT, + GenomicInteractions, + webshot, + R.utils, + covr, + sesameData +biocViews: + DNAMethylation, + GeneExpression, + MotifAnnotation, + Software, + GeneRegulation, + Transcription, + Network +Encoding: UTF-8 +RoxygenNote: 7.2.3 diff --git a/ELMER.Rproj b/ELMER.Rproj index eaa6b818..93f86b1c 100644 --- a/ELMER.Rproj +++ b/ELMER.Rproj @@ -9,10 +9,10 @@ UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 -RnwWeave: Sweave +RnwWeave: knitr LaTeX: pdfLaTeX BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace +PackageRoxygenize: rd diff --git a/NAMESPACE b/NAMESPACE index d43a2bdc..3110fbb1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,195 @@ -# Generated by roxygen2 (4.1.0): do not edit by hand +# Generated by roxygen2: do not edit by hand -export(Distal) -export(Proximal) -export(RandomLoci) -export(ReadBed) -export(ReadGFF) -export(WriteBed) +export(GetNearGenes) +export(TCGA.pipe) +export(TF.rank.plot) +export(TFsurvival.plot) +export(addDistNearestTSS) +export(calculateEnrichement) +export(createBigWigDNAmetArray) +export(createIGVtrack) +export(createMAE) +export(createMotifRelevantTfs) +export(createTSVTemplates) +export(get.TFs) +export(get.diff.meth) +export(get.enriched.motif) +export(get.feature.probe) +export(get.pair) +export(get.permu) +export(getExp) +export(getExpSamples) +export(getGeneID) +export(getMet) +export(getMetSamples) +export(getRandomPairs) +export(getRegionNearGenes) +export(getSymbol) +export(getTCGA) +export(getTFBindingSites) +export(getTFtargets) +export(getTSS) +export(heatmapGene) +export(heatmapPairs) +export(metBoxPlot) +export(motif.enrichment.plot) +export(preAssociationProbeFiltering) +export(promoterMeth) +export(render_report) +export(scatter.plot) +export(schematic.plot) +import(ComplexHeatmap) +import(ELMER.data) +import(GenomeInfoDb) +import(Gviz) +import(circlize) +import(ggplot2) +import(lattice) +importFrom(DelayedArray,rowMins) +importFrom(GenomeInfoDb,Seqinfo) +importFrom(GenomicFeatures,transcripts) +importFrom(GenomicRanges,"strand<-") +importFrom(GenomicRanges,GRanges) +importFrom(GenomicRanges,distance) +importFrom(GenomicRanges,distanceToNearest) +importFrom(GenomicRanges,findOverlaps) +importFrom(GenomicRanges,follow) +importFrom(GenomicRanges,makeGRangesFromDataFrame) +importFrom(GenomicRanges,nearest) +importFrom(GenomicRanges,precede) +importFrom(GenomicRanges,promoters) +importFrom(GenomicRanges,resize) +importFrom(GenomicRanges,seqnames) +importFrom(GenomicRanges,strand) +importFrom(Gviz,IdeogramTrack) +importFrom(IRanges,IRanges) +importFrom(IRanges,ranges) +importFrom(IRanges,subsetByOverlaps) +importFrom(Matrix,colMeans) +importFrom(Matrix,colSums) +importFrom(MultiAssayExperiment,"experiments<-") +importFrom(MultiAssayExperiment,MultiAssayExperiment) +importFrom(MultiAssayExperiment,colData) +importFrom(MultiAssayExperiment,experiments) +importFrom(MultiAssayExperiment,metadata) +importFrom(MultiAssayExperiment,sampleMap) +importFrom(S4Vectors,DataFrame) +importFrom(S4Vectors,metadata) +importFrom(S4Vectors,queryHits) +importFrom(S4Vectors,subjectHits) +importFrom(S4Vectors,values) +importFrom(SummarizedExperiment,"assay<-") +importFrom(SummarizedExperiment,"colData<-") +importFrom(SummarizedExperiment,SummarizedExperiment) +importFrom(SummarizedExperiment,assay) +importFrom(SummarizedExperiment,makeSummarizedExperimentFromDataFrame) +importFrom(SummarizedExperiment,rowRanges) +importFrom(TCGAbiolinks,GDCdownload) +importFrom(TCGAbiolinks,GDCprepare) +importFrom(TCGAbiolinks,GDCquery) +importFrom(TCGAbiolinks,GDCquery_clinic) +importFrom(TCGAbiolinks,TCGAVisualize_volcano) +importFrom(TCGAbiolinks,TCGAanalyze_survival) +importFrom(TCGAbiolinks,colDataPrepare) +importFrom(biomaRt,getBM) +importFrom(biomaRt,listDatasets) +importFrom(biomaRt,useEnsembl) +importFrom(biomaRt,useMart) +importFrom(doParallel,registerDoParallel) +importFrom(downloader,download) +importFrom(dplyr,"%>%") +importFrom(dplyr,as_data_frame) +importFrom(dplyr,do) +importFrom(dplyr,filter) +importFrom(dplyr,full_join) +importFrom(dplyr,group_by) +importFrom(dplyr,group_by_) +importFrom(dplyr,left_join) +importFrom(dplyr,mutate) +importFrom(dplyr,pull) +importFrom(dplyr,select) +importFrom(dplyr,slice) +importFrom(dplyr,top_n) +importFrom(ggplot2,aes) +importFrom(ggplot2,annotation_custom) +importFrom(ggplot2,coord_flip) +importFrom(ggplot2,geom_abline) +importFrom(ggplot2,geom_errorbar) +importFrom(ggplot2,geom_point) +importFrom(ggplot2,geom_text) +importFrom(ggplot2,geom_vline) +importFrom(ggplot2,ggplot) +importFrom(ggplot2,ggplot_build) +importFrom(ggplot2,ggplot_gtable) +importFrom(ggplot2,ggsave) +importFrom(ggplot2,position_jitter) +importFrom(ggplot2,scale_color_manual) +importFrom(ggplot2,unit) +importFrom(ggpubr,ggscatter) +importFrom(ggpubr,stat_cor) +importFrom(ggrepel,geom_text_repel) +importFrom(grDevices,col2rgb) +importFrom(grDevices,dev.off) +importFrom(grDevices,pdf) +importFrom(grDevices,png) +importFrom(grDevices,rainbow) +importFrom(graphics,plot) +importFrom(grid,gpar) +importFrom(grid,grobWidth) +importFrom(grid,textGrob) +importFrom(grid,unit.c) +importFrom(gridExtra,arrangeGrob) +importFrom(gridExtra,grid.arrange) +importFrom(lattice,bwplot) +importFrom(magrittr,"%>%") +importFrom(magrittr,add) +importFrom(magrittr,divide_by) +importFrom(magrittr,multiply_by) +importFrom(methods,as) +importFrom(methods,is) +importFrom(parallel,detectCores) +importFrom(plotly,layout) +importFrom(plotly,plot_ly) +importFrom(plyr,.) +importFrom(plyr,a_ply) +importFrom(plyr,adply) +importFrom(plyr,alply) +importFrom(plyr,ddply) +importFrom(plyr,ldply) +importFrom(progress,progress_bar) +importFrom(purrr,reduce) +importFrom(readr,read_csv) +importFrom(readr,read_tsv) +importFrom(readr,write_csv) +importFrom(readr,write_tsv) +importFrom(reshape,melt.data.frame) +importFrom(reshape2,melt) +importFrom(rmarkdown,render) +importFrom(rtracklayer,export.wig) +importFrom(rvest,"%>%") +importFrom(rvest,html_table) +importFrom(scales,scientific) +importFrom(stats,coef) +importFrom(stats,cor.test) +importFrom(stats,dist) +importFrom(stats,fisher.test) +importFrom(stats,hclust) +importFrom(stats,lm) +importFrom(stats,na.omit) +importFrom(stats,p.adjust) +importFrom(stats,sd) +importFrom(stats,t.test) +importFrom(stats,wilcox.test) +importFrom(stringr,str_c) +importFrom(stringr,str_split) +importFrom(tibble,as_tibble) +importFrom(tidyr,gather) +importFrom(tidyr,separate_rows) +importFrom(tidyr,unnest) +importFrom(utils,data) +importFrom(utils,read.csv) +importFrom(utils,setTxtProgressBar) +importFrom(utils,txtProgressBar) +importFrom(utils,write.csv) +importFrom(utils,write.table) +importFrom(xml2,read_html) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 00000000..4245914e --- /dev/null +++ b/NEWS.md @@ -0,0 +1,17 @@ +ELMER 2.11.0 +============== + +Changes: +* Minor bug fixes to support R 4.0 + +ELMER 2.5.7 +============== + +Changes: + +* The article describing the new version was published in Bioinformatics (http://dx.doi.org/10.1093/bioinformatics/bty902) + + + + + diff --git a/R/ELMER.R b/R/ELMER.R new file mode 100644 index 00000000..d9f5cde3 --- /dev/null +++ b/R/ELMER.R @@ -0,0 +1,7 @@ +#' @title ELMER (Enhancer Linking by Methylation/Expression Relationships) +#' @description ELMER is designed to use DNA methylation and gene expression from a +#' large number of samples to infere regulatory element landscape and transcription +#' factor network in primary tissue. +#' @docType package +#' @name ELMER +NULL \ No newline at end of file diff --git a/R/FetchTCGA.R b/R/FetchTCGA.R new file mode 100644 index 00000000..9352ba8a --- /dev/null +++ b/R/FetchTCGA.R @@ -0,0 +1,242 @@ +#' getTCGA to download DNA methylation, RNA expression and clinic data for all samples of certain cancer type from TCGA. +#' @description +#' getTCGA is a function to download DNA methylation, RNA expression and clinic data for all +#' samples of certain cancer type from TCGA website. And downloaded data will be transform +#' to matrixes or data frame for further analysis. +#' @param disease A character specifies the disease to download in TCGA such as BLCA +#' @param Meth A logic if TRUE HM450K DNA methylation data will download. +#' @param RNA A logic if TRUE RNA-seq Hiseq-V2 from TCGA level 3 will be download. +#' @param Clinic A logic if TRUE clinic data will be download for that disease. +#' @param genome Data aligned against which genome of reference. Options: "hg19", "hg38" (default) +#' @param basedir A path shows where the data will be stored. +#' @return Download DNA methylation (HM450K)/RNAseq(HiseqV2)/Clinic data for +#' the specified disease from TCGA. +#' @usage getTCGA(disease, Meth=TRUE, RNA=TRUE, Clinic=TRUE, basedir="./Data", genome = "hg38") +#' @export +#' @examples +#' getTCGA( +#' disease = "BRCA", +#' Meth = FALSE, +#' RNA = FALSE, +#' Clinic = TRUE, +#' basedir = tempdir(), +#' genome = "hg19" +#') +getTCGA <- function( + disease, + Meth = TRUE, + RNA = TRUE, + Clinic = TRUE, + basedir = "./Data", + genome = "hg38" +){ + + if(missing(disease)) stop("disease need to be specified.") + + if(Meth){ + print.header("Downloading DNA methylation", "subsection") + test.meth <- tryCatch({ + get450K(disease, basedir, genome = genome) + }, error = function(err){ + return("error") + }) + } + + if(RNA){ + print.header("Downloading RNA", "subsection") + test.rna <- tryCatch({ + getRNAseq(disease, basedir, genome = genome) + }, error = function(err){ + return("error") + }) + } + + if(Clinic){ + print.header("Downloading Clinic", "subsection") + test.clinic <- tryCatch({ + getClinic(disease, basedir) + }, error = function(err){ + return("error") + }) + } + + if(Meth && test.meth == "error") + warning( + sprintf("Failed to download DNA methylation data. Possible possibility: + 1. No 450K DNA methylation data for %s patients; + 2. Download error.",disease)) + if(RNA && test.rna == "error") + warning( + sprintf("Failed to download RNA-seq data. Possible possibility: + 1. No RNA-seq data for %s patients; + 2. Download error.",disease)) + if(Clinic && test.clinic == "error") + warning( + sprintf("Failed to download clinic data. Possible possibility: + 1. No clinical data for %s patients; + 2. Download error.", disease)) +} + +#' getRNAseq to download all RNAseq data for a certain cancer type from TCGA. +#' @description getRNAseq is a function to download RNAseq data for all samples of a certain cancer type from TCGA +#' @param disease A character specifies disease in TCGA such as BLCA +#' @param basedir Download all RNA seq level 3 data for the specified disease. +#' @param genome Data aligned against which genome of reference. Options: "hg19", "hg38" (default) +#' @usage getRNAseq(disease, basedir = "./Data", genome = "hg38") +#' @return Download all RNA seq level 3 data for the specified disease. +#' @importFrom TCGAbiolinks GDCdownload GDCquery GDCprepare +getRNAseq <- function( + disease, + basedir="./Data", + genome = "hg38" +) { + disease <- tolower(disease) + diseasedir <- file.path(basedir, toupper(disease)) + dir.raw <- file.path(diseasedir,"Raw") + dir.rna <- file.path(dir.raw,"RNA") + if (!file.exists(dir.rna)) dir.create(dir.rna,recursive = TRUE, showWarnings = FALSE) + + fout <- sprintf("%s/%s_RNA_%s.rda",diseasedir,toupper(disease), genome) + if(!file.exists(fout)){ + + if (genome == "hg38"){ + message("Downloading STAR - Counts (hg38)") + query <- GDCquery( + project = paste0("TCGA-",toupper(disease)), + data.category = "Transcriptome Profiling", + data.type = "Gene Expression Quantification", + workflow.type = "STAR - Counts" + ) + } else { + message("Downloading RSEM - normalized results data (hg19)") + query <- GDCquery( + project = paste0("TCGA-",toupper(disease)), + data.category = "Gene expression", + data.type = "Gene expression quantification", + platform = "Illumina HiSeq", + file.type = "normalized_results", + experimental.strategy = "RNA-Seq", + legacy = TRUE + ) + } + tryCatch({ + GDCdownload(query, directory = dir.rna, files.per.chunk = 200) + }, error = function(e) { + GDCdownload(query, directory = dir.rna, files.per.chunk = 50) + }) + + # Preparing to save output if it does not exists + rna <- GDCprepare( + query, + directory = dir.rna, + save = FALSE, + remove.files.prepared = FALSE, + summarizedExperiment = TRUE + ) + if(genome == "hg19"){ + rownames(rna) <- values(rna)$ensembl_gene_id + } + message(paste0("Saving Gene Expression to: ", fout)) + save(rna,file=fout) + } else { + message(paste("Gene Expression object already exists:", fout)) + } + return("OK") +} + +#' get450K to download HM40K DNA methylation data for certain cancer types from TCGA website. +#' @description +#' get450K is a function to download latest version of HM450K DNA methylation +#' for all samples of certain cancer types from GDC website. +#' @param disease A character specifies the disease to download from TCGA such as BLCA +#' @param basedir A path. Shows where the data will be stored. +#' @param filter For each probe, the percentage of NA among the all the samples +#' should smaller than filter. +#' @param genome Data aligned against which genome of reference. Options: "hg19", "hg38" (default) +#' @return Download all DNA methylation from HM450K level 3 data for +#' the specified disease. +#' @importFrom TCGAbiolinks GDCquery GDCdownload GDCprepare +#' @usage get450K(disease, basedir="./Data",filter=0.2, genome = "hg38") +get450K <- function( + disease, + basedir = "./Data", + filter = 0.2, + genome = "hg38" +){ + + disease <- tolower(disease) + diseasedir <- file.path(basedir, toupper(disease)) + dir.raw <- file.path(diseasedir,"Raw") + dir.meth <- file.path(dir.raw,"Meth") + if (!file.exists(dir.meth)) dir.create(dir.meth,recursive = TRUE, showWarnings = FALSE) + + fout <- sprintf("%s/%s_meth_%s.rda",diseasedir,toupper(disease),genome) + if(!file.exists(fout)){ + + if (genome == "hg38") { + query <- GDCquery( + project = paste0("TCGA-",toupper(disease)), + data.category = "DNA Methylation", + data.type = "Methylation Beta Value", + platform = "Illumina Human Methylation 450" + ) + } else { + query <- GDCquery( + project = paste0("TCGA-",toupper(disease)), + data.category = "DNA methylation", + legacy = TRUE, + platform = "Illumina Human Methylation 450" + ) + } + tryCatch({ + GDCdownload(query,directory = dir.meth, files.per.chunk = 5) + }, error = function(e) { + GDCdownload(query,directory = dir.meth, method = "client") + }) + message("Preparing data") + met <- GDCprepare( + query = query, + directory = dir.meth, + save = TRUE, + save.filename = sprintf("%s/%s_meth_%s_no_filter.rda",diseasedir,toupper(disease),genome), + remove.files.prepared = TRUE, + summarizedExperiment = TRUE + ) + + # Remove probes that has more than 20% of its values as NA + met <- met[rowMeans(is.na(assay(met))) < filter,] + message(paste0("Saving DNA methylation to: ", fout)) + save(met,file = fout) + } else { + message(paste("DNA methylation object already exists:", fout)) + } + return("OK") +} + +#' getClinic to download clinic data for certain cancer types from TCGA website. +#' @description +#' getClinic is a function to download latest version of clinic data for all samples of certain cancer types from TCGA website. +#' @param disease A character specifies the disease to download from TCGA such as BLCA +#' @param basedir A path shows where the data will be stored. +#' @importFrom TCGAbiolinks GDCquery_clinic +#' @return Download all clinic information for the specified disease. +getClinic <- function( + disease, + basedir="./Data" +) { + disease <- tolower(disease) + diseasedir <- file.path(basedir, toupper(disease)) + dir.raw <- file.path(diseasedir,"Raw") + dir.clinic <- file.path(dir.raw,"Clinic") + if(!file.exists(dir.clinic)) dir.create(dir.clinic,recursive = TRUE, showWarnings = FALSE) + + Clinic <- GDCquery_clinic(project = paste0("TCGA-",toupper(disease))) + save(Clinic,file=sprintf("%s/%s_clinic.rda",diseasedir,toupper(disease))) + return("OK") +} + +print.header <- function(text, type ="section"){ + message(paste(rep("-",nchar(text) + 3),collapse = "")) + message(paste(ifelse(type == "section","*","**"),text)) + message(paste(rep("-",nchar(text) + 3),collapse = "")) +} diff --git a/R/GetNearbyGenes.R b/R/GetNearbyGenes.R index ecf1d9c8..b7cc2e9c 100644 --- a/R/GetNearbyGenes.R +++ b/R/GetNearbyGenes.R @@ -1,63 +1,89 @@ -#' Collect nearby gene for one locus. -#' @param Target A charactor which is name of TRange or one of rownames of TBed. -#' @param Gene A GRange object contains coordinates of promoters for human genome. -#' @param SampleSize A number determine how many gene will be collected from each side of target (number shoule be even). -#' @param Tbed A bed format data.frame object contains coordinate of targets. -#' @param TRange A GRange object contains coordinate of targets. -#' @return A data frame of nearby genes and information: genes' IDs, genes' symbols, distance with target and side to which the gene locate to the target. - -.NearGenes <- function (Target=NULL,Gene=NULL,SampleSize=20,TBed=NULL,TRange=NULL){ +# NearGenes +# @param Target A charactor which is name of TRange or one of rownames of TBed. +# @param Gene A GRange object contains coordinates of promoters for human genome. +# @param geneNum A number determine how many gene will be collected from each +# side of target (number shoule be even). +# @param TRange A GRange object contains coordinate of targets. +# @return A data frame of nearby genes and information: genes' IDs, genes' symbols, +# distance with target and side to which the gene locate to the target. +#'@importFrom GenomicRanges strand<- +NearGenes <- function (Target = NULL, + Gene = NULL, + geneNum = 20, + TRange = NULL){ + # Algorithm: + # 1) get the follow gene (overlapping genes are diconsidered) to be the first in L1 (index variable) + # probe + # ------- O ------ + # | | || | | + # ------- || ------ + # follow gene precede gene + # 2) Sort genes (by start) + # 3.1) Get 9 genes before index (L1) + # If we only have l genes (l < 9) due to end of genomic region, get the l ones and get more 10-l to the right + # 3.2) Get 10 after index (L1) + # If we only have r genes (r < 10) due to end of genomic region, get the r ones and get more 10-r to the left + # Where 10 is genum/2 + Gene$GENEID <- Gene$ensembl_gene_id + if("external_gene_name" %in% colnames(S4Vectors::mcols(Gene))){ + Gene$SYMBOL <- Gene$external_gene_name + } else if("external_gene_id" %in% colnames(S4Vectors::mcols(Gene))){ + Gene$SYMBOL <- Gene$external_gene_id + } else { + stop("No gene symbol column found (expected external_gene_id or external_gene_name") + } if(is.null(Gene) | is.null(Target)){ stop ("Target and Genes should both be defined") } - source("/export/uec-gs1/laird/users/lijingya/software/scripts/R/ReadFile.R") - require("GenomicRanges") - message(Target) - # form the Gene GRange - # Gene <- GRanges(seqnames = TBed[Target,1], ranges = IRanges (start =as.numeric(TBed[Target,2]), end = as.numeric(TBed[Target,3])), strand=TBed[Target,6]) if(is.null(TRange)){ - if(is.null(TBed)){ - stop( "Either TBed or TRange must be defined") - } - regionInfo <- GRanges(seqnames = TBed[Target,1], ranges = IRanges (start =as.numeric(TBed[Target,2]), end = as.numeric(TBed[Target,3]))) + stop( "TRange must be defined") }else{ - regionInfo <- TRange[TRange$name %in% Target,] + # Just to be sure we have only one probe. To be removed ? + regionInfo <- TRange[names(TRange) %in% Target] } - GeneIDs <-c() + GeneIDs <- c() Distances <- c() strand(Gene) <- "*" + # We will get only genes in the same same chromossome Gene <- Gene[as.character(seqnames(Gene)) %in% as.character(seqnames(regionInfo))] if(length(Gene)==0){ warning(paste0(Target," don't have any nearby gene in the given gene list.")) Final <- NA - }else{ + } else { Gene <- sort(Gene) - # Gene_subset <- Gene[as.character(seqnames(Gene)) %in% as.character(seqnames(regionInfo))] index <- follow(regionInfo,Gene) #left side - Leftlimit <- SampleSize/2 - Rightlimit <- SampleSize/2 - Left <- index + Leftlimit <- geneNum/2 + Rightlimit <- geneNum/2 n <- 1 - if(index==1){ + if(is.na(index)){ + index<- 0 + Leftlimit <- 0 + Left <- c() + }else if(index==1){ + Left <- index Leftlimit <- length(Left) }else{ + Left <- index while(length(Left) < Leftlimit){ - if(!as.character(Gene$GENEID[index-n])%in%as.character(Gene$GENEID[Left])) Left <- c((index-n),Left) - if((index-n)==1){ - Leftlimit <- length(Left) - } - n <- n+1 + # If the gene is not in the list already add it, otherwise go to the next + if(!as.character(Gene$GENEID[index-n]) %in% as.character(Gene$GENEID[Left])) Left <- c((index-n),Left) + + # Is it the first gene? If so there is nothing in the left anymore + if((index-n)==1) Leftlimit <- length(Left) + n <- n + 1 } } Right <- c() n <- 1 - if(index==length(Gene) || all(unique(Gene$GENEID[(index+1):length(Gene)]) %in% as.character(Gene$GENEID[index]))){ + if(index==length(Gene) || + all(unique(Gene$GENEID[(index+1):length(Gene)]) %in% as.character(Gene$GENEID[index]))){ Rightlimit <- length(Right) }else{ while(length(Right) < Rightlimit){ - if(!as.character(Gene$GENEID[index+n])%in% as.character(Gene$GENEID[c(Right,Left)])) Right <- c(Right,(index+n)) + if(!as.character(Gene$GENEID[index+n])%in% as.character(Gene$GENEID[c(Right,Left)])) + Right <- c(Right,(index+n)) if(index+n==length(Gene)){ Rightlimit <- length(Right) @@ -67,67 +93,499 @@ } } - if(Rightlimit < SampleSize/2){ + if(Rightlimit < geneNum/2){ n <- 1 if(Left[1]-n > 0){ - while((length(Left)+length(Right)) < SampleSize){ - if(!as.character(Gene$GENEID[Left[1]-n])%in%as.character(Gene$GENEID[c(Left,Right)])) Left <- c((Left[1]-n),Left) + while((length(Left)+length(Right)) < geneNum){ + if(!as.character(Gene$GENEID[Left[1]-n])%in%as.character(Gene$GENEID[c(Left,Right)])) + Left <- c((Left[1]-n),Left) n <- n+1 } } } - if(Leftlimit < SampleSize/2){ + if(Leftlimit < geneNum/2){ n <- 1 m <- length(Right) if(Right[m]+n < length(Gene)+1) - while((length(Left)+length(Right)) < SampleSize){ - if(!as.character(Gene$GENEID[Right[m]+n])%in%as.character(Gene$GENEID[c(Left,Right)])) Right <- c(Right,(Right[m]+n)) + while((length(Left)+length(Right)) < geneNum){ + if(!as.character(Gene$GENEID[Right[m]+n])%in%as.character(Gene$GENEID[c(Left,Right)])) + Right <- c(Right,(Right[m]+n)) n <- n+1 } } - print(Left) - print(Right) Whole <- c(Left,Right) GeneIDs <- Gene$GENEID[Whole] Symbols <- Gene$SYMBOL[Whole] - Distances <- distance(Gene[Whole],regionInfo) + Distances <- suppressWarnings(distance(Gene[Whole],regionInfo)) if(Rightlimit < 1){ Sides <- paste0("L",length(Left):1) - }else if( Leftlimit < 1){ + } else if( Leftlimit < 1){ Sides <- paste0("R",1:length(Right)) - }else{ + } else{ Sides <- c(paste0("L",length(Left):1),paste0("R",1:length(Right))) } - Final <- cbind(Target=rep(Target,length(GeneIDs)),GeneID=GeneIDs,Symbol=Symbols,Distance=Distances, Sides=Sides) - names(Final) <- NULL + + Final <- data.frame(Target=rep(Target,length(GeneIDs)),GeneID=GeneIDs, + Symbol=Symbols,Distance=Distances, Side=Sides, + stringsAsFactors = FALSE) + Final <- Final[order(Final$Side,Final$Distance),] } return(Final) } -#' Collect nearby gene for one locus. -#' @param Gene A GRange object contains coordinates of promoters for human genome. -#' @param SampleSize A number determine how many gene will be collected from each side of target (number shoule be even) Default to 20. -#' @param Tbed A bed format data.frame object contains coordinate of a list targets. -#' @param TRange A GRange object contains coordinate of a list targets. -#' @param cores A number to specific how many cores to use to compute. Default to detectCores()/2. -#' @return A data frame of nearby genes and information: genes' IDs, genes' symbols, distance with target and side to which the gene locate to the target. - -GetNearGenes <- function(SampleSize=20,Gene=NULL,TBed=NULL,TRange=NULL,cores=NULL){ - require("snow") - if(is.null(cores)) cores <- detectCores()/2 - cl <- makeCluster(cores,type="SOCK") - # options('mc.cores'=detectCores()/2) +#' GetNearGenes to collect nearby genes for one locus. +#' @description +#' GetNearGenes is a function to collect equal number of gene on each side of one locus. +#' It can receite either multi Assay Experiment with both DNA methylation and gene Expression matrix +#' and the names of probes to select nearby genes, or it can receive two granges objects TRange and geneAnnot. +#' @param data A multi Assay Experiment with both DNA methylation and gene Expression objects +#' @param probes Name of probes to get nearby genes (it should be rownames of the DNA methylation +#' object in the data argument object) +#' @param geneAnnot A GRange object or Summarized Experiment object that contains coordinates of promoters for +#' human genome. +#' @param numFlankingGenes A number determines how many gene will be collected totally. +#' Then the number devided by 2 is the number of genes collected from +#' each side of targets (number shoule be even) Default to 20. +#' @param TRange A GRange object or Summarized Experiment object that contains coordinates of a list of targets loci. +#' @return A data frame of nearby genes and information: genes' IDs, genes' symbols, +#' distance with target and side to which the gene locate to the target. +#' @export +#' @importFrom GenomicRanges strand follow distance +#' @importFrom plyr alply +#' @importFrom doParallel registerDoParallel +#' @importFrom SummarizedExperiment rowRanges +#' @references +#' Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +#' factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +#' @examples +#' geneAnnot <- getTSS(genome = "hg38") +#' probe <- GenomicRanges::GRanges(seqnames = c("chr1","chr2"), +#' range=IRanges::IRanges(start = c(16058489,236417627), end= c(16058489,236417627)), +#' name= c("cg18108049","cg17125141")) +#' names(probe) <- c("cg18108049","cg17125141") +#' NearbyGenes <- GetNearGenes(numFlankingGenes = 20,geneAnnot=geneAnnot,TRange=probe) +GetNearGenes <- function(data = NULL, + probes = NULL, + geneAnnot = NULL, + TRange = NULL, + numFlankingGenes = 20){ + message("Searching for the ", numFlankingGenes, " near genes") + if(!is.null(data)){ + if(is.null(probes)) stop("Please set the probes argument (names of probes to select nearby genes)") + TRange <- subset(getMet(data), rownames(getMet(data)) %in% probes) + geneAnnot <- getExp(data) + } if(is.null(TRange)){ - if(is.null(TBed)){ - stop( "Either TBed or TRange must be defined") + stop("TRange must be defined") + } + tssAnnot <- NULL + if(is.null(geneAnnot)){ + if("genome" %in% names(metadata(data))){ + genome <- metadata(data)$genome + tssAnnot <- getTSS(genome = genome) + geneAnnot <- get.GRCh(genome = genome,as.granges = TRUE) } - rownames(TBed) <- TBed[,4] - out <- parSapplyLB(cl,rownames(TBed),.NearGenes,SampleSize=SampleSize,Gene=Gene,TBed=TBed,simplify=FALSE) - }else{ - out <- parSapplyLB(cl,as.character(TRange$name),.NearGenes,SampleSize=SampleSize,Gene=Gene,TRange=TRange,simplify=FALSE) } - stopCluster(cl) - return(out) + + if(class(TRange) == class(as(SummarizedExperiment(),"RangedSummarizedExperiment"))){ + TRange <- rowRanges(TRange) + } + if(class(geneAnnot) == class(as(SummarizedExperiment(),"RangedSummarizedExperiment"))){ + geneAnnot <- rowRanges(geneAnnot) + } + + if(is.null(names(TRange))) { + if(is.null(TRange$name)) stop("No probe names found in TRange") + names(TRange) <- TRange$name + } + + NearGenes <- + getRegionNearGenes( + numFlankingGenes = numFlankingGenes, + geneAnnot = geneAnnot, + tssAnnot = tssAnnot, + TRange = TRange + ) + return(NearGenes) } + +#' @title Calculate the distance between probe and gene TSS +#' @description Calculate the distance between probe and gene TSS +#' @param data A multi Assay Experiment with both DNA methylation and gene Expression objects +#' @param NearGenes A list or a data frame with the pairs gene probes +#' @param cores Number fo cores to be used. Deafult: 1 +#' @param met.platform DNA methyaltion platform to retrieve data from: EPIC or 450K (default) +#' @param genome Which genome build will be used: hg38 (default) or hg19. +#' @export +#' @examples +#' \dontrun{ +#' data <- ELMER:::getdata("elmer.data.example") +#' NearbyGenes <- GetNearGenes( +#' data = data, +#' probes = c("cg15924102", "cg24741609"), +#' numFlankingGenes = 20 +#' ) +#' NearbyGenes <- addDistNearestTSS(data = data, NearGenes = NearbyGenes) +#' } +addDistNearestTSS <- function( + data, + NearGenes, + genome, + met.platform, + cores = 1 +) { + + if(missing(NearGenes)) stop("Please set NearGenes argument") + + # used to recover TSS information + if(missing(data) & missing(genome)) { + stop("Please set data argument or genome arguments") + } + + # For a given probe/region and gene find nearest TSS distance + if(!missing(data)){ + tss <- getTSS(metadata(data)$genome) + } else { + tss <- getTSS(genome = genome) + } + + message("Update the distance to gene to distance to the nearest TSS of the gene") + + # If our input has the probe names we will have to recover the probe metadata to map. + region <- FALSE + if(!missing(data)){ + met <- rowRanges(getMet(data)) + } else if(!missing(met.platform)){ + met <- getInfiniumAnnotation(plat = met.platform, genome = genome) + } else { + region <- TRUE + met <- NearGenes %>% tidyr::separate("Target", c("seqnames","start","end"), + sep = ":|-", remove = FALSE, + convert = FALSE, extra = "warn", fill = "warn") %>% + makeGRangesFromDataFrame(keep.extra.columns = TRUE) + } + + NearGenes <- calcDistNearestTSS(links = NearGenes,TRange = met,tssAnnot = tss) + + return(NearGenes) +} + +#' @title Calculate distance from region to nearest TSS +#' @description +#' Idea +#' For a given region R linked to X genes G +#' merge R with nearest TSS for G (multiple) +#' this will increse nb of lines +#' i.e R1 - G1 - TSS1 - DIST1 +#' R1 - G1 - TSS2 - DIST2 +#' To vectorize the code: +#' make a granges from left and onde from right +#' and find distance +#' collapse the results keeping min distance for equals values +#' @param links Links to calculate the distance +#' @param TRange Genomic coordinates for Tartget region +#' @param tssAnnot TSS annotation +#' @importFrom dplyr slice left_join group_by_ +#' @examples +#' \dontrun{ +#' data <- ELMER:::getdata("elmer.data.example") +#' NearbyGenes <- GetNearGenes( +#' data = data, +#' probes = c("cg15924102", "cg24741609"), +#' numFlankingGenes = 20 +#' ) +#' +#' NearbyGenes <- ELMER:::calcDistNearestTSS( +#' links = NearbyGenes, +#' tssAnnot = getTSS(genome = "hg38"), +#' TRange = rowRanges(getMet(data)) +#' ) +#' } +#' @author Tiago C. Silva +calcDistNearestTSS <- function( + links, + TRange, + tssAnnot +){ + + message("calculating Distance to nearest TSS") + if(!is(tssAnnot,"GenomicRanges")){ + stop("tssAnnot is not a GenomicRanges") + } + + if(!is(TRange,"GenomicRanges")){ + stop("tssAnnot is not a GenomicRanges") + } + + if(!"ID" %in% colnames(values(TRange))){ + TRange$ID <- names(TRange) + } + + if(!"ensembl_gene_id" %in% colnames(links)){ + colnames(links)[grep("GeneID", colnames(links))] <- "ensembl_gene_id" + } + + merged <- dplyr::left_join( + links, + suppressWarnings(tibble::as_tibble(tssAnnot)), + by = c("ensembl_gene_id") + ) + + merged <- dplyr::left_join( + merged, + suppressWarnings(tibble::as_tibble(TRange)), + by = c("ID") + ) + + # In case a gene was removed from newer versions and not mapped + merged <- merged[!is.na(merged$transcription_start_site),] + + left <- makeGRangesFromDataFrame( + merged, + start.field = "transcription_start_site", + end.field = "transcription_start_site", + seqnames.field = "seqnames.x", + strand.field = "strand.x", + ignore.strand = FALSE + ) + + right <- makeGRangesFromDataFrame( + merged, + start.field = "start.y", + end.field = "end.y", + strand.field = "strand.y", + seqnames.field = "seqnames.y", + ignore.strand = FALSE + ) + + merged$DistanceTSS <- distance(left,right,ignore.strand = TRUE) + merged <- unique(merged[,c("ID","ensembl_gene_id","DistanceTSS")]) + + ret <- merged %>% + dplyr::group_by(.data$ID,.data$ensembl_gene_id) %>% + dplyr::slice(which.min(DistanceTSS)) + + #ret <- ret[match(links %>% tidyr::unite(ID,Target,GeneID) %>% pull(ID), + # ret %>% tidyr::unite(ID,Target,GeneID) %>% pull(ID)),] + links <- dplyr::full_join(links,ret) + colnames(links)[1:3] <- c("ID","GeneID","Symbol") + return(links) +} + +#' @title Identifies nearest genes to a region +#' @description +#' Auxiliary function for GetNearGenes +#' This will get the closest genes (n=numFlankingGenes) for a target region (TRange) +#' based on a genome of refenrece gene annotation (geneAnnot). If the +#' transcript level annotation (tssAnnot) is provided the Distance will be updated to +#' the distance to the nearest TSS. +#' @param geneAnnot A GRange object contains gene coordinates of for human genome. +#' @param tssAnnot A GRange object contains tss coordinates of for human genome. +#' @param numFlankingGenes A number determine how many gene will be collected from each +# side of target (number shoule be even). +#' @param TRange A GRange object contains coordinate of targets. +#' @return A data frame of nearby genes and information: genes' IDs, genes' symbols, +# distance with target and side to which the gene locate to the target. +#' @examples +#' geneAnnot <- ELMER:::get.GRCh("hg38",as.granges = TRUE) +#' tssAnnot <- getTSS(genome = "hg38") +#' probe <- GenomicRanges::GRanges(seqnames = c("chr1","chr2"), +#' range=IRanges::IRanges(start = c(16058489,236417627), end= c(16058489,236417627)), +#' name= c("cg18108049","cg17125141")) +#' names(probe) <- c("cg18108049","cg17125141") +#' NearbyGenes <- getRegionNearGenes(numFlankingGenes = 20, +#' geneAnnot = geneAnnot, +#' TRange = probe, +#' tssAnnot = tssAnnot) +#' @importFrom GenomicRanges nearest precede follow +#' @importFrom tibble as_tibble +#' @importFrom dplyr group_by do +#' @importFrom progress progress_bar +#' @author +#' Tiago C Silva (maintainer: tiagochst@usp.br) +#' @export +getRegionNearGenes <- function(TRange = NULL, + numFlankingGenes = 20, + geneAnnot = NULL, + tssAnnot = NULL){ + + + pb <- progress::progress_bar$new(total = numFlankingGenes * 2) + + TRange$ID <- names(TRange) + + # We will consider the input at a gene level only + if(! "ensembl_gene_id" %in% colnames(S4Vectors::mcols(geneAnnot))){ + message("geneAnnot needs the following column ensembl_gene_id") + } + geneAnnot <- geneAnnot[!duplicated(geneAnnot$ensembl_gene_id)] + + + # Optimized version + # Idea: vectorize search + # 1) For all regions, get nearest gene + # 2) check follow and overlapping genes recursively + # 3) check precede and overlapping genes recursively + # 4) map the positions based on min distance (L1) + # The input data has to be at gene level and not transcript which would broke + # some of the optimizations for which we remove the genes already evaluated + all <- 1:length(TRange) + nearest.idx <- + nearest(TRange, + geneAnnot, + select = "all", + ignore.strand = TRUE) + idx <- suppressWarnings(tibble::as_tibble(nearest.idx)) + evaluating <- idx$queryHits + suppressWarnings({ + ret <- + cbind( + suppressWarnings(tibble::as_tibble(geneAnnot[idx$subjectHits])), + tibble::tibble( + "ID" = names(TRange)[idx$queryHits], + "Distance" = distance(TRange[idx$queryHits], + geneAnnot[idx$subjectHits], select = "all", ignore.strand = TRUE) * + ifelse(start(TRange[evaluating]) < start(geneAnnot[idx$subjectHits]), 1,-1) + + ) + ) + }) + for (i in 1:(numFlankingGenes)) { + idx <- + unique(rbind( + suppressWarnings(tibble::as_tibble( + findOverlaps( + geneAnnot[idx$subjectHits], + geneAnnot, + ignore.strand = TRUE, + type = "any", + select = "all" + ) + )), + suppressWarnings(tibble::as_tibble( + precede( + geneAnnot[idx$subjectHits], + geneAnnot, + select = "all", + ignore.strand = TRUE + ) + )) + )) + idx$evaluating <- evaluating[idx$queryHits] + # remove same target gene and probe if counted twice + idx <- idx[!duplicated(idx[, 2:3]), ] + + # todo remove already evaluated previously (we don't wanna do it again) + idx <- + idx[!paste0(geneAnnot[idx$subjectHits]$ensembl_gene_id, names(TRange)[idx$evaluating]) %in% paste0(ret$ensembl_gene_id, ret$ID), ] + evaluating <- evaluating[idx$queryHits] + ret <- + rbind(ret, # keep old results + cbind( + suppressWarnings( + tibble::as_tibble(geneAnnot[idx$subjectHits])), + tibble::tibble( + "ID" = names(TRange)[evaluating], + "Distance" = ifelse(start(TRange[evaluating]) < start(geneAnnot[idx$subjectHits]), 1,-1) * + distance(TRange[evaluating], + geneAnnot[idx$subjectHits], select = "all", + ignore.strand = TRUE) + ) + )) + pb$tick() + } + ret <- ret[!duplicated(ret[,c("ensembl_gene_id","ID")]),] + + idx <- suppressWarnings(tibble::as_tibble(nearest.idx)) + evaluating <- idx$queryHits + for (i in 1:(numFlankingGenes)) { + idx <- + unique(rbind( + suppressWarnings(tibble::as_tibble( + findOverlaps( + geneAnnot[idx$subjectHits], + geneAnnot, + ignore.strand = TRUE, + type = "any", + select = "all" + ) + )), + suppressWarnings(tibble::as_tibble( + follow( + geneAnnot[idx$subjectHits], + geneAnnot, + select = "all", + ignore.strand = TRUE + ) + )) + )) + idx$evaluating <- evaluating[idx$queryHits] + idx <- idx[!duplicated(idx[, 2:3]), ] + idx <- + idx[!paste0(geneAnnot[idx$subjectHits]$ensembl_gene_id, names(TRange)[idx$evaluating]) %in% paste0(ret$ensembl_gene_id, ret$ID), ] + evaluating <- evaluating[idx$queryHits] + ret <- + rbind(ret, cbind( + suppressWarnings(tibble::as_tibble(geneAnnot[idx$subjectHits])), + suppressWarnings(tibble::tibble( + "ID" = names(TRange)[evaluating], + "Distance" = ifelse(start(TRange[evaluating]) < start(geneAnnot[idx$subjectHits]), 1,-1) * + distance(TRange[evaluating], + geneAnnot[idx$subjectHits], select = "all",ignore.strand = TRUE) + )) + )) + pb$tick() + } + ret <- ret[!duplicated(ret[,c("ensembl_gene_id","ID")]),] + ret <- ret[order(ret$Distance),] + + ret <- ret[, c("ID", + "ensembl_gene_id", + grep("external_gene_", colnames(ret), value = TRUE), + "Distance")] + + message("Identifying gene position for each probe") + f <- function(x) { + center <- which(abs(x$Distance) == min(abs(x$Distance)))[1] + pos <- setdiff(-center:(nrow(x) - center), 0) + x$Side <- ifelse(pos > 0, paste0("R", abs(pos)), paste0("L", abs(pos))) + out <- x %>% dplyr::filter(x$Side %in% c(paste0("R", 1:(numFlankingGenes / 2)), + paste0("L", 1:(numFlankingGenes / 2)) + )) + if (nrow(out) < numFlankingGenes) { + if (paste0("R", floor(numFlankingGenes / 2)) %in% out$Side) { + cts <- length(grep("L", sort(x$Side), value = TRUE)) + out <- x %>% dplyr::filter(Side %in% c(paste0("R", 1:(numFlankingGenes - cts)), + grep("L", sort(out$Side), value = TRUE))) + } else { + cts <- length(grep("R", sort(x$Side), value = TRUE)) + out <- x %>% dplyr::filter(x$Side %in% + c(paste0("L", 1:(numFlankingGenes - cts)), + grep("R", sort(out$Side), value = TRUE)) + ) + } + } + out <- out[order(out$Distance), ] + return(out) + } + ret <- ret %>% group_by(ID) %>% do(f(.)) + + if (!is.null(tssAnnot)) { + message("Calculating distance to nearest TSS") + ret <- calcDistNearestTSS( + links = ret, + TRange = TRange, + tssAnnot = tssAnnot + ) + } + + if(any(grepl("external_gene_", colnames(ret)))){ + colnames(ret)[1:3] <- c("ID", "GeneID", "Symbol") + } else { + colnames(ret)[1:2] <- c("ID", "GeneID") + } + pb$terminate() + return(ret) +} diff --git a/R/Heatmap.Func.R b/R/Heatmap.Func.R deleted file mode 100755 index e860f733..00000000 --- a/R/Heatmap.Func.R +++ /dev/null @@ -1,773 +0,0 @@ -## common colors -jet.colors <- - colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", - "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000")) -redGreen <- colorRampPalette(c("green","black","red")) - -#'cluster functions -#'@param x A matrix -#'@param Rowv A boolean determines if the row dendrogram should be computed and reordered. -#'@param Colv A boolean determines if the column dendrogram should be computed and reordered. -#'@param distfun function used to compute the distance (dissimilarity) between both rows and columns. Defaults to dist. -#'@param distMethod A character to specify method for computing distance. Default to euclidean. See detail for other methods. -#'@param hclustfun function used to compute the hierarchical clustering when Rowv or Colv are not dendrograms. Defaults to fastcluster::hclust. Should take as argument a result of distfun and return an object to which as.dendrogram can be applied. -#'@param hclustMethod A character to specify method for computing clustering. Default to complete -#'@param Distance.row A vector of distance value for rows. If Distance.row was specified, distance calculation step will be skiped for rows. -#'@param Distance.col A vector of distance value for columns. If Distance.col was specified, distance calculation step will be skiped for columns. -#'@return A list contains: x the original matrix; rowInd order of row after clustering; ddr dendrograms for rows; colInd order of columns after clustering; ddc dendrograms for columns. -#'@details distMethod euclidean: Usual square distance between the two vectors (2 norm).maximum: Maximum distance between two components of x and y (supremum norm).manhattan: Absolute distance between the two vectors (1 norm). canberra: sum(|x_i - y_i| / |x_i + y_i|). Terms with zero numerator and denominator are omitted from the sum and treated as if the values were missing. This is intended for non-negative values (e.g. counts): taking the absolute value of the denominator is a 1998 R modification to avoid negative distances.binary: (aka asymmetric binary): The vectors are regarded as binary bits, so non-zero elements are ‘on’ and zero elements are ‘off’. The distance is the proportion of bits in which only one is on amongst those in which at least one is on. minkowski: The p norm, the pth root of the sum of the pth powers of the differences of the components. -#'@details hclustMethod: the agglomeration method to be used. This should be (an unambiguous abbreviation of) one of "ward", "single", "complete", "average", "mcquitty", "median" or "centroid" -cluster.main <- function(x, Rowv = TRUE, Colv = TRUE, distfun = dist, distMethod="euclidean", hclustfun = fastcluster::hclust, hclustMethod="complete", - Distance.row=NULL, Distance.col=NULL){ - ##use faster cluster package - library(fastcluster) - if (length(di <- dim(x)) != 2 || !is.numeric(x)) - stop("'x' must be a numeric matrix") - nr <- di[1] - nc <- di[2] - if (nr <= 1 || nc <= 1) - stop("'x' must have at least 2 rows and 2 columns") - if (Rowv){ - if(is.null(Distance.row)){ - Distance.row <- distfun(x,method=distMethod) - } - hcr <- hclustfun(Distance.row,method=hclustMethod) - ddr <- as.dendrogram(hcr) - if (nr != length(rowInd <- order.dendrogram(ddr))){ - stop("row dendrogram ordering gave index of wrong length") - } - }else{ - rowInd <- 1:nr - ddr <- NULL - } - - if(Colv){ - if(is.null(Distance.col)){ - Distance.col <- distfun(t(x),method=distMethod) - } - hcc <- hclustfun(Distance.col,method=hclustMethod) - ddc <- as.dendrogram(hcc) - if (nc != length(colInd <- order.dendrogram(ddc))){ - stop("col dendrogram ordering gave index of wrong length") - } - }else{ - colInd <- 1:nc - ddc <- NULL - } - -# x <- x[rowInd, colInd] - out <- list(x=x,rowInd = rowInd, ddr=ddr,colInd = colInd,ddc=ddc,hcc=hcc) - return(out) -} - - -#' output heatmap -#' @param x output from cluster.main. -#' @param margins a character vector of variable names to compute margins for. -#' @param labRow a character vector of labels for rows of matrix in x. -#' @param labCol a chracter vector of lables for columns of matrix in x. -#' @param nonlab a boolean to determine no labels for rows and column. -#' @param nonlab.row a boolean to determine no labels for rows. -#' @param nonlab.row a boolean to determine no labels for columns. -#' @param ... parameters for image function. -#' @return A heatmap -heatmap.main <- function(x,margins=c(5,0.5,0.5,5), labRow=NULL, labCol=NULL, nonlab=F,nonlab.row=F,nonlab.col=F, xlab=NULL, ylab=NULL, col=heat.colors(225), zlim=NULL, cexRow = 0.2 +1/log10(nr), cexCol = 0.2 + 1/log10(nc)){ - if (length(di <- dim(x$x)) != 2 || !is.numeric(x$x)) stop("'x' must be a numeric matrix") - nr <- di[1] - nc <- di[2] - if (nr <= 1 || nc <= 1) stop("'x' must have at least 2 rows and 2 columns") -#labcol, labrow, - if (is.null(labRow)) { - if (is.null(rownames(x$x))) { - labRow <- (1:nr)[x$rowInd] - }else{ - labRow <-rownames(x$x)[x$rowInd] - } - }else{ - labRow <- labRow[x$rowInd] - } - if (is.null(labCol)){ - if (is.null(colnames(x$x))){ - labCol <- (1:nc)[x$colInd] - }else{ - labCol <-colnames(x$x)[x$colInd] - } - }else{ - labCol <- labCol[x$colInd] - } - if(nonlab){ - labRow<- NULL - labCol <- NULL - } - x$x <- x$x[x$rowInd, x$colInd] - par(mar=margins) - if (is.null(zlim)){ - image(1:nc, 1:nr, t(x$x), xlim = 0.5 + c(0, nc), ylim = 0.5 + c(0, nr), axes = FALSE, xlab = "", ylab = "", col = col) - }else{ - image(1:nc, 1:nr, t(x$x), zlim=zlim, xlim = 0.5 + c(0, nc), ylim = 0.5 + c(0, nr), axes = FALSE, xlab = "", ylab = "", col = col) - } - - if(!(nonlab.col|nonlab)){ - axis(1, 1:nc, labels = labCol, las = 2, line = -0.5, tick = 0, cex.axis = cexCol) - } - - if (!is.null(xlab)){ - mtext(xlab, side = 1, line = margins[1] - 1.25) - } - - if (!(nonlab.row|nonlab)){ - axis(2, 1:nr, labels = labRow, las = 2, line = -0.5, tick = 0, cex.axis = cexRow) - } - - - if (!is.null(ylab)){ - mtext(ylab, side = 4, line = margins[2] - 1.25) - } -} - -#' making side bars for heatmap -#' @param x A matrix which is same order of the original matrix before cluster. See details -#' @param side A character which are either 'colside' or 'rowside' specifying where the side bar locates. -#' @param order A vector of number specifying order of side bars. -#' @param ... parameters for image function. -#' @return a side bar for heatmap -#' @details x must be the matrix. If it is colbars, the row number of col bars should be the same as the col number of the matrix. If it is rowbars, the row number of row bars should be the same as the row number of the matrix -side.bars <- function(x,side="colside",order=NULL,margins=c(5,0.5,0.5,5),lab=NULL,col=heat.colors(225),zlim=NULL,cexlab = 0.2 +1/log10(nr)){ - if (length(di <- dim(x)) != 2 || !is.numeric(x)) stop("'x' must be a numeric matrix") - nr <- di[1] - nc <- di[2] - names <- colnames(x) - par(mar=margins) - if(side == "colside"){ - cbar <- x[order,] - cbar=matrix(cbar, nrow=nr) - if (is.null(zlim)){ - image(cbar, col = col, axes = FALSE) - }else{ - image(cbar, col = col, zlim=zlim, axes = FALSE) - } - - if (is.null(lab)) { - if(nc==1){ - axis(2, 0 , colnames(cbar), las=2, tick=FALSE,cex.axis=cexlab) - }else{ - axis(2, 0:(dim(cbar)[2]-1)/(dim(cbar)[2]-1) , names, las=2, tick=FALSE,cex.axis=cexlab) - } - - }else{ - if(nc==1){ - axis(2, 0, lab, las=2, tick=FALSE,cex.axis=cexlab) - }else{ - axis(2, 0:(dim(cbar)[2]-1)/ (dim(cbar)[2]-1), lab, las=2, tick=FALSE,cex.axis=cexlab) - } - } - } - if(side == "rowside"){ - rbar <- x[order,] - rbar<- matrix(rbar, nrow=nr) - if (is.null(zlim)){ - image(t(rbar), col = col, axes = FALSE) - }else{ - image(t(rbar), col = col, zlim=zlim, axes = FALSE) - } - - if (is.null(lab)) { - if(nc==1){ - axis(1, 0 , colnames(rbar), las=2, tick=FALSE,cex.axis=cexlab) - }else{ - axis(1, 0:(dim(rbar)[2]-1)/ (dim(rbar)[2]-1) , names, las=2, tick=FALSE,cex.axis=cexlab) - } - - }else{ - if(nc==1){ - axis(1, 0, lab, las=2, tick=FALSE,cex.axis=cexlab) - }else{ - axis(1, 0:(dim(rbar)[2]-1)/ (dim(rbar)[2]-1), lab, las=2, tick=FALSE,cex.axis=cexlab) - } - } - } - box("plot",col="black") - #add boader - if(nc>1){ - - for(i in 1:nc-1){ - if(side=="colside"){ - abline(h=par("usr")[3]+i*(par("usr")[4]-par("usr")[3])/nc,col="black") - }else{ - abline(v=par("usr")[1]+i*(par("usr")[2]-par("usr")[1])/nc,col="black") - } - - } - } -} - - -#' making side bars for heatmap using segment to solve the problem in image that line will very tiny when a lot of samples. -#' @param x A matrix which is same order of the original matrix before cluster. See details -#' @param side A character which are either 'colside' or 'rowside' specifying where the side bar locates. -#' @param order A vector of number specifying order of side bars. -#' @param ... parameters for image function. -#' @return a side bar for heatmap -#' @details x must be the matrix. If it is colbars, the row number of col bars should be the same as the col number of the matrix. If it is rowbars, the row number of row bars should be the same as the row number of the matrix -side.bars2 <- function(x,side="colside",order=NULL,margins=c(5,0.5,0.5,5),lab=NULL,zlim=NULL,cexlab = 0.2 +1/log10(nr)){ - if (length(di <- dim(x)) != 2 || !is.numeric(x)) stop("'x' must be a numeric matrix") - nr <- di[1] - nc <- di[2] - names <- colnames(x) - par(mar=margins) - if(side == "colside"){ - cbar <- x[order,] - cbar=matrix(cbar, nrow=nr) - image(cbar, col = "white", axes = FALSE) - dis <- par("usr")[4]-par("usr")[3] - box("plot",col="black") - #add boader - if(nc>1){ - for(i in 1:nc){ - abline(h=par("usr")[3]+i*dis/nc,col="black") - for(nn in which(cbar[,i]==1)){ - segments(nn/nr,par("usr")[3]+(i-1)*dis/nc,nn/nr,par("usr")[3]+i*dis/nc) - } - } - }else{ - for(nn in which(cbar==1)){ - segments(nn/nr,par("usr")[3],nn/nr,par("usr")[3]+dis/nc) - } - } - if (!is.null(lab)) { - names <- labs - } - if(nc==1){ - axis(2, 0 , colnames(cbar), las=2, tick=FALSE,cex.axis=cexlab) - }else{ - axis(2, 0:(dim(cbar)[2]-1)/(dim(cbar)[2]-1) , names, las=2, tick=FALSE,cex.axis=cexlab) - } - } - if(side == "rowside"){ - rbar <- x[order,] - rbar<- matrix(rbar, nrow=nr) - image(t(rbar), col = "white", axes = FALSE) - dis <- par("usr")[2]-par("usr")[1] - box("plot",col="black") - #add boader - if(nc>1){ - for(i in 1:nc){ - abline(v=par("usr")[1]+i*dis/nc,col="black") - for(nn in which(rbar[,i]==1)){ - segments(par("usr")[1]+(i-1)*dis/nc,nn/nr,par("usr")[1]+i*dis/nc,nn/nr,) - } - } - }else{ - for(nn in which(rbar==1)){ - segments(par("usr")[1],nn/nr,par("usr")[1]+dis/nc,nn/nr) - } - } - if (!is.null(lab)) { - names <- labs - } - if(nc==1){ - axis(1, 0 , colnames(rbar), las=2, tick=FALSE,cex.axis=cexlab) - }else{ - axis(1, 0:(dim(rbar)[2]-1)/ (dim(rbar)[2]-1) , names, las=2, tick=FALSE,cex.axis=cexlab) - } - } - -} - - - -#' keyplot -#' @param x A matrix. -#' @param col A vector of colors to define colors. -#' @param breaks a vector of number to define how many label in x axis. -#' @param extremes a vector of number to define the edge value of the key. Default is NULL. -#' @param ... parameters for image. -#' @return A keyplot. -keyplot <- function(x,col,breaks,extremes=NULL,texts,margin=c(1,1,1,1),cex.axis=par("cex.axis")){ - #x axis lable position: axis(at=..) - scale01 <- function(x, low = min(x), high = max(x)) { - x <- (x - low)/(high - low) - x - } - - #break is to set up the x label . - if (missing(breaks) || is.null(breaks) || length(breaks) < 1) { - if (missing(col) || is.function(col)) - breaks <- 16 - else breaks <- length(col) + 1 - } - - if (is.null(extremes)){ - breaks <- seq(min(x, na.rm = TRUE), max(x, na.rm = TRUE),length = breaks) - #get the minimul and maximal data - min.raw <- min(x, na.rm = TRUE) - max.raw <- max(x, na.rm = TRUE) - }else{ - breaks <- seq(extremes[1], extremes[2], length = breaks) - min.raw <- extremes[1] - max.raw <- extremes[2] - } - par(mar=margin) - # color sequential data - z <- seq(min.raw, max.raw, length = length(col)) - image(z = matrix(z, ncol = 1), col = col, breaks = breaks, - xaxt = "n", yaxt = "n") - #add the x axis label - lv <- pretty(breaks) - xv <- scale01(as.numeric(lv), min.raw, max.raw) - axis(1, at = xv, labels = lv,cex.axis=cex.axis) - - #add label - if(!missing(texts)){ - mtext(side=1,texts,line=2) - }else{ - mtext(side=1,"Value",line=2) - } - title("Color Key",cex.main=2) -} - - -#' Plot the dendro tree. -#' @param Rdend dendrograms for rows. -#' @param Cdend dendrograms for columns. -#' @param title The main title (on top). -#' @param cex.title A numerical value giving the amount by which plotting title text and symbols should be magnified relative to the default. This starts as 1 when a device is opened, and is reset when the layout is changed. -#' @return a graph of dendrograms tree. -dendro.plot <- function(Rdend=NULL, Cdend=NULL,margin=c(0.5,0.5,0.5,0.5),title=NULL,cex.title){ - par(mar = margin) - if (!is.null(Rdend)) { - plot(Rdend, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none") - } - if (!is.null(Cdend)){ - plot(Cdend, axes = FALSE, xaxs = "i", leaflab = "none") - } - if(!is.null(title)){ - title(main=title,cex=cex.title) - } -} - -mat=rbind(c(NA,NA,7),c(6,NA,5),c(NA,NA,4),c(3,2,1)) - - -#add more information to the figure -heat.info <- function(main=NULL,cex=1.5,mars=c(4,4,4,4)){ - plot.new() - par(mar=mars) - text(par("usr")[2],par("usr")[4], main, cex = cex) -} - -#' add lengend -#' @param Labels A vector of characters -#' @param cols A vector of colors for each characters in Labels -#' @param ... parameters in image function. -#' @return A legend -AddLegend <- function(Labels=NULL,cols=NULL,margins=c(1,1,1,1),lab.las=2,cexlab=0.2 +1/log10(length(cols))){ - par(mar=margins) - if(is.null(Labels)) Labels <- 1:length(cols) - z <- 1:length(cols) - image(matrix(z,ncol=1), col = cols, axes = FALSE) - box("plot",col="black") - labposition <- c() - for(i in 1:length(cols)-1){ - abline(v=par("usr")[1]+i*(par("usr")[2]-par("usr")[1])/length(cols),col="black") - labposition <- c(labposition,i*(par("usr")[2]-par("usr")[1])/length(cols)) - } - axis(1, labposition, Labels, las=lab.las, tick=FALSE,cex.axis=cexlab) -} - - -#--------------color function------------------------------- -#brewer.pal(n, name) :makes the color palettes from ColorBrewer available as R palettes. -#display.brewer.pal(n, name): displays the selected palette in a graphics window. -#display.brewer.all(n=NULL, type="all", select=NULL, exact.n=TRUE): displays the a few palettes simultanueously in a graphics window. -#There are 3 types of palettes, sequential, diverging, and qualitative. -#1. Sequential palettes are suited to ordered data that progress from low to high. Lightness steps dominate the look of these schemes, with light colors for low data values to dark colors for high data values. -#2. Diverging palettes put equal emphasis on mid-range critical values and extremes at both ends of the data range. The critical class or break in the middle of the legend is emphasized with light colors and low and high extremes are emphasized with dark colors that have contrasting hues. -#3. Qualitative palettes do not imply magnitude differences between legend classes, and hues are used to create the primary visual differences between classes. Qualitative schemes are best suited to representing nominal or categorical data. - -#The sequential palettes names are -#Blues BuGn BuPu GnBu Greens Greys Oranges OrRd PuBu PuBuGn PuRd Purples RdPu Reds YlGn YlGnBu YlOrBr YlOrRd - -#All the sequential palettes are available in variations from 3 different values up to 9 different values. - -#The diverging palettes are -#BrBG PiYG PRGn PuOr RdBu RdGy RdYlBu RdYlGn Spectral - -#All the diverging palettes are available in variations from 3 different values up to 11 different values. - -#For qualitative palettes, the lowest number of distinct values available always is 3, but the largest number is different for different palettes. It is given together with the palette names in the following table. - -#Accent 8 -#Dark2 8 -#Paired 12 -#Pastel1 9 -#Pastel2 8 -#Set1 9 -#Set2 8 -#Set3 12 - -# GenerateColor <- function(n,name){ -# if (!require("RColorBrewer")) { -# install.packages("RColorBrewer") -# library(RColorBrewer) -# } -# cols <- colorRampPalette(brewer.pal(n,name)) -# return(cols) -# } -# -# Color2Num <- function(x){ -# num <- unique(x) -# Numbers <- rep(0,length(x)) -# count <- 0 -# for(i in num){ -# Numbers[x %in% i] <-count -# count <- count+1 -# } -# return(Numbers) -# } -# -# -# #-------------convert number to color ----------------------- -# #x is vector -# SetcolorNum <- function (x,TotalNum=255, nameBrewer,Add=0,extremes=NULL){ -# cols<- GenerateColor(9,nameBrewer) -# col <- cols(TotalNum) -# breaks <- length(col) + 1 -# if (is.null(extremes)) -# breaks <- seq(min(x, na.rm = TRUE), max(x, na.rm = TRUE),length = breaks) -# else breaks <- seq(extremes[1], extremes[2], length = breaks) -# Cuts <- cut(x,breaks=breaks,labels=FALSE) -# Cuts <- Add+Cuts -# return(Cuts) -# } - - -#' Normalization to 0 to 1 -#' @param x A matrix. -#' @param col A boolean to determine normalize by column or not. -#' @param row A boolean to determine normalize by row or not. -#' @param na.rm A boolean to determine to remove na number or not. -#' @return A normalized matrix. -Normalize <- function (x,col=FALSE,row=FALSE,na.rm=FALSE){ - if(col){ - ColMax <- apply(x,2,max,na.rm=na.rm) - ColMin <- apply(x,2,min,na.rm=na.rm) - x <- t((t(x)-ColMin)/ColMax) - } - if(row){ - RowMax <- apply(x,1,max,na.rm=na.rm) - RowMin <- apply(x,1,min,na.rm=na.rm) - x <- (x-RowMin)/RowMax - } - - return(x) -} - -#' Normalization based on mean -#' @param x A matrix. -#' @param col A boolean to determine normalize by column or not. -#' @param row A boolean to determine normalize by row or not. -#' @param na.rm A boolean to determine to remove na number or not. -#' @return A normalized matrix. -NormalizeMean <- function (x,col=FALSE,row=FALSE,na.rm=FALSE){ - - if(col){ - Mean <- colMeans(x,na.rm=na.rm) - SD <- apply(x,2,sd,na.rm=na.rm) - x <- t((t(x)-Mean)/SD) - x[,SD==0] <- 0 - } - if(row){ - Mean <- rowMeans(x,na.rm=na.rm) - SD <- apply(x,1,sd,na.rm=na.rm) - x <- (x-Mean)/SD - x[SD==0,] <- 0 - } - - return(x) -} - -#' Normalization based on median -#' @param x A matrix. -#' @param col A boolean to determine normalize by column or not. -#' @param row A boolean to determine normalize by row or not. -#' @param na.rm A boolean to determine to remove na number or not. -#' @return A normalized matrix. -NormalizeMedian <- function (x,col=FALSE,row=FALSE,na.rm=FALSE){ - if(col){ - Median <- apply(x,2,median,na.rm=na.rm) - x <- t((t(x)-Median)) - } - if(row){ - Median <- apply(x,1,median,na.rm=na.rm) - x <- x-Median - } - - return(x) -} - -#' binary data -#' @param x A matrix. -#' @param Break A value to binarize the data. -#' @param Break2 A value to cut value to 3 categories. -#' @return A binarized matrix. -Binary <- function(x,Break=0.3,Break2=NULL){ - if(!is.numeric(x)) stop("x need to be numeric") - change <- x - if(is.null(Break2)){ - change[x > Break] <- 1 - change[x < Break | x== Break] <- 0 - }else{ - change[x < Break | x== Break] <- 0 - change[x> Break & x < Break2] <- NA - change[x > Break2 | x== Break2] <-1 - } - - return(change) -} - -# -# ##make multisidebars------------------------------------------------------- -# MultiSide.bars <- function(data=list(),side="colside",order=NULL,margins=c(5,0.5,0.5,5),col=NULL,zlim=NULL,cexlab = 0.2 +1/log10(nn)){ -# nn <- length(data) -# if (is.null(order)) stop ("order should not be NULL") -# if(is.null(zlim)) zlim <- list() -# if(is.null(col)){ -# Col <- NULL -# col <- list() -# }else{ -# Col <- TRUE -# } -# for (i in names(data)){ -# print(i) -# if(is.null(zlim)) zlim[[i]]<- NULL -# Bar.name <- i -# One <- data[[i]] -# if(is.numeric(One)){ -# if(is.null(Col)) col[[i]] <- jet.colors(255) -# side.bars(matrix(One,ncol=1),side=side,order=order,margins=margins,col=col[[i]],lab=Bar.name,cexlab=cexlab,zlim=zlim[[i]]) -# keyplot (One,col=col[[i]],margin=c(1,1,0.5,1),extremes=zlim[[i]]) -# }else if(is.factor(One)){ -# if(is.null(Col)) col[[i]] <- 1:length(unique(One)) -# side.bars(matrix(as.numeric(One),ncol=1),side=side,order=order,margins=margins,col=col[[i]][sort(unique(as.numeric(One)))],lab=Bar.name,cexlab=cexlab) -# AddLengend (levels(One)[sort(unique(as.numeric(One)))],cols=col[[i]][sort(unique(as.numeric(One)))],margins=c(1,1,0.5,1),cexlab=cexlab,lab.las=1) -# }else if(is.character(One)){ -# if(is.null(Col)) col[[i]] <- 1:length(unique(One)) -# One <- factor(One) -# side.bars(matrix(as.numeric(One),ncol=1),side=side,order=order,margins=margins,col=col[[i]][sort(unique(as.numeric(One)))],lab=Bar.name,cexlab=cexlab) -# AddLengend (levels(One)[sort(unique(as.numeric(One)))],cols=col[[i]][sort(unique(as.numeric(One)))],margins=c(1,1,0.5,1),cexlab=cexlab,lab.las=1) -# } -# } -# } - - - -#' lable linear regression formula -#' @param df A data.frame object contains two variables: dependent variable (Dep) and explanation variable (Exp). -#' @return a linear regression formula -lm_eqn = function(df){ - m = lm(Dep ~ Exp, df); - eq <- substitute(italic(y) == a + (b) %.% italic(x)*"\n"~~italic(r)^2~"="~r2, - list(a = format(coef(m)[1], digits = 2), - b = format(coef(m)[2], digits = 2), - r2 = format(summary(m)$r.squared, digits = 3))) - as.character(as.expression(eq)); -} - - - -##PeakToVenn -#Peaks :List the GRange format objects. -#... : the findOverlap options. -#the overlap parts belongs to the peaks set before. -#' Making peak sets venn diagram -#' @param Peaks A list of Peak sets. -#' @param ... parameters from VennDiagram package. -#' @return A venn diagram of peaks. -PeakToVenn <- function(Peaks,...){ - library(VennDiagram) - if(length(Peaks)==2){ - P1 <- Peaks[[1]] - P2 <- Peaks[[2]] - values(P1) <- NULL - values(P2) <- NULL - Over <- findOverlaps(P1,P2) - A <- 1: length(P1) - OverNum <- length(unique(queryHits(Over))) - B_unique <- length(P2)-length(unique(subjectHits(Over))) - B <- (length(P1)-OverNum+1):(length(P1)-OverNum+B_unique) - tmp <- list(A,B) - names(tmp) <- names(Peaks) - venn.plot <- venn.diagram( - x = tmp, - filename = NULL, - fill = c("cornflowerblue", "darkorchid1"), - alpha = 0.75, - label.col = "black", - fontfamily = "serif", - fontface = "bold", - cat.col = c("cornflowerblue", "darkorchid1"), - cat.fontfamily = "serif", - cat.fontface = "bold", - cat.dist = c(0.03, 0.03), - cat.pos = c(-20, 14), - ... - ); - }else if (length(Peaks)==3){ - for (i in 1:length(Peaks)){ - values(Peaks[[i]]) <- NULL - } - ## n1: 1 unique, n2: 2 unique, n3: 3 unique - ##n123_1: in n123 what is the number of 1. n123_2 ...of 2, n123_3 ...of 3 - ##n12_1: in n12 what is the number of 1. n12_2 ...of 2 - ##n13_1: in n13 what is the number of 1. n13_3 ...of 3 - ##n23_2: in n23 what is the number of 2. n23_3 ...of 3 - ##n1 - over1 <- findOverlaps(Peaks[[1]],c(Peaks[[2]],Peaks[[3]])) - n1 <- length(Peaks[[1]]) - length(unique(queryHits(over1))) - ##n2 - over1 <- findOverlaps(Peaks[[2]],c(Peaks[[1]],Peaks[[3]])) - n2 <- length(Peaks[[2]])-length(unique(queryHits(over1))) - ##n3 - over1 <- findOverlaps(Peaks[[3]],c(Peaks[[2]],Peaks[[1]])) - n3 <- length(Peaks[[3]])-length(unique(queryHits(over1))) - ##n123 - tmpPeak <- Peaks[[1]][unique(queryHits(findOverlaps(Peaks[[1]],Peaks[[2]])))] - n123_1 <- length(unique(queryHits(findOverlaps(tmpPeak,Peaks[[3]])))) - tmpPeak <- Peaks[[2]][unique(queryHits(findOverlaps(Peaks[[2]],Peaks[[1]])))] - n123_2 <- length(unique(queryHits(findOverlaps(tmpPeak,Peaks[[3]])))) - tmpPeak <- Peaks[[3]][unique(queryHits(findOverlaps(Peaks[[3]],Peaks[[1]])))] - n123_3 <- length(unique(queryHits(findOverlaps(tmpPeak,Peaks[[2]])))) - #n12 - n12_1 <- length(unique(queryHits(findOverlaps(Peaks[[1]],Peaks[[2]]))))-n123_1 - n12_2 <- length(unique(queryHits(findOverlaps(Peaks[[2]],Peaks[[1]]))))-n123_2 - #n13 - n13_1 <- length(unique(queryHits(findOverlaps(Peaks[[1]],Peaks[[3]]))))-n123_1 - n13_3 <- length(unique(queryHits(findOverlaps(Peaks[[3]],Peaks[[1]]))))-n123_3 - #n23 - n23_2 <- length(unique(queryHits(findOverlaps(Peaks[[2]],Peaks[[3]]))))-n123_2 - n23_3 <- length(unique(queryHits(findOverlaps(Peaks[[3]],Peaks[[2]]))))-n123_3 - tmp <- list() - tmp[[1]] <- 1:length(Peaks[[1]]) - tmp[[2]] <- c(1:n123_1, ##common sets - (n123_1+1):(n123_1+n12_1), #n12_1 - (length(Peaks[[1]])+1):(length(Peaks[[1]])+n2+n23_2)) ##unique to 2 - tmp[[3]] <- c(1:n123_1, - (n123_1+n12_1+1):(n123_1+n12_1+n13_1), ##n13_1 - (length(Peaks[[1]])+1):(length(Peaks[[1]])+n23_2), #n23_2 - (max(tmp[[2]])+1):(max(tmp[[2]])+n3)) - names(tmp) <- names(Peaks) - - subtile <- sprintf("1:%s,2:%s,3:%s\nn1:%d,n2:%d,n3%d,n12_1:%d,n12_2:%d,n13_1:%d,\nn13_3:%d,n23_2:%d,n23_3:%d,n123_1:%d,n123_2:%d,n123_3:%d",names(tmp)[1],names(tmp[2]),names(tmp)[3], - n1,n2,n3,n12_1, n12_2,n13_1,n13_3,n23_2,n23_3,n123_1,n123_2,n123_3) - venn.plot <- venn.diagram( - x = tmp, - filename = NULL, - col = "transparent", - fill = c("red", "blue", "green"), - alpha = 0.5, - label.col = c("darkred", "white", "darkblue", "white", - "white", "white", "darkgreen"), - fontfamily = "serif", - fontface = "bold", - cat.default.pos = "text", - cat.col = c("darkred", "darkblue", "darkgreen"), - cat.fontfamily = "serif", - cat.dist = c(0.06, 0.06, 0.03), - cat.pos = 0, - sub=subtile, - ... - ); - } - return (venn.plot) -} - -# convert chr matrix to color --------------------------------------------- -##x is matrix with charatcter. - -ChrToColor <- function(x,cols=NULL,simple=FALSE){ - if(class(x) %in% "data.frame") x <- as.matrix(x) - Levels <- sort(unique(as.vector(x))) - if(any(is.na(as.vector(x)))) Levels <- c("missing",Levels) - if(is.null(cols)) cols <- 1:length(Levels) - out <- mat.or.vec(nr=nrow(x),nc=ncol(x)) - rownames(out) <- rownames(x) - colnames(out) <- colnames(x) - if(simple){ - for(i in 1:length(Levels)){ - if(Levels[i] %in% "missing") out[is.na(x)] <- cols[i] - out[x %in% Levels[i]] <- cols[i] - } - colnames(out) <- colnames(x) - rownames(out) <- rownames(x) - return(out) - }else{ - for(i in 1:length(Levels)){ - if(Levels[i] %in% "missing") out[is.na(x)] <- cols[i] - out[x %in% Levels[i]] <- i - } - colnames(out) <- colnames(x) - rownames(out) <- rownames(x) - tmp <- list(x=out,cols=cols,levels=Levels) - return(tmp) - } -} - - - -# ## smooth DNA methylation matrix -# Smooth <- function(x, bin=10, cores=6){ -# nr <- dim(x)[1] -# nc <- dim(x)[2] -# start <- 1 -# end <- nc-bin -# cl <- makeCluster(cores,"SOCK") -# out <- parSapplyLB(cl, start:end, function(x, data.matrix, bin){sub.matrix <- data.matrix[,x:(x+bin-1)] -# out <- rowMeans(sub.matrix,na.rm=T) -# return(out)}, data.matrix=x, bin=bin, simplify=F) -# stopCluster(cl) -# out <- do.call(cbind,out) -# return(out) -# } -# -# # bin DNA methylation matrix -# Bin <- function(x, size, cores=6){ -# nr <- dim(x)[1] -# nc <- dim(x)[2] -# isInt <- function(n) { -# return (ceiling(n) == n); -# } -# if(!isInt(size/2)) size <- 1 + size -# if(isInt(nc/2)){ -# center = nc/2+1 -# k = floor((nc-center-size/2+1)/size) -# Breakpoints <- as.vector(unlist(apply(matrix(0:k,ncol=1), 1,function(x,center,size){ upstream <- center+size/2-1+x*size -# downstream <- center - size/2 - x*size -# out <- c(upstream, downstream) -# return(out)}, center=center, size=size))) -# if(nc- center - size/2 + 1 - k*size > size/2) Breakpoints <- c(1,nc) -# Breakpoints <- sort(Breakpoints) -# Sites <- Breakpoints -# Sites[Sites < center] <- Sites[Sites < center] - center+1 -# Sites[Sites > center] <- Sites[Sites > center] - center -# }else{ -# center = ceiling(nc/2) -# k = floor((nc-center-size/2)/size) -# Breakpoints <- as.vector(unlist(apply(matrix(0:k,ncol=1), 1,function(x,center,size){ upstream <- center+size/2+x*size -# downstream <- center - size/2 - x*size -# out <- c(upstream, downstream) -# return(out)}, center=center, size=size))) -# if(nc- center - size/2 - k*size > size/2) Breakpoints <- c(1,nc) -# Breakpoints <- sort(Breakpoints) -# Sites <- Breakpoints-center -# } -# Sites <- sapply(1:(length(Sites)-1), function(x){ out <- mean(Sites[c(x,x+1)]) -# return(out)}) -# cl <- makeCluster(cores,"SOCK") -# out <- parSapplyLB(cl, 1:(length(Breakpoints)-1), function(x, data.matrix, Breakpoints){sub.matrix <- data.matrix[,Breakpoints[x]:(Breakpoints[x+1]-1)] -# out <- rowMeans(sub.matrix,na.rm=T) -# return(out)}, data.matrix=x, Breakpoints=Breakpoints, simplify=F) -# stopCluster(cl) -# out <- list(matrix=do.call(cbind,out), sites=Sites) -# return(out) -# } \ No newline at end of file diff --git a/R/Main_function.R b/R/Main_function.R new file mode 100644 index 00000000..88014d92 --- /dev/null +++ b/R/Main_function.R @@ -0,0 +1,1904 @@ +#' @title get.feature.probe to select probes within promoter regions or distal regions. +#' @description +#' get.feature.probe is a function to select the probes falling into +#' distal feature regions or promoter regions. +#' @importFrom GenomicRanges promoters +#' @description This function selects the probes on HM450K that either overlap +#' distal biofeatures or TSS promoter. +#' @param promoter A logical.If TRUE, function will ouput the promoter probes. +#' If FALSE, function will ouput the distal probes overlaping with features. The +#' default is FALSE. +#' @param met.platform DNA methyaltion platform to retrieve data from: EPIC or 450K (default) +#' @param genome Which genome build will be used: hg38 (default) or hg19. +#' @param feature A GRange object containing biofeature coordinate such as +#' enhancer coordinates. +#' If NULL only distal probes (2Kbp away from TSS will be selected) +#' feature option is only usable when promoter option is FALSE. +#' @param TSS A GRange object contains the transcription start sites. When promoter is FALSE, Union.TSS +#' in \pkg{ELMER.data} will be used for default. When promoter is TRUE, UCSC gene TSS will +#' be used as default (see detail). User can specify their own preference TSS annotation. +#' @param TSS.range A list specify how to define promoter regions. +#' Default is upstream =2000bp and downstream=2000bp. +#' @param rm.chr A vector of chromosome need to be remove from probes such as chrX chrY or chrM +#' @return A GRange object containing probes that satisfy selecting critiria. +#' @export +#' @importFrom S4Vectors queryHits subjectHits +#' @details +#' In order to get real distal probes, we use more comprehensive annotated TSS by both +#' GENCODE and UCSC. However, to get probes within promoter regions need more +#' accurate annotated TSS such as UCSC. Therefore, there are different settings for +#' promoter and distal probe selection. But user can specify their own favorable +#' TSS annotation. Then there won't be any difference between promoter and distal +#' probe selection. +#' @return A GRanges object contains the coordinate of probes which locate +#' within promoter regions or distal feature regions such as union enhancer from REMC and FANTOM5. +#' @usage get.feature.probe( +#' feature, +#' TSS, +#' TSS.range = list(upstream = 2000, downstream = 2000), +#' promoter = FALSE, rm.chr = NULL +#' ) +#' @examples +#' # get distal enhancer probe +#' \dontrun{ +#' Probe <- get.feature.probe() +#' } +#' # get promoter probes +#' \dontrun{ +#' Probe <- get.feature.probe(promoter=FALSE) +#' } +#' # get distal enhancer probe remove chrX chrY +#' Probe2 <- get.feature.probe(rm.chr=c("chrX", "chrY")) +get.feature.probe <- function( + feature = NULL, + TSS, + genome = "hg38", + met.platform = "450K", + TSS.range = list(upstream = 2000, downstream = 2000), + promoter = FALSE, + rm.chr = NULL +){ + + probe <- getInfiniumAnnotation(plat = toupper(met.platform),genome = genome) + # We will remove the rs probes, as they should not be used in the analysis + probe <- probe[!grepl("rs",names(probe)),] + probe <- probe[!probe$MASK_general,] # remove masked probes + if(!is.null(rm.chr)) probe <- probe[!as.character(seqnames(probe)) %in% rm.chr] + + if(missing(TSS)){ + # The function getTSS gets the transcription coordinantes from Ensemble (GENCODE) + TSS <- getTSS(genome = genome) + } + suppressWarnings({ + promoters <- promoters(TSS, + upstream = TSS.range[["upstream"]], + downstream = TSS.range[["downstream"]]) + }) + + if(!promoter){ + probe <- probe[setdiff(1:length(probe),unique(queryHits(findOverlaps(probe,promoters,ignore.strand=TRUE))))] + + + if(is.null(feature)) { + message("Returning distal probes: ", length(probe)) + return(probe) + } + if(is(feature,"GRanges")) { + probe <- probe[unique(queryHits(findOverlaps(probe,feature)))] + message("Returning distal probes overlapping with features: ", length(probe)) + + } else { + stop("feature is not GRanges object.") + } + } else { + probe <- probe[unique(queryHits(findOverlaps(probe,promoters,ignore.strand=TRUE)))] + message("Returning promoter probes: ", length(probe)) + } + return(probe) +} + + + +#' @title Identify hypo/hyper-methylated CpG sites between two groups (i.e. normal vs tumor samples, treated vs untreated). +#' @description +#' get.diff.meth applys one-way t-test to identify the CpG sites that are significantly +#' hypo/hyper-methyalated using proportional samples (defined by minSubgroupFrac option) from group 1 +#' and group 2. The P values will be adjusted by Benjamini-Hochberg method. +#' Option pvalue and sig.dif will be the criteria (cutoff) for selecting significant +#' differentially methylated CpG sites. +#' If save is TURE, two getMethdiff.XX.csv files will be generated (see detail). +#' @param data A multiAssayExperiment with DNA methylation and Gene Expression data. +#' See \code{\link{createMAE}} function. +#' @param group.col A column defining the groups of the sample. You can view the +#' available columns using: colnames(MultiAssayExperiment::colData(data)). +#' @param group1 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param group2 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param diff.dir A character can be "hypo", "hyper" or "both", showing differential +#' methylation direction. It can be "hypo" which is only selecting hypomethylated probes (one tailed test); +#' "hyper" which is only selecting hypermethylated probes (one tailed test); +#' or "both" which are probes differenly methylated (two tailed test). +#' @param cores A interger which defines the number of cores to be used in parallel +#' process. Default is 1: no parallel process. +#' @param mode A character. Can be "unsupervised" or "supervised". If "supervised", the +#' minSubgroupFrac argument will be set to 1 to use all samples from both groups to find the +#' differently methylated regions. The supervised mode should be used when all samples from both +#' groups are considered homogenous (i.e. treated vs untreated, molecular subtype A vs molecular subtype B), +#' while unsupervised mode should be used when there is at least one group with heterogenous samples +#' (i.e tumor samples). +#' @param minSubgroupFrac A number ranging from 0 to 1, +#' specifying the fraction of extreme samples from group 1 and group 2 +#' that are used to identify the differential DNA methylation. +#' The default is 0.2 because we typically want to be able to detect a specific +#' (possibly unknown) molecular subtype among tumor; these subtypes often make up only +#' a minority of samples, and 20\% was chosen as a lower bound for the purposes of statistical power. +#' If you are using pre-defined group labels, such as treated replicates vs. untreated replicated, +#' use a value of 1.0 (Supervised mode) +#' @param pvalue A number specifies the significant P value (adjusted P value by BH) +#' threshold Limit for selecting significant hypo/hyper-methylated probes. Default is 0.01 +#' If pvalue is smaller than pvalue than it is considered significant. +#' @param sig.dif A number specifies the smallest DNA methylation difference as a cutoff for +#' selecting significant hypo/hyper-methylated probes. Default is 0.3. +#' @param dir.out A path specify the directory for outputs. Default is is current directory. +#' @param test Statistical test to be used. Options: t.test (DEFAULT), wilcox.test +#' @param save A logic. When TRUE, two getMethdiff.XX.csv files will be generated (see detail) +#' @param min.samples Minimun number of samples to use in the analysis. Default 5. +#' If you have 10 samples in one group, minSubgroupFrac is 0.2 this will give 2 samples +#' in the lower quintile, but then 5 will be used. +#' @details +#' save: +#' When save is TRUE, function will generate two XX.csv files.The first one is named +#' getMethdiff.hypo.probes.csv (or getMethdiff.hyper.probes.csv depends on diff.dir). +#' The first file contains all statistic results for each probe. Based on this +#' file, user can change different P value or sig.dir cutoff to select the significant results +#' without redo the analysis. The second file is named getMethdiff.hypo.probes.significant.csv +#' (or getMethdiff.hyper.probes.significant.csv depends on diff.dir). This file contains +#' statistic results for the probes that pass the significant criteria (P value and sig.dir). +#' When save is FALSE, a data frame R object will be generate which contains the same +#' information with the second file. +#' @return Statistics for all probes and significant hypo or hyper-methylated probes. +#' @export +#' @importFrom readr write_csv +#' @importFrom plyr adply +#' @importFrom stats p.adjust +#' @importFrom MultiAssayExperiment colData +#' @importFrom TCGAbiolinks TCGAVisualize_volcano +#' @references +#' Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +#' factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +#' @examples +#' data <- ELMER:::getdata("elmer.data.example") +#' Hypo.probe <- get.diff.meth(data, +#' diff.dir="hypo", +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' sig.dif = 0.1) # get hypomethylated probes +#' Hyper.probe <- get.diff.meth(data, +#' diff.dir="hyper", +#' group.col = "definition", +#' sig.dif = 0.1) # get hypomethylated probes +get.diff.meth <- function(data, + diff.dir = "hypo", + cores = 1, + mode = "unsupervised", + minSubgroupFrac = 0.2, + pvalue = 0.01, + group.col, + min.samples = 5, + group1, + group2, + test = t.test, + sig.dif = 0.3, + dir.out = "./", + save = TRUE){ + + if(is.null(getMet(data))) + stop("Cannot identify differential DNA methylation region without DNA methylation data.") + if(nrow(colData(data))==0){ + stop("Sample information data to do differential analysis.") + } else if (missing(group.col)){ + stop("Please colData.col should be specified, labeling two group of sample for comparison. See colnames(colData(data)) for possibilities") + } else if (!group.col %in% colnames(colData(data))){ + stop("Group column not found in phenotypic data and meta-data of the object. See values with colData(data)") + } else if (missing(group1) | missing(group2)) { + if(length(unique(colData(data)[,group.col])) < 2){ + stop("Group column should have at least 2 distinct group labels for comparison.") + } else if (length(unique(colData(data)[,group.col])) > 2){ + stop("Please your object must have only two groups. We found more than two and this might impact the next analysis steps.") + } else { + # TO be changed + groups <- colData(data)[,group.col] + group1 <- unique(groups)[1] + group2 <- unique(groups)[2] + message(paste0("Group 1: ", group1, "\nGroup 2: ", group2)) + } + } else if(!group1 %in% unique(colData(data)[,group.col])){ + stop(group1," not found in ", group.col) + } else if(!group2 %in% unique(colData(data)[,group.col])){ + stop(group2," not found in ", group.col) + } + if(!diff.dir %in% c("hypo","hyper","both")) stop("diff.dir optiosn are hypo, hyper or both") + if(diff.dir %in% c("both")) diff.dir <- NA + + + parallel <- FALSE + if (cores > 1){ + if (cores > detectCores()) cores <- detectCores() + registerDoParallel(cores) + parallel = TRUE + } + Top.m <- ifelse(diff.dir == "hyper",TRUE,FALSE) + if(is.na(Top.m) & minSubgroupFrac < 1) { + message("Two tailed test should be performed with all samples") + minSubgroupFrac <- 1 + } + if(mode == "supervised" & minSubgroupFrac < 1) { + message("Supervised mode will use all samples from boths groups. Setting argument minSubgroupFrac to 1") + minSubgroupFrac <- 1 + } + + counts <- plyr::count(MultiAssayExperiment::colData(data)[,group.col]) + message(paste0("ELMER will search for probes ", ifelse(is.na(diff.dir),"differently ",diff.dir), + "methylated in group ", + group1, " (n:",subset(counts,counts$x == group1)$freq,")", + " compared to ", + group2, " (n:",subset(counts,counts$x == group2)$freq,")")) + + message(paste0("ooo Arguments ooo")) + message(paste0("o Number of probes: ",nrow(getMet(data)))) + message(paste0("o Beta value difference cut-off: ",sig.dif)) + message(paste0("o FDR cut-off: ", pvalue)) + message(paste0("o Mode: ", mode)) + message(paste0("o % of samples per group in each comparison: ", minSubgroupFrac)) + message(paste0("o Min number of samples per group in each comparison: ", min.samples)) + message(paste0("o Nb of samples group1 in each comparison: ", + ifelse(round(subset(counts,counts$x == group1)$freq * minSubgroupFrac) > min.samples, + round(subset(counts,counts$x == group1)$freq * minSubgroupFrac), + min(min.samples,subset(counts,counts$x == group1)$freq)))) + message(paste0("o Nb of samples group2 in each comparison: ", + ifelse(round(subset(counts,counts$x == group2)$freq * minSubgroupFrac) > min.samples, + round(subset(counts,counts$x == group2)$freq * minSubgroupFrac), + min(min.samples,subset(counts,counts$x == group2)$freq)))) + message(paste0("Output direction: ", dir.out)) + message(paste0("ooooooooooooooooo")) + + + groups.info <- colData(data)[getMetSamples(data),group.col] + met <- assay(getMet(data)) + probes <- rownames(met) + out <- alply(.data = met, .margins = 1, + .fun = function(x) { + Stat.diff.meth(percentage = minSubgroupFrac, + meth = x, + min.samples = min.samples, + groups = groups.info, + group1 = group1, + test = test, + group2 = group2, + Top.m = Top.m)}, + .progress = "time", + .parallel = parallel, + .paropts = list(.errorhandling = 'pass') + ) + out <- do.call(rbind,out) + out <- as.data.frame(out,stringsAsFactors = FALSE) + out$probe <- probes + diffCol <- paste0(gsub("[[:punct:]]| ", ".", group1),"_Minus_",gsub("[[:punct:]]| ", ".", group2)) + out$adjust.p <- p.adjust(as.numeric(out$PP),method = "BH") + out <- out[,c("probe","PP","MeanDiff","adjust.p")] + colnames(out) <- c("probe","pvalue", diffCol, "adjust.p") + rownames(out) <- out$probe + + if(save){ + message("Saving results") + dir.create(dir.out,showWarnings = FALSE, recursive = TRUE) + ylab <- ifelse(is.na(diff.dir), + " (FDR corrected P-values) [two tailed test]", + " (FDR corrected P-values) [one tailed test]" + ) + TCGAVisualize_volcano( + x = as.data.frame(out)[,grep("Minus",colnames(out),value = T)], + y = out$adjust.p, + title = paste0("Volcano plot - Probes ", + ifelse(is.na(diff.dir),"differently ",diff.dir), + "methylated in ", group1, " vs ", group2,"\n"), + filename = sprintf("%s/volcanoPlot.probes.%s.png",dir.out, ifelse(is.na(diff.dir),"two_tailed",diff.dir)), + label = c("Not Significant", + paste0("Hypermethylated in ",group1), + paste0("Hypomethylated in ",group1)), + ylab = bquote(-Log[10] ~ .(ylab)), + xlab = expression(paste( + "DNA Methylation difference (",beta,"-values)") + ), + x.cut = sig.dif, + y.cut = pvalue + ) + write_csv( + x = out, + file = sprintf("%s/getMethdiff.%s.probes.csv",dir.out,ifelse(is.na(diff.dir),"both",diff.dir)) + ) + write_csv( + x = out[out$adjust.p < pvalue & abs(out[,diffCol]) > sig.dif & !is.na(out$adjust.p),], + file = sprintf("%s/getMethdiff.%s.probes.significant.csv",dir.out,ifelse(is.na(diff.dir),"both",diff.dir)) + ) + + + } + + result <- out[out$adjust.p < pvalue & abs(out[,diffCol]) > sig.dif & !is.na(out$adjust.p),] + if(nrow(result) == 0 ) { + message("No relevant probes found") + } else { + message(paste0("Number of relevant probes found: ", nrow(result))) + } + return(result) +} + +## TCGA pipe don't specify dir.out +#' get.pair to predict enhancer-gene linkages. +#' @description +#' get.pair is a function to predict enhancer-gene linkages using associations between +#' DNA methylation at enhancer CpG sites and expression of 20 nearby genes of the CpG sites +#' (see reference). Two files will be saved if save is true: getPair.XX.all.pairs.statistic.csv +#' and getPair.XX.pairs.significant.csv (see detail). +#' @usage +#' get.pair(data, +#' nearGenes, +#' minSubgroupFrac = 0.4, +#' permu.size = 10000, +#' permu.dir = NULL, +#' raw.pvalue = 0.001, +#' Pe = 0.001, +#' mode = "unsupervised", +#' diff.dir = NULL, +#' dir.out = "./", +#' diffExp = FALSE, +#' group.col, +#' group1 = NULL, +#' group2 = NULL, +#' cores = 1, +#' correlation = "negative", +#' filter.probes = TRUE, +#' filter.portion = 0.3, +#' filter.percentage = 0.05, +#' label = NULL, +#' addDistNearestTSS = FALSE, +#' save = TRUE) +#' @param data A multiAssayExperiment with DNA methylation and Gene Expression data. +#' See \code{\link{createMAE}} function. +#' @param nearGenes Can be either a list containing output of GetNearGenes +#' function or path of rda file containing output of GetNearGenes function. +#' @param cores A interger which defines number of core to be used in parallel process. +#' Default is 1: don't use parallel process. +#' @param minSubgroupFrac A number ranging from 0 to 1, specifying the fraction of +#' extreme samples that define group U (unmethylated) and group M (methylated), +#' which are used to link probes to genes. +#' The default is 0.4 (the lowest quintile of samples is the U group and the highest quintile samples is the M group) +#' because we typically want to be able to detect a specific (possibly unknown) molecular subtype among tumor; +#' these subtypes often make up only a minority of samples, and 20\% was chosen as a lower bound for the purposes of statistical power. +#' If you are using pre-defined group labels, such as treated replicates vs. untreated replicated, use a value of 1.0 (Supervised mode). +#' @param permu.size A number specify the times of permuation used in the unsupervised mode. Default is 10000. +#' @param permu.dir A path where the output of permutation will be. +#' @param raw.pvalue A number specify the raw p-value cutoff for defining significant pairs. +#' Default is 0.001. It will select the significant P value cutoff before calculating the empirical p-values. +#' @param Pe A number specify the empirical p-value cutoff for defining significant pairs. +#' Default is 0.001 +#' @param filter.probes Should filter probes by selecting only probes that have at least +#' a certain number of samples below and above a certain cut-off. +#' See \code{\link{preAssociationProbeFiltering}} function. +#' @param filter.portion A number specify the cut point to define binary methylation level for probe loci. +#' Default is 0.3. When beta value is above 0.3, the probe is methylated and +#' vice versa. For one probe, the percentage of methylated and unmethylated samples +#' should be above filter.percentage value. +#' Only used if filter.probes is TRUE. See \code{\link{preAssociationProbeFiltering}} function. +#' @param filter.percentage Minimun percentage of samples to be considered in methylated and unmethylated +#' for the filter.portion option. Default 5\%. Only used if filter.probes is TRUE. +#' See \code{\link{preAssociationProbeFiltering}} function. +#' @param diffExp A logic. Default is FALSE. If TRUE, t test will be applied to +#' test whether putative target gene are differentially expressed between two groups. +#' @param group.col A column defining the groups of the sample. You can view the +#' available columns using: colnames(MultiAssayExperiment::colData(data)). +#' @param group1 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param group2 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param mode A character. Can be "unsupervised" or "supervised". If unsupervised is set +#' the U (unmethylated) and M (methylated) groups will be selected +#' among all samples based on methylation of each probe. +#' Otherwise U group and M group will set as the samples of group1 or group2 as described below: +#' If diff.dir is "hypo, U will be the group 1 and M the group2. +#' If diff.dir is "hyper" M group will be the group1 and U the group2. +#' @param correlation Type of correlation to evaluate (negative or positive). +#' Negative (default) checks if hypomethylated region has a upregulated target gene. +#' Positive checks if region hypermethylated has a upregulated target gene. +#' @param diff.dir A character can be "hypo" or "hyper", showing differential +#' methylation direction in group 1. It can be "hypo" which means the probes are hypomethylated in group1; +#' "hyper" which means the probes are hypermethylated in group1; +#' This argument is used only when mode is supervised nad +#' it should be the same value from get.diff.meth function. +#' @param dir.out A path specify the directory for outputs. Default is current directory +#' @param label A character labels the outputs. +#' @param save Two files will be saved if save is true: getPair.XX.all.pairs.statistic.csv +#' and getPair.XX.pairs.significant.csv (see detail). +#' @param addDistNearestTSS Calculated distance to the nearest TSS instead of gene distance. +#' Having to calculate the distance to nearest TSS will take some time. +#' @return Statistics for all pairs and significant pairs +#' @export +#' @author +#' Lijing Yao (creator: lijingya@usc.edu) +#' Tiago C Silva (maintainer: tiagochst@usp.br) +#' @references +#' Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +#' factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +#' @examples +#' data <- ELMER:::getdata("elmer.data.example") +#' nearGenes <- GetNearGenes(TRange=getMet(data)[c("cg00329272","cg10097755"),], +#' geneAnnot=getExp(data)) +#' Hypo.pair <- get.pair(data=data, +#' nearGenes=nearGenes, +#' permu.size=5, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' raw.pvalue = 0.2, +#' Pe = 0.2, +#' dir.out="./", +#' label= "hypo") +#' +#' Hypo.pair <- get.pair(data = data, +#' nearGenes = nearGenes, +#' permu.size = 5, +#' raw.pvalue = 0.2, +#' Pe = 0.2, +#' dir.out = "./", +#' diffExp = TRUE, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' label = "hypo") +get.pair <- function(data, + nearGenes, + minSubgroupFrac = 0.4, + permu.size = 10000, + permu.dir = NULL, + raw.pvalue = 0.001, + Pe = 0.001, + mode = "unsupervised", + diff.dir = NULL, + dir.out = "./", + diffExp = FALSE, + group.col, + group1 = NULL, + group2 = NULL, + cores = 1, + correlation = "negative", + filter.probes = TRUE, + filter.portion = 0.3, + filter.percentage = 0.05, + label = NULL, + addDistNearestTSS = FALSE, + save = TRUE){ + + if(is.character(nearGenes)){ + nearGenes <- get(load(nearGenes)) + } + + # if(!all(c("ID", "GeneID", "Symbol" ) %in% colnames(nearGenes))) + # stop("nearGenes does not have one of the expected columns: ID, GeneID, Symbol") + + if(diffExp & missing(group.col)) + stop("Please set group.col argument to test whether putative target gene are differentially expressed between two groups.") + + if(missing(group.col)) stop("Please set group.col argument") + if(missing(group1)) stop("Please set group1 argument") + if(missing(group2)) stop("Please set group2 argument") + data <- data[,colData(data)[,group.col] %in% c(group1, group2)] + + # Supervised groups + unmethylated <- methylated <- NULL + if(mode == "supervised"){ + if(is.null(diff.dir)) stop("For supervised mode please set diff.dir argument (same from the get.diff.meth)") + if(diff.dir == "hypo"){ + message("Using pre-defined groups. U (unmethylated): ",group1,", M (methylated): ", group2) + unmethylated <- which(colData(data)[,group.col] == group1) + methylated <- which(colData(data)[,group.col] == group2) + } else { + message("Using pre-defined groups. U (unmethylated): ",group2,", M (methylated): ", group1) + unmethylated <- which(colData(data)[,group.col] == group2) + methylated <- which(colData(data)[,group.col] == group1) + } + } else { + message("Selecting U (unmethylated) and M (methylated) groups. Each groups has ", minSubgroupFrac * 50,"% of samples") + } + # Paralellization code + parallel <- FALSE + if (cores > 1){ + if (cores > detectCores()) cores <- detectCores() + registerDoParallel(cores) + parallel = TRUE + } + + if(filter.probes) data <- preAssociationProbeFiltering(data, K = filter.portion, percentage = filter.percentage) + + met <- assay(getMet(data)) + # Probes that were removed from the last steps cannot be verified + nearGenes <- nearGenes[nearGenes$ID %in% rownames(met),] + + if(nrow(nearGenes) == 0) { + message("No probes passed the preAssociationProbeFiltering filter") + return(NULL) + } + exp <- assay(getExp(data)) + message("Calculating Pp (probe - gene) for all nearby genes") + Probe.gene <- adply(.data = unique(nearGenes$ID), + .margins = 1, + .fun = function(x) { + Stat.nonpara( + Probe = x, + Meths = met[x,], + methy = methylated, + unmethy = unmethylated, + NearGenes = as.data.frame(nearGenes), + correlation = correlation, + Top = minSubgroupFrac/2, # Each group will have half of the samples + Exps = exp + ) + }, + .progress = "time", + .parallel = parallel, + .id = NULL, + .paropts = list(.errorhandling = 'pass') + ) + + rownames(Probe.gene) <- paste0(Probe.gene$Probe,".",Probe.gene$GeneID) + Probe.gene <- Probe.gene[!is.na(Probe.gene$Raw.p),] + + if(save) { + dir.create(dir.out, showWarnings = FALSE) + file <- sprintf("%s/getPair.%s.all.pairs.statistic.csv",dir.out, ifelse(is.null(label),"",label)) + write_csv(Probe.gene,file = file) + message(paste("File created:", file)) + } + + Probe.gene <- Probe.gene[Probe.gene$Raw.p < raw.pvalue,] + Probe.gene <- Probe.gene[order(Probe.gene$Raw.p),] + selected <- Probe.gene + if(nrow(selected) == 0) { + message(paste("No significant pairs were found for pvalue =", raw.pvalue)) + return(selected) + } + + + # Probe.gene$logRaw.p <- -log10(Probe.gene$Raw.p) + if(mode == "unsupervised"){ + GeneID <- unique(Probe.gene[,"GeneID"]) + message(paste("Calculating Pr (random probe - gene). Permutating ", permu.size, "probes for", length(GeneID), "genes")) + # get permutation + permu <- get.permu(data, + geneID = GeneID, + percentage = minSubgroupFrac / 2, + rm.probes = unique(nearGenes$ID), + methy = methylated, + unmethy = unmethylated, + correlation = correlation, + permu.size = permu.size, + permu.dir = permu.dir, + cores = cores) + # Get empirical p-value + Probe.gene.Pe <- Get.Pvalue.p(Probe.gene,permu) + + if(save) write_csv(Probe.gene.Pe, + file = sprintf("%s/getPair.%s.pairs.statistic.with.empirical.pvalue.csv",dir.out, + ifelse(is.null(label),"",label))) + # Pe will always be 1 for the supervised mode. As the test exp(U) > exp(M) will always be doing the same comparison. + selected <- Probe.gene.Pe[Probe.gene.Pe$Pe < Pe & !is.na(Probe.gene.Pe$Pe),] + } else { + Probe.gene$FDR <- p.adjust(Probe.gene$Raw.p,method = "BH") + if(save) write_csv(Probe.gene, + file=sprintf("%s/getPair.%s.pairs.statistic.with.empirical.pvalue.csv",dir.out, + ifelse(is.null(label),"",label))) + selected <- Probe.gene[Probe.gene$FDR < Pe,] + } + + # Change distance from gene to nearest TSS + if(addDistNearestTSS) { + selected$Distance <- NULL + selected <- addDistNearestTSS(data, NearGenes = selected) + } + if(diffExp){ + message("Calculating differential expression between two groups") + Exp <- assay(getExp(data)[unique(selected$GeneID),]) + groups <- unique(colData(data)[,group.col]) + prefix <- paste(gsub("[[:punct:]]| ", ".", groups),collapse = ".vs.") + log.col <- paste0("log2FC_",prefix) + diff.col <- paste0(prefix,".diff.pvalue") + idx1 <- colData(data)[,group.col] == groups[1] + idx2 <- colData(data)[,group.col] == groups[2] + out <- adply(.data = split(Exp,rownames(Exp)), .margins = 1, + .fun = function(x) { + test <- t.test(x = x[idx1],y = x[idx2]) + out <- data.frame("log2FC" = test$estimate[1] - test$estimate[2], + "diff.pvalue" = test$p.value) + }, + .progress = "time", + .parallel = parallel, + .id = "GeneID", + .paropts = list(.errorhandling = 'pass') + ) + add <- out[match(selected$GeneID, out$GeneID),c("log2FC","diff.pvalue")] + colnames(add) <- c(log.col,diff.col) + selected <- cbind(selected, add) + } + if(save) write_csv(selected,file = sprintf("%s/getPair.%s.pairs.significant.csv",dir.out, ifelse(is.null(label),"",label))) + invisible(gc()) + return(selected) +} + +### permutation +#permu.size can be all which mean all the usable probes. +#' get.permu to generate permutation results for calculation of empirical P values for +#' each enhancer-gene linkage. +#' @description +#' get.permu is a function to use the same statistic model to calculate random enhancer-gene +#' pairs. Based on the permutation value, empirical P value can be calculated for the +#' real enhancer-gene pair (see reference). +#' @usage +#' get.permu(data, +#' geneID, +#' methy = NULL, +#' unmethy = NULL, +#' percentage = 0.2, +#' rm.probes = NULL, +#' correlation = "negative", +#' permu.size = 10000, +#' permu.dir = NULL, +#' cores = 1) +#' @param data A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function. +#' @param geneID A vector lists the genes' ID. +#' @param rm.probes A vector lists the probes name. +#' @param correlation Type of correlation to identify. Default is negative: look for hypomethylation and increase target expression. +#' @param cores A interger which defines number of core to be used in parallel process. +#' Default is 1: don't use parallel process. +#' @param percentage A number ranges from 0 to 1 specifying the percentage of samples of group 1 and group 2 +#' groups used to link probes to genes. Default is 0.2. +#' @param permu.size A number specify the times of permuation. Default is 10000. +#' @param permu.dir A path where the output of permuation will be. +#' @param methy Index of M (methylated) group. +#' @param unmethy Index of U (unmethylated) group. +#' @return Permutations +#' @importFrom plyr alply +#' @importFrom doParallel registerDoParallel +#' @author +#' Lijing Yao (creator: lijingya@usc.edu) +#' Tiago C Silva (maintainer: tiagochst@usp.br) +#' @references +#' Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +#' factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +#' @note +#' Permutation is the most time consuming step. It is recommended to use multiple +#' cores for this step. Default permutation time is 1000 which may need 12 hrs by 4 cores. +#' However 10,000 permutations is recommended to get high confidence results. But it may cost 2 days. +#' @export +#' @examples +#' data <- ELMER:::getdata("elmer.data.example") +#' permu <-get.permu(data = data, +#' geneID=rownames(getExp(data)), +#' rm.probes=c("cg00329272","cg10097755"), +#' permu.size=5) +get.permu <- function(data, + geneID, + methy = NULL, + unmethy = NULL, + percentage = 0.2, + rm.probes = NULL, + correlation = "negative", + permu.size = 10000, + permu.dir = NULL, + cores = 1){ + + ## get usable probes + usable.probes <- names(getMet(data)) + usable.probes <- usable.probes[!usable.probes %in% rm.probes] + if(length(usable.probes) < permu.size) + stop(sprintf("There is no enough usable probes (%s) to perform %s time permutation, + set a smaller permu.size.",length(usable.probes),permu.size)) + if(!is.numeric(permu.size)) permu.size <- length(usable.probes) + + # Desire for reproducible results + set.seed(200) + probes.permu <- sample(usable.probes, size = permu.size, replace = FALSE) + + parallel <- FALSE + if (cores > 1){ + if (cores > detectCores()) cores <- detectCores() + registerDoParallel(cores) + parallel = TRUE + } + + + # We have two cases to consider: + # 1) Permutation was not done before + # 2) It was done before + # 2.a) We have more probes to evaluate + # 2.b) We have more genes to evaluate + # 2.c) More genes and more probes + # 2.d) No more genes or probes + # For 1) just do for all genes and probes + # For 2 a-c do it for new probes, then do for new genes for all probes + # For 2.d just subset + permu <- NULL + tmp.probes <- probes.permu + tmp.genes <- geneID + missing.genes <- NULL + # Check if it isCase 2: Permutation already done + file <- file.path(permu.dir,"permu.rda") + if (!is.null(permu.dir)) { + if (file.exists(file)) { + temp.space <- new.env() + permu.file <- get(load(file, temp.space), temp.space) + rm(temp.space) + # Does the probe really exists ? + permu.file <- permu.file[,colnames(permu.file) %in% rownames(getMet(data))] + tmp.probes <- probes.permu[!probes.permu %in% colnames(permu.file)] + if(!all(geneID %in% rownames(permu.file))) { + tmp.genes <- rownames(permu.file) + missing.genes <- geneID[!geneID %in% tmp.genes] + } + } + } + permu.meth <- assay(getMet(data)[tmp.probes,,drop=FALSE] ) + exp.data <- assay(getExp(data)) + + # Should Exps=exp.data[geneID,] to improve performance ? + # in that case For a second run we will need to look if gene is in the matrix and also probe + if(length(tmp.probes) > 0) { + exps <- exp.data[tmp.genes,,drop=FALSE] + permu <- alply(.data = tmp.probes, .margins = 1, + .fun = function(x) { + Stat.nonpara.permu( + Probe = x, + Meths = permu.meth[x,], + Gene = tmp.genes, + methy = methy, + correlation = correlation, + unmethy = unmethy, + Top = percentage, + Exps = exps)}, + .progress = "time", + .parallel = parallel, + .paropts = list(.errorhandling = 'pass') + ) + + permu <- sapply(permu, + function(x,geneID){ + x <- x[match(geneID,x[,1]),2] + }, + geneID=tmp.genes,simplify=FALSE) + + permu <- do.call(cbind,permu) + rownames(permu) <- tmp.genes + colnames(permu) <- tmp.probes + } + + if(!is.null(permu) & length(file) > 0) { + if(file.exists(file)){ + # Put genes in the same order before rbind it + permu.file <- permu.file[match(rownames(permu),rownames(permu.file)),,drop=FALSE] + permu <- cbind(permu, permu.file) + } + } else if(is.null(permu) & length(file) > 0) { + permu <- permu.file + } + + # For the missing genes calculate for all probes + if(length(missing.genes) > 0) { + # Get all probes + permu.meth <- assay(getMet(data)[colnames(permu),] ) + exps <- exp.data[missing.genes,,drop=FALSE] + permu.genes <- alply(.data = colnames(permu), .margins = 1, + .fun = function(x) { + Stat.nonpara.permu( + Probe = x, + Meths = permu.meth[x,], + Gene = missing.genes, + Top = percentage, + methy = methy, + unmethy = unmethy, + Exps = exps)}, + .progress = "time", + .parallel = parallel, + .paropts = list(.errorhandling = 'pass') + ) + + permu.genes <- sapply(permu.genes, + function(x,geneID){ + x <- x[match(geneID,x[,1]),2] + }, + geneID=missing.genes,simplify=FALSE) + + permu.genes <- do.call(cbind,permu.genes) + rownames(permu.genes) <- missing.genes + colnames(permu.genes) <- colnames(permu) + # Adding new genes + # Make sure probes are in the same order + permu.genes <- permu.genes[,match(colnames(permu.genes),colnames(permu))] + permu <- rbind(permu,permu.genes) + } + + if(!is.null(permu.dir) & !is.null(permu)) { + dir.create(permu.dir, showWarnings = FALSE, recursive = TRUE) + save(permu,file = file.path(permu.dir,"permu.rda"), compress = "xz") + } + permu <- permu[geneID,probes.permu, drop = FALSE] + return(permu) +} + +#'promoterMeth +#' @title +#' promoterMeth to calculate associations of gene expression with DNA methylation +#' at promoter regions +#' @description +#' promoterMeth is a function to calculate associations of gene expression with DNA methylation +#' at promoter regions. +#' @usage +#' promoterMeth(data, sig.pvalue = 0.01, minSubgroupFrac = 0.4, +#' upstream = 200, downstream = 2000, save = TRUE, cores = 1) +#'@param data A Multi Assay Experiment object with DNA methylation and +#' gene expression Summarized Experiment objects +#'@param sig.pvalue A number specifies significant cutoff for gene silenced by promoter +#' methylation. Default is 0.01. P value is raw P value without adjustment. +#' @param minSubgroupFrac A number ranging from 0 to 1 +#' specifying the percentage of samples used to create the groups U (unmethylated) +#' and M (methylated) used to link probes to genes. +#' Default is 0.4 (lowest quintile of all samples will be in the +#' U group and the highest quintile of all samples in the M group). +#' @param upstream Number of bp upstream of TSS to consider as promoter region +#' @param downstream Number of bp downstream of TSS to consider as promoter region +#'@param cores Number of cores to be used in paralellization. Default 1 (no paralellization) +#' @param save A logic. If it is true, the result will be saved. +#' @importFrom GenomicRanges promoters +#' @importFrom utils write.csv +#' @return A data frame contains genes whose expression significantly anti-correlated +#' with promoter methylation. +#' @examples +#' \dontrun{ +#' data(elmer.data.example.promoter) +#' Gene.promoter <- promoterMeth(mae.promoter) +#' } +#' @export +promoterMeth <- function(data, + sig.pvalue = 0.01, + minSubgroupFrac = 0.4, + upstream = 200, + downstream = 2000, + save = TRUE, + cores = 1){ + # stop("For the moment, this function was depreciated.") + parallel <- FALSE + if (cores > 1){ + if (cores > detectCores()) cores <- detectCores() + registerDoParallel(cores) + parallel = TRUE + } + + message("Calculating associations of gene expression with DNA methylation at promoter regions") + + # get +-2KB around TSS + TSS_2K <- promoters(rowRanges(getExp(data)), upstream = upstream, downstream = downstream) + + # get probes overlapping those promoter regions + probes <- rowRanges(getMet(data)) + overlap <- findOverlaps(probes, TSS_2K) + + # make data frame with probe and gene + df <- data.frame( + Probe = as.character(names(probes)[queryHits(overlap)]), + GeneID = TSS_2K$ensembl_gene_id[subjectHits(overlap)], + stringsAsFactors = FALSE + ) + + # no results ? + if(nrow(df)==0){ + out <- data.frame(GeneID=c(), Symbol=c(), Raw.p= c()) + } else { + df <- unique(df) + ProbeInTSS <- split(df$Probe,df$GeneID) + + message("Calculating average DNA methylation for probes near the same TSS") + + # calculate average methylation of promoter (if promoter has several probes) + met <- assay(getMet(data)) + Gene.promoter <- lapply( + ProbeInTSS, + function(x, METH){ + meth <- METH[x,] + if(length(x)>1){ + meth <- colMeans(meth,na.rm=TRUE) + } + return(meth) + }, + METH = met + ) + + Gene.promoter <- do.call(rbind, Gene.promoter) + + ## make fake NearGene + Fake <- data.frame( + Symbol = values(getExp(data))[values(getExp(data))$ensembl_gene_id %in% rownames(Gene.promoter),"external_gene_name"], + GeneID = values(getExp(data))[values(getExp(data))$ensembl_gene_id %in% rownames(Gene.promoter),"ensembl_gene_id"], + Distance = 1, + Side = 1, + stringsAsFactors = FALSE + ) + Fake <- split(Fake, Fake$GeneID) + exps <- assay(getExp(data)) + + message("Calculating Pp (probe - gene) for all nearby genes") + out <- adply( + .data = rownames(Gene.promoter), .margins = 1, + .fun = function(x) { + Stat.nonpara( + Probe = x, + Meths = Gene.promoter[x,,drop = FALSE], + NearGenes = as.data.frame(Fake), + Top = minSubgroupFrac/2, + Exps = exps + ) + }, + .progress = "time", + .parallel = parallel, + .id = NULL, + .paropts = list(.errorhandling = 'pass') + ) + + out <- out[,c("GeneID","Symbol","Raw.p")] + + if(save) { + write.csv( + x = out, + file = "Genes_all_anticorrelated_promoter_methylation.csv", + row.names = FALSE + ) + } + out <- out[out$Raw.p < sig.pvalue & !is.na(out$Raw.p),] + } + if(nrow(out) == 0) message("No assossiation was found") + + if(save) { + write.csv( + x = out, + file = "Genes_significant_anticorrelated_promoter_methylation.csv", + row.names = FALSE + ) + } + return(out) +} +#' get.enriched.motif to identify the overrepresented motifs in a set of probes (HM450K) regions. +#' @description +#' get.enriched.motif is a function make use of Probes.motif data from \pkg{ELMER.data} +#' package to calculate the motif enrichment Odds Ratio and 95\% confidence interval for +#' a given set of probes using fisher test function, after performing the Fisher's exact test, +#' the results for all transcription factors are corrected for multiple testing with the Benjamini-Hochberg procedure. +#' If save is TURE, two output files will be saved: +#' getMotif.XX.enriched.motifs.rda and getMotif.XX.motif.enrichment.csv (see detail). +#' @usage +#' get.enriched.motif(data, probes.motif, probes, min.motif.quality = "DS", +#' background.probes, pvalue = 0.05, lower.OR = 1.1, min.incidence = 10, +#' dir.out = "./", label = NULL, save = TRUE, plot.title="") +#' @param data A multi Assay Experiment from \code{\link{createMAE}} function. +#' If set and probes.motif/background probes are missing this will be used to get +#' this other two arguments correctly. This argument is not require, you can set probes.motif and +#' the backaground.probes manually. +#' @param probes.motif A matrix contains motifs occurrence within probes regions. Probes.motif in +#' \pkg{ELMER.data} will be used if probes.motif is missing (detail see Probes.motif.hg19.450K in ELMER.data). +#' @param probes A vector lists the name of probes to define the set of probes in which motif enrichment +#' OR and confidence interval will be calculated. +#' @param background.probes A vector lists name of probes which are considered as +#' background for motif.enrichment calculation (see detail). +#' @param lower.OR A number specifies the smallest lower boundary of 95\% confidence interval for Odds Ratio. +#' The motif with higher lower boudnary of 95\% confidence interval for Odds Ratio than the number +#' are the significantly enriched motifs (detail see reference). +#' @param min.incidence A non-negative integer specifies the minimum incidence of motif in the given probes set. +#' 10 is default. +#' @param pvalue FDR P-value cut off (default 0.05) +#' @param min.motif.quality Minimum motif quality score to consider. +#' Possible valules: A, B, C , D, AS (A and S), BS (A, B and S), CS (A, B , C and S), DS (all - default) +#' Description: Each PWM has a quality rating from A to D where +#' A represents motifs with the highest confidence, and D motifs only weakly describe the pattern with a +#' limited applications for quantitative analyses. +#' Special S quality marks the single-box motifs (secondary motif). +#' Source: http://hocomoco.autosome.ru/help#description_quality_score +#' More information: \url{http://nar.oxfordjournals.org/content/44/D1/D116.full#sec-8} +#' @param dir.out A path. Specifies the directory for outputs. Default is current directory +#' @param label A character. Labels the outputs such as "hypo", "hyper" +#' @param save If save is TURE, two files will be saved: getMotif.XX.enriched.motifs.rda and +#' getMotif.XX.motif.enrichment.csv (see detail). +#' @param plot.title Plot title. Default: no title. +#' @return A list contains enriched motifs with the probes regions harboring the motif. +#' @export +#' @details +#' background.probes: +#' For enhancer study, it is better to use probes within distal enhancer probes as +#' background.probes. For promoter study, it is better to use probes within promoter +#' regions as background.probes. Because enhancer and promoter have different CG content +#' and harbors different clusters of TFs motif. +#' +#' save: +#' if save is TRUE, two files will be save on the disk. The first file is +#' getMotif.XX.motif.enrichment.csv (XX depends on option label). This file reports +#' the Odds Ratio and 95\% confidence interval for these Odds Ratios which pass the +#' significant cutoff (lower.OR and min.incidence). The second file is +#' getMotif.XX.enriched.motifs.rda (XX depends on option lable). This file contains +#' a list R object with enriched motifs as name and probes containing the enriched +#' motif as contents. This object will be used in \code{\link{get.TFs}} function. +#' if save is FALSE, the function will return a R object which is the same with second file. +#'@return A list (R object) with enriched motifs as name and probes containing the enriched +#' motif as contents. And hypo.motif.enrichment.pdf plot will be generated. +#' @author +#' Lijing Yao (creator: lijingya@usc.edu) +#' @importFrom magrittr divide_by multiply_by %>% add +#' @importFrom plyr alply +#' @importFrom utils data read.csv +#' @importFrom S4Vectors metadata +#' @importFrom stats fisher.test cor.test +#' @importFrom dplyr filter +#' @importFrom Matrix colMeans colSums +#' @importFrom IRanges ranges +#' @import ELMER.data +#' @references +#' Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +#' factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +#' @examples +#' probes <- c("cg00329272","cg10097755","cg08928189", "cg17153775","cg21156590", +#' "cg19749688","cg12590404","cg24517858","cg00329272","cg09010107", +#' "cg15386853", "cg10097755", "cg09247779","cg09181054","cg19371916") +#' data <- tryCatch(ELMER:::getdata("elmer.data.example"), error = function(e) { +#' message(e) +#' data(elmer.data.example, envir = environment()) +#' }) +#' bg <- rownames(getMet(data)) +#' data(Probes.motif.hg38.450K,package = "ELMER.data") +#' enriched.motif <- get.enriched.motif( +#' probes.motif = Probes.motif.hg38.450K, +#' probes = probes, +#' background.probes = bg, +#' pvalue = 1, +#' min.incidence = 2, +#' label = "hypo" +#' ) +#' # If the MAE is set, the background and the probes.motif will +#' # be automatically set +#' enriched.motif <- get.enriched.motif( +#' data = data, +#' min.motif.quality = "DS", +#' probes=probes, +#' pvalue = 1, +#' min.incidence=2, +#' label="hypo" +#' ) +get.enriched.motif <- function( + data, + probes.motif, + probes, + min.motif.quality = "DS", + background.probes, + pvalue = 0.05, + lower.OR = 1.1, + min.incidence = 10, + dir.out="./", + label = NULL, + save = TRUE, + plot.title = "" +){ + # create output directory if it does not exists + dir.create(dir.out,showWarnings = FALSE, recursive = TRUE) + + if(missing(probes.motif)){ + if(missing(data)) stop("Please set probes.motif argument. See ELMER data") + file <- paste0("Probes.motif.",metadata(data)$genome,".",metadata(data)$met.platform) + message("Loading object: ",file) + if(file == "Probes.motif.hg38.450K") probes.motif <- getdata("Probes.motif.hg38.450K") + if(file == "Probes.motif.hg19.450K") probes.motif <- getdata("Probes.motif.hg19.450K") + if(file == "Probes.motif.hg38.EPIC") probes.motif <- getdata("Probes.motif.hg38.EPIC") + if(file == "Probes.motif.hg19.EPIC") probes.motif <- getdata("Probes.motif.hg19.EPIC") + } + all.probes.TF <- probes.motif + ## here need to be add motif search part. + if(missing(probes)) stop("probes option should be specified.") + if(missing(background.probes)){ + if(!missing(data)) { + background.probes <- as.character(names(getMet(data))) + } else if(file.exists(sprintf("%s/probeInfo_feature_distal.rda",dir.out))){ + background.probes <- get(load(sprintf("%s/probeInfo_feature_distal.rda",dir.out))) + background.probes <- as.character(names(background.probes)) + } else { + message("backaground.probes argument is missing. We will use all probes as background, ", + "but for enhancer study, it is better to use probes within distal enhancer probes as background.probes.") + background.probes <- rownames(all.probes.TF) + } + } + probes <- unique(probes) # A probe should not be considered more than one time + if(length(probes) < min.incidence) stop("Number of unique prober is smaller than the min.incidence required") + background.probes <- background.probes[background.probes %in% rownames(all.probes.TF)] + bg.probes.TF <- all.probes.TF[background.probes,] + bg.Probes.TF.percent <- Matrix::colMeans(bg.probes.TF) # This is equal to: c/(c+d) + + ## load probes for enriched motif ---------------------------------------------- + probes.TF <- all.probes.TF[rownames(all.probes.TF) %in% probes,] + probes.TF.num <- Matrix::colSums(probes.TF, na.rm = TRUE) + # a is the number of probes within the selected probe set that contain one or more motif occurrences; + # b is the number of probes within the selected probe set that do not contain a motif occurrence; + # c and d are the same counts within the entire enhancer probe set (background) + # lower boundary of 95% conf idence interval = exp (ln OR - SD) + a <- Matrix::colSums(probes.TF) + b <- nrow(probes.TF) - Matrix::colSums(probes.TF) + c <- Matrix::colSums(bg.probes.TF ) + d <- nrow(bg.probes.TF) - Matrix::colSums(bg.probes.TF) + fisher <- plyr::adply(seq_len(length(a)),.margins = 1, .fun = function(i) { + x <- fisher.test(matrix(c(a[i],b[i],c[i],d[i]),nrow = 2,ncol = 2)) + ret <- data.frame(x$conf.int[1],x$conf.int[2],x$estimate,x$p.value) + colnames(ret) <- c("lowerOR","upperOR","OR","p.value") + ret + },.id = NULL, + .progress = "time") + rownames(fisher) <- names(a) + Summary <- data.frame( + motif = names(a), + NumOfProbes = probes.TF.num, + PercentageOfProbes = probes.TF.num/length(probes), + fisher, + FDR = p.adjust(fisher$p.value,method = "BH"), + stringsAsFactors = FALSE + ) + hocomoco <- getHocomocoTable() + family.class <- hocomoco[,c("Model",grep("family",colnames(hocomoco),value = T))] + Summary <- merge(Summary,family.class, by.x = "motif",by.y = "Model") + Summary <- Summary[order(Summary$lowerOR, decreasing = TRUE),] + + if(save) { + write_csv( + x = Summary, + file = sprintf("%s/getMotif.%s.motif.enrichment.csv", + dir.out,ifelse(is.null(label),"",label)) + ) + } + + ## enriched motif and probes + en.motifs <- as.character(Summary[Summary$lowerOR > lower.OR & Summary$NumOfProbes > min.incidence & Summary$FDR <= pvalue,"motif"]) + + # Subset by quality + print.header("Filtering motifs based on quality", "subsection") + message("Number of enriched motifs with quality:") + message("-----------") + for(q in c("A","B","C","D","S")) message(paste0(" => ",q,": ", length(grep(paste0("\\.",q),en.motifs)))) + message("-----------") + + en.motifs <- grep(paste0("\\.[A-",toupper(min.motif.quality),"]"), en.motifs, value = T) + message("Considering only motifs with quality from A up to ", min.motif.quality,": ",length(en.motifs)," motifs are enriched.") + enriched.motif <- alply( + en.motifs, + function(x, probes.TF) { + rownames(probes.TF[probes.TF[,x] == 1, x, drop = FALSE]) + }, + probes.TF = probes.TF,.margins = 1, .dims = FALSE + ) + attributes(enriched.motif) <- NULL + names(enriched.motif) <- en.motifs + + if(save) { + save(enriched.motif, + file = sprintf( + "%s/getMotif.%s.enriched.motifs.rda", + dir.out, + ifelse(is.null(label),"",label)) + ) + } + + ## make plot + if(length(enriched.motif) > 1){ + suppressWarnings({ + P <- motif.enrichment.plot( + motif.enrichment = filter(Summary,grepl(paste0("\\.[A-",toupper(min.motif.quality),"]"), Summary$motif)), + significant = list(NumOfProbes = min.incidence, lowerOR = lower.OR), + dir.out = dir.out, + label = paste0(ifelse(is.null(label),"",label),".quality.A-",toupper(min.motif.quality)), + save = TRUE + ) + P <- motif.enrichment.plot( + motif.enrichment = filter(Summary,grepl(paste0("\\.[A-",toupper(min.motif.quality),"]"), Summary$motif)), + significant = list(lowerOR = lower.OR), + dir.out = dir.out, + summary = TRUE, + label = paste0(label,".quality.A-",toupper(min.motif.quality),"_with_summary"), + title = plot.title, + save = TRUE + ) + }) + } + ## add information to siginificant pairs + sig.pair.file <- sprintf("%s/getPair.%s.pairs.significant.csv",dir.out, ifelse(is.null(label),"",label)) + if(file.exists(sig.pair.file)){ + print.header("Adding enriched motifs to significant pairs file") + sig.Pairs <- readr::read_csv(file = sig.pair.file, col_types = readr::cols()) + sig.Pairs <- sig.Pairs[sig.Pairs$Probe %in% rownames(probes.TF),] + if(all(unique(sig.Pairs$Probe) %in% rownames(probes.TF))){ + motif.Info <- sapply(sig.Pairs$Probe, + function(x, probes.TF,en.motifs){ + TFs <- names(probes.TF[x,probes.TF[x,] == 1]) + non.en.motif <- paste(setdiff(TFs,en.motifs),collapse = ";") + en.motif <- paste(intersect(TFs,en.motifs), collapse = ";") + out <- data.frame( + non_enriched_motifs = non.en.motif, + enriched_motifs = en.motif, + stringsAsFactors = FALSE + ) + return(out) + }, + probes.TF = probes.TF, + en.motifs = en.motifs, + simplify = FALSE) + motif.Info <- do.call(rbind,motif.Info) + sig.Pairs <- cbind(sig.Pairs, motif.Info) + + if(!missing(data)){ + message("Adding coordinates for probes and genes from the provided data") + met.coord <- as.data.frame(ranges(getMet(data))) + met.coord$width <- NULL + genes.coord <- as.data.frame(ranges(getExp(data))) + genes.coord$width <- NULL + colnames(met.coord) <- paste0("probe_",colnames(met.coord)) + colnames(met.coord)[3] <- "Probe" + colnames(genes.coord) <- paste0("gene_",colnames(met.coord)) + colnames(genes.coord)[3] <- "GeneID" + sig.Pairs <- left_join(sig.Pairs,genes.coord) %>% left_join(met.coord) + } + + out.file <- sprintf("%s/getPair.%s.pairs.significant.withmotif.csv",dir.out, ifelse(is.null(label),"",label)) + message("Saving file: ", out.file) + write_csv(sig.Pairs, file = out.file) + } + } + + return(enriched.motif) +} + +#' get.TFs to identify regulatory TFs. +#' @description +#' get.TFs is a function to identify regulatory TFs based on motif analysis and association analysis +#' between the probes containing a particular motif and expression of all known TFs. If save is true, +#' two files will be saved: getTF.XX.significant.TFs.with.motif.summary.csv and getTF.hypo.TFs.with.motif.pvalue.rda (see detail). +#' @usage +#' get.TFs(data, +#' enriched.motif, +#' TFs, +#' group.col, +#' group1, +#' group2, +#' mode = "unsupervised", +#' correlation = "negative", +#' diff.dir = NULL, +#' motif.relevant.TFs, +#' minSubgroupFrac = 0.4, +#' dir.out = "./", +#' label = NULL, +#' save.plots = FALSE, +#' cores = 1, +#' topTFper = 0.05, +#' save = TRUE) +#' @param data A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function. +#' @param enriched.motif A list containing output of get.enriched.motif function or a path of XX.rda file containing output of get.enriched.motif function. +#' @param TFs A data.frame containing TF GeneID and Symbol or a path of XX.csv file containing TF GeneID and Symbol. +#' If missing, human.TF list will be used (human.TF data in ELMER.data). +#' For detail information, refer the reference paper. +#' @param motif.relevant.TFs A list containing motif as names and relavent TFs as contents +#' for each list element or a path of XX.rda file containing a list as above. +#' If missing, motif.relavent.TFs will be used (motif.relavent.TFs data in ELMER.data). +#' For detail information, refer the reference paper. +#' @param minSubgroupFrac A number ranging from 0 to 1 +#' specifying the percentage of samples used to create the groups U (unmethylated) +#' and M (methylated) used to link probes to TF expression. +#' Default is 0.4 (lowest quintile of all samples will be in the +#' U group and the highest quintile of all samples in the M group). +#' @param cores A interger which defines the number of cores to be used in parallel process. Default is 1: no parallel process. +#' @param dir.out A path specifies the directory for outputs of get.pair function. Default is current directory +#' @param label A character labels the outputs. +#' @param save A logic. If save is ture, two files will be saved: getTF.XX.significant.TFs.with.motif.summary.csv and +#' getTF.hypo.TFs.with.motif.pvalue.rda (see detail). If save is false, a data frame contains the same content with the first file. +#' @param group.col A column defining the groups of the sample. You can view the +#' available columns using: colnames(MultiAssayExperiment::colData(data)). +#' @param group1 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param group2 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param mode A character. Can be "unsupervised" or "supervised". If unsupervised is set +#' the U (unmethylated) and M (methylated) groups will be selected +#' among all samples based on methylation of each probe. +#' Otherwise U group and M group will set as the samples of group1 or group2 as described below: +#' If diff.dir is "hypo, U will be the group 1 and M the group2. +#' If diff.dir is "hyper" M group will be the group1 and U the group2. +#' @param diff.dir A character can be "hypo" or "hyper", showing differential +#' methylation direction in group 1. It can be "hypo" which means the probes are hypomethylated in group1; +#' "hyper" which means the probes are hypermethylated in group1; +#' This argument is used only when mode is supervised nad +#' it should be the same value from get.diff.meth function. +#' @param topTFper Top ranked TF to be retrieved (default "0.05" - 5 percent) +#' @param correlation Type of correlation to evaluate (negative or positive). +#' Negative checks if hypomethylated is upregulated. Positive if hypermethylated is upregulated. +#' @param save.plots Create TF ranking plots ? +#' @export +#' @details +#' save: If save is ture, two files will be saved. The first file is getTF.XX.significant.TFs.with.motif.summary.csv (XX depends on option lable). +#' This file contain the regulatory TF significantly associate with average DNA methylation at particular motif sites. +#' The second file is getTF.hypo.TFs.with.motif.pvalue.rda (XX depends on option label). +#' This file contains a matrix storing the statistic results for significant associations between TFs (row) and average DNA methylation at motifs (column). +#' If save is false, a data frame which contains the same content with the first file will be reported. +#' @importFrom plyr ldply adply +#' @importFrom doParallel registerDoParallel +#' @importFrom stats na.omit +#' @importFrom parallel detectCores +#' @return +#' Potential responsible TFs will be reported in a dataframe with 4 columns: +#' \itemize{ +#' \item motif: the names of motif. +#' \item top.potential.TF.family: the highest ranking upstream TFs which are known recognized the motif. First item in potential.TFs.family +#' \item top.potential.TF.subfamily: the highest ranking upstream TFs which are known recognized the motif. First item in potential.TFs.subfamily +#' \item potential.TFs.family: TFs which are within top 5\% list and are known recognized the motif (considering family classification). +#' \item potential.TFs.subfamily: TFs which are within top 5\% list and are known recognized the motif (considering subfamily classification). +#' \item top_5percent: all TFs which are within top 5\% list. +#' } +#' @author +#' Lijing Yao (creator: lijingya@usc.edu) +#' Tiago C Silva (maintainer: tiagochst@usp.br) +#' @references +#' Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +#' factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +#' @examples +#' data <- tryCatch( +#' ELMER:::getdata("elmer.data.example"), +#' error = function(e) { +#' message(e) +#' data(elmer.data.example, envir = environment()) +#' }) +#' enriched.motif <- list( +#' "P53_HUMAN.H11MO.1.A"= c( +#' "cg00329272", "cg10097755", "cg08928189", +#' "cg17153775", "cg21156590", "cg19749688", "cg12590404", +#' "cg24517858", "cg00329272", "cg09010107", "cg15386853", +#' "cg10097755", "cg09247779", "cg09181054" +#' ) +#' ) +#' TF <- get.TFs( +#' data, +#' enriched.motif, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' TFs = data.frame( +#' external_gene_name=c("TP53","TP63","TP73"), +#' ensembl_gene_id= c( +#' "ENSG00000141510", +#' "ENSG00000073282", +#' "ENSG00000078900" +#' ), +#' stringsAsFactors = FALSE +#' ), +#' label = "hypo" +#' ) +#' # This case will use Uniprot dabase to get list of Trasncription factors +#' TF <- get.TFs( +#' data, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' enriched.motif, +#' label = "hypo" +#') +get.TFs <- function( + data, + enriched.motif, + TFs, + group.col, + group1, + group2, + mode = "unsupervised", + correlation = "negative", + diff.dir = NULL, + motif.relevant.TFs, + minSubgroupFrac = 0.4, + dir.out = "./", + label = NULL, + save.plots = FALSE, + cores = 1, + topTFper = 0.05, + save = TRUE +){ + if(missing(data)){ + stop("data argument is empty") + } + + if(missing(enriched.motif)){ + stop("enriched.motif is empty.") + } else if(is.character(enriched.motif)) { + enriched.motif <- get(load(enriched.motif)) # The data is in the one and only variable + } else if(!is.list(enriched.motif)) { + stop("enriched.motif option should be a list object.") + } + + if(length(enriched.motif) == 0) { + message("No enriched motifs were found in the last step") + return(NULL) + } + + if(missing(group.col)) stop("Please set group.col argument") + if(missing(group1)) stop("Please set group1 argument") + if(missing(group2)) stop("Please set group2 argument") + data <- data[,colData(data)[,group.col] %in% c(group1, group2)] + + # Supervised groups + unmethylated <- methylated <- NULL + if(mode == "supervised"){ + if(is.null(diff.dir)) stop("For supervised mode please set diff.dir argument (same from the get.diff.meth)") + if(diff.dir == "hypo"){ + message("Using pre-defined groups. U (unmethylated): ",group1,", M (methylated): ", group2) + unmethylated <- which(colData(data)[,group.col] == group1) + methylated <- which(colData(data)[,group.col] == group2) + } else { + message("Using pre-defined groups. U (unmethylated): ",group2,", M (methylated): ", group1) + unmethylated <- which(colData(data)[,group.col] == group2) + methylated <- which(colData(data)[,group.col] == group1) + } + } else { + message("Selecting U (unmethylated) and M (methylated) groups. Each groups has ", minSubgroupFrac * 50,"% of samples") + } + + if(missing(TFs)){ + # Here we will make some assumptions: + # TFs has a column Symbol + # data has the field external_gene_name which should be created by + # createMultAssayExperiment function + # external_gene_name is the default for hg38 in biomart + # external_gene_id is the default for hg19 in biomart + TFs <- getTF() + } else if(is.character(TFs)){ + TFs <- read_csv(TFs) + } + + if(missing(motif.relevant.TFs)){ + message("Accessing TF families from TFClass database to indentify known potential TF") + TF.family <- createMotifRelevantTfs() + TF.subfamily <- createMotifRelevantTfs("subfamily") + } else if(is.character(motif.relevant.TFs)){ + TF.family <- get(load(motif.relevant.TFs)) # The data is in the one and only variable + TF.subfamily <- TF.family + } + + parallel <- FALSE + if (cores > 1){ + if (cores > detectCores()) cores <- detectCores() + registerDoParallel(cores) + parallel = TRUE + } + dir.create(dir.out,showWarnings = FALSE, recursive = TRUE) + # This will calculate the average methylation at all motif-adjacent probes + message("Calculating the average methylation at all motif-adjacent probes ") + + motif.meth <- ldply( + enriched.motif, + function(x,meth){ + if(length(x)<2) { + return(meth[x,]) + } else { + return(colMeans(meth[x,],na.rm = TRUE)) + }}, meth = assay(getMet(data))[unique(unlist(enriched.motif)),,drop = FALSE], + .progress = "time", + .parallel = parallel, + .id = "rownames", + .paropts=list(.errorhandling='pass') + ) + rownames(motif.meth) <- motif.meth$rownames + motif.meth$rownames <- NULL + + # motif.meth matrix + # - rows: average methylation at all motif-adjacent probes (rownames will be the motif) + # - cols: each patient + + # rownames are ensemble gene id + TFs <- TFs[TFs$ensembl_gene_id %in% rownames(getExp(data)),] + gene <- TFs$ensembl_gene_id + gene.name <- TFs$external_gene_name # For plotting purposes + + # Definition: + # M group: 20% of samples with the highest average methylation at all motif-adjacent probes + # U group: 20% of samples with the lowest + + # The Mann-Whitney U test was used to test + # the null hypothesis that overall gene expression in group M was greater or equal + # than that in group U. + message("Performing Mann-Whitney U test") + + # For each motif (x) split the Meths object into U and M and evaluate the expression + # of all TF Exps (obj) + exps <- assay(getExp(data))[gene,] + TF.meth.cor <- alply( + .data = names(enriched.motif), .margins = 1, + .fun = function(x) { + Stat.nonpara.permu( + Probe = x, + Meths = motif.meth[x,] %>% as.numeric(), + Gene = gene, + correlation = correlation, + unmethy = unmethylated, + methy = methylated, + Top = minSubgroupFrac/2, + Exps = exps + )}, + .progress = "time", + .parallel = parallel, + .paropts = list(.errorhandling = 'pass') + ) + + # We are going to make a multiple hypothesis correction + TF.meth.cor <- lapply(TF.meth.cor, function(x){return(p.adjust(x$Raw.p,method = "BH"))}) + TF.meth.cor <- do.call(cbind,TF.meth.cor) + ## check row and col names + rownames(TF.meth.cor) <- gene.name + colnames(TF.meth.cor) <- names(enriched.motif) + TF.meth.cor <- na.omit(TF.meth.cor) + + # TF.meth.cor matrix with corrected p-value (Pr) + # - rows: TFs + # - cols: motifs + # lower corrected p-value means that TF expression in M group is lower than in + # U group. That means, the Unmethylated with more TF expression + # have a higher correlation. + + message("Finding potential TF and known potential TF") + + # For each motif evaluate TF + cor.summary <- adply( + colnames(TF.meth.cor), + function(x, TF.meth.cor, motif.relavent.TFs.family,motif.relavent.TFs.subfamily){ + cor <- rownames(TF.meth.cor)[sort(TF.meth.cor[,x],index.return = TRUE)$ix] + top <- cor[1:floor(topTFper * nrow(TF.meth.cor))] + if (any(top %in% motif.relavent.TFs.family[[x]])) { + potential.TF.family <- top[top %in% motif.relavent.TFs.family[[x]]] + } else { + potential.TF.family <- NA + } + if (any(top %in% motif.relavent.TFs.subfamily[[x]])) { + potential.TF.subfamily <- top[top %in% motif.relavent.TFs.subfamily[[x]]] + } else { + potential.TF.subfamily <- NA + } + out <- data.frame( + "motif" = x, + "top.potential.TF.family" = ifelse(!is.na(potential.TF.family[1]),potential.TF.family[1],NA), + "top.potential.TF.subfamily" = ifelse(!is.na(potential.TF.subfamily[1]),potential.TF.subfamily[1],NA), + "potential.TF.family" = ifelse(!any(sapply(potential.TF.family,is.na)),paste(potential.TF.family, collapse = ";"),NA), + "potential.TF.subfamily" = ifelse(!any(sapply(potential.TF.subfamily,is.na)),paste(potential.TF.subfamily, collapse = ";"),NA), + "top_5percent_TFs" = paste(top,collapse = ";"), + stringsAsFactors = FALSE + ) + return(out) + }, + TF.meth.cor = TF.meth.cor, + motif.relavent.TFs.family = TF.family, + motif.relavent.TFs.subfamily = TF.subfamily, + .progress = "time", + .parallel = parallel, + .margins = 1, + .id = NULL, + .paropts = list(.errorhandling = 'pass') + ) + rownames(cor.summary) <- cor.summary$motif + + if(save){ + save( + TF.meth.cor, + file = sprintf("%s/getTF.%s.TFs.with.motif.pvalue.rda",dir.out=dir.out, label=ifelse(is.null(label),"",label)) + ) + write_csv( + x = cor.summary, + file = sprintf( + "%s/getTF.%s.significant.TFs.with.motif.summary.csv", + dir.out = dir.out, label=ifelse(is.null(label),"",label) + ) + ) + } + + if(save.plots){ + print.header("Creating plots", "subsection") + message("TF rank plot highlighting TF in the same family (folder: ", sprintf("%s/TFrankPlot",dir.out),")") + dir.create(sprintf("%s/TFrankPlot",dir.out), showWarnings = FALSE, recursive = TRUE) + TF.rank.plot( + motif.pvalue = TF.meth.cor, + motif = colnames(TF.meth.cor), + dir.out = sprintf("%s/TFrankPlot",dir.out), + cores = cores, + save = TRUE + ) + } + return(cor.summary) +} + +#' @title Get TF target genes +#' @description This function uses ELMER analysis +#' results and summarizes the possible genes targets for each TF +#' @param pairs Output of get.pairs function: dataframe or file path +#' @param TF.result Output get.TF function: dataframe or file path +#' @param enriched.motif List of probes for each enriched motif: list of file path. +#' The file created by ELMER is getMotif...enriched.motifs.rda +#' @param dmc.analysis DMC results file or data frame +#' @param dir.out A path specifies the directory for outputs of get.pair function. Default is current directory +#' @param label A character labels the outputs. +#' @param cores Number of cores to be used in parallel +#' @param classification use family or subfamily classification to consider potential TF +#' @param mae A multiAssayExperiment outputed from createMAE function +#' @param save A logic. If save is true, a files will be saved: getTFtarget.XX..csv +#' If save is false, only a data frame contains the same content with the first file. +#' @export +#' @examples +#' pairs <- data.frame(Probe = c("cg26992600","cg26992800","cg26992900"), +#' Symbol = c("KEAP1","DSP","ATP86")) +#' enriched.motif <- list("FOXD3_HUMAN.H11MO.0.D"= c("cg26992800","cg26992900")) +#' TF.result <- data.frame(motif = c("FOXD3_HUMAN.H11MO.0.D"), +#' potential.TF.family = c("TP63;TP73")) +#' getTFtargets(pairs,enriched.motif,TF.result) +#' +#' \dontrun{ +#' getTFtargets("../LUAD_LUSC_analysis_hg38/hyper/getPair.hyper.pairs.significant.csv", +#' enriched.motif = "../LUAD_analysis_hg38/hyper/getMotif.hyper.enriched.motifs.rda", +#' TF.result = "../LUAD_analysis_hg38/hyper/getTF.hyper.significant.TFs.with.motif.summary.csv") +#' } +getTFtargets <- function( + pairs, + enriched.motif, + TF.result, + dmc.analysis, + mae, + save = TRUE, + dir.out = "./", + classification = "family", + cores = 1, + label = NULL +){ + if(is.character(pairs)) pairs <- readr::read_csv(pairs, col_types = readr::cols()) + if(is.character(enriched.motif)) load(enriched.motif) + if(is.character(TF.result)) TF.result <- readr::read_csv(TF.result, col_types = readr::cols()) + + # 1 - For each enriched motif we will select the known TF that binds + # to the region (using TF.table input) + # 2 - For each enriched region get the probes (using motif.probes input) + # 3 - the associates target genes (using the pairs input) + df.all <- NULL + for(m in TF.result$motif){ + targets <- as.character(pairs[pairs$Probe %in% enriched.motif[[m]],]$Symbol) + if(classification == "family"){ + x <- TF.result[TF.result$motif == m,,drop = FALSE]$potential.TF.family + } else { + x <- TF.result[TF.result$motif == m,,drop = FALSE]$potential.TF.subfamily + } + + if(is.na(x)) next + + x <- strsplit(as.character(x),";") %>% unlist + df <- expand.grid(x,targets) + colnames(df) <- c("TF","Target") + df.all <- rbind(df.all,df) + } + if(is.null(df.all)) return(NULL) + df.all <- df.all[!duplicated(df.all),,drop = FALSE] + df.all <- df.all[order(df.all$TF),,drop = FALSE] + + if(save) readr::write_csv( + x = df.all, + file = sprintf( + "%s/getTFtargets.%s.%s.csv", + dir.out=dir.out, + label=ifelse(is.null(label),"",label), + classification = classification + ) + ) + + if(!missing(dmc.analysis)) { + if(is.character(dmc.analysis)) dmc.analysis <- readr::read_csv(dmc.analysis, col_types = readr::cols()) + colnames(dmc.analysis) <- paste0("DMC_analysis_",colnames(dmc.analysis)) + colnames(dmc.analysis)[1] <- "Probe" + pairs <- merge(pairs,dmc.analysis,by = "Probe") + } + if(!missing(mae)){ + if(is.character(mae)) mae <- get(load(mae)) + # add genomic info to pairs + metadata <- as.data.frame(rowRanges(getMet(mae)[unique(pairs$Probe),])) + colnames(metadata) <- paste0("probe_", colnames(metadata)) + metadata$Probe <- rownames(metadata) + pairs <- merge(pairs, metadata, by = "Probe") + metadata <- as.data.frame(rowRanges(getExp(mae)[unique(pairs$GeneID),])) + pairs <- merge(pairs, metadata, by.x = "GeneID", by.y = "ensembl_gene_id") + + pairs$TF <- NA + + # to make it faster we will change the name of the enriched motifs to the mr TF binding to it + if(classification == "family"){ + names(enriched.motif) <- TF[which(TF$motif == names(enriched.motif)),]$potential.TF.family + } else { + names(enriched.motif) <- TF[which(TF$motif == names(enriched.motif)),]$potential.TF.subfamily + } + + # remove enriched motifs without any MR TFs + enriched.motif <- enriched.motif[!is.na(names(enriched.motif))] + + + parallel <- FALSE + if (cores > 1){ + if (cores > detectCores()) cores <- detectCores() + registerDoParallel(cores) + parallel = TRUE + } + + # For each paired probe get the enriched motifs in which it appears + aux <- plyr::adply(unique(pairs$Probe),1, function(x) { + y <- paste(unique(names(enriched.motif)[grep(x,enriched.motif)]),collapse = ";") + y <- paste( + unique(na.omit(unlist( + stringr::str_split(y,";") + ))), + collapse = ";") + return(y) + },.progress = "time", + .parallel = parallel, + .paropts = list(.errorhandling = 'pass')) + aux$X1 <- unique(pairs$Probe) + + # For each enriched motif the the potencial TF family members + pairs[,"TF"] <- aux[match(pairs$Probe,aux$X1),]$V1 + if(save) readr::write_csv( + x = pairs, + file = sprintf( + "%s/getTFtargets_genomic_coordinates.%s.%s.csv", + dir.out=dir.out, + label=ifelse(is.null(label),"",label), + classification = classification + ) + ) + } + return(df.all) +} + +maphg38tohg19 <- function( + file, + TF, + dir.out = "./", + label = NULL +){ + + if (!requireNamespace("rtracklayer", quietly = TRUE)) { + stop("rtracklayer package is needed for this function to work. Please install it.", + call. = FALSE) + } + + if (!requireNamespace("R.utils", quietly = TRUE)) { + stop("R.utils package is needed for this function to work. Please install it.", + call. = FALSE) + } + + chain.file <- "http://hgdownload.cse.ucsc.edu/gbdb/hg38/liftOver/hg38ToHg19.over.chain.gz" + if(!file.exists(gsub(".gz","",basename(chain.file)))){ + downloader::download(chain.file,basename(chain.file)) + R.utils::gunzip(basename(chain.file)) + } + ch <- rtracklayer::import.chain(gsub(".gz","",basename(chain.file))) + + ret <- readr::read_csv(file) + if(!missing(TF)){ + ret <- ret[grep(TF,ret$TF),] + if(nrow(ret) == 0) { + message("No targets for that TF") + return(NULL) + } + label <- paste0(label,"_",TF) + } + + gr <- makeGRangesFromDataFrame( + ret, seqnames.field = "probe_seqnames", + start.field = "probe_start", + end.field = "probe_end", + strand.field = "probe_strand" + ) + gr$Probe <- ret$Probe + gr <- unique(gr) + x <-data.frame(unlist(rtracklayer::liftOver(gr,ch))) + colnames(x) <- paste0("probe_hg19_",colnames(x)) + ret[,grep("probe_",colnames(ret))] <- NULL + ret.hg19 <- merge(ret,x, by.x = "Probe",by.y = "probe_hg19_Probe") + + # remap gene + gr <- makeGRangesFromDataFrame( + ret, + seqnames.field = "seqnames", + start.field = "start", + end.field = "end", + strand.field = "strand" + ) + gr$GeneID <- ret$GeneID + gr <- unique(gr) + x <-data.frame(unlist(rtracklayer::liftOver(gr,ch))) + colnames(x) <- paste0("gene_hg19_",colnames(x)) + ret.hg19[,c("seqnames","start","end","strand")] <- NULL + ret.hg19 <- merge(ret.hg19,x, by.x = "GeneID",by.y = "gene_hg19_GeneID") + readr::write_csv( + ret.hg19, + file = gsub("genomic_coordinates","genomic_coordinates_mapped_to_hg19",file) + ) +} + + +#' @title Make MR TF binary table +#' @description This function uses ELMER analysis +#' results and summarizes the MR TF identified in each analysis +#' @param files Output of get.pairs function: dataframe or file path +#' @param path Directory path with the ELMER results. Files with the following pattern +#' will be selected TF.*with.motif.summary.csv. +#' @param classification Consider subfamily or family classifications +#' @param top Get only the top potential (default) or all potentials +#' @importFrom readr read_csv +summarizeTF <- function( + files = NULL, + path = NULL, + classification = "family", + top = FALSE +){ + + top <- ifelse(top, "top.", "") + col <- ifelse( + classification == "family", + paste0(top, "potential.TF.family"), + paste0(top, "potential.TF.subfamily") + ) + + if(!is.null(path)) { + files <- dir( + path = path, + pattern = "TF.*with.motif.summary.csv", + recursive = T, + full.names = TRUE + ) + } + aux <- list() + for(f in files){ + TF <- readr::read_csv(f,col_types = readr::cols()) + aux[[f]] <- sort(na.omit(unique(unlist(stringr::str_split(TF[[col]],";"))))) + } + TF <- sort(unique(unlist(unique(aux)))) + if(length(TF) == 0) return(NULL) + df <- data.frame(TF) + + for(f in files){ + TF <- readr::read_csv(f,col_types = readr::cols()) + df$analysis <- NA + df$analysis[df$TF %in% sort(na.omit(unique(unlist(stringr::str_split(TF[[col]],";")))))] <- "x" + colnames(df)[which(colnames(df) == "analysis")] <- paste0(basename(dirname(f)), " in ", basename(dirname(dirname(f)))) + } + + return(df) +} + +getTopFamily <- function( + motif, + TF.meth.cor, + n = 3 +){ + TF.family <- createMotifRelevantTfs() + TF <- stringr::str_trim(unlist(stringr::str_split(TF.family[[motif]],";"))) + TF <- TF[TF %in% rownames(TF.meth.cor)] + topfamily <- names(sort(TF.meth.cor[TF,motif]))[1:n] + return(topfamily) +} diff --git a/R/ReadFile.R b/R/ReadFile.R deleted file mode 100755 index 65ebc48f..00000000 --- a/R/ReadFile.R +++ /dev/null @@ -1,185 +0,0 @@ -#' Read a bed file. -#' @param x A path of bed file (characters) -#' @param strand A boolean to specific strands. If true, strand column will be filled as input. If false, strand column will be filled "*"" -#' @param skip A number to specify how many lines should be removed from bed file. -#' @param cols Specify the column to read from bed file. -#' @param seqLength Specify custmer seqLength parameter in GRange function -#' @return GRange object containning bed file information. -#' @export - - - -ReadBed <- function(x,strand=FALSE,skip=0,cols=NULL,seqLength=NULL){ - require(GenomicRanges) - if(is.null(cols)){ - x <- read.table(x,stringsAsFactors=FALSE,sep = "\t",skip=skip) - }else{ - cols <- paste(cols,collapse=",") - cmd <- sprintf("cut -f%s %s",cols,x) - x <- read.table(pipe(cmd),stringsAsFactors=FALSE,sep = "\t",skip=skip) - } - - x[,1] <- sub("chrx","chrX",x[,1]) - x[,1] <- sub("chry","chrY",x[,1]) - x[,1] <- sub("chrm","chrM",x[,1]) - if(dim(x)[2] >5){ - x[,6] <- sub("\\.","*",x[,6]) - } - x[,1] <- sub(" ","",x[,1]) - if(strand){ - Bed<-GRanges( x[,1], IRanges(x[,2]+1, x[,3]) ,strand=x[,6]) - if(ncol(x)>6){ - values(Bed) <- data.frame(name=x[,4],score = x[,5],x[,setdiff(seq_len(ncol(x)),c(1:6))]) - } - }else{ - Bed<-GRanges( x[,1], IRanges(x[,2], x[,3]) ) - if(ncol(x)==4){ - values(Bed) <- data.frame(name=x[,4]) - }else if(ncol(x)==5){ - values(Bed) <- data.frame(name=x[,4],score = x[,5]) - }else if(ncol(x)>=6){ - values(Bed) <- data.frame(name=x[,4],score = x[,5],x[,setdiff(seq_len(ncol(x)),c(1:6))]) - } - } - if(!is.null(seqLength)){ - seqlengths(Bed) <- sequenceLen[seqlevels(Bed),2] - } - - return(Bed) -} - -#' Read a GFF file. -#' @param x A path of GFF file (characters) -#' @param strand A boolean to specific strands. If true, strand column will be filled as input. If false, strand column will be filled "*"" -#' @param skip A number to specify how many lines should be removed from bed file. -#' @return GRange object containning GFF file information. -#' @export -ReadGFF <- function(x,strand=FALSE,skip=0 ){ - require(GenomicRanges) - x <- read.table(x,stringsAsFactors=FALSE,sep = "\t",skip=skip) - x[,1] <- sub("chrx","chrX",x[,1]) - x[,1] <- sub("chry","chrY",x[,1]) - x[,1] <- sub("chrm","chrM",x[,1]) - x[,7] <- sub("\\.","*",x[,7]) - x[,1] <- sub(" ","",x[,1]) - if(strand){ - GFF<-GRanges( x[,1], IRanges(x[,4], x[,5]) ,strand=x[,7]) - values(GFF) <- data.frame(Source=x[,2],feature=x[,3],score = x[,6],frame = x[,8],name = x[,9],x[,setdiff(seq_len(ncol(x)),c(1:9))]) - }else{ - GFF<-GRanges( x[,1], IRanges(x[,4], x[,5]) ) - values(GFF) <- data.frame(Source=x[,2],feature=x[,3],score = x[,6],frame = x[,8],name = x[,9],x[,setdiff(seq_len(ncol(x)),c(1:9))]) - } - seqlengths(GFF) <- sequenceLen[seqlevels(GFF),2] - return(GFF) -} - -sequenceLen <- data.frame(chr=c("chr1","chr1_gl000191_random","chr1_gl000192_random","chr2","chr3","chr4","chr4_gl000193_random","chr4_gl000194_random","chr5", - "chr6","chr7","chr7_gl000195_random", "chr8","chr8_gl000196_random","chr8_gl000197_random", - "chr9","chr9_gl000198_random","chr9_gl000199_random","chr9_gl000200_random","chr9_gl000201_random", - "chr10","chr11","chr11_gl000202_random","chr12","chr13","chr14","chr15","chr16","chr17","chr17_gl000203_random","chr17_gl000204_random", - "chr17_gl000205_random", "chr17_gl000206_random","chr18","chr18_gl000207_random","chr19","chr19_gl000208_random","chr19_gl000209_random", - "chr20", "chr21", "chr21_gl000210_random", "chr22","chrX","chrY","chrM", "chrUn_gl000211","chrUn_gl000212","chrUn_gl000213", - "chrUn_gl000214","chrUn_gl000215","chrUn_gl000216","chrUn_gl000217","chrUn_gl000218","chrUn_gl000219","chrUn_gl000220", - "chrUn_gl000221","chrUn_gl000222","chrUn_gl000223","chrUn_gl000224","chrUn_gl000225", "chrUn_gl000226", "chrUn_gl000227", - "chrUn_gl000228","chrUn_gl000229","chrUn_gl000230","chrUn_gl000231","chrUn_gl000232","chrUn_gl000233","chrUn_gl000234", - "chrUn_gl000235","chrUn_gl000236","chrUn_gl000237","chrUn_gl000238","chrUn_gl000239","chrUn_gl000240", - "chrUn_gl000241", "chrUn_gl000242", "chrUn_gl000243","chrUn_gl000244","chrUn_gl000245","chrUn_gl000246","chrUn_gl000247", - "chrUn_gl000248","chrUn_gl000249" ), - Length=c(249250621,106433,547496,243199373,198022430,191154276, 189789,191469,180915260,171115067,159138663,182896,146364022, - 38914,37175,141213431,90085,169874, 187035, 36148,135534747,135006516,40103,133851895,115169878,107349540,102531392, - 90354753,81195210,37498,81310, 174588,41001,78077248, 4262, 59128983,92689,159169,63025520,48129895,27682,51304566, - 155270560,59373566,16569,166566,186858,164239,137718,172545,172294,172149,161147,179198,161802,155397,186861, 180455, - 179693,211173,15008, 128374,129120,19913,43691,27386,40652,45941, 40531,34474,41934, 45867, 39939,33824,41933,42152, - 43523,43341,39929,36651,38154,36422,39786,38502 - )) -rownames(sequenceLen) <- sequenceLen$chr - -#' Write a bed file from GRange object. -#' @param x GRange object -#' @param save if save is false, function will return a bed format data.frame. if save is true, fn parameter need to be specific and it output bed file in the path you specified in fn. -#' @param fn A name of bed file you want to output. -#' @return A data.frame bed object or save output bed file. -#' @export -WriteBed <-function(x,save=T,fn=NULL){ - require(GenomicRanges) - x <- as.data.frame(x,row.names=NULL) - x$element=NULL - if(ncol(x) >5){ - if(ncol(x)>=7){ - out <- x[,c(1,2,3,6,4,5,7:ncol(x))] - }else{ - out <- x[,c(1,2,3,6,4,5)] - } - }else{ - Names <- paste0(x[,1],":",x[,2],"-",x[,3]) - out <- cbind(x[,c(1,2,3)],Names,x[,4],x[,5]) - } - rownames(out) <- NULL - if(save){ - if(is.null(fn)) fn <- deparse(substitute(x)) - write.table(out,file=fn,row.names=F,col.names=F,quot=F,sep="\t") - }else{ - return(out) - } -} - -#' Generate random loci of genome. -#' @param SampleSize A number of random loci you want to generate. -#' @param exclusion The chromosome you want to exclude such as chrX chrY. -#' @param regionWidth The width of each random loci. -#' @return GRange object. -#' @export -RandomLoci <- function(SampleSize=NULL,exclusion=NULL,regionWidth=0){ - require(GenomicRanges) - chr <- paste0("chr",c(1:22,"X","Y")) - if(!is.null(exclusion)) chr <- chr[!chr %in% exclusion] - LengthSum <- sum(sequenceLen[chr,2]) - positions <- sample(seq_len(LengthSum),SampleSize) - out <- c() - for(i in seq_len(SampleSize)){ - count <- 1 - start <- positions[i] - while((start- sequenceLen[chr[count],2]) > 0){ - start <- start- sequenceLen[chr[count],2] - count <- count+1 - } - chrsample <- chr[count] - end <- start +regionWidth - if(end > sequenceLen[chrsample,2]){ - end <- sequenceLen[chrsample,2] - start <- end - regionWidth - } - out <- rbind(out,c(chrsample,start,end)) - } - out <- GRanges( out[,1], IRanges(as.numeric(out[,2]), as.numeric(out[,3]))) - seqlengths(out) <- sequenceLen[seqlevels(out),2] - return(out) -} - -#' Generate random loci of genome. -#' @param x GRange object which you want to identify distal elements from. -#' @param TSS.range GRange object which contain promoter infomation -#' @param ignore.strand A boolean which to specific if ignore strand or not. -#' @return GRange object which contains elements from x that doesn't overlap with TSS.range. -#' @export -Distal <- function(x,TSS.range,ignore.strand=F){ - library(GenomicRanges) - overlap <- findOverlaps(x,TSS.range,ignore.strand=ignore.strand) - distal.index <- !1:length(x) %in% queryHits(overlap) - distal <- x[distal.index] - return(distal) -} - -#' Generate random loci of genome. -#' @param x GRange object which you want to identify distal elements from. -#' @param TSS.range GRange object which contain promoter infomation -#' @param ignore.strand A boolean which to specific if ignore strand or not. -#' @return GRange object which contains elements from x that overlap with TSS.range. -#' @export -Proximal <- function(x,TSS.range,ignore.strand=F){ - library(GenomicRanges) - overlap <- findOverlaps(x,TSS.range,ignore.strand=ignore.strand) - proximal.index <- 1:length(x) %in% queryHits(overlap) - proximal <- x[proximal.index] - return(proximal) -} \ No newline at end of file diff --git a/R/Scatter.plot.R b/R/Scatter.plot.R new file mode 100644 index 00000000..32178a1f --- /dev/null +++ b/R/Scatter.plot.R @@ -0,0 +1,359 @@ +#' scatter.plot to plot scatter plots between gene expression and DNA methylation. +#' @description +#' scatter.plot is a function to plot various scatter plots between gene expression and +#' DNA methylation. When byPair is specified, scatter plot for individual probe-gene pairs +#' will be generated. When byProbe is specified, scatter plots for one probes with nearby +#' 20 gene pairs will be generated. When byTF is specified, scatter plot for TF expression +#' and average DNA methylation at certain motif sites will be generated. +#' @importFrom ggplot2 ggsave +#' @usage +#' scatter.plot(data, +#' byPair = list(probe = c(), gene = c()), +#' byProbe = list(probe = c(), numFlankingGenes = 20), +#' byTF = list(TF = c(), probe = c()), +#' category = NULL, +#' ylim = NULL, +#' dots.size = 0.9, +#' correlation = FALSE, +#' width = 7, +#' height = 6, +#' dir.out = "./", +#' save = TRUE, ...) +#' @param data A multiAssayExperiment with DNA methylation and Gene Expression data. +#' See \code{\link{createMAE}} function. +#' @param byPair A list: byPair =list(probe=c(),gene=c()); probe contains a vector +#'of probes' name and gene contains a vector of gene ID. The length of probe +#'should be the same with length of gene. Output see numFlankingGenes +#'@param byProbe A list byProbe =list(probe=c(), geneNum=20); probe contains +#'a vector of probes'name and geneNum specify the number of gene near the probes +#'will ploted. 20 is default for numFlankingGenes Output see detail. +#'@param byTF A list byTF =list(TF=c(), probe=c()); TF contains a vector of TF's +#'symbol and probe contains the a vector of probes' name. Output see detail. +#'@param category A vector labels subtype of samples or a character which is the +#'column name in the colData(data) in the multiAssayExperiment object. Once specified, samples +#'will label different color. The color can be customized by using color.value. +#'@param dir.out A path specify the directory to which the figures will be saved. +#'Current directory is default. +#'@param ylim y-axis limit i.e. c(0,25) +#'@param dots.size Control dots size +#'@param save A logic. If true, figure will be saved to dir.out. +#'@param height PDF height +#'@param width PDF width +#'@param correlation Add pearson correlation values to the plot +#'@param ... color.value, lm_line in scatter function +#'@details byPair The output will be scatter plot for individual pairs. +#'@details byProbe The output will be scatter plot for the probe and nearby genes. +#'@details byTF The output will be scatter plot for the TFs and the average +#'DNA methylation at the probes set specified in byTF list. +#'@return Scatter plots. +#'@importFrom MultiAssayExperiment sampleMap +#'@export +#'@author Lijing Yao (maintainer: lijingya@usc.edu) +#'@examples +#' data <- ELMER:::getdata("elmer.data.example") +#' scatter.plot(data, +#' byProbe=list(probe=c("cg19403323"),numFlankingGenes=20), +#' category="definition", save=FALSE) +#' scatter.plot(data,byProbe=list(probe=c("cg19403323"),numFlankingGenes=20), +#' category="definition", save=TRUE) ## save to pdf +#' # b. generate one probe-gene pair +#' scatter.plot(data,byPair=list(probe=c("cg19403323"),gene=c("ENSG00000143322")), +#' category="definition", save=FALSE,lm_line=TRUE) +scatter.plot <- function(data, + byPair = list(probe = c(), + gene = c()), + byProbe = list(probe = c(), + numFlankingGenes = 20), + byTF = list(TF = c(), + probe = c()), + category = NULL, + ylim = NULL, + dots.size = 0.9, + correlation = FALSE, + width = 7, + height = 6, + dir.out = "./", + save = TRUE, + ...){ + + dir.create(dir.out,recursive = TRUE,showWarnings = FALSE) + + simpleCap <- function(x) { + if(is.na(x)) return("NA") + s <- x + paste(toupper(substring(s, first = 1, last = 1)), tolower(substring(s, 2)), + sep = "", collapse = " ") + } + if(missing(data)) stop("A data object should be included.") + + if(!is.null(category) && length(category)==1) { + + if(! category %in% colnames(colData(data))){ + stop("category not found in the phenotypic data (colData(data)) ") + } + if(is.null(category)) stop("Please, set category argument") + legend.title <- simpleCap(category) + samples <- sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"] + category <- colData(data)[samples,category] + if(!"color.value" %in% names(list(...))) category <- sapply(category, simpleCap) + } + + if(length(byPair$probe) != 0){ + + if(length(byPair$probe) != length(byPair$gene)) + stop("In pairs, the length of probes should be the same with the length of genes.") + + pb <- txtProgressBar(min = 0, max = length(byPair$gene), + title = "creating images", + style = 3, initial = 0, char = "=") + + for(i in 1:length(byPair$probe)){ + setTxtProgressBar(pb, i) + probe <- byPair$probe[i] + gene <- byPair$gene[i] + symbol <- getSymbol(data,geneID = gene) + P <- scatter(meth = assay(getMet(data)[probe,]), + exp = assay(getExp(data)[gene,] ), + category = category, + ylim = ylim, + dots.size = dots.size, + legend.title = legend.title, + correlation = correlation, + xlab = sprintf("DNA methylation at %s",probe), + ylab = sprintf("%s gene expression",symbol), + title = sprintf("%s_%s",probe,symbol), + ...) + if(save) { + filename <- sprintf("%s/%s_%s_bypair.pdf", dir.out, probe, symbol) + ggsave(filename = filename, + plot = P, + useDingbats = FALSE, + width = width, + height = height) + } + } + close(pb) + + } + if(length(byProbe$probe) != 0){ + nearGenes <- GetNearGenes(data = data, + probes = byProbe$probe, + numFlankingGenes = byProbe$numFlankingGenes) + for(i in byProbe$probe){ + probe <- i + gene <- nearGenes %>% filter(nearGenes$ID == i) %>% pull('GeneID') + symbol <- getSymbol(data,geneID = gene) + exp <- assay(getExp(data)[gene,]) + meth <- assay(getMet(data)[byProbe$probe,]) + rownames(exp) <- symbol + P <- scatter(meth = meth, + exp = exp, + ylim = ylim, + category = category, + dots.size = dots.size, + legend.title = legend.title, + xlab = sprintf("DNA methylation at %s", probe), + ylab = sprintf("Gene expression"), + title = sprintf("%s nearby %s genes", probe, byProbe$numFlankingGenes), + ...) + if(save) ggsave(filename = sprintf("%s/%s_byprobe.pdf", dir.out, probe), + plot = P, + useDingbats = FALSE, + width = width, + height = height) + } + } + + if(length(byTF$TF) != 0){ + probes <- byTF$probe[byTF$probe %in% rownames(assay(getMet(data)))] + meth <- colMeans(assay(getMet(data)[probes,]),na.rm = TRUE) + gene <- getGeneID(data,symbol = byTF$TF) + + # Our input might not be mapped, we need to verify it + found <- NULL + if(any(is.na(gene))){ + found <- !is.na(gene) + message("Gene not found: ", byTF$TF[!found]) + gene <- na.omit(gene) # rm the one not found + } + + exp <- assay(getExp(data)[gene,]) + + if(nrow(exp) > 0){ + if(!is.null(found)) { + rownames(exp) <- byTF$TF[found] + } else { + rownames(exp) <- byTF$TF + } + } + + P <- scatter(meth = meth, + exp = exp, + ylim = ylim, + category = category, + dots.size = dots.size, + correlation = correlation, + legend.title = legend.title, + xlab = "Avg DNA methylation", + ylab = sprintf("TF expression"), + title = "TF vs avg DNA methylation", + ...) + + if(save) ggsave(filename = sprintf("%s/%s_byTF.pdf",dir.out,paste(byTF$TF,collapse = "_")), + plot = P, + useDingbats = FALSE, + width = max(6, 3*(length(byTF$TF) %% 5)), + height = max(4, 3 * ceiling(length(byTF$TF) / 5))) + } + return(P) +} + + +#'scatter +#'@importFrom reshape melt.data.frame +#' @importFrom scales scientific +#'@import ggplot2 +#'@param meth A vector of number. +#'@param exp A vector of number or matrix with sample in column and gene in rows. +#'@param category A vector of sample labels. +#'@param legend.title Plot legend title +#'@param xlab A character specify the title of x axis. +#'@param ylab A character specify the title of y axis. +#'@param ylim y-axis limit i.e. c(0,25) +#'@param dots.size Control dots size +#'@param title A character specify the figure title. +#'@param correlation Show spearman correlation values +#'@param correlation.text.size Correlation values +#'@param color.value A vector specify the color of each category, such as +#color.value=c("Experiment"="red","Control"="darkgreen") +#'@param lm_line A logic. If it is TRUE, regression line will be added to the graph. +#'@return A ggplot figure object +scatter <- function( + meth, + exp, + legend.title = "Legend", + category = NULL, + xlab = NULL, + ylab = NULL, + ylim = NULL, + dots.size = 0.9, + title = NULL, + correlation = FALSE, + correlation.text.size = 3, + color.value = NULL, + lm_line = FALSE +){ + + if(is.null(category)) category <- rep(1,length(meth)) + + if(!is.vector(exp)){ + exp <- as.data.frame(t(exp)) + GeneID <- colnames(exp) + exp$meth <- as.vector(meth) + exp$category <- category + df <- melt.data.frame(exp, measure.vars = GeneID) + df$category <- factor(df$category) + + P <- ggplot(df, aes_string(x = 'meth', y = 'value', color = 'category')) + + geom_point(size = dots.size) + + facet_wrap(facets = ~ variable, ncol = 5) + + scale_x_continuous(limits = c(0,1), breaks = c(0, 0.25, 0.5, 0.75, 1)) + + theme_bw() + + theme( + panel.grid.major = element_blank(), + legend.position = "bottom", + legend.key = element_rect(colour = 'white'), + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5) + ) + + labs(x = xlab, y = ylab, title = title) + + scale_colour_discrete(name = legend.title) + + guides( + colour = guide_legend( + override.aes = list(size = 4), + title.position = "top", + nrow = ceiling(sum(stringr::str_length(unique(category))) / 100), + title.hjust = 0.5 + ) + ) + + if(!is.null(color.value)) { + P <- P + scale_colour_manual(values = color.value) + } + + if(!is.null(ylim)) { + P <- P + coord_cartesian(ylim = ylim) + } + + if(lm_line) { + P <- P + geom_smooth(method = "lm", se = TRUE, color = "black", formula = y ~ x,data = df) + } + + if(correlation && length(GeneID)> 1){ + message("Sorry no option to add correlation with more than one gene") + } + if(correlation && length(GeneID) == 1){ + + cor <- cor.test( + x = as.numeric(meth), + y = as.numeric(exp[,GeneID,drop = TRUE]), + exact = FALSE, + method = c("pearson") + ) + corval <- round(cor$estimate,digits = 2) + pvalue <- scientific(cor$p.value, digits = 3) + title <- paste0(title, "\n","Spearman Cor: ", corval," / P-value: ", pvalue) + P <- P + labs(title = title) + P <- P + annotate( + "text", + x = 0.01, + y = ifelse(is.null(ylim),max(as.numeric(exp[,GeneID])) + 1, max(ylim) - 1), + hjust = 0.0, + size = correlation.text.size, + label = bquote(italic(rho)~":"~.(corval)~"/P-value: "~.(pvalue)) + ) + #print(paste0(title, "\n","Rho: ", corval," / P-value: ", cor$p.value)) + } + } else { + df <- data.frame(meth = meth,exp = exp,category = factor(category)) + if(length(unique(df$category)) == 1){ + P <- ggplot(df, aes_string(x = 'meth', y = 'exp')) + } else { + P <- ggplot(df, aes_string(x = 'meth', y = 'exp', color = 'category')) + } + P <- P + geom_point() + + scale_x_continuous(limits = c(0,1), breaks = c(0, 0.25, 0.5, 0.75, 1))+ + theme_bw()+ + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + legend.position="bottom", + legend.key = element_rect(colour = 'white'), + axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5))+ + labs(x = xlab, + y = ylab, + title = title) + + scale_colour_discrete(name = legend.title)+ + guides(colour = guide_legend(override.aes = list(size=4), + title.position="top", + nrow = ceiling(sum(stringr::str_length(unique(data$group)))/100), + title.hjust = 0.5)) + + scale_fill_discrete(guide = FALSE) + + guides(fill=FALSE) + + if(lm_line){ + # P <- P+ geom_text(aes(x =0.8 , y = max(exp)-0.5, label = lm_eqn(df)), + #parse = TRUE,colour = "black")+ + P <- P + geom_smooth( + method = "lm", + span = 1, + se = TRUE, + color = "black", + formula = y ~ x, + data = df + ) + } + + + } + return(P) +} + diff --git a/R/Schematic.plot.R b/R/Schematic.plot.R new file mode 100644 index 00000000..a1da2730 --- /dev/null +++ b/R/Schematic.plot.R @@ -0,0 +1,459 @@ +#' schematic.plot to plot schematic plots showing the locations of genes and probes. +#' @description +#' schematic.plot is a function to plot schematic plots showing the locations of genes and probes. +#' @usage +#' schematic.plot(data, +#' group.col = NULL, +#' group1 = NULL, +#' group2 = NULL, +#' pair, +#' byProbe, +#' byGeneID, +#' byCoordinate=list(chr=c(), start=c(), end=c()), +#' statehub.tracks, +#' dir.out="./", +#' save=TRUE,...) +#' @importFrom GenomicRanges GRanges findOverlaps +#' @importFrom IRanges IRanges +#'@param data A Multi Assay Experiment object with DNA methylation and +#' gene expression Summarized Experiment objects +#' @param pair A data frame with three columns: Probe, Gene ID (Ensemble gene ID) +#' and Pe (empirical p-value). This is the ouput of get.pair function. +#' @param group.col A column defining the groups of the sample. You can view the +#' available columns using: colnames(MultiAssayExperiment::colData(data)). +#' @param group1 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param group2 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2.#' @param byProbe A vector of probe names. +#' @param byGeneID A vector of gene ID +#' @param byProbe A vector of probe names +#' @param byCoordinate A list contains chr, start and end. +#'byCoordinate=list(chr=c(),start=c(),end=c()). +#' @param ... Parameters for GetNearGenes +#' @param dir.out A path specify the directory for outputs. Default is current directory +#' @param statehub.tracks Relative path to a statehub track. +#' @param save A logic. If true, figures will be saved to dir.out. +#' @details +#' byProbes: +#' When a vector of probes' name are provided, +#' function will produce schematic plots for each individual probes. +#' The schematic plot contains probe, nearby 20 (or the number of gene user specified.) +#' genes and the significantly linked gene to the probe. +#' +#' byGene: +#' When a vector of gene ID are provided, function will produce schematic plots +#' for each individual genes. The schematic plot contains the gene and all the +#' significantly linked probes. +#' +#' byCoordinate: +#' When a genomic coordinate is provided, function will +#' produce a schematic plot for this coordinate. The schematic plot contains +#' all genes and significantly linked probes in the range and the significant links. +#' @export +#' @import Gviz lattice +#' @importFrom utils setTxtProgressBar txtProgressBar +#' @importFrom methods as is +#' @examples +#' data <- ELMER:::getdata("elmer.data.example") +#' pair <- data.frame(Probe = c("cg19403323","cg19403323", "cg26403223"), +#' GeneID = c("ENSG00000196878", "ENSG00000009790", "ENSG00000009790" ), +#' Symbol = c("TRAF3IP3","LAMB3","LAMB3"), +#' Raw.p =c(0.001,0.00001,0.001), +#' Pe = c(0.001,0.00001,0.001)) +#' schematic.plot(data, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' pair = pair, +#' byProbe = "cg19403323") +#' schematic.plot(data, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' pair = pair, +#' byGeneID = "ENSG00000009790") +#' +#' schematic.plot(data, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' pair = pair, +#' byCoordinate = list(chr="chr1", start = 209000000, end = 209960000)) +#' \dontrun{ +#' schematic.plot(data, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' pair = pair, +#' byProbe = "cg19403323", +#' statehub.tracks = "hg38/ENCODE/mcf-7.16mark.segmentation.bed") +#' } +schematic.plot <- function( + data, + group.col = NULL, + group1 = NULL, + group2 = NULL, + pair, + byProbe, + byGeneID, + byCoordinate=list(chr=c(), start=c(), end=c()), + statehub.tracks = NULL, + dir.out="./", + save = TRUE, + ... +){ + # Begin of new schematic plot + # For a probe get nearby genes + args <- list(...) + params <- args[names(args) %in% c("numFlankingGenes","cores")] + + extra.tracks <- args[grepl("track",names(args))] + + suppressWarnings({ + if(!missing(byProbe)){ + nearGenes <- do.call( + GetNearGenes, + c(list( + TRange = rowRanges(getMet(data))[byProbe,], + geneAnnot = rowRanges(getExp(data))), + params + ) + ) + pb <- txtProgressBar( + min = 0, max = length(byProbe), title = "creating images", + style = 3, initial = 0, char = "=" + ) + progress <- 0 + for(probe in byProbe){ + progress <- progress + 1 + setTxtProgressBar(pb, progress) + significant <- pair[pair$Probe == probe,] + gene.gr <- rowRanges(getExp(data))[nearGenes[nearGenes$ID == probe,]$GeneID,] + probe.gr <- rowRanges(getMet(data))[unique(nearGenes[nearGenes$ID == probe,]$ID),] + schematic( + data = data, + gene.gr = gene.gr, + probe.gr = probe.gr, + significant = significant, + label = sprintf("%s/%s.schematic.byProbe",dir.out,probe), + statehub.tracks = statehub.tracks, + save = save, + group.col = group.col, + group1 = group1, + group2 = group2, + extra.tracks = extra.tracks + ) + } + close(pb) + } + if(!missing(byGeneID)){ + pb <- txtProgressBar( + min = 0, max = length(byGeneID), title = "creating images", + style = 3, initial = 0, char = "=" + ) + progress <- 0 + for(gene in byGeneID){ + progress <- progress + 1 + setTxtProgressBar(pb, progress) + significant <- pair[pair$GeneID==gene,] + if(nrow(significant) == 0) { + warning(paste0("Gene ", gene, " is not in pair list. We cannot plot it.")) + next + } + gene.gr <- rowRanges(getExp(data))[gene,] + probe.gr <- rowRanges(getMet(data))[significant$Probe,] + schematic( + data = data, + gene.gr, + probe.gr, + significant, + label = sprintf("%s/%s.schematic.byGene",dir.out,gene), + save = save, + statehub.tracks = statehub.tracks, + group.col = group.col, + group1 = group1, + group2 = group2, + extra.tracks = extra.tracks + ) + } + close(pb) + } + + if(length(byCoordinate$chr) != 0){ + for(i in 1:length(byCoordinate$chr)){ + coordinate <- GRanges( + seqnames = byCoordinate$chr[i], + ranges = IRanges(byCoordinate$start[i],byCoordinate$end[i]) + ) + probe.gr <- rowRanges(getMet(data))[unique(pair$Probe),] + probe.gr <- probe.gr[queryHits(findOverlaps(probe.gr, coordinate)),] + if(length(probe.gr) == 0) stop("No probes in that region") + gene.gr <- rowRanges(getExp(data))[queryHits(findOverlaps(rowRanges(getExp(data)), coordinate)),] + if (length(gene.gr) == 0) stop("No genes in that region") + significant <- pair[pair$GeneID %in% names(gene.gr) & pair$Probe %in% names(probe.gr),] + + schematic( + data = data, + gene.gr, + probe.gr, + significant, + label = sprintf( + "%s/%s_%s_%s.schematic.byCoordinate", + dir.out,byCoordinate$chr[i],byCoordinate$start[i], + byCoordinate$end[i] + ), + save = save, + statehub.tracks = statehub.tracks, + group.col = group.col, + group1 = group1, + group2 = group2, + extra.tracks = extra.tracks + ) + } + } + }) +} + +#' @importFrom grDevices rainbow pdf dev.off +#' @importFrom GenomicRanges seqnames +#' @importFrom MultiAssayExperiment metadata +#' @importFrom Gviz IdeogramTrack +#' @importFrom lattice bwplot +schematic <- function( + data, + gene.gr, + probe.gr, + significant, + label, + save=TRUE, + statehub.tracks = NULL, + group.col = NULL, + group1 = NULL, + group2 = NULL, + extra.tracks = NULL +){ + options(ucscChromosomeNames = FALSE) + + chr <- as.character(seqnames(probe.gr)) + + idxTrack <- Gviz::IdeogramTrack(genome = metadata(data)$genome, chromosome = chr) + axTrack <- GenomeAxisTrack() + + # We will find which is the significant pairs of genes + fill <- rep("blue", length(values(gene.gr)$ensembl_gene_id)) + + for(i in seq_len(length(unique(significant$Probe)))) { + fill[values(gene.gr)$ensembl_gene_id %in% significant[significant$Probe %in% unique(significant$Probe)[i],]$GeneID] <- "red" + } + genetrack <- GeneRegionTrack(gene.gr, + name = "Gene", + fill = fill, + symbol = values(gene.gr)$external_gene_name, + shape = "arrow") + + wrap_strings <- function(vector_of_strings,width){ + as.character( + sapply(vector_of_strings, + FUN = function(x){paste(strwrap(x,width), collapse="\n")} + ) + ) + } + details <- function(identifier, ...) { + d <- data.frame( + signal = assay(getMet(data))[identifier, ], + group = wrap_strings(colData(data)[,group.col],30) + ) + + print( + lattice::bwplot( + signal~group, + data = d, + xlab = identifier, + ylab = 'DNA methylation levels', + horizontal = FALSE, + panel = function(..., box.ratio) { + panel.violin(..., + col = "lightblue", + varwidth = FALSE, + box.ratio = box.ratio) + panel.bwplot(..., + col='black', + cex=0.8, + pch='|', + fill='gray', + box.ratio = .1) + }, + par.settings = list(box.rectangle=list(col='black'), + plot.symbol = list(pch='.', cex = 0.1)), + scales = list(x=list(rot=45, cex=0.5))), + #densityplot(~signal, group = group, data = d, auto.key = TRUE, + # main = list(label = identifier, cex = 0.7), + # scales = list(draw = FALSE, x = list(draw = TRUE)), + # ylab = "", xlab = ""), + newpage = FALSE, + prefix = "plot") + } + interactions.track <- c() + if(nrow(significant) > 0 ) { + if (!requireNamespace("GenomicInteractions", quietly = TRUE)) { + stop("GenomicInteractions package is needed for this function to work. Please install it.", + call. = FALSE) + } + genes.plot <- gene.gr[match(significant$GeneID,names(gene.gr))] + genes.plot <- SummarizedExperiment::resize(genes.plot,width = 1) + interactions <- GenomicInteractions::GenomicInteractions( + genes.plot, + probe.gr[match(significant$Probe,names(probe.gr))], + experiment_name="Putative pair genes ", + description="this is a test", counts=-log10(significant$Raw.p) + ) + interactions.track <- GenomicInteractions::InteractionTrack( + name = "Putative pair genes\n (-log10 raw p-value)", + interactions, + chromosome = unique(chr) + ) + displayPars(interactions.track) = list( + col.interactions = "red", + col.anchors.fill = "transparent", + col.anchors.line = "transparent", + interaction.dimension = "height", + interaction.measure = "counts", + anchor.height = 0.1 + ) + } + probe.col <- "black" + # StateHub tracks + state.tracks <- c() + if(!is.null(statehub.tracks)){ + base <- "http://s3-us-west-2.amazonaws.com/statehub-trackhub/tracks/5813b67f46e0fb06b493ceb0/" + for(state in statehub.tracks){ + message("Adding stateHub track: ", state) + bed <- paste0(base,state) + if(!file.exists(basename(bed))) downloader::download(bed,basename(bed)) + if (!requireNamespace("rtracklayer", quietly = TRUE)) { + stop("rtracklayer package is needed for this function to work. Please install it.", + call. = FALSE) + } + state.chr <- rtracklayer::import.bed(basename(bed)) + state.chr <- state.chr[seqnames(state.chr) == chr] + #state.chr <- state[seqnames(state) == chr & + # start(state) >= min(start(gene.gr) , start(probe.gr) ) & + # end(state) <= max(end(gene.gr) , end(probe.gr) )] + + tracks <- plyr::alply(unique(state.chr$name), 1, function(x){ + aux <- state.chr[state.chr$name == x] + AnnotationTrack( + aux,name = paste0(state.chr@trackLine@name, "\n",x), + stacking = "dense", + col = NULL, + col.line = NULL, + shape = "box", + fill = unique(aux$itemRgb) + ) + }) + } + #all.states <- AnnotationTrack(state.chr,fill = state.chr$itemRgb, stacking = "squish") + state.tracks <- c(state.tracks,tracks) + message("Probes overlapping") + print(IRanges::subsetByOverlaps(state.chr,probe.gr)$name) + } + if(save) { + pdf( + paste0(label,".pdf"), + height = max(5, 5 + rep(2,!is.null(group.col)), floor(length(state.tracks)/2 + 5 + rep(2,!is.null(group.col)))) + ) + } + + if(!is.null(group.col)){ + + if(!is.null(group1) & !is.null(group1)) + data <- data[,colData(data)[,group.col] %in% c( group1, group2)] + + deTrack <- AnnotationTrack( + range = probe.gr, + genome = metadata(data)$genome, + showId = FALSE, + groupAnnotation = "group", + chromosome = chr, + fill = probe.col, + detailsConnector.col="grey", + detailsBorder.col="grey", + col.line=probe.col, + detailsBorder.lwd=0, + detailsConnector.pch = NA, + id = names(probe.gr), + name = "Probe details", + stacking = "squish", + fun = details + ) + plotTracks(c(list(idxTrack, + axTrack, + deTrack), + interactions.track, + list(genetrack), + extra.tracks, + state.tracks), + background.title = "darkblue", + detailsBorder.col = "white", + from = min(start(gene.gr) , start(probe.gr), end(gene.gr) , end(probe.gr)), + to = max(start(gene.gr) , start(probe.gr), end(gene.gr) , end(probe.gr)), + sizes=c(1, + 1, + 8, + rep(2,length(interactions.track)),3, + rep(2,length(extra.tracks)), + rep(0.5,length(state.tracks))), + extend.right = 10000, + extend.left = 100000, + details.ratio = 1, + details.size = 0.9, + baseline=0, + innerMargin=0, + col = NULL, + #fontsize = 8, + showBandId = TRUE, cex.bands = 0.5, + title.width = 2, + cex.title = 0.5, + rotation.title=360, + geneSymbols=TRUE + ) + } else { + atrack <- AnnotationTrack( + probe.gr, name = "Probes", + genome = metadata(data)$genome, + chromosome = chr + ) + plotTracks(c(list(idxTrack, axTrack), + interactions.track,list(atrack , genetrack), + extra.tracks,state.tracks), + background.title = "darkblue", + from = min(start(gene.gr) , start(probe.gr), end(gene.gr) , end(probe.gr)), + to = max(start(gene.gr) , start(probe.gr), end(gene.gr) , end(probe.gr)), + extend.right = 10000, + extend.left = 100000, + baseline=0, innerMargin=0, + showBandId = TRUE, + cex.bands = 0.5, + detailsBorder.col = "white", + sizes = c(0.5, + 0.5, + rep(2,length(interactions.track)),1, + 3,rep(2,length(extra.tracks)), + rep(0.5,length(state.tracks)) + ), + details.ratio = 1, + #fontsize = 8, + rotation.title=360, + title.width = 2, + details.size = 0.8, + col = NULL, + cex.title = 0.5, + geneSymbols=TRUE) + } + if(save) { + message("Saving as: ", label) + dev.off() + } +} diff --git a/R/Schematic_plot.R b/R/Schematic_plot.R deleted file mode 100755 index 80dfeebf..00000000 --- a/R/Schematic_plot.R +++ /dev/null @@ -1,97 +0,0 @@ -###schematic figure funciton -#scale # per 1 unit -library(grid) -#' Generate random loci of genome. -#' @param target.range A GRange object showing target coordinate. -#' @param Gene.range A GRange oject contains coordinate of a list of genes to show on figure. -#' @param special A list. Symbols is specific the gene that you want to emphasize and colors to specific the color to each gene you want to emphasize. -#' @param target.col A color for target -#' @param save Specific save the graph object or not -#' @return A graph object or save as a pdf. - -schematic <- function(target.range,Gene.range,special=list(Symbols=c(),colors=c()),target.col,fn,save=T){ - tmp <- Gene.range - strand(tmp) <- "*" - tmp <- sort(tmp) - position <- follow(target.range,tmp) - n <- 1 - Righttmp <- tmp[(position+1)] - while(position+n+1 < length(tmp)+1){ - if(!as.character(tmp$SYMBOL[position+n+1]) %in% as.character(Righttmp$SYMBOL)){ - Righttmp <- c(Righttmp,tmp[position+n+1]) - } - n <- n+1 - } - if(length(as.character(Righttmp$SYMBOL)) < 20){ - n <-0 - while(as.character(tmp$SYMBOL[position-n]) %in% as.character(Righttmp$SYMBOL)){ - n <- n+1 - } - Lefttmp <- tmp[position-n] - while(position-n >= 1){ - if(!as.character(tmp$SYMBOL[position-n]) %in% as.character(Lefttmp$SYMBOL)){ - Lefttmp <- c(tmp[position-n],Lefttmp) - } - n <- n+1 - } - Unique <- c(Lefttmp,Righttmp) - }else{ - Unique <- Righttmp - } - - if(length(Righttmp)==20){ - Range <- distance(target,Unique[length(Unique)]) - targetD <- 0.25 - }else{ - Range <- distance(Unique[1],Unique[length(Unique)]) - targetD <- 0.25 + 0.7* distance(Unique[1],target.range)/Range - } - Distance <- 0.25 + 0.7* unlist(apply(matrix(1:length(Unique),ncol=1),1, function(x){ distance(Unique[1],Unique[x])}))/Range - strand <- unlist(apply(matrix(as.character(Unique$SYMBOL),ncol=1),1,function(x){subset <- Gene.range[Gene.range$SYMBOL %in% x] - return(unique(as.character(strand(subset)))[1])})) - - df <- data.frame(Symbols=as.character(Unique$SYMBOL), - x1=Distance, - strand = strand, - colors=rep("black",length(Unique)), - label= rep("No",length(Unique)), - stringsAsFactors = F) - - df$colors[df$Symbols %in% special$Symbols] <- special$colors - df$label[df$Symbols %in% special$Symbols] <- "Yes" - if(save) pdf(paste0(fn,".pdf")) - vp <- viewport(h=5, w=6) - grid.lines(c(0.2,1),c(0.3,0.3),gp=gpar(lwd=3)) - grid.text(fn,0.1,0.3,gp=gpar(lwd=3)) - grid.rect(targetD,0.3,gp=gpar(col=target.col,fill=target.col),width=0.015,height=0.01) - .plot.arrow <- function(x){ - start <- as.numeric(x["x1"]) - if(x["strand"] == "-"){ - add <- -0.025 - if(x["label"] == "Yes"){ - grid.curve(x1=start, y1=0.3, x2=start+add*2, y2=0.35, arrow=arrow(length = unit(0.02,"npc")),shape=0,gp=gpar(col=x["colors"],lwd=3)) - grid.lines(x=c(start,start+0.05), - y=c(0.36,0.5),gp=gpar(lwd=2)) - grid.text(x["Symbols"],start+0.05,0.52,gp=gpar(lwd=3)) - }else{ - grid.curve(x1=start, y1=0.3, x2=start+add, y2=0.325, arrow=arrow(length = unit(0.01,"npc")),shape=0,gp=gpar(col=x["colors"],lwd=3)) - } - }else{ - add <- 0.025 - if(x["label"] == "Yes"){ - grid.curve(x1=start, y1=0.3, x2=start+add*2, y2=0.35, arrow=arrow(length = unit(0.02,"npc")),shape=0,gp=gpar(col=x["colors"],lwd=3),curvature=-1) - grid.lines(x=c(start,start+0.05), - y=c(0.36,0.5),gp=gpar(lwd=2)) - grid.text(x["Symbols"],start+0.05,0.52,gp=gpar(lwd=3)) - }else{ - grid.curve(x1=start, y1=0.3, x2=start+add, y2=0.325, arrow=arrow(length = unit(0.01,"npc")),shape=0,gp=gpar(col=x["colors"],lwd=3),curvature=-1) - } - } - - } - for(i in 1:nrow(df)){ - .plot.arrow(as.matrix(df)[i,]) - } - if(save) dev.off() -} - diff --git a/R/Small.R b/R/Small.R new file mode 100644 index 00000000..bd451079 --- /dev/null +++ b/R/Small.R @@ -0,0 +1,1247 @@ +#' @title Construct a Multi Assay Experiment for ELMER analysis +#' @description +#' This function will receive a gene expression and DNA methylation data objects +#' and create a Multi Assay Experiment. +#' @param met A Summarized Experiment with one assay containing beta-values, +#' a matrix or path of rda file only containing the data. +#' @param exp A Summarized Experiment with one assay, or +#' a matrix or path of rda file only containing the data. Rownames should be +#' either Ensembl gene id (ensembl_gene_id) or gene symbol (external_gene_name) +#' @param genome Which is the default genome to make gene information. Options hg19 and hg38 +#' @param colData A DataFrame or data.frame of the phenotype data for all participants. Must have column primary (sample ID). +#' @param sampleMap A DataFrame or data.frame of the matching samples and colnames +#' of the gene expression and DNA methylation matrix. This should be used if your matrix +#' have different columns names. +#' This object must have following columns: +#' assay ("DNA methylation" and "Gene expression"), primary (sample ID) and colname (names of the columns of the matrix). +#' @param linearize.exp Take log2(exp + 1) in order to linearize relation between methylation and expression +#' @param met.platform DNA methylation platform "450K" or "EPIC" +#' @param TCGA A logical. FALSE indicate data is not from TCGA (FALSE is default). +#' TRUE indicates data is from TCGA and sample section will automatically filled in. +#' @param filter.probes A GRanges object contains the coordinate of probes which locate +#' within promoter regions or distal feature regions such as union enhancer from REMC and FANTOM5. +#' See \code{\link{get.feature.probe}} function. +#' @param filter.genes List of genes ensemble ids to filter from object +#' @param save If TRUE, MAE object will be saved into a file named as the argument save.file if this was set, otherwise as mae_genome_met.platform.rda. +#' @param save.filename Name of the rda file to save the object (must end in .rda) +#' @param met.na.cut Define the percentage of NA that the line should have to remove the probes +#' for humanmethylation platforms. +#' @return A MultiAssayExperiment object +#' @export +#' @importFrom MultiAssayExperiment MultiAssayExperiment +#' @importFrom SummarizedExperiment SummarizedExperiment makeSummarizedExperimentFromDataFrame assay assay<- +#' @examples +#' # NON TCGA example: matrices has different column names +#' gene.exp <- S4Vectors::DataFrame( +#' sample1.exp = c("ENSG00000141510"=2.3,"ENSG00000171862"=5.4), +#' sample2.exp = c("ENSG00000141510"=1.6,"ENSG00000171862"=2.3) +#' ) +#' +#' dna.met <- S4Vectors::DataFrame( +#' sample1.met = c("cg14324200"=0.5,"cg23867494"=0.1), +#' sample2.met = c("cg14324200"=0.3,"cg23867494"=0.9) +#' ) +#' sample.info <- S4Vectors::DataFrame( +#' primary = c("sample1","sample2"), +#' sample.type = c("Normal", "Tumor") +#' ) +#' +#' sampleMap <- S4Vectors::DataFrame( +#' assay = c("Gene expression","DNA methylation","Gene expression","DNA methylation"), +#' primary = c("sample1","sample1","sample2","sample2"), +#' colname = c("sample1.exp","sample1.met","sample2.exp","sample2.met") +#' ) +#' +#' mae <- createMAE( +#' exp = gene.exp, +#' met = dna.met, +#' sampleMap = sampleMap, +#' met.platform ="450K", +#' colData = sample.info, +#' genome = "hg38" +#' ) +#' +#' # You can also use sample Mapping and Sample information tables from a tsv file +#' # You can use the createTSVTemplates function to create the tsv files +#' readr::write_tsv(as.data.frame(sampleMap), path = "sampleMap.tsv") +#' readr::write_tsv(as.data.frame(sample.info), path = "sample.info.tsv") +#' +#' mae <- createMAE( +#' exp = gene.exp, +#' met = dna.met, +#' sampleMap = "sampleMap.tsv", +#' met.platform ="450K", +#' colData = "sample.info.tsv", +#' genome = "hg38" +#' ) +#' +#' # NON TCGA example: matrices has same column names +#' gene.exp <- S4Vectors::DataFrame(sample1 = c("ENSG00000141510"=2.3,"ENSG00000171862"=5.4), +#' sample2 = c("ENSG00000141510"=1.6,"ENSG00000171862"=2.3)) +#' dna.met <- S4Vectors::DataFrame(sample1 = c("cg14324200"=0.5,"cg23867494"=0.1), +#' sample2= c("cg14324200"=0.3,"cg23867494"=0.9)) +#' sample.info <- S4Vectors::DataFrame(primary = c("sample1","sample2"), +#' sample.type = c("Normal", "Tumor")) +#' sampleMap <- S4Vectors::DataFrame( +#' assay = c("Gene expression","DNA methylation","Gene expression","DNA methylation"), +#' primary = c("sample1","sample1","sample2","sample2"), +#' colname = c("sample1","sample1","sample2","sample2") +#' ) +#' mae <- createMAE( +#' exp = gene.exp, +#' met = dna.met, +#' sampleMap = sampleMap, +#' met.platform ="450K", +#' colData = sample.info, +#' genome = "hg38" +#' ) +#' +#' \dontrun{ +#' # TCGA example using TCGAbiolinks +#' # Testing creating MultyAssayExperiment object +#' # Load library +#' library(TCGAbiolinks) +#' library(SummarizedExperiment) +#' +#' samples <- c( +#' "TCGA-BA-4074", "TCGA-BA-4075", "TCGA-BA-4077", "TCGA-BA-5149", +#' "TCGA-UF-A7JK", "TCGA-UF-A7JS", "TCGA-UF-A7JT", "TCGA-UF-A7JV" +#' ) +#' +#' #1) Get gene expression matrix +#' query.exp <- GDCquery( +#' project = "TCGA-HNSC", +#' data.category = "Transcriptome Profiling", +#' data.type = "Gene Expression Quantification", +#' workflow.type = "STAR - Counts", +#' barcode = samples +#' ) +#' +#' GDCdownload(query.exp) +#' exp.hg38 <- GDCprepare(query = query.exp) +#' +#' # Aligned against Hg19 +#' query.exp.hg19 <- GDCquery( +#' project = "TCGA-HNSC", +#' data.category = "Gene expression", +#' data.type = "Gene expression quantification", +#' platform = "Illumina HiSeq", +#' file.type = "normalized_results", +#' experimental.strategy = "RNA-Seq", +#' barcode = samples, +#' legacy = TRUE +#' ) +#' GDCdownload(query.exp.hg19) +#' exp.hg19 <- GDCprepare(query.exp.hg19) +#' +#' # Our object needs to have emsembl gene id as rownames +#' rownames(exp.hg19) <- values(exp.hg19)$ensembl_gene_id +#' +#' # DNA Methylation +#' query.met <- GDCquery( +#' project = "TCGA-HNSC", +#' legacy = FALSE, +#' data.category = "DNA Methylation", +#' data.type = "Methylation Beta Value", +#' barcode = samples, +#' platform = "Illumina Human Methylation 450" +#' ) +#' +#' GDCdownload(query.met) +#' met <- GDCprepare(query = query.met) +#' +#' distal.enhancer <- get.feature.probe(genome = "hg19",met.platform = "450k") +#' +#' # Consisering it is TCGA and SE +#' mae.hg19 <- createMAE( +#' exp = exp.hg19, +#' met = met, +#' TCGA = TRUE, +#' genome = "hg19", +#' filter.probes = distal.enhancer +#' ) +#' values(getExp(mae.hg19)) +#' +#' mae.hg38 <- createMAE( +#' exp = exp.hg38, met = met, +#' TCGA = TRUE, genome = "hg38", +#' filter.probes = distal.enhancer +#' ) +#' values(getExp(mae.hg38)) +#' +#' # Consisering it is TCGA and not SE +#' mae.hg19.test <- createMAE( +#' exp = assay(exp.hg19), met = assay(met), +#' TCGA = TRUE, genome = "hg19", +#' filter.probes = distal.enhancer +#' ) +#' +#' mae.hg38 <- createMAE( +#' exp = assay(exp.hg38), met = assay(met), +#' TCGA = TRUE, genome = "hg38", +#' filter.probes = distal.enhancer +#' ) +#' values(getExp(mae.hg38)) +#' +#' # Consisering it is not TCGA and SE +#' # DNA methylation and gene expression Objects should have same sample names in columns +#' not.tcga.exp <- exp.hg19 +#' colnames(not.tcga.exp) <- substr(colnames(not.tcga.exp),1,15) +#' not.tcga.met <- met +#' colnames(not.tcga.met) <- substr(colnames(not.tcga.met),1,15) +#' +#' phenotype.data <- data.frame(row.names = colnames(not.tcga.exp), +#' primary = colnames(not.tcga.exp), +#' samples = colnames(not.tcga.exp), +#' group = c(rep("group1",4),rep("group2",4))) +#' distal.enhancer <- get.feature.probe(genome = "hg19",met.platform = "450k") +#' mae.hg19 <- createMAE(exp = not.tcga.exp, +#' met = not.tcga.met, +#' TCGA = FALSE, +#' filter.probes = distal.enhancer, +#' genome = "hg19", +#' colData = phenotype.data) +#' } +#' createMAE +createMAE <- function ( + exp, + met, + colData, + sampleMap, + linearize.exp = FALSE, + filter.probes = NULL, + met.na.cut = 0.2, + filter.genes = NULL, + met.platform = "450K", + genome = NULL, + save = TRUE, + save.filename, + TCGA = FALSE +) { + + if(missing(genome)) stop("Please specify the genome (hg38, hg19)") + + + # Check if input are path to rda files + if(is.character(exp)) exp <- get(load(exp)) + if(is.character(met)) met <- get(load(met)) + + if(is(exp,"RangedSummarizedExperiment")){ + + + if(length(SummarizedExperiment::assays(exp)) > 1) { + if("tpm_unstrand" %in% names(SummarizedExperiment::assays(exp))){ + message("Multiple assay found. Selecting tpm_unstrand") + exp <- SummarizedExperiment::assays(exp)$tpm_unstrand + } else { + stop("RNA-seq object has more than one assay. Please make sure it is only") + } + } + } + + suppressMessages({ + + if(!missing(colData)) { + if(is.character(colData)) { + colData <- as.data.frame(read_tsv(colData)) + } + if (!"primary" %in% colnames(colData)) stop("No primary column in colData input") + rownames(colData) <- colData$primary + } + if(!missing(sampleMap)) { + if(is.character(sampleMap)) sampleMap <- read_tsv(sampleMap) + if (!all(c("assay","colname","primary") %in% colnames(sampleMap))) + stop("All assay, primary and colname columns should be in sampleMap input") + } + }) + + # Expression data must have the ensembl_gene_id (Ensemble ID) and external_gene_name (Gene Symbol) + required.cols <- c("external_gene_name", "ensembl_gene_id") + # If my input is a data frame we will need to add metadata information for the ELMER analysis steps + if(!is(exp,"RangedSummarizedExperiment")){ + exp <- makeSummarizedExperimentFromGeneMatrix(exp, genome) + } + # Add this here ? + if(linearize.exp) assay(exp) <- log2(assay(exp) + 1) + + if(!is(met,"RangedSummarizedExperiment")){ + met <- makeSummarizedExperimentFromDNAMethylation( + met = met, genome = genome, met.platform = met.platform + ) + } + met <- met[rowMeans(is.na(assay(met))) < met.na.cut, ] + + # Select the regions from DNA methylation that overlaps enhancer. + if(!is.null(filter.probes)){ + if(is.character(filter.probes)){ + filter.probes <- get(load(filter.probes)) + } + } + if(!is.null(filter.probes) & !is.null(met)){ + met <- met[rownames(met) %in% names(filter.probes),] + } + if(!is.null(filter.genes) & !is.null(exp)){ + exp <- exp[rownames(exp) %in% names(filter.genes),] + } + + # We will need to check if the fields that we need exists. + # Otherwise we will need to create them + if(!is(exp,"RangedSummarizedExperiment")){ + required.cols <- required.cols[!required.cols %in% colnames(values(exp))] + if(length(required.cols) > 0) { + gene.info <- get.GRCh(genome) + colnames(gene.info)[grep("external_gene", colnames(gene.info))] <- "external_gene_name" + if(all(grepl("ENSG",rownames(exp)))) { + extra <- as.data.frame(gene.info[match(rownames(exp),gene.info$ensembl_gene_id),required.cols]) + colnames(extra) <- required.cols + values(exp) <- cbind(values(exp),extra) + } else { + stop("Please the gene expression matrix should receive ENSEMBLE IDs") + } + } + } + if(TCGA){ + message("Checking if samples have both DNA methylation and Gene expression and if they are in the same order...") + # If it is not TCGA we will assure the sample has both DNA methylation and gene expression + ID <- intersect(substr(colnames(met),1,16), substr(colnames(exp),1,16)) + + # Get only samples with both DNA methylation and Gene expression + met <- met[,match(ID,substr(colnames(met),1,16))] + exp <- exp[,match(ID,substr(colnames(exp),1,16))] + stopifnot(all(substr(colnames(exp),1,16) == substr(colnames(met),1,16))) + stopifnot(ncol(exp) == ncol(met)) + + # Get clinical information + if(missing(colData)) { + colData <- TCGAbiolinks::colDataPrepare(colnames(met)) + # This will keep the same strategy the old ELMER version used: + # Every type of tumor samples (starts with T) will be set to tumor and + # every type of normal samples (starts with N) will be set to normal + # See : https://github.com/lijingya/ELMER/blob/3e050462aa41c8f542530ccddc8fa607207faf88/R/Small.R#L8-L48 + colData$TN <- NA + colData[grep("^N",colData$shortLetterCode),"TN"] <- "Normal" + colData[grep("^T",colData$shortLetterCode),"TN"] <- "Tumor" + + colData$barcode <- NULL + colData <- colData[!duplicated(colData),] + rownames(colData) <- colData$sample + } + if(missing(sampleMap)) { + sampleMap <- DataFrame(assay = c(rep("DNA methylation", length(colnames(met))), rep("Gene expression", length(colnames(exp)))), + primary = substr(c(colnames(met),colnames(exp)),1,16), + colname = c(colnames(met),colnames(exp))) + } + + message("Creating MultiAssayExperiment") + mae <- MultiAssayExperiment( + experiments=list("DNA methylation" = met,"Gene expression" = exp), + colData = colData, + sampleMap = sampleMap, + metadata = list(TCGA= TRUE, genome = genome, met.platform = met.platform ) + ) + } else { + + if(missing(colData)){ + message <- paste( + "Please set colData argument. A data frame with samples", + "information. All rownames should be colnames of DNA", + "methylation and gene expression. An example is showed", + "in MultiAssayExperiment documentation", + "(access it with ?MultiAssayExperiment)" + ) + stop(message) + } + + if(missing(sampleMap)){ + # Check that we have the same number of samples + message("Removing samples not found in both DNA methylation and gene expression (we are considering the names of the gene expression and DNA methylation columns to be the same) ") + ID <- intersect(colnames(met), colnames(exp)) + met <- met[,match(ID,colnames(met))] + exp <- exp[,match(ID,colnames(exp))] + + if(!all(colnames(exp) == colnames(met))) + stop("Error DNA methylation matrix and gene expression matrix are not in the same order") + + colData <- colData[match(ID,rownames(colData)),,drop = FALSE] + sampleMap <- DataFrame( + assay = c(rep("DNA methylation", length(colnames(met))), + rep("Gene expression", length(colnames(exp)))), + primary = c(colnames(met),colnames(exp)), + colname = c(colnames(met),colnames(exp)) + ) + mae <- MultiAssayExperiment( + experiments = list("DNA methylation" = met,"Gene expression" = exp), + colData = colData, + sampleMap = sampleMap, + metadata = list(TCGA = FALSE, genome = genome, met.platform = met.platform ) + ) + } else { + # Check that we have the same number of samples + if(!all(c("primary","colname") %in% colnames(sampleMap))) + stop("sampleMap should have the following columns: primary (sample ID) and colname(DNA methylation and gene expression sample [same as the colnames of the matrix])") + #if(!any(rownames(colData) %in% sampleMap$primary)) + # stop("colData row names should be mapped to sampleMap primary column ") + # Find which samples are DNA methylation and gene expression + sampleMap.met <- sampleMap[sampleMap$assay %in% "DNA methylation",,drop = FALSE] + sampleMap.exp <- sampleMap[sampleMap$assay %in% "Gene expression",,drop = FALSE] + + # Which ones have both DNA methylation and gene expression ? + commun.samples <- intersect(sampleMap.met$primary,sampleMap.exp$primary) + + # Remove the one that does not have both data + sampleMap.met <- sampleMap.met[match(sampleMap.met$primary,commun.samples),,drop = FALSE] + sampleMap.exp <- sampleMap.exp[match(sampleMap.exp$primary,commun.samples),,drop = FALSE] + + # Ordering samples to be matched + met <- met[,sampleMap.met$colname,drop = FALSE] + exp <- exp[,sampleMap.exp$colname,drop = FALSE] + + if(!all(sampleMap.met$primary == sampleMap.exp$primary)) + stop("Error DNA methylation matrix and gene expression matrix are not in the same order") + + colData <- colData[match(commun.samples,colData$primary),,drop = FALSE] + sampleMap <- DataFrame( + assay = c(rep("DNA methylation", length(colnames(met))), + rep("Gene expression", length(colnames(exp)))), + primary = commun.samples, + colname = c(colnames(met),colnames(exp)) + ) + mae <- MultiAssayExperiment( + experiments=list("DNA methylation" = met, + "Gene expression" = exp), + colData = colData, + sampleMap = sampleMap, + metadata = list(TCGA=FALSE, genome = genome, met.platform = met.platform ) + ) + } + } + if(save) { + if(missing(save.filename)) save.filename <- paste0("mae_",genome,"_",met.platform,".rda") + save(mae, file = save.filename,compress = "xz") + message("MAE saved as: ", save.filename) + } + return(mae) +} + +makeSummarizedExperimentFromGeneMatrix <- function(exp, genome = genome){ + message("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-") + message("Creating a SummarizedExperiment from gene expression input") + gene.info <- get.GRCh(genome) + gene.info$chromosome_name <- paste0("chr",gene.info$chromosome_name) + colnames(gene.info)[grep("external_gene", colnames(gene.info))] <- "external_gene_name" + gene.info$strand[gene.info$strand == 1] <- "+" + gene.info$strand[gene.info$strand == -1] <- "-" + exp <- as.data.frame(exp) + required.cols <- c("external_gene_name", "ensembl_gene_id") + + if(all(grepl("ENSG",rownames(exp)))) { + exp$ensembl_gene_id <- gsub("\\.[0-9]*$","",rownames(exp)) + aux <- merge(exp, gene.info, by = "ensembl_gene_id", sort = FALSE) + aux <- aux[!duplicated(aux$ensembl_gene_id),] + rownames(aux) <- aux$ensembl_gene_id + aux[,grep("entrezgene",colnames(aux))] <- NULL + exp <- makeSummarizedExperimentFromDataFrame( + aux[,!grepl("external_gene_name|ensembl_gene_id|entrezgene",colnames(aux))], + start.field = "start_position", + end.field = c("end_position") + ) + extra <- as.data.frame(gene.info[match(rownames(exp),gene.info$ensembl_gene_id),required.cols]) + colnames(extra) <- required.cols + values(exp) <- cbind(values(exp),extra) + } else { + stop("Please the gene expression matrix should receive ENSEMBLE IDs (ENSG)") + } + return(exp) +} + +#' @importFrom downloader download +#' @importFrom S4Vectors DataFrame +makeSummarizedExperimentFromDNAMethylation <- function(met, genome, met.platform) { + message("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-") + message("Creating a SummarizedExperiment from DNA methylation input") + + # Instead of looking on the size, it is better to set it as a argument as the annotation is different + annotation <- getInfiniumAnnotation(plat = met.platform, genome = genome) + + rowRanges <- annotation[names(annotation) %in% rownames(met),,drop=FALSE] + + # Remove masked probes, besed on the annotation + rowRanges <- rowRanges[!rowRanges$MASK_general] + + colData <- DataFrame(samples = colnames(met)) + met <- met[rownames(met) %in% names(rowRanges),,drop = FALSE] + met <- met[names(rowRanges),,drop = FALSE] + assay <- data.matrix(met) + met <- SummarizedExperiment( + assays = assay, + rowRanges = rowRanges, + colData = colData + ) + return(met) +} + + +# getInfiniumAnnotation <- function(plat = "450K", genome = "hg38"){ +# if(tolower(genome) == "hg19" & toupper(plat) == "450K" ) return(sesameData::sesameDataGet("HM450.hg19.manifest")) +# if(tolower(genome) == "hg19" & toupper(plat) == "EPIC" ) return(sesameData::sesameDataGet("EPIC.hg19.manifest")) +# if(tolower(genome) == "hg38" & toupper(plat) == "450K" ) return(sesameData::sesameDataGet("HM450.hg38.manifest")) +# if(tolower(genome) == "hg38" & toupper(plat) == "EPIC" ) return(sesameData::sesameDataGet("EPIC.hg19.manifest")) +# } + +#' @title Get DNA methylation array metadata from SesameData +#' @noRd +#' @importFrom stringr str_c +getInfiniumAnnotation <- function( + genome = c("hg38","hg19"), + plat = c("450K","EPIC") +){ + genome <- match.arg(genome) + arrayType <- match.arg(plat) + + check_package("sesameData") + check_package("ExperimentHub") + check_package("AnnotationHub") + + message("Accessing DNAm annotation from sesame package for: ", genome, " - ",arrayType) + manifest <- str_c( + ifelse(arrayType == "450K","HM450","EPIC"), + ".", + genome, + ".manifest" + ) + ehub <- ExperimentHub::ExperimentHub() + query <- AnnotationHub::query(ehub, c("sesameData",manifest)) + query <- query[query$title == manifest,] + ah_id <- query$ah_id[query$rdatadateadded == max(as.Date(query$rdatadateadded))] + ehub[[ah_id]] +} + +#' @title register cores +#' @param package Package name +#' @noRd +check_package <- function(package){ + if (!requireNamespace(package, quietly = TRUE)) { + stop(package, " package is needed for this function to work. Please install it.", + call. = FALSE) + } +} + +getdata <- function(...) +{ + e <- new.env() + name <- data(..., package = "ELMER.data",envir = e)[1] + e[[ls(envir = e)[1]]] +} + +#' Create examples files for Sample mapping and information used in createMAE function +#' @description +#' This function will receive the DNA methylation and gene expression matrix and will create +#' some examples of table for the argument colData and sampleMap used in ceeateMae function. +#' @param met DNA methylation matrix or Summarized Experiment +#' @param exp Gene expression matrix or Summarized Experiment +#' @examples +#' gene.exp <- S4Vectors::DataFrame(sample1.exp = c("ENSG00000141510"=2.3,"ENSG00000171862"=5.4), +#' sample2.exp = c("ENSG00000141510"=1.6,"ENSG00000171862"=2.3)) +#' dna.met <- S4Vectors::DataFrame(sample1.met = c("cg14324200"=0.5,"cg23867494"=0.1), +#' sample2.met = c("cg14324200"=0.3,"cg23867494"=0.9)) +#' createTSVTemplates(met = dna.met, exp = gene.exp) +#' @importFrom readr write_tsv +#' @export +createTSVTemplates <- function(met, exp) { + assay <- c(rep("DNA methylation", ncol(met)), + rep("Gene expression", ncol(exp))) + primary <- rep("SampleX", ncol(met) + ncol(exp)) + colname <- c(colnames(met),colnames(exp)) + sampleMap <- data.frame(assay,primary,colname) + message("===== Sample mapping example file ======") + message("Saving example file as elmer_example_sample_mapping.tsv.") + message("Please, fill primary column correctly") + write_tsv(sampleMap,path = "elmer_example_sample_mapping.tsv") + + colData <- data.frame(primary = paste0("sample",1:ncol(met)), group = rep("To be filled",ncol(met))) + message("===== Sample information example file ======") + message("Saving example file as elmer_example_sample_metadata.tsv.") + message("Please, fill primary column correctly, also you can add new columns as the example group column.") + write_tsv(colData,path = "elmer_example_sample_metadata.tsv") +} + +# splitmatix +# @param x A matrix +# @param by A character specify if split the matix by row or column. +# @return A list each of which is the value of each row/column in the matrix. +splitmatrix <- function(x,by="row") { + if(by %in% "row"){ + out <- split(x, rownames(x)) + }else if (by %in% "col"){ + out <- split(x, colnames(x)) + } + return(out) +} + + +#' lable linear regression formula +#' @param df A data.frame object contains two variables: dependent +#' variable (Dep) and explanation variable (Exp). +#' @param Dep A character specify dependent variable. The first column +#' will be dependent variable as default. +#' @param Exp A character specify explanation variable. The second column +#' will be explanation variable as default. +#' @return A linear regression formula +#' @importFrom stats coef lm +lm_eqn = function(df,Dep,Exp){ + if(missing(Dep)) Dep <- colnames(df)[1] + if(missing(Exp)) Exp <- colnames(df)[2] + m = lm(df[,Dep] ~ df[,Exp]); + eq <- substitute(italic(y) == a + (b) %.% italic(x)*"\n"~~italic(r)^2~"="~r2, + list(a = format(coef(m)[1], digits = 2), + b = format(coef(m)[2], digits = 2), + r2 = format(summary(m)$r.squared, digits = 3))) + as.character(as.expression(eq)); +} + + + +#' getTSS to fetch GENCODE gene annotation (transcripts level) from Bioconductor package biomaRt +#' If upstream and downstream are specified in TSS list, promoter regions of GENCODE gene will be generated. +#' @description +#' getTSS to fetch GENCODE gene annotation (transcripts level) from Bioconductor package biomaRt +#' If upstream and downstream are specified in TSS list, promoter regions of GENCODE gene will be generated. +#' @param TSS A list. Contains upstream and downstream like TSS=list(upstream, downstream). +#' When upstream and downstream is specified, coordinates of promoter regions with gene annotation will be generated. +#' @param genome Which genome build will be used: hg38 (default) or hg19. +#' @return GENCODE gene annotation if TSS is not specified. Coordinates of GENCODE gene promoter regions if TSS is specified. +#' @examples +#' # get GENCODE gene annotation (transcripts level) +#' \dontrun{ +#' getTSS <- getTSS() +#' getTSS <- getTSS(genome.build = "hg38", TSS=list(upstream=1000, downstream=1000)) +#' } +#' @export +#' @author Lijing Yao (maintainer: lijingya@usc.edu) +#' @import GenomeInfoDb +#' @importFrom GenomicFeatures transcripts +#' @importFrom GenomicRanges makeGRangesFromDataFrame +#' @importFrom biomaRt useEnsembl +getTSS <- function(genome = "hg38", + TSS = list(upstream = NULL, downstream = NULL)){ + + if (tolower(genome) == "hg38") { + tss <- getdata("Human_genes__GRCh38_p12__tss") + } else { + tss <- getdata("Human_genes__GRCh37_p13__tss") + } + + # tries <- 0L + # msg <- character() + # while (tries < 3L) { + # tss <- tryCatch({ + # host <- ifelse(genome == "hg19", "grch37.ensembl.org","www.ensembl.org") + # message("Accessing ", host, " to get TSS information") + # + # ensembl <- tryCatch({ + # useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", host = host) + # }, error = function(e) { + # message(e) + # for(mirror in c("asia","useast","uswest")){ + # x <- useEnsembl("ensembl", + # dataset = "hsapiens_gene_ensembl", + # mirror = mirror, + # host = host) + # if(class(x) == "Mart") { + # return(x) + # } + # } + # return(NULL) + # }) + # + # if(is.null(host)) { + # message("Problems accessing ensembl database") + # return(NULL) + # } + # attributes <- c("chromosome_name", + # "start_position", + # "end_position", "strand", + # "ensembl_gene_id", + # "transcription_start_site", + # "transcript_start", + # "ensembl_transcript_id", + # "transcript_end", + # "external_gene_name") + # chrom <- c(1:22, "X", "Y","M","*") + # db.datasets <- listDatasets(ensembl) + # description <- db.datasets[db.datasets$dataset=="hsapiens_gene_ensembl",]$description + # message(paste0("Downloading transcripts information from ", ensembl@host, ". Using: ", description)) + # + # filename <- paste0(gsub("[[:punct:]]| ", "_",description),"_tss.rda") + # if(!file.exists(filename)) { + # tss <- getBM(attributes = attributes, filters = c("chromosome_name"), values = list(chrom), mart = ensembl) + # tss <- tss[!duplicated(tss$ensembl_transcript_id),] + # save(tss, file = filename, compress = "xz") + # } else { + # message("Loading from disk") + # tss <- get(load(filename)) + # } + tss$chromosome_name <- paste0("chr", tss$chromosome_name) + tss$strand[tss$strand == 1] <- "+" + tss$strand[tss$strand == -1] <- "-" + tss <- makeGRangesFromDataFrame(tss, + strand.field = "strand", + start.field = "transcript_start", + end.field = "transcript_end", + keep.extra.columns = TRUE) + + if (!is.null(TSS$upstream) & !is.null(TSS$downstream)) + tss <- promoters(tss, upstream = TSS$upstream, downstream = TSS$downstream) + # tss + # }, error = function(e) { + # msg <<- conditionMessage(e) + # tries <<- tries + 1L + # }) + # if(!is.null(tss)) break + #} + #if (tries == 3L) stop("failed to get URL after 3 tries:", "\n error: ", msg) + + return(tss) +} + +#' @title Get human TF list from the UNiprot database +#' @description This function gets the last version of human TF list from the UNiprot database +#' @importFrom readr read_tsv +#' @return A data frame with the ensemble gene id. +getTF <- function(){ + print.header("Downloading TF list from Lambert, Samuel A., et al. The human transcription factors. Cell 172.4 (2018): 650-665.","subsection") + # human.TF <- readr::read_table("http://humantfs.ccbr.utoronto.ca/download/v_1.01/TFs_Ensembl_v_1.01.txt",col_names = F) + # colnames(human.TF) <- "ensembl_gene_id" + human.TF <- getdata("human.TF") + return(human.TF) +} + +#' @importFrom biomaRt getBM useMart listDatasets +get.GRCh <- function(genome = "hg19", genes = NULL, as.granges = FALSE) { + + if (tolower(genome) == "hg38") { + gene.location <- getdata("Human_genes__GRCh38_p12") + } else { + gene.location <- getdata("Human_genes__GRCh37_p13") + } + + if (!is.null(genes)) + gene.location <- gene.location[match(genes,gene.location$entrezgene),] + + if (as.granges) { + gene.location$strand[gene.location$strand == 1] <- "+" + gene.location$strand[gene.location$strand == -1] <- "-" + gene.location$chromosome_name <- paste0("chr", gene.location$chromosome_name) + gene.location <- makeGRangesFromDataFrame(gene.location, + seqnames.field = "chromosome_name", + start.field = "start_position", + end.field = "end_position", + keep.extra.columns = TRUE) + } + return(gene.location) +} + + + +#' @title Get family of transcription factors +#' @description +#' This will output a list each TF motif and TFs that binding the motis. Multiple TFs may +#' recognize a same motif such as TF family. +#' The association between each motif famil and transcription factor was created using the +#' (HOCOMOCO)[https://hocomoco11.autosome.org/human/mono?full=true] which TF structural families +#' was created according to TFClass [@wingender2014tfclass] +#' This data is stored as a list whose elements +#' are motifs and contents for each element are TFs which recognize the same motif that +#' is the name of the element. This data is used in function get.TFs in \pkg{ELMER} +#' to identify the real regulator TF whose motif is enriched in a given set of probes +#' and expression associate with average DNA methylation of these motif sites. +#' @importFrom rvest html_table %>% +#' @importFrom xml2 read_html +#' @export +#' @param classification Select if we will use Family classification or sub-family +#' @return A list of TFs and its family members +createMotifRelevantTfs <- function(classification = "family"){ + message("Retrieving TFClass ", classification," classification from ELMER.data.") + if(classification == "family") motif.relevant.TFs <- getdata("TF.family") + if(classification == "subfamily") motif.relevant.TFs <- getdata("TF.subfamily") + return(motif.relevant.TFs) +} + +#' @title Filtering probes +#' @description +#' This function has some filters to the DNA methylation data +#' in each it selects probes to avoid correlations due to non-cancer +#' contamination and for additional stringency. +#' \itemize{ +#' \item Filter 1: We usually call locus unmethylated when the methylation value < 0.3 and methylated when the methylation value > 0.3. +#' Therefore Meth_B is the percentage of methylation value > K. +#' Basically, this step will make sure we have at least a percentage of beta values lesser than K and n percentage of beta values greater K. +#' For example, if percentage is 5\%, the number of samples 100 and K = 0.3, +#' this filter will select probes that we have at least 5 (5\% of 100\%) samples have beta values > 0.3 and at least 5 samples have beta values < 0.3. +#' This filter is importante as true promoters and enhancers usually have a pretty low value (of course purity can screw that up). +#' we often see lots of PMD probes across the genome with intermediate values like 0.4. +#' Choosing a value of 0.3 will certainly give some false negatives, but not compared to the number of false positives we thought we might get without this filter. +#' } +#' @references +#' Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +#' factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +#' Method section (Linking enhancer probes with methylation changes to target genes with expression changes). +#' @param data A MultiAssayExperiment with a DNA methylation martrix or a DNA methylation matrix +#' @param K Cut off to consider probes as methylated or unmethylated. Default: 0.3 +#' @param percentage The percentage of samples we should have at least considered as methylated and unmethylated +#' @return An object with the same class, but with the probes removed. +#' @importFrom MultiAssayExperiment experiments<- +#' @export +#' @examples +#' random.probe <- runif(100, 0, 1) +#' bias_l.probe <- runif(100, 0, 0.3) +#' bias_g.probe <- runif(100, 0.3, 1) +#' met <- rbind(random.probe,bias_l.probe,bias_g.probe) +#' met <- preAssociationProbeFiltering(data = met, K = 0.3, percentage = 0.05) +#' met <- rbind(random.probe,random.probe,random.probe) +#' met <- preAssociationProbeFiltering(met, K = 0.3, percentage = 0.05) +#' data <- ELMER:::getdata("elmer.data.example") # Get data from ELMER.data +#' data <- preAssociationProbeFiltering(data, K = 0.3, percentage = 0.05) +#' +#' cg24741609 <- runif(100, 0, 1) +#' cg17468663 <- runif(100, 0, 0.3) +#' cg14036402 <- runif(100, 0.3, 1) +#' met <- rbind(cg24741609,cg14036402,cg17468663) +#' colnames(met) <- paste("sample",1:100) +#' exp <- met +#' rownames(exp) <- c("ENSG00000141510","ENSG00000171862","ENSG00000171863") +#' sample.info <- S4Vectors::DataFrame(primary = paste("sample",1:100), +#' sample.type = rep(c("Normal", "Tumor"),50)) +#' rownames(sample.info) <- colnames(exp) +#' mae <- createMAE(exp = exp, met = met, colData = sample.info, genome = "hg38") +#' mae <- preAssociationProbeFiltering(mae, K = 0.3, percentage = 0.05) +preAssociationProbeFiltering <- function(data, K = 0.3, percentage = 0.05){ + print.header("Filtering probes", type = "section") + message("For more information see function preAssociationProbeFiltering") + + if(is(data,"MultiAssayExperiment")) { + met <- assay(getMet(data)) + } else { + met <- data + } + # In percentage how many probes are bigger than K ? + Meth_B <- rowMeans(met > K, na.rm = TRUE) + # We should have at least 5% methylation value < K or at least 5% methylation value > K + keep <- Meth_B < (1 - percentage) & Meth_B > percentage + keep[is.na(keep)] <- FALSE + message("Making sure we have at least ", percentage * 100, "% of beta values lesser than ", K," and ", + percentage * 100, "% of beta values greater ",K,".") + if(length(keep) - sum(keep) != 0) { + message("Removing ", length(keep) - sum(keep), " probes out of ", length(keep)) + } else { + message("There were no probes to be removed") + } + if(is(data,"MultiAssayExperiment")) { + experiments(data)["DNA methylation"][[1]] <- experiments(data)["DNA methylation"][[1]][keep,] + } else { + data <- data[keep,,drop = FALSE] + } + return(data) +} + +#' @importFrom xml2 read_html +#' @importFrom rvest html_table +getHocomocoTable <- function(){ + hocomoco <- tryCatch({ + hocomoco <- "https://hocomoco11.autosome.org/human/mono?full=true" %>% read_html() %>% html_table() + hocomoco <- hocomoco[[1]] + }, error = function(e) { + getdata("hocomoco.table") + }) + + TF.family <- createMotifRelevantTfs() + TF.subfamily <- createMotifRelevantTfs("subfamily") + + x <- do.call(rbind.data.frame,lapply(TF.family, function(x) paste(x,collapse = ";"))) + x$Model <- names(TF.family) + colnames(x) <- c("TF.family.member","Model") + hocomoco <- merge(hocomoco,x, by = "Model") + x <- do.call(rbind.data.frame,lapply(TF.subfamily, function(x) paste(x,collapse = ";"))) + x$Model <- names(TF.subfamily) + colnames(x) <- c("TF.subfamily.member","Model") + hocomoco <- merge(hocomoco,x, by = "Model") + return(hocomoco) +} + +#' @title Get random pairs +#' @description +#' This function will receive a pair gene probes and will return a +#' random object with the following pattern, if a probe is linked to R1 and L3 genes +#' the random pairs will be a random probes (a distal probe not in the input pairs) +#' also linked to its R1 and L3 gene. +#' @param pairs A data frame with probe, gene and side information. See example below. +#' @param met.platform DNA methyaltion platform to retrieve data from: EPIC or 450K (default) +#' @param genome Which genome build will be used: hg38 (default) or hg19. +#' @param cores A interger which defines the number of cores to be used in parallel +#' process. Default is 1: no parallel process. +#' @return A data frame with the random linkages +#' @export +#' @importFrom dplyr pull filter +#' @importFrom TCGAbiolinks colDataPrepare +#' @examples +#' \dontrun{ +#' data <- ELMER:::getdata("elmer.data.example") +#' nearGenes <- GetNearGenes(TRange=getMet(data)[c("cg00329272","cg10097755"),], +#' geneAnnot=getExp(data)) +#' +#' pair <- get.pair(data = data, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' mode = "supervised", +#' diff.dir = "hypo", +#' nearGenes = nearGenes, +#' permu.size = 5, +#' raw.pvalue = 0.001, +#' Pe = 0.2, +#' dir.out="./", +#' permu.dir = "permu_test", +#' label = "hypo") +#' } +#' pair <- data.frame(Probe = rep("cg00329272",3), +#' GeneID = c("ENSG00000116213","ENSG00000130762","ENSG00000149527"), +#' Sides = c("R5","R2","L4")) +#' getRandomPairs(pair) +getRandomPairs <- function(pairs, + genome = "hg38", + met.platform = "450K", + cores = 1) { + + if(missing(pairs)) stop("Please set pairs argument") + if(is.data.frame(pairs)) pairs <- as.data.frame(pairs) + + # Rename column + if("Side" %in% colnames(pairs)) colnames(pairs)[grep("Side",colnames(pairs))] <- "Sides" + if(!"Sides" %in% colnames(pairs)) stop("No column Sides in the object") + + if("Target" %in% colnames(pairs)) colnames(pairs)[grep("Target",colnames(pairs))] <- "Probe" + if("ID" %in% colnames(pairs)) colnames(pairs)[colnames(pairs) == "ID"] <- "Probe" + + parallel <- FALSE + if (cores > 1){ + if (cores > detectCores()) cores <- detectCores() + registerDoParallel(cores) + parallel = TRUE + } + + # Get Probe information + met.info <- getInfiniumAnnotation(plat = met.platform, genome = genome) + probes.ranges <- as.data.frame(met.info)[,1:5] + colnames(probes.ranges) <- paste0("probe_",colnames(probes.ranges)) + + # Get distal probes not in the pairs + distal.probe <- get.feature.probe( + genome = genome, + met.platform = met.platform, + feature = NULL + ) # get distal probes + distal.probe <- distal.probe[!names(distal.probe) %in% pairs$Probe,] # Select probes were not used + + nb.pairs <- nrow(pairs) + nb.probes <- length(unique(pairs$Probe)) + # get gene information + genes <- getTSS(genome = genome) + genes$ensembl_transcript_id <- NULL + genes <- genes[!duplicated(genes$ensembl_gene_id)] + + df.random <- NULL + near.genes.linked <- NULL + near.genes.df <- NULL + # We will get the double of random probes, because some will not be used in case it does not matches the position + # Example: real probe + gene R10 and random probe does not have R10. Discart and get next random + not.matched <- 1:nb.probes + numFlankingGenes <- 20 + while(length(not.matched) > 0){ + random.probes <- distal.probe[sample(1:length(distal.probe), length(not.matched)),] + near.genes <- GetNearGenes(TRange = random.probes, + geneAnnot = genes, + numFlankingGenes = numFlankingGenes) + + near.genes.linked <- plyr::alply(1:length(not.matched), + .margins = 1, + .fun = function(x){ + side <- pairs %>% + filter(pairs$Probe == unique(pairs$Probe)[not.matched[x]]) %>% pull('Sides') + ret <- near.genes[near.genes$ID == unique(near.genes$ID)[x] & near.genes$Side %in% side,] + if(!all(side %in% ret$Side)) return(NULL) + ret + },.progress = "time", .parallel = parallel) + + not.matched <- not.matched[grep("NULL",near.genes.linked)] + if(length(not.matched) > 0){ + aux <- pairs %>% filter(pairs$Probe == unique(pairs$Probe)[not.matched[1]]) %>% pull('Sides') + numFlankingGenes <- max(as.numeric(gsub("L|R","",aux))) * 2 + } + near.genes.df <- rbind(near.genes.df,data.table::rbindlist(near.genes.linked)) + distal.probe <- distal.probe[!names(distal.probe) %in% near.genes.df$Target,] # Remove probes already used + } + colnames(near.genes.df)[1] <- "Probe" + + # Add probe metadata to output + probes.ranges$Probe <- rownames(probes.ranges) + near.genes.df <- merge(near.genes.df, probes.ranges, by = "Probe") + return(near.genes.df) +} + + +# Reading homer output. For each reagion (rows) +# homer will try to find if a given motif was found in it. +# This will read this homer file and create a sparce matrix +# in which 1 means the motif was found and 0 not found. +# this is used to calculate the motif enrichement compared +# to a background signal. +getMatrix <- function(filename) { + motifs <- readr::read_tsv(filename) + # From 1 to 21 we have annotations + matrix <- Matrix::Matrix(0, nrow = nrow(motifs), ncol = ncol(motifs) - 21 ,sparse = TRUE) + colnames(matrix) <- gsub(" Distance From Peak\\(sequence,strand,conservation\\)","",colnames(motifs)[-c(1:21)]) + rownames(matrix) <- motifs$PeakID + matrix[!is.na(motifs[,-c(1:21)])] <- 1 + matrix <- as(matrix, "nsparseMatrix") + return(matrix) +} + +#' @title Calculate motif Erichment +#' @description Calculates fisher exact test +#' @param foreground A nsparseMatrix object in each 1 means the motif is found in a region, 0 not. +#' @param background A nsparseMatrix object in each 1 means the motif is found in a region, 0 not. +#' @export +#' @examples +#' foreground <- Matrix::Matrix(sample(0:1,size = 100,replace = TRUE), +#' nrow = 10, ncol = 10,sparse = TRUE) +#' rownames(foreground) <- paste0("region",1:10) +#' colnames(foreground) <- paste0("motif",1:10) +#' background <- Matrix::Matrix(sample(0:1,size = 100,replace = TRUE), +#' nrow = 10, ncol = 10,sparse = TRUE) +#' rownames(background) <- paste0("region",1:10) +#' colnames(background) <- paste0("motif",1:10) +#' calculateEnrichement(foreground,background) +calculateEnrichement <- function(foreground, + background){ + if(missing(foreground)) stop("foreground argument is missing") + if(missing(background)) stop("background argument is missing") + + # a is the number of probes within the selected probe set that contain one or more motif occurrences; + # b is the number of probes within the selected probe set that do not contain a motif occurrence; + # c and d are the same counts within the entire enhancer probe set (background) + # lower boundary of 95% conf idence interval = exp (ln OR - SD) + a <- Matrix::colSums(foreground) + b <- nrow(foreground) - Matrix::colSums(foreground) + c <- Matrix::colSums(background) + d <- nrow(background) - Matrix::colSums(background) + fisher <- plyr::adply(seq_len(length(a)),.margins = 1, .fun = function(i) { + x <- fisher.test(matrix(c(a[i],b[i],c[i],d[i]),nrow = 2,ncol = 2)) + ret <- data.frame(x$conf.int[1],x$conf.int[2],x$estimate,x$p.value) + colnames(ret) <- c("lowerOR","upperOR","OR","p.value") + ret + },.id = NULL,.progress = "text") + rownames(fisher) <- names(a) + Summary <- data.frame(motif = names(a), + NumOfRegions = Matrix::colSums(foreground, na.rm=TRUE), + fisher, + FDR = p.adjust(fisher$p.value,method = "BH"), + stringsAsFactors = FALSE) + Summary <- Summary[order(-Summary$lowerOR),] + return(Summary) +} + + +#' @title Use Hocomoco motif and homer to identify motifs in a given region +#' @param regions A GRanges object. Names will be used as the identifier. +#' @param output.filename Final file name +#' @param region.size If NULL the motif will be mapped to the region. If set a window around its center will be considered. +#' For example if region.size is 500, then +-250bp round it will be searched. +#' @param cores A interger which defines the number of cores to be used in parallel +#' process. Default is 1: no parallel process. +#' @param genome Homer genome (hg38, hg19) +#' @param nstep Number of regions to evaluate in homer, the bigger, more memory it will use at each step. +#' @description +#' To find for each probe the know motif we will use HOMER software (http://homer.salk.edu/homer/). +#' Homer and genome should be installed before this function is executed +#' Step: +#' 1 - get DNA methylation probes annotation with the regions +#' 2 - Make a bed file from it +#' 3 - Execute section: Finding Instance of Specific Motifs +#' from http://homer.salk.edu/homer/ngs/peakMotifs.html to the HOCOMOCO TF motifs +#' Also, As HOMER is using more RAM than the available we will split the files in to 100k probes. +#' Obs: for each probe we create a winddow of 500 bp (-size 500) around it. +#' This might lead to false positives, but will not have false negatives. +#' The false posives will be removed latter with some statistical tests. +#' @importFrom utils write.table +#' @examples +#' \dontrun{ +#' # use the center of the region and +-250bp around it +#' gr0 <- GRanges(Rle(c("chr2", "chr2", "chr1", "chr3"), +#' c(1, 3, 2, 4) +#' ), +#' IRanges(1:10, width=10:1) +#' ) +#' names(gr0) <- paste0("ID",c(1:10)) +#' findMotifRegion(regions = gr0, region.size = 500, genome = "hg38", cores = 1) +#' +#' # use the region size itself +#' gr1 <- GRanges(Rle(c("chr2", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)), +#' IRanges(1:10, width=sample(200:1000,10))) +#' names(gr1) <- paste0("ID",c(1:10)) +#' findMotifRegion(regions = gr0, genome = "hg38", cores = 1) +#' } +findMotifRegion <- function(regions, + output.filename = "mapped_motifs_regions.txt", + region.size = NULL, + genome = "hg38", + nstep = 10000, + cores = 1){ + + if(!is(regions, "GRanges")) stop("Regions must be a Genomic Ranges object") + # get all hocomoco 11 motifs + TFBS.motif <- "http://hocomoco11.autosome.org/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.0001.motif" + if(!file.exists(basename(TFBS.motif))) downloader::download(TFBS.motif,basename(TFBS.motif)) + + + if(is.null(names(regions))){ + names(regions) <- tibble::as_tibble(regions) %>% tidyr::unite(col = "names","seqnames","start","end") %>% pull(names) + } + df <- data.frame(seqnames = seqnames(regions), + starts = as.integer(start(regions)), + ends = end(regions), + names = names(regions), + scores = c(rep(".", length(regions))), + strands = strand(regions)) + n <- nrow(df) + step <- ifelse(n > nstep, nstep, n) + + if(!file.exists(output.filename)){ + pb <- txtProgressBar(max = floor(n/step), style = 3) + + for(j in 0:floor(n/step)){ + setTxtProgressBar(pb, j) + # STEP 2 + file.aux <- paste0(gsub(".txt","",output.filename),"_",j,".bed") + if(!file.exists(gsub(".bed",".txt",file.aux))){ + end <- ifelse(((j + 1) * step) > n, n,((j + 1) * step)) + write.table(df[((j * step) + 1):end,], file = file.aux, col.names = F, quote = F,row.names = F,sep = "\t") + + # STEP 3 use -mscore to get scores + # we need to check if annotatePeaks.pl is available! + cmd <- paste("annotatePeaks.pl" ,file.aux, genome, "-m", basename(TFBS.motif), + ifelse(is.null(region.size),"",paste0("-size ", region.size)), + "-cpu", cores, + ">", gsub(".bed",".txt",file.aux)) + error <- system(cmd) + if(error == 127) { + unlink(file.aux) + unlink(gsub(".bed",".txt",file.aux)) + stop(paste0("annotatePeaks.pl had an error. Please check homer install.", + "\nIf already installed be sure R can see it.", + "You can change PATH evn variable with ", + "\nSys.setenv(PATH=paste(Sys.getenv('PATH'), '/the/bin/folder/of/bedtools', sep=':'))") + ) + } + } + } + close(pb) + } + # We will merge the results from each file into one + peaks <- NULL + pb <- txtProgressBar(max = floor(n/step), style = 3) + for(j in 0:floor(n/step)){ + setTxtProgressBar(pb, j) + f <- paste0(gsub(".txt","",output.filename),"_",j,".txt") + if(file.exists(f)){ + aux <- readr::read_tsv(f) + colnames(aux)[1] <- "PeakID" + if(is.null(peaks)) { + peaks <- aux + } else { + peaks <- rbind(peaks, aux) + } + } + gc() + } + close(pb) + print(paste0("Writing file: ",output.filename)) + if(!is.null(peaks)) { + readr::write_tsv(peaks, path = output.filename,col_names = TRUE) + } + print("DONE!") +} + + +#' @title Get MR TF binding regions infered by ELMER +#' @description Saves a bed file with the unmethylated probes (+-250bp) regions that was infered +#' to be bound by a given TF +#' @param tf TF name +#' @param results.dir path to the directory with the results +#' (i.e. analysis/unsupervised/definition-Primary.solid.Tumor_vs_Solid.Tissue.Normal/hypo/) +#' @param genome Human genome (hg38, hg19) +#' @param met.platform DNA Methylation Array platform (EPIC, 450K) +#' @importFrom readr read_csv +#' @importFrom dplyr %>% mutate +#' @importFrom tidyr unnest +#' @importFrom GenomicRanges resize +#' @examples +#' \dontrun{ +#' getTFBindingSites("HNF1A", +#' results.dir = "analysis/unsupervised/group-Tumor_vs_Normal/hypo/") +#' } +#' @export +getTFBindingSites <- function(tf = NULL, + results.dir = NULL, + genome = "hg38", + met.platform = "450K"){ + + if(is.null(tf)) stop("Please set a tf to be searched") + + tf.file <- dir(path = results.dir, + pattern = "*.significant.TFs.with.motif.summary.csv", + recursive = T, + full.names = T, + all.files = T) + if(length(tf.file) == 0) stop("No TF results file found") + tf.tab <- readr::read_csv(tf.file,col_types = readr::cols()) %>% na.omit + + pair.file <- dir(path = results.dir, + pattern = "*.pairs.significant.withmotif.csv", + recursive = T, + full.names = T, + all.files = T) + if(length(pair.file) == 0) stop("No pair results file found") + pair.tab <- readr::read_csv(pair.file,col_types = readr::cols()) + + + # for each enriched motif find the one with the TF in the classification (family of subfamily within the 5%) + for(classification in c("family","subfamily")){ + if(classification == "family"){ + tf.tab <- tf.tab %>% + mutate(tf.target = strsplit(as.character(tf.tab$potential.TF.family), ";")) %>% + unnest(cols = "tf.target") + } else { + tf.tab <- tf.tab %>% + mutate(tf.target = strsplit(as.character(tf.tab$potential.TF.subfamily), ";")) %>% + unnest(cols = "tf.target") + } + + if( !tf %in% tf.tab$tf.target) stop("TF not found") + motif <- tf.tab %>% filter(tf.tab$tf.target == tf) %>% pull('motif') + + # For each enriched motif find in each paired probes it appers + pair.tab <- pair.tab %>% + mutate(motif.target = strsplit(as.character(pair.tab$enriched_motifs), ";")) %>% + unnest(cols = "motif.target") + probes <- pair.tab %>% filter(pair.tab$motif.target %in% motif) %>% pull('Probe') %>% unique + + # for each probe get region and write bed file + metadata <- getInfiniumAnnotation(plat = met.platform, genome = genome) + metadata <- metadata[probes,] + metadata <- resize(metadata, width = 500, fix = 'center') + file.out <- file.path(results.dir, paste0(tf, "_",classification,".bed")) + message("Saving as ", file.out) + rtracklayer::export.bed(metadata,con = file.out) + } + +} diff --git a/R/SmallFunctions.R b/R/SmallFunctions.R deleted file mode 100755 index 6d52ae55..00000000 --- a/R/SmallFunctions.R +++ /dev/null @@ -1,707 +0,0 @@ -# #extract probe bed file: need to write code for this later. out put a bed format file. Next step can direct load bed file as Granges. -# -# #2. Form matrix for feature and probe overlap -# # 1 means overlap, 0 means no overlap -# # target_region and bioFeature_fl need to be bed file format if it is character. If it is already loaded file, it need to be Grange format -# -# -# -# -# feature_overlap <- function (target_region, bioFeature_fl,target.size=c(0,0),bio.size=c(400,400),col.names=NULL, row.names=NULL){ # bioFeature_fl : list of pathes of a banch of files that will otherlap with target region. -# require(GenomicRanges) -# n_biofeature <- length (bioFeature_fl) -# -# #####load target region information -# if (mode(target_region)== "character") RegionInfo <- ReadBed(target_region) -# start(RegionInfo) <- start(RegionInfo)-target.size[1] -# end(RegionInfo) <- end(RegionInfo)+target.size[2] -# #####load biofeature information and form biofeature matrix -# -# biofeature_matrix = c() -# colName <- c() -# for (i in 1:n_biofeature){ -# if (any(mode(bioFeature_fl)== "character")){ -# if(file.info(bioFeature_fl[i])$size==0){ -# print (paste0("Error: no lines available in input",bioFeature_fl[i])) #read.table can't read 0 size input. -# }else{ -# file_regionINFO = ReadBed(bioFeature_fl[i]) -# start(file_regionINFO) <- start(file_regionINFO)-bio.size[1] -# end(file_regionINFO) <- end(file_regionINFO)+bio.size[2] -# overlap_vector = RegionInfo %over% file_regionINFO -# biofeature_num = sum(file_regionINFO %over% RegionInfo) -# biofeature_matrix = cbind(biofeature_matrix,overlap_vector) -# print(paste0("reading and overlap ", bioFeature_fl[i])) -# bn <- basename(bioFeature_fl[i]) -# bn<-sub(".bed","",bn) -# colName <- c(colName,bn) -# } -# }else{ -# start(file_regionINFO) <- start(file_regionINFO)-bio.size[1] -# end(file_regionINFO) <- end(file_regionINFO)+bio.size[2] -# overlap_vector = RegionInfo %over% file_regionINFO -# biofeature_num = sum(file_regionINFO %over% RegionInfo) -# biofeature_matrix = cbind(biofeature_matrix,overlap_vector) -# print(paste0("reading and overlap ", bioFeature_fl[i])) -# colName <- c(colName,paste0(bioFeature_fl[i],"_",biofeature_num)) -# } -# -# } -# -# biofeature_matrix[biofeature_matrix==FALSE]<- 0 -# biofeature_matrix[biofeature_matrix==TRUE]<- 1 -# -# ####merge the target region with the biofeature matrix -# -# colnames(biofeature_matrix) <- colName -# rownames(biofeature_matrix) <- RegionInfo$name -# final_file = as.data.frame (final_file) # it is better to use dataframe, it has dim. the list don't have dim -# return (final_file) -# } - -# -# #------------------------------MergeBioFeature function---------------------- -# #main is the main manifest -# #custom is list of data which will be merged into main manifest -# #by is specify the column to use to merge the files, default is the row.names. -# MergeBioFeature <- function(main, custom=list(),by.main="row.names",by.custom=c()){ -# if(!is.data.frame(main)){ -# main = as.data.frame (main) -# } -# -# for (i in 1:length(custom)){ -# oldn<-nrow(main) -# main<-merge(main,custom[[i]],by.x=by.main,by.y=by.custom[i],sort=TRUE) -# if(by.main == "row.names" && by.custom[i]=="row.names"&& i < length(custom)){ -# rownames(main)<-main$Row.names # I don't know why merge doesn't do this by default -# main$Row.names<-NULL -# } -# print(sprintf("Merge of main file (%d rows) and extra cols (%d rows) yielded %d rows\n",oldn,nrow(custom[[i]]),nrow(main))) -# } -# -# return(main) -# } - - -#-------------------------------repliAnalysis function(from Ben)--------------------- - -repliAnalysis<-function() { - - repliFns <- dir(dir1,pattern="Rep1.bed$",full.names=TRUE) - print(paste0("Found ",length(repliFns)," repli files")) - - round <- 1 - - colName <- c("id") # record the column names - out <- c() # output - - for (repliFn in repliFns) - { - - bn <- basename(repliFn) - bn<-sub("HM450-wgEncodeUwRepliSeq","",bn) - bn<-sub(".bed","",bn) - bn<-sub("WaveSignalRep1","",bn) - colName <- c(colName,bn) - - repli<-read.table(repliFn,sep="\t") - if(round==1){ - id <- repli[,4] - out <- as.character(id) - }else{ - id.new <- repli[,4] - stopifnot(length(id)==length(id.new) && all(as.character(id)==as.character(id.new))) - } - print(paste0("RepliAnalysis saw file: ",repliFn)) - out<-cbind(out, repli[,5]) - print(paste0("output has ",ncol(out)-1," repli cell type: ",bn)) - round <- round + 1 - } - - colnames(out)<-colName - return(out) -} - -#-----------------------combineMethysets function---------------------------- -#ProbesToUse is probes names vector to indicate which should to use -# CancerTypeToDo indices are from the array of tumor types -#SampleToUse : if it is null , it will output all sampletypes, if it is TN, it means Tumor and tissue normal sample. -combineMethysets <- function(CancerTypeToDo=NULL,Probes=NULL, SampleToUse=NULL,panCan=panCan) -{ - load(file11) - if(is.null(Probes)) { - Probes <- read.table(pipe(sprintf("cut -f4 %s",file2.2)),colClasses = "character")[,1] - } - if(!is.null(CancerTypeToDo)){ - if(all(CancerTypeToDo %in% c("BLCA","BRCA","COAD","GBM","HNSC","KIRC","LAML","LUAD","LUSC","READ","UCEC"))){ - CancerTypeToDo <- c(1:11)[c("BLCA","BRCA","COAD","GBM","HNSC","KIRC","LAML","LUAD","LUSC","READ","UCEC") %in% CancerTypeToDo] - } - subAnnot <- subAnnot[subAnnot$CT %in% CancerTypeToDo,] - } - if (SampleToUse=="TN"){ - subAnnot <- subAnnot[subAnnot$TN.cat %in% "Tumor" |subAnnot$TN.cat %in% "Normal",] - }else if(SampleToUse=="T"){ - subAnnot <- subAnnot[subAnnot$TN.cat %in% "Tumor",] - }else if(SampleToUse=="N"){ - subAnnot <- subAnnot[subAnnot$TN.cat %in% "Normal",] - } - betas <- panCan[Probes,subAnnot$sampleNames] - final <-list(betas=betas,Info=subAnnot) - rm(subAnnot) - return(final) -} - -#---------------return sample type function-------------------- ------- -# Returns type: tumor, bloodNormal, tissueNormal -#according to the sampletypes -# Returns type: tumor, bloodNormal, tissueNormal -tcgaSampleType <- function(x) -{ - # "Code","Definition","Short Letter Code" - # "01","Primary solid Tumor","TP" - # "02","Recurrent Solid Tumor","TR" - # "03","Primary Blood Derived Cancer - Peripheral Blood","TB" - # "04","Recurrent Blood Derived Cancer - Bone Marrow","TRBM" - # "05","Additional - New Primary","TAP" - # "06","Metastatic","TM" - # "07","Additional Metastatic","TAM" - # "08","Human Tumor Original Cells","THOC" - # "09","Primary Blood Derived Cancer - Bone Marrow","TBM" - # "10","Blood Derived Normal","NB" - # "11","Solid Tissue Normal","NT" - # "12","Buccal Cell Normal","NBC" - # "13","EBV Immortalized Normal","NEBV" - # "14","Bone Marrow Normal","NBM" - # "20","Control Analyte","CELLC" - # "40","Recurrent Blood Derived Cancer - Peripheral Blood","TRB" - # "50","Cell Lines","CELL" - # "60","Primary Xenograft Tissue","XP" - # "61","Cell Line Derived Xenograft Tissue","XCL" - - - code<-substring(x,14,15) - if (code < 10) - { - out <- "tumor" - } - - else if ((code == 10) || (code == 11) || (code == 13) || (code == 14)) - { - out <- "tissueNormal" - } - else if (code==20) - { - out <- "Control" - } - else if (code==40) - { - out <- "TRB" - } - else if (code==50) - { - out <- "cell_line" - } - else if (code==60) - { - out <- "XP" - } - else if (code==61) - { - out <- "XCL" - } - else - { - out <- "other" - } - - return(out) - -} - - - -normalTissues<-function(x) -{ - #return(grep("Normal",tcga$Pheno$histology,ignore.case=TRUE)) - a<-apply(matrix(x,nrow=1),1,tcgaSampleType) - return(a == "Normal") -} - -tumorTissues<-function(x) -{ - #return(grep("Carcinoma",tcga$Pheno$histology,ignore.case=TRUE)) - a<-apply(matrix(x,nrow=1),1,tcgaSampleType) - return(a == "Tumor") -} - -analysisSamples<-function(x) -{ - return(normalTissues(x) | tumorTissues(x)) -} - -##--------------standarlize ID-------------------------------------------------- -#standardize IDs -standardizeTcgaId<-function(tcgaId) -{ - # This only differentiates tumors (-0) from normals (-1). Why are some "control analytes" (-20) - out<-substring(tcgaId,1,15) - return(out) -} - -standardizeTcgaIds<-function(ids) -{ - #print(paste0("Length of ids=",length(ids))) - require("snow") - pthreads <- detectCores()-3 - cl <- makeCluster(pthreads,type="SOCK") - out <- parSapplyLB(cl,as.character(ids),standardizeTcgaId) - stopCluster(cl) - names(out) <- NULL - return(out) -} - -##------------mutation Info---------------------------------------------------------- -#SampleSet is the sample barcode with proper order -GetMutation<-function(mutation=file6,SampleSet=NULL,MutNames=NULL) -{ - require(snow) - newenv <- new.env() - load(mutation, envir=newenv) - Mut <- get(ls(newenv)[1],envir=newenv) # The data is in the one and only variable - Mut$Tumor_Sample_Barcode <- standardizeTcgaIds(Mut$Tumor_Sample_Barcode) - Mut$Matched_Norm_Sample_Barcode <- standardizeTcgaIds(Mut$Matched_Norm_Sample_Barcode) - Mut <- Mut[,c(1,16,9,10)] - SampelOverlap <- Mut$Tumor_Sample_Barcode %in% SampleSet - print (paste0("The number of the smaple that has mutation data is ",sum(SampleSet %in% Mut$Tumor_Sample_Barcode))) - if(is.null(MutNames)){ - MutNames <- names(sort(table(Mut[ Mut$Tumor_Sample_Barcode %in% SampleSet,1])[table(Mut[ Mut$Tumor_Sample_Barcode %in% SampleSet,1])>0])) - }else{ - MutNames <- MutNames - } - - # Form the matrix of the mutation - pthreads <- detectCores()/2 - cl <- makeCluster(pthreads,type="SOCK") - out <- parLapplyLB(cl,MutNames,Mutvector,SampleSet,Mut) - stopCluster(cl) - out <- do.call(cbind,out) - colnames(out) <- paste0(MutNames,"mut") - rownames(out) <- SampleSet - return(out); -} - -Mutvector <- function(MutName,SampleSet,Mut){ - tmp <- Mut[as.character(Mut$Hugo_Symbol) %in% MutName,] - out<- as.character(tmp$Variant_Classification[match(SampleSet,tmp$Tumor_Sample_Barcode)]) - return(out) -} - - -##-----------RNA-seq----------------------------------------------------------------- - -#Gene make rowname separat ------------------------------------- -GeneIDName <- function(x){ - tmp<-strsplit(rownames(x),"\\|") - GeneID<-unlist(lapply(tmp,function(x) x[2])) - GeneID <- paste0("ID",GeneID) - row.names(x) <- GeneID - return(x) -} - - -##-------------Get gene symbol through ID------------------------------- -.GENEID2Symbol <- function(x,revert=FALSE){ - load("/export/uec-gs1/laird/users/lijingya/data/methylation/TCGA/array/450K/INFO/UCSC.hg19.knownGene_TSS_gene.rda") - TxDbTSS$GENEID <- paste0("ID",TxDbTSS$GENEID) - Pairs <- unique(data.frame(GENEID=TxDbTSS$GENEID,Symbol= TxDbTSS$SYMBOL)) - Pairs[,1] <- as.character(Pairs[,1]) - Pairs[,2] <- as.character(Pairs[,2]) - if(revert){ - out <- Pairs[match(x,Pairs$Symbol), "GENEID"] - out <- x[is.na(out)] - }else{ - out <- Pairs[match(x,Pairs$GENEID), "Symbol"] - out[is.na(out)] <- x[is.na(out)] - } - return(out) -} - -##-------------make TxDBTSS file------------------------------------- -MakeTxDbTSS <- function(){ - library(Homo.sapiens) - keytypes(Homo.sapiens) - txs <- transcriptsBy(Homo.sapiens, 'gene', col=c('GENEID','SYMBOL')) - txs <- unlist(txs) - TxDBTSS <- promoters(txs,0,0) - save(TxDBTSS,file="/export/uec-gs1/laird/users/lijingya/data/methylation/TCGA/array/450K/INFO/UCSC.hg19.knownGene_TSS_gene.rda") - #in pipeline it should be storage to somewhere. -} - - -# ##------------z score calculation------------------------------------ -# # reduce memory occupy I should remove Exps and Meths. -# #By: "gene" or "probe" which must be one. -# .Stat.zscore <- function(Probe,Gene,K,Top=NULL){ -# if(!(length(Gene)==1 & length(Probe)==1)) {stop("Number of Gene ID or Probe should be 1")} -# Exp <- as.matrix(Exps[Gene,])[1,] -# Meth <- Meths[Probe,] -# Meth_B <- Binary(Meth,Break=K) -# zscore <- c() -# if(sum(Exp)==0){ -# zscore <- NA -# }else{ -# df <- data.frame(Exp=Exp,Meth=Meth,Meth_B=Meth_B) -# rownames(df) <- names(Meth) -# unmethyPercent <- sum(df$Meth_B==0,na.rm=T)/dim(df)[1] -# methyPercent <- sum(df$Meth_B==1,na.rm=T)/dim(df)[1] -# if(unmethyPercent < 0.05 | methyPercent < 0.05){ -# zscore <- NA -# }else{ -# unmethy <- rownames(df[order(df$Meth),])[1:round(nrow(df)*Top)] -# methy <- rownames(df[order(df$Meth,decreasing=T),])[1:round(nrow(df)*Top)] -# zscore <- (mean(df[unmethy,"Exp"],na.rm=T)-mean(df[methy,"Exp"],na.rm=T))/sd(df[methy,"Exp"],na.rm=T) -# } -# out <- c(Gene, Probe,zscore) -# } -# return(out) -# } -# -# .Stat.zscore2 <- function(Probe,Gene,K,Top=NULL){ -# if(! length(Probe)==1) {stop("Number of Probe should be 1")} -# Exp <- as.matrix(Exps[Gene,]) -# Meth <- Meths[Probe,] -# Meth_B <- Binary(Meth,Break=K) -# zscore <- c() -# if(sum(Exp)==0){ -# zscore <- NA -# }else{ -# unmethyPercent <- sum(Meth_B==0,na.rm=T)/length(Meth_B) -# methyPercent <- sum(Meth_B==1,na.rm=T)/length(Meth_B) -# if(unmethyPercent < 0.05 | methyPercent < 0.05){ -# zscore <- NA -# }else{ -# unmethy <- names(Meth[order(Meth)])[1:round(length(Meth)*Top)] -# methy <- names(Meth[order(Meth,decreasing=T)])[1:round(length(Meth)*Top)] -# zscore <- apply(Exp,1,function(x) {(mean(x[unmethy],na.rm=T)-mean(x[methy],na.rm=T))/sd(x[methy],na.rm=T)}) -# } -# out <- cbind(Probe=rep(Probe,length(Gene)),Gene,zscore) -# } -# return(out) -# } -# -# .Stat.zscore3 <- function(Probe,NearGenes,K,Top=NULL){ -# if(! length(Probe)==1) {stop("Number of Probe should be 1")} -# Gene <- NearGenes[[Probe]][,2] -# Exp <- as.matrix(Exps[Gene,]) -# Meth <- Meths[Probe,] -# Meth_B <- Binary(Meth,Break=K) -# zscore <- c() -# if(sum(Exp)==0){ -# zscore <- NA -# }else{ -# unmethyPercent <- sum(Meth_B==0,na.rm=T)/length(Meth_B) -# methyPercent <- sum(Meth_B==1,na.rm=T)/length(Meth_B) -# if(unmethyPercent < 0.05 | methyPercent < 0.05){ -# zscore <- NA -# }else{ -# unmethy <- names(Meth[order(Meth)])[1:round(length(Meth)*Top)] -# methy <- names(Meth[order(Meth,decreasing=T)])[1:round(length(Meth)*Top)] -# zscore <- apply(Exp,1,function(x) {(mean(x[unmethy],na.rm=T)-mean(x[methy],na.rm=T))/sd(x[methy],na.rm=T)}) -# } -# out <- cbind(Probe=rep(Probe,length(Gene)),Gene,zscore) -# } -# return(out) -# } -# -# -# -# -# .Stat.corr2 <- function(Probe,Gene,K,Top=NULL,method="pearson"){ -# if(! length(Probe)==1) {stop("Number of Probe should be 1")} -# Exp <- as.matrix(Exps[Gene,]) -# Meth <- Meths[Probe,] -# Meth_B <- Binary(Meth,Break=K) -# corr <- c() -# if(sum(Exp)==0){ -# corr <- NA -# }else{ -# unmethyPercent <- sum(Meth_B==0,na.rm=T)/length(Meth_B) -# methyPercent <- sum(Meth_B==1,na.rm=T)/length(Meth_B) -# if(unmethyPercent < 0.05 | methyPercent < 0.05){ -# corr <- NA -# }else{ -# corr <- apply(Exp,1,function(x,Meth) {cor(x,Meth,use="complete.obs",method = method)},Meth=Meth) -# } -# out <- cbind(Probe=rep(Probe,length(Gene)),Gene,corr) -# } -# return(out) -# } -# -# .Stat.corr3 <- function(Probe,NearGenes,K,Top=NULL,method="pearson"){ -# if(! length(Probe)==1) {stop("Number of Probe should be 1")} -# Gene <- NearGenes[[Probe]][,2] -# Exp <- as.matrix(Exps[Gene,]) -# Meth <- Meths[Probe,] -# Meth_B <- Binary(Meth,Break=K) -# corr <- c() -# if(sum(Exp)==0){ -# corr <- NA -# }else{ -# unmethyPercent <- sum(Meth_B==0,na.rm=T)/length(Meth_B) -# methyPercent <- sum(Meth_B==1,na.rm=T)/length(Meth_B) -# if(unmethyPercent < 0.05 | methyPercent < 0.05){ -# corr <- NA -# }else{ -# corr <- apply(Exp,1,function(x,Meth) {cor(x,Meth,use="complete.obs",method = method)},Meth=Meth) -# } -# out <- cbind(Probe=rep(Probe,length(Gene)),Gene,corr) -# } -# return(out) -# } - - -#---probes name index------------------------------------ -# calculate Pvalue -Get.Pvalue <- function(zscore.Matrix,permu){ - .Pvalue <- function(x,permu,target.Matrix){ - zscore <- target.Matrix[x,3] - Gene <- target.Matrix[x,"Gene"] - if(is.na(zscore)){ - out <- NA - print("NA") - }else{ - out <- sum(permu[Gene,] > zscore | permu[Gene,] == zscore,na.rm=T)/sum(!is.na(permu[Gene,])) - } -# else if(zscore <= 0){ -# out <- sum(permu[Gene,] < zscore | permu[Gene,] == zscore, na.rm=T)/length(permu[Gene,]) - return(out) - } - Pvalue <- apply(matrix(1:nrow(zscore.Matrix),ncol=1),1,.Pvalue,target.Matrix=zscore.Matrix,permu=permu) - Pvalue <- unlist(Pvalue) - Output <- cbind(zscore.Matrix,Pvalue) - return(Output) -} - - - -Get.Pvalue.c <- function(zscore.Matrix,permu){ - .Pvalue <- function(x,permu,target.Matrix){ - zscore <- target.Matrix[x,3] - Gene <- target.Matrix[x,"Gene"] - if(is.na(zscore)){ - out <- NA - #print("NA") - }else{ - out <- sum(permu[Gene,] < zscore | permu[Gene,] == zscore,na.rm=T)/sum(!is.na(permu[Gene,])) - } - # else if(zscore <= 0){ - # out <- sum(permu[Gene,] < zscore | permu[Gene,] == zscore, na.rm=T)/length(permu[Gene,]) - return(out) - } - Pvalue <- apply(matrix(1:nrow(zscore.Matrix),ncol=1),1,.Pvalue,target.Matrix=zscore.Matrix,permu=permu) - Pvalue <- unlist(Pvalue) - Output <- cbind(zscore.Matrix,Pvalue) - return(Output) -} -z.test = function(a, mu, var){ - zeta = (mean(a) - mu) / (sqrt(var / length(a))) - return(zeta) -} - -CNV_forgene <- function(wd,CT,Type){ - files <- dir(wd) - CT <- toupper(CT) - if(Type %in% "thresholded"){ - File <- paste0(wd,"/",CT,".gistic.all_thresholded.by_genes") - }else{ - File <- paste0(wd,"/",CT,".gistic.all_data_by_genes") - } - CNV_Gene <- read.table(File,header= T,sep="\t",stringsAsFactors=F) - colnames(CNV_Gene) <- gsub("\\.", "-", colnames(CNV_Gene)) - rownames(CNV_Gene) <- CNV_Gene[,1] - CNV_Gene <- CNV_Gene[,-c(1:3)] - return(CNV_Gene) -} - -##CordinateToRange -CordinateToRange <- function(Cordinate,core=detectCores()/2){ - options('mc.cores'=core) - chr <- unlist(mclapply(Cordinate,function(x){strsplit(x,"\\:")[[1]][1]})) - position <- sub("chr.*:","",Cordinate) - startAend <- matrix(unlist(mclapply(position,function(x){strsplit(x,"\\-")})),ncol=2,byrow=T) - class(startAend) <- "numeric" - Bed<-GRanges(chr, IRanges(startAend[,1], startAend[,2]) ) - return(Bed) -} - - -#---------------------------------------lsos Function--------- -# improved list of objects -.ls.objects <- function (pos = 1, pattern, order.by, - decreasing=FALSE, head=FALSE, n=5) { - napply <- function(names, fn) sapply(names, function(x) - fn(get(x, pos = pos))) - names <- ls(pos = pos, pattern = pattern) - obj.class <- napply(names, function(x) as.character(class(x))[1]) - obj.mode <- napply(names, mode) - obj.type <- ifelse(is.na(obj.class), obj.mode, obj.class) - obj.prettysize <- napply(names, function(x) { - capture.output(print(object.size(x), units = "auto")) }) - obj.size <- napply(names, object.size) - obj.dim <- t(napply(names, function(x) - as.numeric(dim(x))[1:2])) - vec <- is.na(obj.dim)[, 1] & (obj.type != "function") - obj.dim[vec, 1] <- napply(names, length)[vec] - out <- data.frame(obj.type, obj.size, obj.prettysize, obj.dim) - names(out) <- c("Type", "Size", "PrettySize", "Rows", "Columns") - if (!missing(order.by)) - out <- out[order(out[[order.by]], decreasing=decreasing), ] - if (head) - out <- head(out, n) - out -} - -# shorthand -lsos <- function(..., n=10) { - .ls.objects(..., order.by="Size", decreasing=TRUE, head=TRUE, n=n) -} - -MakeHeatmap <- function(sub.probes,METH,CT,K=0.3,title=NULL){ - # set colors -------------------------------------------------------------- - jet.colors <- - colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", - "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000")) - redGreen <- colorRampPalette(c("green","black","red")) - library(RColorBrewer) - GreyCol <- colorRampPalette(brewer.pal(9,"Greys")) - # load data --------------------------------------------------------------- - ## load REMC barcode - #CT specific barcode - REMC_Barcode<- c("GI","GI","BRST","BRN","THYM","KID","BLD","LNG","LNG","GI","OVRY") - - REMC_ID <- read.csv("/export/uec-gs1/laird/users/lijingya/data/REMC/FinalFiles_Mar2014/barcode.csv",stringsAsFactors=F) - relativeID <- unique(unlist(mclapply(REMC_Barcode[CT],function(x){ REMC_ID$Epigenome.Mnemonic[grepl( x, REMC_ID$Epigenome.Mnemonic)]}))) - ESC <- REMC_ID[REMC_ID$GROUP %in% "ESC" | REMC_ID$Epigenome.Mnemonic %in% relativeID,] - # load probe info - tmp <- as.matrix(read.csv("/export/uec-gs1/laird/users/lijingya/analysis/methylation/TCGA/450Kmethyl/Results/2014-04-23_RMEC_Phantom5/Probes_Phantom_REMC_TSS_CGI_info_hg19.csv",nrow=1,header=F))[1,] - - Cols <- paste(c(1,2,3,909:920,which(tmp %in% c("E119",ESC$NEW.EID))),collapse=',') - Probes <- read.csv( - pipe(sprintf('cut -f %s -d "," /export/uec-gs1/laird/users/lijingya/analysis/methylation/TCGA/450Kmethyl/Results/2014-04-23_RMEC_Phantom5/Probes_Phantom_REMC_TSS_CGI_info_hg19.csv',Cols)), - stringsAsFactors=F) - Probes <- Probes[rowSums(Probes[,c("EnhG1","EnhG2","EnhA1","EnhA2")])>0,] - ESC <- ESC[ESC$NEW.EID %in% colnames(Probes),] - Probes <- Probes[match(sub.probes,Probes$Probes),] - - ## read buffy coat data - Buffy <- read.table("/home/rcf-40/lijingya/lijingya/data/methylation/TCGA/array/450K/INFO/1074_Buffy\ Coat\ corrected\ beta\ values.txt",skip=5,stringsAsFactors=F,header=F) - Buffy <- as.matrix(Buffy) - rownames(Buffy)<- Buffy[,1] - Buffy <- Buffy[,-1] - class(Buffy) <- "numeric" - Buffy <- Buffy[sub.probes,] - - ## fimo motif - ##fimo motif search - load("/export/uec-gs1/laird/users/lijingya/analysis/methylation/TCGA/450Kmethyl/Results/2014-05-06-BCRAexample/exploreClustering/Probesall.TF.matrix.rda") - - ## REMC WGBS - load("/export/uec-gs1/laird/users/lijingya/data/methylation/TCGA/array/450K/INFO/Probes_REMC_WGBS.rda") - - #load lower.bound - load(sprintf("../CT_%d_low.assign_SigHypo.rda",CT)) - - sub.METH <- METH$betas[sub.probes,] - sub.Info <- METH$Info - sub.METH.normal <- sub.METH[,sub.Info$TN.cat %in% "Normal"] - sub.METH.tumor <- sub.METH[,sub.Info$TN.cat %in% "Tumor"] - sub.Info <- sub.Info[sub.Info$TN.cat %in% "Tumor",] - sub.normal <- Binary(sub.METH.normal,Break=K) - sub.tumor <- Binary(sub.METH.tumor,Break=K) - system.time(cluster_T <- cluster.main(sub.tumor,Colv=T,Rowv=T,hclustMethod="ward",distMethod="binary")) - cluster_N <- cluster.main(sub.normal,Colv=T,Rowv=F,hclustMethod="ward") - cluster_T$x <- sub.METH.tumor - cluster_N$x <- sub.METH.normal - cluster_N$rowInd <- cluster_T$rowInd - ## - sub.CGI <- Probes[match(sub.probes,Probes$Probes),"CGI"] - sub.buffy <- Buffy[rownames(sub.METH),] - sub.segmentation <- Probes[match(sub.probes,Probes$Probes),ESC$NEW.EID] - tmp <- ChrToColor(sub.segmentation) - Leve <- tmp$levels - sub.segmentation <- tmp$x - sub.REMC.WGBS <- Probes.REMC.WGBS[rownames(sub.METH),] - colnames(sub.REMC.WGBS) <- REMC_ID[match(colnames(sub.REMC.WGBS),REMC_ID$NEW.EID),3] - sub.TF.matrix <- Probes.TF[sub.probes,] - sub.enrich <- unlist(mclapply(1:ncol(sub.TF.matrix),function(x) {binom.test(colSums(sub.TF.matrix)[x], nrow(sub.TF.matrix), p = colMeans(Probes.TF)[x],alternative = "greater")$p.value},mc.cores=6)) - sub.TF.matrix <- sub.TF.matrix[,names(sort(-log10(sub.enrich),decreasing=T))[1:20]] - sub.probes <- rownames(sub.METH.tumor)[unlist(apply(sub.METH.tumor,1,function(x){ all(!is.na(x))}))] - Mean.TF.sample <- t(sub.METH.tumor[sub.probes,]) %*% sub.TF.matrix[sub.probes,] / matrix(rep(colSums(sub.TF.matrix[sub.probes,]),ncol(sub.METH.tumor)),nrow=ncol(sub.METH.tumor),byrow=T) - TF.cluster <- cluster.main(sub.TF.matrix,Colv=T,Rowv=F,hclustMethod="ward") - sub.TF.matrix <- sub.TF.matrix[,TF.cluster$colInd] - ImpInfo <- c("VHLmeth" ,"CDKN2Ameth" ,"BRCAmeth", "MLH1meth","VHL", "BRCA1","BRCA2", "CDKN2A","TP53","ARID1A","MLL2","SETD2","MLL3","CTCF", "KDM6A","MLH1","ATRX","CREBBP","EP300","DNMT3A", - "TET2","TET1","IDH1","IDH2","KDM3A","KDM5C","MLL4","MLL","ASXL1","EZH2","MECOM","DAXX","IDH1.R132","IDH2.R172","IDH2.R140","BRAFV600E","KRAS") - sub.ImpInfo <- sub.Info[,ImpInfo] - sub.ImpInfo <- do.call(cbind,mclapply(colnames(sub.ImpInfo),function(x,sub.ImpInfo){ as.numeric(sub.ImpInfo[,x])},sub.ImpInfo=sub.ImpInfo,mc.cores=6)) - sub.ImpInfo[is.na(sub.ImpInfo)] <- 0 - colnames(sub.ImpInfo) <- ImpInfo - if(paste(CT,collapse="_") %in% "2"){ - sub.pam <- as.numeric(sub.Info$pam50) - sub.pam[is.na(sub.pam)] <- -1 - } - if(length(CT) > 1){ - sub.pam <- sub.Info$CT - } - png(paste0("CT",paste(CT,collapse="_"),"_",title,"_probes",dim(sub.tumor)[1],".png"), width = 1600, height = 1500) - if(paste(CT,collapse="_") %in%"2" | length(CT) > 1){ - layout(matrix(data=c(0,16,17,17,17,17,0, - 10,9,0,0,0,0,0, - 8,7,0,0,0,0,0, - 6,5,14,14,14,14,0, - 4,3,0,0,0,0,0, - 2,1,11,12,13,15,18, - 0,19,0,0,0,0,0, - 0,20,0,0,0,0,0), nrow=8,ncol=7,byrow=TRUE),widths=c(5*(ncol(sub.normal)/ncol(sub.tumor)),5,0.2,0.4,2,2.8,2.8),heights=c(0.2,0.2,0.2,0.2,0.2,4,1.8,1.8)) - }else{ - layout(matrix(data=c(10,9,0,0,0,0,0, - 8,7,0,0,0,0,0, - 6,5,14,14,14,14,0, - 4,3,0,0,0,0,0, - 2,1,11,12,13,15,16, - 0,17,0,0,0,0,0, - 0,18,0,0,0,0,0), nrow=7,ncol=7,byrow=TRUE),widths=c(5*(ncol(sub.normal)/ncol(sub.tumor)),5,0.2,0.4,2,2.8,2.8),heights=c(0.2,0.2,0.2,0.2,4,1.8,1.8)) - } - - - heatmap.main(cluster_T,col=jet.colors(255),margin=c(1,1,0.5,1),cexRow = 1.5, cexCol = 2, - zlim=c(0,1),nonlab=T) - heatmap.main(cluster_N,col=jet.colors(255),margin=c(1,1,0.5,0.5),cexRow = 1.5, cexCol = 2,nonlab=T,zlim=c(0,1)) - MultiSide.bars(data=list(lower.bound=lower.bound, cluster=factor(cutree(cluster_T$hcc,k=5)), Batch=factor(sub.Info$TCGA.BATCH),Purity=sub.Info$ABSOLUTE.purity), - side="colside",order=cluster_T$colInd, margins=c(0.2,1,0.2,1), - col=list(cluster=c("grey","white","black","pink","purple"), - Batch=1:length(unique(sub.Info$TCGA.BATCH)), - Purity=GreyCol(255), - lower.bound = jet.colors(255)), - zlim=list(lower.bound=c(0,1),Purity=c(0,1))) - side.bars(as.matrix(sub.buffy),side="rowside",order=cluster_T$rowInd,margins=c(1,0.5,0.5,0.2),col=jet.colors(255),zlim=c(0,1),cex=1) - side.bars2(as.matrix(sub.CGI,ncol=1),side="rowside",order=cluster_T$rowInd,margins=c(1,0.5,0.5,0.2),cex=1) - side.bars(sub.segmentation,side="rowside",order=cluster_T$rowInd,margins=c(1,0.5,0.5,0.2),col=c("lightcyan","chocolate","yellow","purple","darkgreen","green","darkorange"),cex=1.5) - AddLengend (Leve,cols=c("lightcyan","chocolate","yellow","purple","darkgreen","green","darkorange"),margins=c(3,1,0.5,1),cexlab=2,lab.las=1) - side.bars(as.matrix(sub.REMC.WGBS),side="rowside",order=cluster_T$rowInd,margins=c(1,0.5,0.5,0.2),col=jet.colors(255),zlim=c(0,1),cex=1.5) - if(paste(CT,collapse="_") %in% "2") { - side.bars(matrix(sub.pam,ncol=1),side="colside",order=cluster_T$colInd,margins=c(0.2,1,0.2,1),col=c(1:length(unique(sub.Info$pam50))),cex=1) - AddLengend (c("missing",levels(sub.Info$pam50)),cols=c(1:length(unique(sub.pam))),margins=c(3,1,0.5,1),cexlab=2,lab.las=1) - }else if( length(CT) > 1){ - side.bars(matrix(sub.pam,ncol=1),side="colside",order=cluster_T$colInd,margins=c(0.2,1,0.2,1),col=c(1:length(unique(sub.pam))),cex=1) - AddLengend (c(paste0(CT,unique(sub.pam))),cols=c(1:length(unique(sub.pam))),margins=c(3,1,0.5,1),cexlab=2,lab.las=1) - } - side.bars2(as.matrix(sub.TF.matrix),side="rowside",order=cluster_T$rowInd,margins=c(1,0.5,0.5,0.2),cex=1.5) - side.bars(as.matrix(Mean.TF.sample),side="colside",order=cluster_T$colInd,margins=c(0.2,1,0.2,1),col=jet.colors(255),zlim=c(0,1),cex=1.5) - side.bars(as.matrix(sub.ImpInfo),side="colside",order=cluster_T$colInd,margins=c(0.2,1,0.2,1),col=c("lightcyan","white","black"),cex=1.5) - dev.off() - rm(cluster_T,cluster_N,sub.segmentation,sub.TF.matrix,Mean.TF.sample,sub.ImpInfo,sub.CGI,sub.REMC.WGBS,TF.cluster,sub.buffy,tmp,sub.probes) - gc() -} - - - - diff --git a/R/StatisticTest.R b/R/StatisticTest.R index 35ba6909..3943c078 100644 --- a/R/StatisticTest.R +++ b/R/StatisticTest.R @@ -1,178 +1,219 @@ -# Stat for computing hypo- and hypermethylated probes -##ID: the rownames of Exps -##TN: the two group factors -##Exps: the data matrix. Rows is the ID and cols are sample which divided into two groups. -##cutoff: filter out the small difference. -#' Stat for computing hypo- and hypermethylated probes -#' @param Probe A character which is name of probes on the array. -#' @param TN A vector of characters either 'Tumor' or 'Normal' which represent samples from Tumor or Normal. -#' @param test A test method used for the statistic. -#' @param Tumor.per A number determines the percentage of Tumor samples will be used in the test. -#' @param Normal.per A number determines the percentage of Normal samples will be used in the test. -#' @param hyper A character either NULL,FALSE, TRUE. NULL indicate two way test to identify hypo/hypermethylated probes. FALSE indicate to identify hypomethylated probes. TRUE indicates to identify hypermethylated probes. -#' @return test results. -.Stat <- function(Probe,TN,Meth,test=t.test,Tumor.per=NULL,Normal.per=NULL,hyper=NULL){ - out <- c() - Meth <- Meth[Probe,] - if(!is.na(Tumor.per)){ - if(Top.m){ - tumor.tmp <- sort(Meth[TN %in% "Tumor"],decreasing=T) - normal.tmp <- sort(Meth[TN %in% "Normal"],decreasing=T) - }else{ - tumor.tmp <- sort(Meth[TN %in% "Tumor"]) - normal.tmp <- sort(Meth[TN %in% "Normal"]) - } - if(round(length(normal.tmp)*Normal.per)< 5){ - if(length(normal.tmp) < 5) { - Normal.number <- length(normal.tmp) - }else{ - Normal.number <- 5 - message(sprintf("%s percentage of normal sample is less than 5. Set number of normal samples as 5",Normal.per)) - } - }else{ - Normal.number <- round(length(normal.tmp)*Normal.per) - } - tumor.tmp <- tumor.tmp[1:round(length(tumor.tmp)*Tumor.per)] - normal.tmp <- normal.tmp[1:Normal.number] - Meth <- c(normal.tmp,tumor.tmp) - TN <- c(rep("Normal",length(normal.tmp)),rep("Tumor",length(tumor.tmp))) +## get differential methylated probes------------------------- +#' Stat.diff.meth +#' @param meth A matrix contain DNA methylation data. +#' @param groups A vector of category of samples. +#' @param group1 Group 1 label in groups vector +#' @param group2 Group 2 label in groups vector +#' @param test A function specify which statistic test will be used. +#' @param percentage A number specify the percentage of normal and tumor +#' samples used in the test. +#' @param Top.m A logic. If to identify hypomethylated probe Top.m should be FALSE. +#' hypermethylated probe is TRUE. +#' @param min.samples Minimun number of samples to use in the analysis. Default 5. +#' If you have 10 samples in one group, percentage is 0.2 this will give 2 samples +#' in the lower quintile, but then 5 will be used. +#' @importFrom stats sd t.test wilcox.test +#' @return Statistic test results to identify differentially methylated probes. +Stat.diff.meth <- function( + meth, + groups, + group1, + group2, + test = t.test, + min.samples = 5, + percentage = 0.2, + Top.m = NULL +){ + + if(percentage < 1){ + g1 <- meth[groups %in% group1] + g2 <- meth[groups %in% group2] + group1.nb <- ifelse(round(length(g1) * percentage) < min.samples, min(min.samples,length(g1)), round(length(g1) * percentage)) + group2.nb <- ifelse(round(length(g2) * percentage) < min.samples, min(min.samples,length(g2)), round(length(g2) * percentage)) + + group1.tmp <- sort(g1, decreasing = Top.m) + group2.tmp <- sort(g2, decreasing = Top.m) + + group1.tmp <- group1.tmp[1:group1.nb] + group2.tmp <- group2.tmp[1:group2.nb] + } else { + group1.tmp <- meth[groups %in% group1] + group2.tmp <- meth[groups %in% group2] } - ##this is to remove the situation that the normal or tumor are all NA (only one is value) - Meth_split <- split(Meth,TN) - Meth_split <- unlist(lapply(Meth_split,function(x){!is.na(sd(x,na.rm=T))})) - if(sd(Meth,na.rm=T)>0 & all(Meth_split)){ - if(!is.null(hyper)){ - alternative <- ifelse(hyper,"less","greater") - }else{ + + if(sd(meth,na.rm=TRUE) > 0 & !all(is.na(group1.tmp)) & !all(is.na(group2.tmp))){ + if(!is.na(Top.m)){ + alternative <- ifelse(Top.m,"greater","less") + } else { alternative <- "two.sided" } - df <- data.frame(Meth=Meth,TN=factor(TN)) - TT <- test(Meth~TN,df,alternative=alternative) - MeanDiff <- TT$estimate[2]-TT$estimate[1] - PP <- TT$p.value - out <- rbind(out,c(Probe,PP,MeanDiff)) - }else{ - out <- rbind(out,c(Probe,NA,NA)) + # If hyper (top. TRUE alternative greater) group 1 > group 2 + # If hypo (top. FALSE alternative greater) group 1 < group 2 + out <- tryCatch({ + TT <- test(x = group1.tmp, y = group2.tmp, alternative = alternative, conf.int = TRUE) + MeanDiff <- ifelse(length(TT$estimate) == 2, TT$estimate[1]-TT$estimate[2],TT$estimate) + PP <- TT$p.value + data.frame(PP=PP,MeanDiff=MeanDiff, stringsAsFactors = FALSE) + }, error = function(e) { + data.frame(PP=NA,MeanDiff=NA,stringsAsFactors = FALSE) + }) + } else{ + out <- data.frame(PP=NA,MeanDiff=NA,stringsAsFactors = FALSE) } return(out) } - - - - -#---different statistic test---------------------------- -##Probes must be one probes -##Gene can be one gene or multiple gene. -#' U test (non parameter test) for permutation. This is one probe vs multiple gene which is good for computing permutation for each probe. +#'Stat.nonpara.permu #' @param Probe A character of name of Probe in array. #' @param Gene A vector of gene ID. -#' @param K A number determines the methylated groups and unmethylated groups. #' @param Top A number determines the percentage of top methylated/unmethylated samples. +#' Only used if unmethy and methy are not set. +#' @param correlation Type of correlation to evaluate (negative or positive). +#' Negative (default) checks if hypomethylated region has a upregulated target gene. +#' Positive checks if region hypermethylated has a upregulated target gene. #' @param Meths A matrix contains methylation for each probe (row) and each sample (column). #' @param Exps A matrix contains Expression for each gene (row) and each sample (column). +#' @param methy Index of M (methylated) group. +#' @param unmethy Index of U (unmethylated) group. #' @return U test results - -.Stat.nonpara.permu <- function(Probe,Gene,K,Top=NULL,Meths=Meths,Exps=Exps){ - if(! length(Probe)==1) {stop("Number of Probe should be 1")} - Exp <- as.matrix(Exps[Gene,]) - Meth <- Meths[Probe,] - Meth_B <- Binary(Meth,Break=K) - test.p <- c() - if(sum(Exp)==0){ - test.p <- NA - }else{ - unmethyPercent <- sum(Meth_B==0,na.rm=T)/length(Meth_B) - methyPercent <- sum(Meth_B==1,na.rm=T)/length(Meth_B) - if(unmethyPercent < 0.05 | methyPercent < 0.05){ - test.p <- NA - }else{ - unmethy <- order(Meth)[1:round(length(Meth)*Top)] - methy <- order(Meth,decreasing=T)[1:round(length(Meth)*Top)] - Fa <- factor(rep(NA,length(Meth)),levels=c(-1,1)) - Fa[unmethy] <- -1 - Fa[methy] <- 1 - if(length(Gene)<2){ - test.p <- wilcox.test(Exp~Fa,alternative = "greater",exact=F)$p.value - }else{ - cl <- makeCluster(13,"SOCK") - # test.p <- unlist(mclapply(1:nrow(Exp),function(x,Factor) { exp = Exp[x,] - # wilcox.test(exp~Factor,alternative = "greater",exact=F)$p.value},Factor=Fa,mc.cores=10)) - test.p <- unlist(parSapplyLB(cl,1:nrow(Exp),function(x,Factor) { exp = Exp[x,] - wilcox.test(exp~Factor,alternative = "greater",exact=F)$p.value},Factor=Fa,simplify=F)) - stopCluster(cl) - - } - } - if(length(Gene) < 2){ - out <- c(Probe,Gene,test.p) - }else{ - out <- cbind(Probe=rep(Probe,length(Gene)),Gene,test.p) - } - - } - return(out) +Stat.nonpara.permu <- function( + Probe, + Gene, + Top = 0.2, + correlation = "negative", + unmethy = NULL, + methy = NULL, + Meths = Meths, + Exps = Exps +){ + + + if(is.null(methy) & is.null(unmethy)){ + idx <- order(Meths) + nb <- round(length(Meths) * Top) + unmethy <- head(idx, n = nb) + methy <- tail(idx, n = nb) + } + test.p <- unlist( + lapply( + splitmatrix(Exps), + function(x) { + tryCatch({ + wilcox.test( + x[unmethy], + x[methy], + alternative = ifelse(correlation == "negative","greater","less"), + exact = FALSE + )$p.value + }, error = function(e){ + NA + }) + } + + )) + + test.p <- data.frame( + GeneID = Gene, + Raw.p = test.p[match(Gene, names(test.p))], + stringsAsFactors = FALSE + ) + + return(test.p) } -#' U test (non parameter test) for permutation. This is one probe vs nearby gene which is good for computing each probes for nearby genes. +#' U test (non parameter test) for permutation. This is one probe vs nearby gene +#' which is good for computing each probes for nearby genes. #' @param Probe A character of name of Probe in array. #' @param NearGenes A list of nearby gene for each probe which is output of GetNearGenes function. -#' @param K A number determines the methylated groups and unmethylated groups. -#' @param Top A number determines the percentage of top methylated/unmethylated samples. +#' @param Top A number determines the percentage of top methylated/unmethylated samples. +#' Only used if unmethy and methy are not set. +#' @param correlation Type of correlation to evaluate (negative or positive). +#' Negative (default) checks if hypomethylated region has a upregulated target gene. +#' Positive checks if region hypermethylated has a upregulated target gene. #' @param Meths A matrix contains methylation for each probe (row) and each sample (column). #' @param Exps A matrix contains Expression for each gene (row) and each sample (column). +#' @param methy Index of M (methylated) group. +#' @param unmethy Index of U (unmethylated) group. +#' @importFrom stats wilcox.test #' @return U test results -.Stat.nonpara <- function(Probe,NearGenes,K,Top=NULL,Meths=Meths,Exps=Exps){ - source("/export/uec-gs1/laird/users/lijingya/software/scripts/R/Heatmap.Func.R") - if(! length(Probe)==1) {stop("Number of Probe should be 1")} - Gene <- NearGenes[[Probe]][,2] - Exp <- as.matrix(Exps[Gene,]) - Meth <- Meths[Probe,] - Meth_B <- Binary(Meth,Break=K) - test.p <- c() - if(sum(Exp)==0){ - test.p <- NA - }else{ - unmethyPercent <- sum(Meth_B==0,na.rm=T)/length(Meth_B) - methyPercent <- sum(Meth_B==1,na.rm=T)/length(Meth_B) - if(unmethyPercent < 0.05 | methyPercent < 0.05){ - test.p <- NA - }else{ - unmethy <- order(Meth)[1:round(length(Meth)*Top)] - methy <- order(Meth,decreasing=T)[1:round(length(Meth)*Top)] - Fa <- factor(rep(NA,length(Meth)),levels=c(-1,1)) - Fa[unmethy] <- -1 - Fa[methy] <- 1 - test.p <- apply(Exp,1,function(x,Factor) {wilcox.test(x~Factor,alternative = "greater",exact=F)$p.value},Factor=Fa) - } - out <- cbind(Probe=rep(Probe,length(Gene)),Gene,test.p) +Stat.nonpara <- function(Probe, + NearGenes, + Top = NULL, + correlation = "negative", + unmethy = NULL, + methy = NULL, + Meths = Meths, + Exps = Exps){ + if(!length(Probe)==1) stop("Number of Probe should be 1") + + NearGenes.set <- NearGenes[NearGenes$ID == Probe,] + Gene <- NearGenes.set[,2] + Exp <- Exps[Gene,,drop = FALSE] + Meth <- Meths + if(is.null(methy) & is.null(unmethy)){ + idx <- order(Meth) + nb <- round(length(Meth) * Top) + unmethy <- head(idx, n = nb) + methy <- tail(idx, n = nb) } + # Here we will test if the Expression of the unmethylated group is higher than the exptression of the methylated group + test.p <- unlist(lapply(splitmatrix(Exp), + function(x) { + tryCatch({ + wilcox.test(x[unmethy], + x[methy], + alternative = ifelse(correlation == "negative","greater","less"), + exact = FALSE)$p.value}, + error = function(x){ + NA + }) + })) + + if(length(Gene)==1){ + Raw.p <- test.p + } else { + Raw.p <- test.p[match(Gene, names(test.p))] + } + + # In case Symbol is not in the input file + if(!"Symbol" %in% colnames(NearGenes.set)) NearGenes.set$Symbol <- NA + + out <- data.frame(Probe = rep(Probe,length(Gene)), + GeneID = Gene, + Symbol = NearGenes.set$Symbol, + Distance = NearGenes.set$Distance, + Sides = NearGenes.set$Side, + Raw.p = Raw.p, + stringsAsFactors = FALSE) + return(out) } + #' Calculate empirical Pvalue -#' @param zscore.Matrix A data.frame of raw pvalue from U test. Output from .Stat.nonpara +#' @param U.matrix A data.frame of raw pvalue from U test. Output from .Stat.nonpara #' @param permu data frame of permutation. Output from .Stat.nonpara.permu #' @return A data frame with empirical Pvalue. -Get.Pvalue.p <- function(zscore.Matrix,permu){ - .Pvalue <- function(x,permu,target.Matrix){ - zscore <- target.Matrix[x,4] - Gene <- target.Matrix[x,"Gene"] - if(is.na(zscore)){ +Get.Pvalue.p <- function(U.matrix,permu){ + .Pvalue <- function(x,permu){ + Raw.p <- as.numeric(x["Raw.p"]) + Gene <- as.character(x["GeneID"]) + if(is.na(Raw.p)){ out <- NA - #print("NA") - }else{ - out <- (sum(permu[Gene,] > zscore | permu[Gene,] == zscore,na.rm=T)+1)/(sum(!is.na(permu[Gene,])) + 1) - } - # else if(zscore <= 0){ - # out <- sum(permu[Gene,] < zscore | permu[Gene,] == zscore, na.rm=T)/length(permu[Gene,]) + } else { + # num( Pp <= Pr) + 1 + # Pe = --------------------- + # x + 1 + # Pp = pvalue probe (Raw.p) + # Pr = pvalue random probe (permu matrix) + # We have to consider that floating Point Numbers are Inaccurate + out <- (sum(permu[as.character(Gene),] - Raw.p < 10^-100, na.rm=TRUE) + 1) / (sum(!is.na(permu[Gene,])) + 1) + } return(out) } - cl <- makeCluster(5,"SOCK") - Pvalue <- parSapplyLB(cl,1:nrow(zscore.Matrix),.Pvalue,target.Matrix=zscore.Matrix,permu=permu,simplify = F) - stopCluster(cl) - Pvalue <- unlist(Pvalue) - Output <- cbind(zscore.Matrix,Pvalue) - return(Output) + message("Calculate empirical P value.\n") + Pvalue <- unlist(apply(U.matrix,1,.Pvalue,permu=permu)) + U.matrix$Pe <- Pvalue + return(U.matrix) } diff --git a/R/TCGA_pipe.R b/R/TCGA_pipe.R new file mode 100644 index 00000000..8115f239 --- /dev/null +++ b/R/TCGA_pipe.R @@ -0,0 +1,602 @@ +#' ELMER analysis pipeline for TCGA data. +#' @description +#' ELMER analysis pipeline for TCGA data. This pipeline combine every steps of \pkg{ELMER} +#' analyses: get.feature.probe, get.diff.meth, get.pair, get.permu, get.enriched.motif and get.TFs. +#' Every steps' results are saved. +#' @param disease TCGA short form disease name such as COAD +#' @param genome Data aligned against which genome of reference. Options: "hg19", "hg38" (default) +#' @param analysis A vector of characters listing the analysis need to be done. +#' Analysis can be "download","distal.probes","diffMeth","pair","motif","TF.search". +#' Default is "all" meaning all the analysis will be processed. +#' @param group.col A column defining the groups of the sample. You can view the +#' available columns using: colnames(MultiAssayExperiment::colData(data)). +#' @param group1 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param group2 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param wd A path shows working dirctory. Default is "./" +#' @param genes List of genes for which mutations will be verified. +#' A column in the MAE with the name of the gene +#' will be created with two groups WT (tumor samples without mutation), MUT (tumor samples w/ mutation), +#' NA (not tumor samples) +#' @param mode This option will automatically set the percentage of samples to be used in the analysis. +#' Options: "supervised" (use 100\% of samples) or "unsupervised" (use 20\% of samples). +#' @param cores A interger which defines number of core to be used in parallel process. +#' Default is 1: don't use parallel process. +#' @param diff.dir A character can be "hypo" or "hyper", showing dirction DNA methylation changes. +#' If it is "hypo", get.diff.meth function will identify all significantly hypomethylated +#' CpG sites; If "hyper", get.diff.meth function will identify all significantly hypermethylated +#' CpG sites +#' @param mutant_variant_classification List of TCGA variant classification from MAF files to consider a samples +#' mutant. Only used when argument gene is set. +#' @param Data A path shows the folder containing DNA methylation, expression and clinic data +#' @param ... A list of parameters for functions: GetNearGenes, get.feature.probe, +#' get.diff.meth, get.pair +#' @return Different analysis results. +#' @export +#' @importFrom SummarizedExperiment colData<- +#' @importFrom stringr str_split +#' @examples +#' data <- ELMER:::getdata("elmer.data.example") +#' TCGA.pipe( +#' disease = "LUSC", +#' data = data, +#' analysis = c("diffMeth","pair", "motif","TF.search"), +#' mode = "supervised", +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' diff.dir = c("hypo"), +#' dir.out = "pipe", +#' sig.dif = 0.0001, +#' pvalue = 1.0, +#' min.incidence = 0, +#' lower.OR = 0.0 +#' ) +#' \dontrun{ +#' distal.probe <- TCGA.pipe(disease = "LUSC", analysis="distal.enhancer", wd="~/") +#' TCGA.pipe(disease = "LUSC",analysis = "all", genome = "hg19", cores = 1, permu.size=300, Pe=0.01) +#' projects <- TCGAbiolinks:::getGDCprojects()$project_id +#' projects <- gsub("TCGA-","",projects[grepl('^TCGA',projects,perl=TRUE)]) +#' for(proj in projects) TCGA.pipe(disease = proj,analysis = "download") +#' plyr::alply(sort(projects),1,function(proj) { +#' tryCatch({ +#' print(proj); +#' TCGA.pipe(disease = proj,analysis = c("createMAE"))}) +#' }, .progress = "text") +#' plyr::alply(sort(projects),1,function(proj) { +#' tryCatch({ +#' print(proj); +#' TCGA.pipe(disease = proj, +#' analysis = c("diffMeth","pair", "motif","TF.search"))}) +#' }, .progress = "text") +#' +#' # Evaluation mutation +#' TCGA.pipe(disease = "LUSC",analysis = "createMAE",gene = "NFE2L2") +#' TCGA.pipe( +#' disease = "LUSC",analysis = c("diffMeth","pair", "motif","TF.search"), +#' mode = "supervised", +#' group.col = "NFE2L2", group1 = "Mutant", group2 = "WT", +#' diff.dir = c("hypo"), +#' dir.out = "LUSC_NFE2L2_MutvsWT" +#' ) +#' } +TCGA.pipe <- function( + disease, + genome = "hg38", + analysis = "all", + wd = getwd(), + cores = 1, + mode = "unsupervised", + Data = NULL, + diff.dir = "hypo", + genes = NULL, + mutant_variant_classification = c( + "Frame_Shift_Del", + "Frame_Shift_Ins", + "Missense_Mutation", + "Nonsense_Mutation", + "Splice_Site", + "In_Frame_Del", + "In_Frame_Ins", + "Translation_Start_Site", + "Nonstop_Mutation" + ), + group.col = "TN", + group1 = "Tumor", + group2 = "Normal", + ... +){ + + if (missing(disease)) + stop("Disease should be specified.\nDisease short name (such as LAML) + please check https://gdc-portal.nci.nih.gov") + + available.analysis <- c( + "download","distal.probes", + "createMAE","diffMeth","pair", + "motif","TF.search","report","all" + ) + # Replace all by all the other values + if("all" %in% analysis[1] ) analysis <- grep("all", available.analysis, value = TRUE, invert = TRUE) + + if(any(!tolower(analysis) %in% tolower(analysis))) + stop( + paste0( + "Availbale options for analysis argument are: ", + paste(c("",available.analysis), collapse = "\n=> ") + ) + ) + + disease <- toupper(disease) + + # Download + if("download" %in% tolower(analysis)){ + print.header("Download data") + if(is.null(Data)) Data <- sprintf("%s/Data/%s",wd,disease) + params <- c() + params$disease <- disease + params$basedir <- sprintf("%s/Data",wd) + params$genome <- genome + do.call(getTCGA,params) + analysis <- analysis[!analysis %in% "download"] + } + + #--------------------------------------------------------------- + dir.out.root <- sprintf("%s/Result/%s",wd,disease) + if(!file.exists(dir.out.root)) dir.create(dir.out.root, recursive = TRUE, showWarnings = FALSE) + args <- list(...) + + if(any(sapply(analysis, function(x) tolower(x) %in% tolower(c("diffMeth","pair","motif","TF.search","report"))))) { + if(!mode %in% c("supervised","unsupervised")){ + stop("Set mode arugment to supervised or unsupervised") + } + if(mode %in% c("supervised")) { + minSubgroupFrac <- 1 + message("=> ", mode, " was selected: using all samples") + } else { + minSubgroupFrac <- 0.2 + message("=> ", mode, " was selected: using ", minSubgroupFrac, " samples") + } + dir.out <- sprintf("%s/Result/%s/%s_%s_vs_%s/%s",wd,disease,group.col,group1,group2,diff.dir) + message("=> Analysis results wil be save in: ", dir.out) + if(!file.exists(dir.out)) dir.create(dir.out, recursive = TRUE, showWarnings = FALSE) + } + #----------------------------------------------------- + ## select distal enhancer probes + if(tolower("distal.probes") %in% tolower(analysis)){ + print.header("Select distal probes") + params <- args[names(args) %in% c("TSS", "TSS.range","rm.chr")] + params <- c(params,list("genome" = genome, "feature"= NULL)) + probeInfo <- do.call(get.feature.probe,params) + save(probeInfo,file = sprintf("%s/probeInfo_distal_%s.rda",dir.out.root,genome)) + if(length(analysis) == 1){ + return(probeInfo) + } else { + analysis <- analysis[!analysis %in% "distal.probes"] + invisible(gc()) + } + } + + if(tolower("createMAE") %in% tolower(analysis)){ + print.header("Creating Multi Assay Experiment") + file <- sprintf("%s/%s_mae_%s.rda",dir.out.root,disease,genome) + if(!file.exists(file)) { + sample.type <- c(group1,group2) + + if(is.null(Data)) Data <- sprintf("%s/Data/%s",wd,disease) + meth.file <- sprintf("%s/%s_meth_%s.rda",Data,disease,genome) + if(is.null(Data)) Data <- sprintf("%s/Data/%s",wd,disease) + exp.file <- sprintf("%s/%s_RNA_%s.rda",Data,disease,genome) + + ## get distal probe info + distal.probe <- sprintf("%s/probeInfo_distal_%s.rda",dir.out.root,genome) + if(!file.exists(distal.probe)){ + params <- args[names(args) %in% c("TSS","TSS.range","rm.chr")] + params <- c(params,list("genome" = genome, "feature"= NULL)) + distal.probe <- suppressWarnings(do.call(get.feature.probe,params)) + } + + mae <- createMAE( + met = meth.file, + exp = exp.file, + filter.probes = distal.probe, + genome = genome, + met.platform = "450K", + save = FALSE, + linearize.exp = TRUE, + TCGA = TRUE + ) + # Remove FFPE samples + if("is_ffpe" %in% colnames(colData(mae))) mae <- mae[,!mae$is_ffpe] + + # if user set genes argument label Mutant WT will be added to mae + if(!is.null(genes)) mae <- addMutCol(mae, disease, genes, mutant_variant_classification) + } else { + message("File already exists: ", file) + mae <- get(load(file)) + if(!is.null(genes)) mae <- addMutCol(mae, disease, genes, mutant_variant_classification) + } + save(mae,file = file) + message("File saved as: ", file) + readr::write_tsv( + x = as.data.frame(colData(mae)), + path = sprintf("%s/%s_samples_info_%s.tsv",dir.out.root,disease,genome) + ) + } + + # Creates a record of the analysis and arguments called + if(any(tolower(c("diffMeth","pair", "motif","TF.search")) %in% tolower(analysis))){ + createSummaryDocument( + analysis = analysis, + argument.values = args, + mae.path = sprintf("%s/%s_mae_%s.rda",dir.out.root,disease,genome), + genome = genome, + mode = mode, + direction = diff.dir, + group.col = group.col, + group1 = group1, + group2 = group2, + results.path = dir.out + ) + } + # get differential DNA methylation + if(tolower("diffMeth") %in% tolower(analysis)){ + print.header("Get differential DNA methylation loci") + if(!"data" %in% names(args)){ + mae.file <- sprintf("%s/%s_mae_%s.rda",dir.out.root,disease,genome) + if(!file.exists(mae.file)){ + message("MAE not found, please run pipe with createMAE or all options") + return(NULL) + } + load(mae.file) + } else { + mae <- args$data + } + params <- args[names(args) %in% c("pvalue","sig.dif")] + params <- c(params,list(diff.dir = diff.dir, + dir.out = dir.out, + cores = cores, + minSubgroupFrac = minSubgroupFrac)) + diff.meth <- tryCatch({ + diff.meth <- do.call( + get.diff.meth, + c(params, + list(data = mae, + group.col = group.col, + group1 = group1, + group2 = group2) + ) + ) + diff.meth + }, error = function(e) { + message(e) + return(NULL) + }) + if(is.null(diff.meth)) return(NULL) + + if(length(analysis) == 1) return(diff.meth) + } + + # predict pair + if("pair" %in% tolower(analysis)){ + print.header("Predict pairs") + if(!"data" %in% names(args)){ + mae.file <- sprintf("%s/%s_mae_%s.rda",dir.out.root,disease,genome) + if(!file.exists(mae.file)){ + message("MAE not found, please run pipe with createMAE or all options") + return(NULL) + } + load(mae.file) + } else { + mae <- args$data + } + + Sig.probes <- read_csv(sprintf("%s/getMethdiff.%s.probes.significant.csv", + dir.out,diff.dir))[,1,drop = T] + if(length(Sig.probes) == 0) { + message("No significant probes were found") + return(NULL) + } + + message("Get nearby genes") + file <- sprintf("%s/getPair.%s.pairs.significant.csv", dir.out, diff.dir) + + nearGenes.file <- args[names(args) %in% "nearGenes"] + if(length(nearGenes.file) == 0){ + nearGenes.file <- sprintf("%s/%s.probes_nearGenes.rda",dir.out,diff.dir) + params <- args[names(args) %in% c("numFlankingGenes")] + nearGenes <- do.call(GetNearGenes, + c(list(data = mae, + probes = Sig.probes), + params)) + save(nearGenes,file=nearGenes.file) + message("File saved: ", nearGenes.file) + } else { + nearGenes.file <- nearGenes.file[["nearGenes"]] + } + + # calculation + message(sprintf("Identify putative probe-gene pair for %smethylated probes",diff.dir)) + + # get pair + permu.dir <- paste0(dir.out,"/permu") + params <- args[names(args) %in% c("percentage","permu.size","Pe","raw.pvalue","diffExp","group.col")] + SigPair <- do.call(get.pair, + c(list(data = mae, + nearGenes = nearGenes.file, + permu.dir = permu.dir, + group.col = group.col, + group1 = group1, + mode = mode, + diff.dir = diff.dir, + minSubgroupFrac = min(1,minSubgroupFrac * 2), + group2 = group2, + dir.out = dir.out, + cores = cores, + label = diff.dir), + params)) + + # message("==== Promoter analysis ====") + # message("calculate associations of gene expression with DNA methylation at promoter regions") + # message("Fetching promoter regions") + # file <- sprintf("%s/%s_mae_promoter_%s.rda",dir.out.root,disease, genome) + # + # if(!file.exists(file)) { + # ## promoter methylation correlation. + # # get promoter + # suppressWarnings({ + # promoter.probe <- get.feature.probe(promoter=TRUE, genome = genome, + # TSS.range=list(upstream = 200, downstream = 2000)) + # }) + # + # if(is.null(Data)) Data <- sprintf("%s/Data/%s",wd,disease) + # meth.file <- sprintf("%s/%s_meth_%s.rda",Data,disease, genome) + # if(is.null(Data)) Data <- sprintf("%s/Data/%s",wd,disease) + # exp.file <- sprintf("%s/%s_RNA_%s.rda",Data,disease, genome) + # + # mae.promoter <- createMAE(met = meth.file, + # exp = exp.file, + # filter.probes = promoter.probe, + # genome = genome, + # met.platform = "450K", + # linearize.exp = TRUE, + # save = FALSE, + # TCGA = TRUE) + # if(!all(sample.type %in% colData(mae)[,group.col])){ + # message("There are no samples for both groups") + # return(NULL) + # } + # mae.promoter <- mae.promoter[,colData(mae.promoter)[,group.col] %in% sample.type] + # save(mae.promoter,file = file) + # } else { + # mae.promoter <- get(load(file)) + # } + # params <- args[names(args) %in% "percentage"] + # Promoter.meth <- do.call(promoterMeth, c(list(data=mae.promoter, sig.pvalue=0.01, save=FALSE), + # params)) + # write.csv(Promoter.meth, + # file = sprintf("%s/promoter.%s.analysis.csv", dir.out, diff.dir), + # row.names=FALSE) + # add <- SigPair[match(SigPair$GeneID, Promoter.meth$GeneID),"Raw.p"] + # SigPair <- cbind(SigPair, GSbPM.pvalue = add) + if(is.null(SigPair)) { + message("No significant pair probe genes found") + return(NULL) + } + write.csv(SigPair, + file = sprintf("%s/getPair.%s.pairs.significant.csv", dir.out, diff.dir), + row.names=FALSE) + + if(length(analysis) == 1) return(SigPair) + } + + # search enriched motif + if("motif" %in% tolower(analysis)){ + print.header("Motif search") + + if(!"data" %in% names(args)){ + mae.file <- sprintf("%s/%s_mae_%s.rda",dir.out.root,disease,genome) + if(!file.exists(mae.file)){ + message("MAE not found, please run pipe with createMAE or all options") + return(NULL) + } + load(mae.file) + } else { + mae <- args$data + } + + message(sprintf("Identify enriched motif for %smethylated probes",diff.dir)) + if(file.exists(sprintf("%s/getPair.%s.pairs.significant.csv",dir.out, diff.dir))){ + Sig.probes <- readr::read_csv(sprintf("%s/getPair.%s.pairs.significant.csv", dir.out, diff.dir), + col_names = TRUE, + col_types = c("cccicdd")) + Sig.probes <- unique(Sig.probes$Probe) + if(length(unique(Sig.probes)) < 10) { + message ("No significants pairs were found in the previous step") + return(NULL) + } + + } else { + message(sprintf("%s/%s.pairs.significant.csv file doesn't exist",dir.out, diff.dir)) + return(NULL) + } + params <- args[names(args) %in% c("background.probes","lower.OR","min.incidence","pvalue")] + + newenv <- new.env() + if(genome == "hg19") data("Probes.motif.hg19.450K", package = "ELMER.data", envir = newenv) + if(genome == "hg38") data("Probes.motif.hg38.450K", package = "ELMER.data", envir = newenv) + probes.motif <- get(ls(newenv)[1],envir=newenv) + + enriched.motif <- do.call(get.enriched.motif, + c(list(data = mae, + probes.motif = probes.motif, + probes = Sig.probes, + dir.out = dir.out, + label = diff.dir, + plot.title = paste0("OR for paired probes ", + diff.dir, " methylated in ", + group1, " vs ",group2, "(group: ",group.col,")")), + params)) + + if(length(analysis) == 1) return(enriched.motif) + } + + #search responsible TFs + if(tolower("TF.search") %in% tolower(analysis)){ + print.header("Search responsible TFs") + ## load mae + if(!"data" %in% names(args)){ + mae.file <- sprintf("%s/%s_mae_%s.rda",dir.out.root,disease,genome) + if(!file.exists(mae.file)){ + message("MAE not found, please run pipe with createMAE or all options") + return(NULL) + } + load(mae.file) + } else { + mae <- args$data + } + #construct RNA seq data + print.header(sprintf("Identify regulatory TF for enriched motif in %smethylated probes", + diff.dir), "subsection") + enriched.motif <- args[names(args) %in% "enriched.motif"] + if(length(enriched.motif) == 0){ + enriched.motif <- sprintf("%s/getMotif.%s.enriched.motifs.rda", dir.out, diff.dir) + } + + params <- args[names(args) %in% c("TFs", "motif.relavent.TFs","percentage")] + TFs <- do.call(get.TFs, + c(list( + data = mae, + group.col = group.col, + group1 = group1, + group2 = group2, + mode = mode, + diff.dir = diff.dir, + minSubgroupFrac = min(1,minSubgroupFrac * 2), + enriched.motif = enriched.motif, + dir.out = dir.out, + cores = cores, + label = diff.dir), + params)) + if(length(analysis) == 1) return(TFs) + } + + if(tolower("report") %in% tolower(analysis)){ + summary.file <- file.path(dir.out,"TCGA.pipe_records.txt") + lines <- readLines(summary.file) + genome <- trimws(unlist(stringr::str_split(lines[grep("genome",lines)[1]], ":"))[2]) + group1 <- trimws(unlist(stringr::str_split(lines[grep("group1",lines)[1]], ":"))[2]) + group2 <- trimws(unlist(stringr::str_split(lines[grep("group2",lines)[1]], ":"))[2]) + group.col <- trimws(unlist(stringr::str_split(lines[grep("group.col",lines)[1]], ":"))[2]) + results.path <- trimws(unlist(stringr::str_split(lines[grep("results.path",lines)[1]], ":"))[2]) + direction <- trimws(unlist(stringr::str_split(lines[grep("direction",lines)[1]], ":"))[2]) + mae.path <- trimws(unlist(stringr::str_split(lines[grep("mae.path",lines)[1]], ":"))[2]) + render_report( + genome = genome, + mode = mode, + title = paste0(disease, " report"), + minSubgroupFrac = ifelse(mode =="supervised",1,0.2), + mae = mae.path, + direction = direction, + group.col = group.col, + group1 = group1, + group2 = group2, + dir.out = results.path, + out_file = file.path(results.path,paste0(disease,"_report.html"))) + } + +} + +#' Adds mutation information to MAE +#' @param data MAE object +#' @param disease TCGA disease (LUSC, GBM, etc) +#' @param genes list of genes to add information +#' @param mutant_variant_classification List of mutant_variant_classification that will be +#' consider a sample mutant or not. +#' @examples +#' \dontrun{ +#' data <- ELMER:::getdata("elmer.data.example") # Get data from ELMER.data +#' data <- ELMER:::addMutCol(data, "LUSC","TP53") +#' } +addMutCol <- function( + data, + disease, + genes, + mutant_variant_classification = c( + "Frame_Shift_Del", + "Frame_Shift_Ins", + "Missense_Mutation", + "Nonsense_Mutation", + "Splice_Site", + "In_Frame_Del", + "In_Frame_Ins", + "Translation_Start_Site", + "Nonstop_Mutation" + ) +){ + maf <- TCGAbiolinks::GDCquery_Maf(disease, pipeline = "mutect2") + for(gene in genes) { + if(gene %in% maf$Hugo_Symbol) { + message("Adding information for gene: ", gene) + aux <- maf %>% filter(Hugo_Symbol == gene) # Select only mutation on that gene + idx <- unique(unlist(sapply(mutant_variant_classification,function(x) grep(x,aux$Variant_Classification, ignore.case = TRUE)))) + aux <- aux[idx,] + mutant.samples <- substr(aux$Tumor_Sample_Barcode,1,16) + colData(data)[,gene] <- "Normal" + colData(data)[colData(data)$TN == "Tumor", gene] <- "WT" + colData(data)[colData(data)$TN == "Tumor" & + colData(data)$sample %in% mutant.samples,gene] <- "Mutant" + message("The column ", gene, " was create in the MAE object") + print(plyr::count(colData(data)[,gene])) + } else { + message("No mutation found for: ", gene) + } + } + return(data) +} + +#' @title Create summary document for TCGA.pipe function +#' @description This function will create a text file with the +#' date of the last run, which aanalysis were performed, the values of +#' the arguments so the user can keep track +#' @param analysis Which analysis were performed +#' @param argument.values Other argument values changed +#' @param mode Mode "supervised" or "unsupervised" used in the analysis +#' @param genome Genome of reference hg38 and hg19 +#' @param mae.path Where mae is stored +#' @param direction Hypo or hyper direction +#' @param group.col Group col +#' @param group1 Group 1 +#' @param group2 Group 2 +#' @param results.path Path where the results were saved +createSummaryDocument <- function(analysis = "all", + argument.values = "defaults", + genome = NULL, + mae.path = NULL, + mode = NULL, + direction = NULL, + group.col = NULL, + group1 = NULL, + group2 = NULL, + results.path = NULL){ + message("Recording analysis information into: ",file.path(results.path,"TCGA.pipe_records.txt")) + df <- paste0("oooooooooooooooooooooooooooooooooooo\n", + "o date: ",Sys.time(),"\n", + "o analysis: ", paste(analysis, collapse = ","), "\n", + "o genome: ", ifelse(is.null(genome),"",genome), "\n", + "o mae.path: ", ifelse(is.null(mae.path),"",mae.path), "\n", + "o direction: ", ifelse(is.null(direction),"",direction), "\n", + "o mode: ", ifelse(is.null(mode),"",mode), "\n", + "o group.col: ", ifelse(is.null(group.col),"",group.col), "\n", + "o group1: ", ifelse(is.null(group1),"",group1), "\n", + "o group2: ", ifelse(is.null(group2),"",group2), "\n", + "o results.path: ", ifelse(is.null(results.path),"",results.path), "\n" + #"o argument.values: ",paste(paste0(names(argument.values),"=",as.character(argument.values)), collapse = ",") + ) + fileConn <- file(file.path(results.path,"TCGA.pipe_records.txt"),open = "w") + write(df, fileConn, append=TRUE) + close(fileConn) +} diff --git a/R/TFsurvival.plot.R b/R/TFsurvival.plot.R new file mode 100644 index 00000000..277fed8a --- /dev/null +++ b/R/TFsurvival.plot.R @@ -0,0 +1,56 @@ +#' Creates survival plot of based on the expression of a TF +#' @description +#' This function will create a survival plot for the samples with higher, midium, low expression +#' of a given transcription factor. +#' By defau;t samples with higher expression are the top 30% and the lower expression the bottom 30%. +#' @param data A multi assay Experiment with clinical data in the phenotypic data matrix +#' containing the following columns: vital_status, days_to_last_follow_up and days_to_death. Default from GDC and TCGAbiolinks +#' @param TF A gene symbol +#' @param xlim Limit x axis showed in plot +#' @param percentage A number ranges from 0 to 1 specifying the percentage of samples in the +#' higher and lower expression groups. Default is 0.3 +#' @param save Save plot as PDF +#' @importFrom TCGAbiolinks TCGAanalyze_survival +#' @export +TFsurvival.plot <- function(data, + TF, + xlim = NULL, + percentage = 0.3, + save = TRUE){ + if(!all(c("vital_status", "days_to_last_follow_up","days_to_death") %in% colnames(colData(data)))){ + message("colData must have the following columns: vital_status,days_to_last_follow_up, days_to_death") + return(NULL) + } + # For the transcription factor, gets it getGeneID + gene <- getGeneID(data,symbol=TF) + # Get the expression values for the genes. + # (getExp is a ELMER function) + exp <- as.vector(assay(getExp(data)[gene,])) + names(exp) <- colnames(getExp(data)) + exp <- sort(exp) + + # Get the names of the 30% patients with lower expression + lower <- names(head(exp, n = ceiling(length(exp) * percentage))) + + # Get the names of the 30% patients with higher expression + higher <- names(tail(exp, n = ceiling(length(exp) * percentage))) + + df <- colData(data) + # Create the labels for each sample + df$tf_groups <- "medium" + low.idx <- sampleMap(data)[sampleMap(data)$colname %in% lower,"primary"] + df[low.idx,"tf_groups"] <- "low" + high.idx <- sampleMap(data)[sampleMap(data)$colname %in% higher,"primary"] + df[high.idx,"tf_groups"] <- "high" + + filename <- NULL + if(save) filename <- paste0(TF,"_survival.pdf") + # Use TCGAbiolinks to create the survival curve + TCGAanalyze_survival(df, + "tf_groups", + legend=paste0(TF," Exp level"), + filename = filename, + xlim = xlim, + conf.int = FALSE, + risk.table = FALSE) +} diff --git a/R/methodAccess.R b/R/methodAccess.R new file mode 100644 index 00000000..67811b0f --- /dev/null +++ b/R/methodAccess.R @@ -0,0 +1,79 @@ +#' @title Get DNA methylation object from MAE +#' @description Get DNA methylation object from MAE +#' @param data A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function. +#' @importFrom MultiAssayExperiment experiments +#' @export +getMet <- function(data) { + return(experiments(data)[["DNA methylation"]]) +} + +#' @title Get DNA methylation object samples from MAE +#' @description Get DNA methylation object samples from MAE +#' @param data A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function. +#' @importFrom MultiAssayExperiment sampleMap +#' @export +getMetSamples <- function(data){ + return(sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"]) +} + +#' @title Get Gene expression object samples from MAE +#' @description Get Gene expression object samples from MAE +#' @param data A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function. +#' @importFrom MultiAssayExperiment sampleMap +#' @export +getExpSamples <- function(data){ + return(sampleMap(data)[sampleMap(data)$assay == "Gene expression","primary"]) +} + +#' @title Get Gene expression object from MAE +#' @description Get Gene expression object from MAE +#' @param data A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function. +#' @importFrom MultiAssayExperiment experiments +#' @export +getExp <- function(data) { + return(experiments(data)[["Gene expression"]]) +} + +# Check input +checkData <- function(data){ + if(class(data) == class(MultiAssayExperiment())) { + if(! "DNA methylation" %in% names(data) | + !"Gene expression" %in% names(data)) + stop("Please the input should be a MultiAssayExperiment with both DNA methylation and Gene expression matrix. See function createMultiAssayExperiment") + } else { + stop("Please the input should be a MultiAssayExperiment with both DNA methylation and Gene expression matrix. See function createMultiAssayExperiment") + } + # We will need to ensure createMultiAssayExperiment add those fields if they don't exists + if(!"external_gene_name" %in% colnames(values(data))) stop("Please the input should be a Gene Expression data must have external_gene_name column") + if(!"ensembl_gene_id" %in% colnames(values(data))) stop("Please the input should be a Gene Expression data must have external_gene_name column") +} + +#'getSymbol to report gene symbol from id +#' @param data A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function. +#' @param geneID A character which is the ensembl_gene_id +#' @return The gene symbol for input genes. +#' @export +#' @examples +#' data <- ELMER:::getdata("elmer.data.example") +#' getSymbol(data, geneID="ENSG00000143067") +getSymbol <- function(data,geneID){ + gene <- unique(values(getExp(data))[,c("ensembl_gene_id","external_gene_name")]) + gene <- gene[match(geneID,gene$ensembl_gene_id),"external_gene_name"] + return(gene) +} + +#'getGeneID to report gene id from symbol +#'@importFrom S4Vectors values +#' @param data A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function. +#'@param symbol A vector of characters which are gene symbols +#'@return The gene ID for these gene symbols +#'@export +#'@examples +#' data <- ELMER:::getdata("elmer.data.example") +#' getGeneID(data, symbol="ZNF697") +getGeneID <- function(data,symbol){ + gene <- unique(values(getExp(data))[,c("ensembl_gene_id","external_gene_name")]) + gene <- gene[match(symbol,gene$external_gene_name),"ensembl_gene_id"] + return(gene) +} + diff --git a/R/motif.TF.Plots.R b/R/motif.TF.Plots.R new file mode 100644 index 00000000..a0e4a34e --- /dev/null +++ b/R/motif.TF.Plots.R @@ -0,0 +1,374 @@ +#' motif.enrichment.plot to plot bar plots showing motif enrichment ORs and 95\% confidence interval for ORs +#' @description +#' motif.enrichment.plot to plot bar plots showing motif enrichment ORs and +#' 95\% confidence interval for ORs. Option motif.enrichment can be a data frame +#' generated by \code{\link{get.enriched.motif}} or a path of XX.csv saved by the +#' same function. +#' @param motif.enrichment A data frame or a file path of get.enriched.motif output +#'motif.enrichment.csv file. +#' @param significant A list to select subset of motif. Default is NULL. +#' @param dir.out A path specify the directory to which the figures will be saved. +#'Current directory is default. +#' @param save A logic. If true (default), figure will be saved to dir.out. +#' @param label A character. Labels the outputs figure. +#' @param title Plot title. Default: no title +#' @param width Plot width +#' @param height Plot height. If NULL a default value will be calculated +#' @param summary Create a summary table along with the plot, it is necessary +#'to add two new columns to object (NumOfProbes and PercentageOfProbes) +#' @return A figure shows the enrichment level for selected motifs. +#' @details motif.enrichment If input data.frame object, it should contain "motif", +#' "OR", "lowerOR", "upperOR" columns. motif specifies name of motif; +#' OR specifies Odds Ratio, lowerOR specifies lower boundary of OR (95%) ; +#' upperOR specifies upper boundary of OR(95%). +#' @details significant A list used to select subset of motif.enrichment by the +#'cutoff of OR, lowerOR, upperOR. significant=list(OR=1). More than one cutoff +#'can be specified such as significant = list(OR=1, lowerOR=1,upperOR=4) +#' @importFrom ggplot2 aes ggplot geom_point geom_errorbar coord_flip geom_abline +#' @usage +#' motif.enrichment.plot(motif.enrichment, +#' significant = NULL, +#' dir.out ="./", +#' save = TRUE, +#' label = NULL, +#' title = NULL, +#' width = 10, +#' height = NULL, +#' summary = FALSE) +#' @author +#' Lijing Yao (creator: lijingya@usc.edu) +#' @references +#' Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +#' factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +#' @export +#' @importFrom grid gpar +#' @importFrom gridExtra grid.arrange arrangeGrob +#' @examples +#' motif.enrichment <- data.frame(motif = c("TP53","NR3C1","E2F1","EBF1","RFX5","ZNF143", "CTCF"), +#' OR = c(19.33,4.83,1, 4.18, 3.67,3.03,2.49), +#' lowerOR = c(10,3,1.09,1.9,1.5,1.9, 0.82), +#' upperOR = c(23,5,3,7,6,5,5), +#' stringsAsFactors = FALSE) +#' motif.enrichment.plot(motif.enrichment = motif.enrichment, +#' significant = list(OR = 3), +#' label = "hypo", save = FALSE) +#' motif.enrichment.plot(motif.enrichment = motif.enrichment, +#' significant = list(OR = 3), +#' label = "hypo", +#' title = "OR for paired probes hypomethylated in Mutant vs WT", +#' save = FALSE) +#' motif.enrichment <- data.frame(motif = c("TP53","NR3C1","E2F1","EBF1","RFX5","ZNF143", "CTCF"), +#' OR = c(19.33,4.83,1, 4.18, 3.67,3.03,2.49), +#' lowerOR = c(10,3,1.09,1.9,1.5,1.5, 0.82), +#' upperOR = c(23,5,3,7,6,5,5), +#' NumOfProbes = c(23,5,3,7,6,5,5), +#' PercentageOfProbes = c(0.23,0.05,0.03,0.07,0.06,0.05,0.05), +#' stringsAsFactors=FALSE) +#' motif.enrichment.plot(motif.enrichment = motif.enrichment, +#' significant = list(OR = 3), +#' label = "hypo", save = FALSE) +#' motif.enrichment.plot(motif.enrichment = motif.enrichment, +#' significant = list(OR = 3), +#' label = "hypo", +#' summary = TRUE, +#' title = "OR for paired probes hypomethylated in Mutant vs WT", +#' save = TRUE) +motif.enrichment.plot <- function(motif.enrichment, + significant = NULL, + dir.out ="./", + save = TRUE, + label = NULL, + title = NULL, + width = 10, + height = NULL, + summary = FALSE){ + if(missing(motif.enrichment)) stop("motif.enrichment is missing.") + if(is.character(motif.enrichment)){ + motif.enrichment <- read.csv(motif.enrichment, stringsAsFactors=FALSE) + } + if(!is.null(significant)){ + for(i in names(significant)){ + motif.enrichment <- motif.enrichment[motif.enrichment[,i] > significant[[i]],] + } + } + if(nrow(motif.enrichment) == 0) return(NULL) + motif.enrichment <- motif.enrichment[order(motif.enrichment$lowerOR,decreasing = TRUE),] + + if(summary){ + or.col <- paste0(round(motif.enrichment$OR,digits = 2), + " (", round(motif.enrichment$lowerOR,digits = 2),"-", + round(motif.enrichment$upperOR,digits = 2),")") + nb.idx <- grep("NumOf",colnames(motif.enrichment)) + if("PercentageOfProbes" %in% colnames(motif.enrichment)){ + probe.col <- paste0(motif.enrichment[,nb.idx,drop=T], + " (", round(100 * motif.enrichment$PercentageOfProbes, digits = 2),"%)") + } else { + probe.col <- paste0(motif.enrichment[,nb.idx]) + } + lab <- data.frame(x = factor(c("",as.character(motif.enrichment$motif)), + levels = rev(c("",as.character(motif.enrichment$motif)))), + y = rep(c(1,2,3),each=length(motif.enrichment$motif) + 1), + z = c("Motif",gsub("_HUMAN.H11MO.*","",as.character(motif.enrichment$motif)), + "Odds ratio \n (95% CI)", + or.col, + ifelse("PercentageOfProbes" %in% colnames(motif.enrichment), "# probes \n(% of paired)"," # regions"), + probe.col) + ) + + data_table <- ggplot(lab, aes_string(x = 'y', y = 'x', label = format('z', nsmall = 1))) + + theme_minimal() + + geom_text(size = 3.5, hjust=0, vjust=0.5) + + geom_hline(aes(yintercept=c(nrow(motif.enrichment) - 0.65))) + + labs(x="",y="") + + coord_cartesian(xlim=c(1,3.8)) + + theme(panel.grid.major = element_blank(), + panel.grid.minor.x = element_blank(),panel.background =element_blank(), + legend.position = "none", + panel.border = element_blank(), + axis.text.x = element_text(colour="white"),#element_blank(), + axis.text.y = element_blank(), + axis.ticks = element_line(colour="white"),#element_blank(), + plot.margin = unit(c(0,0,0,0), "lines")) + + motif.enrichment$motif <- factor(motif.enrichment$motif, + levels=as.character(motif.enrichment$motif[nrow(motif.enrichment):1])) + limits <- aes_string(ymax = 'upperOR', ymin = 'lowerOR') + motif.enrichment$probes <- NULL + motif.enrichment <- rbind(motif.enrichment,NA) + P <- ggplot(motif.enrichment, aes_string(x='motif', y='OR')) + + geom_point() + + geom_errorbar(limits, width=0.3) + + coord_flip() + + geom_abline(intercept = 1, slope = 0, linetype = "3313")+ + theme_bw() + + theme(panel.grid.major = element_blank()) + + xlab("") + ylab("Odds Ratio") + + ggtitle(label = NULL, subtitle = NULL) + + scale_y_continuous(breaks=c(1,pretty(motif.enrichment$OR, n = 5))) + + theme(axis.text.y = element_blank(), + axis.ticks = element_line(colour="white"),#element_blank(), + plot.margin = unit(c(0,0,0,0), "lines")) + + suppressWarnings({ + P <- arrangeGrob(data_table, P, ncol=2, + widths = c(2,2), + heights = c(0.95,0.05), + top = title) + + }) + } else { + + motif.enrichment$motif <- factor(motif.enrichment$motif, + levels = as.character(motif.enrichment$motif[nrow(motif.enrichment):1])) + limits <- aes_string(ymax = 'upperOR', ymin = 'lowerOR') + P <- ggplot(motif.enrichment, aes_string(x = 'motif', y = 'OR')) + + geom_point() + + geom_errorbar(limits, width = 0.3) + + coord_flip() + + geom_abline(intercept = 1, slope = 0, linetype = "3313")+ + theme_bw() + + theme(panel.grid.major = element_blank()) + + xlab("Motifs") + ylab("Odds Ratio") + + ggtitle(label = title, subtitle = NULL) + + scale_y_continuous(breaks = c(1,pretty(motif.enrichment$OR, n = 5))) + } + if(save) { + dir.create(dir.out, recursive = TRUE, showWarnings = FALSE) + ggsave(filename = sprintf("%s/%s.motif.enrichment.pdf",dir.out,label), + useDingbats = FALSE, + plot = P, + dpi = 320, + width = width, + limitsize = FALSE, + height = ifelse(is.null(height), 3 * round(nrow(motif.enrichment)/8),height)) + return() + } + if(summary) grid.arrange(P) + return(P) +} + + + +#' TF.rank.plot to plot the scores (-log10(P value)) which assess the correlation between +#' TF expression and average DNA methylation at motif sites. +#' @description +#' TF.rank.plot is a function to plot the scores (-log10(P value)) which assess the +#' correlation between TF expression and average DNA methylation at motif sites. The the motif +#' relevant TF and top3 TFs will be labeled in a different color. +#'@importFrom ggplot2 scale_color_manual geom_vline geom_text position_jitter +#'@importFrom ggplot2 annotation_custom unit ggplot_gtable ggplot_build +#'@importFrom ggrepel geom_text_repel +#'@param motif.pvalue A matrix or a path specifying location of "XXX.with.motif.pvalue.rda" +#'which is output of getTF. +#'@param motif A vector of characters specify the motif to plot +#'@param TF.label A list shows the label for each motif. If TF.label is not specified, +#'the motif relevant TF and top3 TF will be labeled. +#'@param dir.out A path specify the directory to which the figures will be saved. +#'Current directory is default. +#'@param save A logic. If true (default), figure will be saved to dir.out +#'@param title Tite title (the motif will still be added to the title) +#'@param cores A interger which defines the number of cores to be used in parallel process. +#'Default is 1: no parallel process. +#'@return A plot shows the score (-log(P value)) of association between TF +#'expression and DNA methylation at sites of a certain motif. +#'@export +#' @author Lijing Yao (maintainer: lijingya@usc.edu) +#' @importFrom graphics plot +#' @importFrom ggplot2 ggsave +#' @importFrom plyr alply +#'@examples +#' library(ELMER) +#' data <- tryCatch(ELMER:::getdata("elmer.data.example"), error = function(e) { +#' message(e) +#' data(elmer.data.example, envir = environment()) +#' }) +#' enriched.motif <- list("P53_HUMAN.H11MO.0.A"= c("cg00329272", "cg10097755", "cg08928189", +#' "cg17153775", "cg21156590", "cg19749688", "cg12590404", +#' "cg24517858", "cg00329272", "cg09010107", "cg15386853", +#' "cg10097755", "cg09247779", "cg09181054")) +#' TF <- get.TFs(data, +#' enriched.motif, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' TFs = data.frame( +#' external_gene_name=c("TP53","TP63","TP73"), +#' ensembl_gene_id= c("ENSG00000141510", +#' "ENSG00000073282", +#' "ENSG00000078900"), +#' stringsAsFactors = FALSE), +#' label="hypo") +#' TF.meth.cor <- get(load("getTF.hypo.TFs.with.motif.pvalue.rda")) +#' TF.rank.plot(motif.pvalue=TF.meth.cor, +#' motif="P53_HUMAN.H11MO.0.A", +#' TF.label=createMotifRelevantTfs("subfamily")["P53_HUMAN.H11MO.0.A"], +#' save=TRUE) +#' TF.rank.plot(motif.pvalue=TF.meth.cor, +#' motif="P53_HUMAN.H11MO.0.A", +#' save=TRUE) +#' # Same as above +#' TF.rank.plot(motif.pvalue=TF.meth.cor, +#' motif="P53_HUMAN.H11MO.0.A", +#' dir.out = "TFplots", +#' TF.label=createMotifRelevantTfs("family")["P53_HUMAN.H11MO.0.A"], +#' save=TRUE) +TF.rank.plot <- function(motif.pvalue, + motif, + title = NULL, + TF.label = NULL, + dir.out = "./", + save = TRUE, + cores = 1){ + if(missing(motif.pvalue)) stop("motif.pvalue should be specified.") + if(missing(motif)) stop("Please specify which motif you want to plot.") + if(!all(motif %in% colnames(motif.pvalue))) { + print(knitr::kable(sort(colnames(motif.pvalue)), col.names = "motifs")) + stop("One of the motifs does not match. Select from the list above") + } + TF.sublabel <- NULL + if(is.character(motif.pvalue)) { + motif.pvalue <- get(load(motif.pvalue)) # The data is in the one and only variable + } + if(is.null(TF.label)){ + motif.relavent.TFs <- createMotifRelevantTfs() + TF.label <- motif.relavent.TFs[motif] + motif.relavent.TFs <- createMotifRelevantTfs("subfamily") + TF.sublabel <- motif.relavent.TFs[motif] + specify <- "No" + } else { + specify <- "Yes" + } + significant <- floor(0.05 * nrow(motif.pvalue)) + motif.pvalue <- -log10(motif.pvalue) + + parallel <- FALSE + if (cores > 1){ + if (cores > detectCores()) cores <- detectCores() + registerDoParallel(cores) + parallel = TRUE + } + + Plots <- alply(motif, 1, function(i){ + df <- data.frame(pvalue = motif.pvalue[,i], + Gene = rownames(motif.pvalue), + stringAsFactors = FALSE) + df <- df[order(df$pvalue, decreasing = TRUE),] + df$rank <- 1:nrow(df) + + df$label <- "None" + df$label[df$rank %in% 1:3] <- "Top 3" + + + # TF in the family + if(!is.null(TF.label)){ + TF.family <- TF.label[[i]] + df$label[df$Gene %in% TF.family] <- "Same family" + } + if(!is.null(TF.sublabel)){ + # TF in the subfamily + TF.subfamily <- TF.sublabel[[i]] + df$label[df$Gene %in% TF.subfamily] <- "Same subfamily" + } + df.label <- data.frame(pvalue = df$pvalue[df$label %in% c("Same family","Same subfamily","Top 3")], + text = as.character(df$Gene[df$label %in% c("Same family","Same subfamily","Top 3")]), + x = which(df$label %in% c("Same family","Same subfamily","Top 3")), + stringsAsFactors = FALSE) + + highlight.top3 <- df[df$label %in% c("Top 3"),] + highlight.family <- df[df$label %in% c("Same family"),] + highlight.subfamily <- df[df$label %in% c("Same subfamily"),] + df$label <- factor(df$label, levels = c("None","Same family","Same subfamily","Top 3")) + P <- ggplot(df, aes_string(x = 'rank', + y = 'pvalue', + color = 'label')) + + scale_color_manual(name = "TF classification",values = c("None" = "black", + "Top 3" = "red", + "Same family" = "orange", + "Same subfamily" = "lightblue")) + + geom_vline(xintercept = significant, linetype = "3313") + + geom_point() + + theme_bw() + + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank())+ + theme(legend.position="top") + + labs(x = "Rank", + y ="-log10(corrected P-value)", + title = ifelse(is.null(title),paste0("Motif: ",gsub("_HUMAN.H11MO.*","",i)), + paste0(title, " (", gsub("_HUMAN.H11MO.*","",i),")"))) + + geom_point(data = highlight.top3, aes_string(x = 'rank', y = 'pvalue')) + + geom_point(data = highlight.family, aes_string(x = 'rank', y = 'pvalue')) + + geom_point(data = highlight.subfamily, aes_string(x = 'rank', y = 'pvalue')) + + df$Gene <- as.character(df$Gene) + df$Gene[df$label %in% "None"] <- "" + P <- P + geom_text_repel( + data = df, + aes_string(label = 'Gene'), + min.segment.length = unit(0.0, "lines"), + size = 3, + segment.alpha = 0.5, + segment.color = "gray", + nudge_x = 10, + show.legend = FALSE, + fontface = 'bold', color = 'black', + box.padding = unit(0.8, "lines"), + point.padding = unit(1.0, "lines") + ) + + if(save){ + dir.create(dir.out, showWarnings = FALSE,recursive = TRUE) + file <- sprintf("%s/%s.TFrankPlot.pdf", dir.out, i) + message("Saving plot as: ", file) + ggsave(P, filename = file, height = 8, width = 10) + } + P + },.progress = "time",.parallel = parallel) + + names(Plots) <- motif + + if(!save) return(Plots) +} + + + + diff --git a/R/plots.R b/R/plots.R new file mode 100644 index 00000000..ab068f88 --- /dev/null +++ b/R/plots.R @@ -0,0 +1,1219 @@ +#' scatter.plot to plot scatter plots between gene expression and DNA methylation. +#' @description +#' scatter.plot is a function to plot various scatter plots between gene expression and +#' DNA methylation. When byPair is specified, scatter plot for individual probe-gene pairs +#' will be generated. When byProbe is specified, scatter plots for one probes with nearby +#' 20 gene pairs will be generated. When byTF is specified, scatter plot for TF expression +#' and average DNA methylation at certain motif sites will be generated. +#' @param data A multiAssayExperiment with DNA methylation and Gene Expression data. +#' See \code{\link{createMAE}} function. +#' @param group.col A column defining the groups of the sample. You can view the +#' available columns using: colnames(MultiAssayExperiment::colData(data)). +#' @param group1 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param group2 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param diff.dir A character can be "hypo" or "hyper", showing differential +#' methylation dirction. It can be "hypo" which is only selecting hypomethylated probes; +#' "hyper" which is only selecting hypermethylated probes; +#' @param minSubgroupFrac A number ranges from 0 to 1 specifying the percentage of samples +#' from group1 and group2 that are used to identify the differential methylation. +#' Default is 0.2 because we did not expect all cases to be from a single molecular +#' subtype.But, If you are working with molecular subtypes please set it to 1. +#' @param min.samples Minimun number of samples to use in the analysis. Default 5. +#' If you have 10 samples in one group, percentage is 0.2 this will give 2 samples +#' in the lower quintile, but then 5 will be used. +#' @param title plot title +#' @param save Save plot as PNG +#' @param filename File names (.png) to save the file (i.e. "plot.png") +#' @param legend.col legend title +#' @param probe Character with probe name (i.e. "cg24517858") +#' @return Box plot +#' @importFrom plotly plot_ly layout +#' @importFrom dplyr top_n filter select %>% +#' @export +#' @author Tiago Chedraoui Silva (tiagochst at gmail.com) +#' @examples +#' \dontrun{ +#' data <- ELMER:::getdata("elmer.data.example") +#' group.col <- "subtype_Expression.Subtype" +#' group1 <- "classical" +#' group2 <- "secretory" +#' metBoxPlot(data, +#' group.col = group.col, +#' group1 = group1, +#' group2 = group2, +#' probe ="cg17898069", +#' minSubgroupFrac = 0.2, +#' diff.dir = "hypo") +#'} +metBoxPlot <- function(data, + group.col, + group1, + group2, + probe, + min.samples = 5, + minSubgroupFrac = 0.2, + diff.dir = "hypo", + legend.col = NULL, + title = NULL, + filename = NULL, + save = TRUE) { + if(missing(data)) stop("Please set data argument") + if(missing(group.col)) stop("Please set group.col argument") + if(missing(group1)) stop("Please set group1 argument") + if(missing(group2)) stop("Please set group2 argument") + if(missing(probe)) stop("Please set probe argument") + + if(is.null(filename)){ + if(is.null(legend.col)) filename <- paste0(group.col,"_",probe) + if(!is.null(legend.col)) filename <- paste0(group.col,"_",probe,"_",legend.col) + filename <- paste0(gsub("\\.","_",filename),".png") + } + + if(is.null(legend.col)) { + aux <- data.frame("group" = colData(data)[,group.col], + "DNA methylation beta value" = assay(getMet(data))[probe,]) %>% + filter(group %in% c(group1,group2)) %>% droplevels + pos <- 2 + showlegend <- FALSE + } else { + aux <- data.frame("group" = colData(data)[,group.col], + "legend" = colData(data)[,legend.col], + "DNA methylation beta value" = assay(getMet(data))[probe,]) %>% + filter(group %in% c(group1,group2)) %>% droplevels + if(any(is.na(aux$legend))) aux$legend[is.na(aux$legend)] <- "NA" + pos <- 3 + showlegend <- TRUE + } + if(diff.dir == "hyper") { + val.used <- rbind(aux %>% filter(group %in% group2) %>% top_n(min(min.samples,ceiling(sum(aux$group == group2) * .2))) %>% select(pos), + aux %>% filter(group %in% group1) %>% top_n(min(min.samples,ceiling(sum(aux$group == group1) * .2))) %>% select(pos)) + } else { + val.used <- rbind(aux %>% filter(group %in% group2) %>% top_n(-min(min.samples,ceiling(sum(aux$group == group2) * .2))) %>% select(pos), + aux %>% filter(group %in% group1) %>% top_n(-min(min.samples,ceiling(sum(aux$group == group1) * .2))) %>% select(pos)) + } + aux$used <- " (100%)" + aux.used <- aux[aux$DNA.methylation.beta.value %in% val.used$DNA.methylation.beta.value,] + aux.used$used <- paste0(" (", minSubgroupFrac * 100, "%",")") + aux <- rbind(aux,aux.used) + aux$group <- paste0(aux$group, aux$used) + if(is.null(title)) title <- paste0("Boxplot DNA methylation (probe ", probe,")") + if(is.null(legend.col)) { + legend.col <- "used" + legend.title <- "" + } else { + legend.title <- legend.col + legend.col <- "legend" + } + + suppressWarnings({ + p <- plot_ly(data = aux, + x = ~group, + y = ~DNA.methylation.beta.value, + color = ~eval(as.name(paste(legend.col))), + type = "box", boxpoints = "all", jitter = 0.3, + pointpos = -1.8) %>% + plotly::add_annotations( text=legend.title, xref="paper", yref="paper", + x=1.02, xanchor="left", + y=0.8, yanchor="bottom", # Same y as legend below + legendtitle=TRUE, showarrow=FALSE ) %>% + layout(title = title, boxmode = "group", xaxis = list(title = ""), + legend=list(y=0.8, yanchor="top" ), + font = list(size = 12), + showlegend = showlegend, + margin = list( + l = 100, + r = 300, + b = 100, + t = 100, + pad = 4 + )) + if(save){ + if (!requireNamespace("webshot", quietly = TRUE)) { + stop("webshot package is needed for this function to work. Please install it and run webshot::install_phantomjs()", + call. = FALSE) + } + plotly::export(p, file = filename, vwidth = 992 , vheight = 744 ) + message("Saved as ", filename) + } else { + return(p) + } + }) +} + +#' Heatmap of pairs gene and probes anti-correlated +#' @description +#' Heatmp plot of pairs gene and probes anti-correlated +#' @param data A MultiAssayExperiment with a DNA methylation SummarizedExperiment (all probes) and a gene Expression SummarizedExperiment. +#' @param group.col A column from the sample matrix from the MultiAssayExperiment object. Accessed with colData(mae) +#' @param group1 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param group2 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param subset Subset MAE object to keep only groups compared ? +#' @param pairs List of probe and pair genes +#' @param annotation.col A vector of columns from the sample matrix from the MultiAssayExperiment object. Accessed with colData(mae) +#' to be added as annotation to the heatmap. +#' @param met.metadata A vector of metdatada columns available in the DNA methylation GRanges to should be added to the heatmap. +#' @param exp.metadata A vector of metdatada columns available in the Gene expression GRanges to should be added to the heatmap. +#' @param width Figure width +#' @param height Figure height +#' @param filename File names (.pdf) to save the file (i.e. "plot.pdf"). If NULL return plot. +#' @param cluster.within.groups Cluster columns based on the groups +#' @param plot.distNearestTSS Plot track with distNearestTSS ? +#' @return A heatmap +#' @import ComplexHeatmap circlize +#' @importFrom stats hclust dist +#' @importFrom grid unit.c grobWidth textGrob +#' @importFrom plyr ddply . +#' @importFrom GenomicRanges distanceToNearest +#' @importFrom grDevices png +#' @export +#' @author Tiago Chedraoui Silva (tiagochst at gmail.com) +#' @examples +#' \dontrun{ +#' data <- ELMER:::getdata("elmer.data.example") +#' group.col <- "subtype_Expression.Subtype" +#' group1 <- "classical" +#' group2 <- "secretory" +#' pairs <- data.frame(Probe = c("cg15924102","cg19403323", "cg22396959"), +#' GeneID = c("ENSG00000196878", "ENSG00000009790", "ENSG00000009790" ), +#' Symbol = c("TRAF3IP3","LAMB3","LAMB3"), +#' Distance = c(6017,168499,0), +#' Raw.p = c(0.001,0.00001,0.001), +#' Pe = c(0.001,0.00001,0.001)) +#' heatmapPairs( +#' data = data, group.col = group.col, +#' group1 = group1, group2 = group2, +#' annotation.col = c("ethnicity","vital_status","age_at_diagnosis"), +#' pairs, filename = "heatmap.pdf", +#' height = 4, width = 11 +#' ) +#' } +heatmapPairs <- function( + data, + group.col, + group1, + group2, + pairs, + subset = FALSE, + cluster.within.groups = TRUE, + plot.distNearestTSS = FALSE, + annotation.col = NULL, + met.metadata = NULL, + exp.metadata = NULL, + width = 10, + height = 7, + filename = NULL +) { + + if(missing(data)) stop("Please set data argument") + if(missing(group.col)) stop("Please set group.col argument") + if(missing(pairs)) stop("Please set probe argument") + + if((!"distNearestTSS" %in% colnames(pairs)) & plot.distNearestTSS) { + # For a given probe and gene find nearest TSS + pairs <- addDistNearestTSS(data, pairs) + } + if(!missing(group1) & subset){ + message("Subsetting") + data <- data[,colData(data)[,group.col] %in% c(group1, group2)] + } + + # Remove pairs to be ploted if not found in the object + pairs <- pairs[pairs$Probe %in% rownames(getMet(data)),] + pairs <- pairs[pairs$GeneID %in% rownames(getExp(data)),] + + meth <- assay(getMet(data))[pairs$Probe,] + exp <- assay(getExp(data))[pairs$GeneID,] + + order <- NULL + cluster_columns <- TRUE + if(cluster.within.groups){ + message("Ordering groups") + cluster_columns <- FALSE + order <- unlist(plyr::alply(as.character(unique(colData(data)[,group.col])), 1 , function(x) { + if(is.na(x)){ + idx <- which(is.na(colData(data)[,group.col])) + } else { + idx <- which(colData(data)[,group.col] == x) + } + aux <- na.omit(meth[,idx]) + order <- t(aux) %>% dist %>% hclust(method = "average") + as.numeric(idx[order$order]) + } + )) + } + + + + # Create color + colors <- c("#6495ED", "#8B2323", "#458B74", "#7AC5CD", + "#4F4F4F", "#473C8B", "#00F5FF", "#CD6889", + "#B3EE3A", "#7B68EE", "#CDAF95", "#0F0F0F", "#FF7F00", + "#00008B", "#5F9EA0", "#F0FFFF", "#8B6969", "#9FB6CD", "#D02090", + "#FFFF00", "#104E8B", "#B22222", "#B3EE3A", "#FF4500", "#4F94CD", + "#40E0D0", "#F5FFFA", "#8B3A62", "#FF3030", "#FFFFFF", + "#191970","#BC8F8F","#778899","#2F4F4F", + "#FFE4E1", "#F5F5DC") + l <- length(unique(colData(data)[,c(group.col)])) + l.all <- l + col <- colors[1:l] + names(col) <- unique(colData(data)[,c(group.col)]) + names(col)[is.na(names(col))] <- "NA" + col.list <- list() + col.list[[group.col]] <- col + + for(i in annotation.col){ + l <- length(unique(colData(data)[,c(i)])) + if(l == 1) next + p <- l/length(na.omit(colData(data)[,c(i)])) + + # Is the variable non numeric ? + # i.e. entry might be purity lelels as character, while they represent a number + if(any(is.na(as.numeric(na.omit(colData(data)[,c(i)]))))){ + col.idx <- (l.all + 1):(l.all + l) %% (length(colors) + 1) + if(0 %in% col.idx) col.idx <- col.idx + 1 + col <- colors[col.idx] + l.all <- l.all + l + n <- unique(colData(data)[,c(i)]) + n[is.na(n)] <- "NA" + names(col) <- n + col.list[[i]] <- col + } else { + message("Considering variable ", i, " as numeric") + suppressWarnings({ + nb <- as.numeric(colData(data)[,c(i)]) + colData(data)[,c(i)] <- nb + if(!all(na.omit(nb) >=0)){ + col <- circlize::colorRamp2( + c(min(nb,na.rm = T), + (max(nb,na.rm = T) + min(nb,na.rm = T))/2, + max(nb,na.rm = T)), c(colors[(l.all+1)],"white", colors[(l.all + 2)]) + ) + l.all <- l.all + 2 + } else { + col.list[[i]] <- col + col <- circlize::colorRamp2( + c(min(nb,na.rm = T), + max(nb,na.rm = T)), + c("white", colors[(l.all+1):(l.all + 1)]) + ) + l.all <- l.all + 1 + } + col.list[[i]] <- col + + }) + } + } + # Annotation track + ha = HeatmapAnnotation( + df = colData(data)[,c(group.col,annotation.col),drop = F], + col = col.list, + show_annotation_name = TRUE, + border = TRUE, + annotation_name_side = "left", + annotation_name_gp = gpar(fontsize = 6) + ) + ha2 = HeatmapAnnotation( + df = colData(data)[,c(group.col,annotation.col),drop = F], + show_legend = F, + show_annotation_name = F, + border = TRUE, + col = col.list + ) + ht_list <- Heatmap( + meth, + name = "DNA methylation beta level", + col = colorRamp2(c(0, 0.5, 1), c("darkblue", "white", "gold")), + column_names_gp = gpar(fontsize = 8), + show_column_names = FALSE, + column_order = order, + show_row_names = TRUE, + use_raster = TRUE, + raster_device = c("png"), + raster_quality = 2, + cluster_columns = cluster_columns, + cluster_rows = TRUE, + row_names_gp = gpar(fontsize = 2), + top_annotation = ha, + column_title = "DNA methylation", + column_title_gp = gpar(fontsize = 10), + row_title_gp = gpar(fontsize = 10) + ) + + if(!is.null(met.metadata)) { + for(i in met.metadata) + ht_list <- ht_list + + Heatmap( + values(getMet(data)[pairs$Probe,])[i], + name = i, + use_raster = TRUE, + raster_device = c("png"), + raster_quality = 2, + width = unit(5, "mm"), + column_title = "", + show_column_names = F + ) + } + + ht_list <- ht_list + + Heatmap( + t(apply(exp, 1, scale)), + name = "Gene Expression (z-score)", + col = colorRamp2(c(-2, 0, 2), c("blue", "white", "red")), + top_annotation = ha2, + show_row_names = FALSE, + use_raster = TRUE, + raster_device = c("png"), + raster_quality = 2, + column_order = order, + cluster_columns = cluster_columns, + column_names_gp = gpar(fontsize = 8), + show_column_names = FALSE, + column_title = "Gene expression", + column_title_gp = gpar(fontsize = 10) + ) + + if(!is.null(exp.metadata)) { + for(i in exp.metadata) + ht_list <- ht_list + + Heatmap( + values(getExp(data)[pairs$Probe,])[i], + name = i, + use_raster = TRUE, + raster_device = c("png"), + raster_quality = 2, + width = unit(5, "mm"), + column_title = "", + row_title = paste0(nrow(pairs), "pairs of probes and genes"), + show_column_names = FALSE + ) + } + if(plot.distNearestTSS){ + ht_list <- ht_list + + Heatmap( + log10(pairs$distNearestTSS + 1), + name = "log10(distNearestTSS + 1)", + width = unit(5, "mm"), + column_title = "", + show_column_names = FALSE, + use_raster = TRUE, + raster_device = c("png"), + raster_quality = 2, + col = colorRamp2(c(0, 8), c("white", "orange")), + heatmap_legend_param = list( + at = log10(1 + c(0, 10, 100, 1000, 10000, 100000, 1000000,10000000,100000000)), + labels = c("0", "10bp", "100bp", "1kb", "10kb", "100kb", "1mb","10mb","100mb")) + ) + } + ht_list <- ht_list + + ht_global_opt( + legend_title_gp = gpar(fontsize = 10, fontface = "bold"), + legend_labels_gp = gpar(fontsize = 10) + ) + if(is.null(filename)) return(ht_list) + padding = unit.c( + unit(2, "mm"), grobWidth(textGrob(paste(rep("a",max(nchar(c(group.col,annotation.col)))/1.5), collapse = ""))) - unit(1, "cm"), + unit(c(2, 2), "mm") + ) + if(grepl("\\.pdf",filename)) { + message("Saving as PDF") + pdf(filename, width = width, height = height) + } + if(grepl("\\.png",filename)) { + message("Saving as PNG") + if(width < 100) width <- 1000 + if(height < 100) height <- 1000 + png(filename, width = width, height = height) + } + draw( + ht_list, + #padding = padding, + newpage = TRUE, + column_title = "Correspondence between probe DNA methylation and distal gene expression", + column_title_gp = gpar(fontsize = 12, fontface = "bold"), + annotation_legend_side = "right", + merge_legend = TRUE + ) + dev.off() +} + + + +#' @title Heatmap for correlation between probes DNA methylation and a single gene expression. +#' @description +#' This heatmap will sort samples by their gene expression and show the DNA methylation levels of the paired probes to that gene. +#' If no pairs are given, nearest probes will be selected. +#' To use this function you MAE object (input data) will need all probes and not only the distal ones. +#' This plot can be used to evaluate promoter, and intro, exons regions and closer distal probes of a gene to verify if their +#' DNA methylation level is affecting the gene expression +#' @param data A MultiAssayExperiment with a DNA methylation SummarizedExperiment (all probes) and a gene Expression SummarizedExperiment. +#' @param group.col A column from the sample matrix from the MultiAssayExperiment object. Accessed with colData(mae) +#' @param group1 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param group2 A group from group.col. ELMER will run group1 vs group2. +#' That means, if direction is hyper, get probes +#' hypermethylated in group 1 compared to group 2. +#' @param pairs List of probe and pair genes +#' @param GeneSymbol Gene Symbol +#' @param annotation.col A vector of columns from the sample matrix from the MultiAssayExperiment object. Accessed with colData(mae) +#' to be added as annotation to the heatmap +#' @param met.metadata A vector of metdatada columns available in the DNA methylation GRanges to should be added to the heatmap. +#' @param exp.metadata A vector of metdatada columns available in the Gene expression GRanges to should be added to the heatmap. +#' @param scatter.plot Plot scatter plots +#' @param correlation.method Correlation method: Pearson or sperman +#' @param correlation.table save table with spearman correlation analysis ? +#' @param numFlankingGenes numFlankingGenes to plot. +#' @param filter.by.probe.annotation Filter probes to plot based on probes annotation +#' @param dir.out Where to save the plots +#' @param width Figure width +#' @param height Figure height +#' @param scatter.plot.width Scatter plot width +#' @param scatter.plot.height Scatter plot height +#' @param filename File names (.pdf) to save the file (i.e. "plot.pdf"). If NULL return plot. +#' @return A heatmap +#' @import ComplexHeatmap circlize +#' @importFrom stats hclust dist +#' @importFrom grid unit.c grobWidth textGrob +#' @importFrom plyr ddply . +#' @importFrom GenomicRanges distanceToNearest +#' @importFrom grDevices png +#' @importFrom ggpubr ggscatter stat_cor +#' @importFrom IRanges subsetByOverlaps +#' @importFrom reshape2 melt +#' @export +#' @author Tiago Chedraoui Silva (tiagochst at gmail.com) +#' @examples +#' \dontrun{ +#' data <- ELMER:::getdata("elmer.data.example") +#' group.col <- "subtype_Expression.Subtype" +#' group1 <- "classical" +#' group2 <- "secretory" +#' pairs <- data.frame(ID = c("cg15924102","cg19403323", "cg22396959"), +#' GeneID = c("ENSG00000196878", "ENSG00000009790", "ENSG00000009790" ), +#' Symbol = c("TRAF3IP3","LAMB3","LAMB3"), +#' Side = c("R1","L1","R3"), +#' Distance = c(6017,168499,0), +#' stringsAsFactors = FALSE) +#' heatmapGene(data = data, +#' group.col = group.col, +#' group1 = group1, +#' group2 = group2, +#' pairs = pairs, +#' GeneSymbol = "LAMB3", +#' height = 5, +#' annotation.col = c("ethnicity","vital_status"), +#' filename = "heatmap.pdf") +#' \dontrun{ +#' heatmapGene(data = data, +#' group.col = group.col, +#' group1 = group1, +#' group2 = group2, +#' GeneSymbol = "ACP6", +#' annotation.col = c("ethnicity","vital_status"), +#' filename = "heatmap_closer_probes.pdf") +#' } +#'} +heatmapGene <- function(data, + group.col, + group1, + group2, + pairs, + GeneSymbol, + scatter.plot = FALSE, + correlation.method = "pearson", + correlation.table = FALSE, + annotation.col = NULL, + met.metadata = NULL, + exp.metadata = NULL, + dir.out = ".", + filter.by.probe.annotation = TRUE, + numFlankingGenes = 10, + width = 10, + height = 10, + scatter.plot.width = 10, + scatter.plot.height = 10, + filename = NULL) { + + if(missing(data)) stop("Please set data argument") + if(missing(group.col)) stop("Please set group.col argument") + if(missing(GeneSymbol)) stop("Please set GeneSymbol argument") + dir.create(dir.out,showWarnings = FALSE, recursive = TRUE) + + # Probe and gene info + probes.info <- rowRanges(getMet(data)) # 450K and hg38 + gene.info <- rowRanges(getExp(data)) + gene.location <- gene.info[gene.info$external_gene_name == GeneSymbol,] + print(gene.location) + if(length(gene.location) == 0) { + message("Gene not found: ", GeneSymbol) + return(NULL) + } + + # If pairs are missing we will select the probes that are closer to the gene + if(missing(pairs)) { + + # Get closer genes + selected.gene.gr <- gene.info[gene.info$external_gene_name == GeneSymbol,] + gene.follow <- gene.info[follow(selected.gene.gr,gene.info,ignore.strand=T),]$external_gene_name %>% as.character + gene.precede <- gene.info[precede(selected.gene.gr,gene.info,ignore.strand=T),]$external_gene_name %>% as.character + gene.gr <- gene.info[gene.info$external_gene_name %in% c(gene.follow,gene.precede,GeneSymbol),] + + + # Get the regions of the 2 nearest genes, we will get all probes on those regions + regions <- data.frame( + chrom = unique(as.character(seqnames(gene.gr))), + start = min(start(gene.gr)), + end = max(end(gene.gr))) %>% + makeGRangesFromDataFrame + p <- names(sort(subsetByOverlaps(probes.info, regions, ignore.strand = TRUE))) + if(length(p) == 0) stop("No probes close to the gene were found") + + pairs <- ELMER::GetNearGenes(probes = p, + data = data, + numFlankingGenes = numFlankingGenes) + pairs <- pairs[pairs$Symbol %in% GeneSymbol,] + pairs <- addDistNearestTSS(data, pairs) %>% na.omit + p <- rev(names(sort(probes.info[p], ignore.strand=TRUE))) + pairs <- pairs[match(p,pairs$ID),] %>% na.omit + + if(filter.by.probe.annotation){ + + if (!requireNamespace("sesameData", quietly = TRUE)) { + stop("sesameData is needed. Please install it.", + call. = FALSE) + } + + + metadata <- sesameData::sesameDataGet("HM450.hg19.manifest") + probes.annotation <- names(metadata[grep(GeneSymbol,metadata$gene_HGNC),]) + pairs <- pairs[pairs$ID %in% probes.annotation,] + } + + if(correlation.table == T){ + if(correlation.method == "spearman"){ + corretlation.tab <- plyr::adply(pairs$ID,1,function(x){ + met <- assay(getMet(data)[x,]) + exp <- assay(getExp(data)[gene.location$ensembl_gene_id,]) + ret <- cor.test(x = met[which(!is.na(met))], + y = exp[which(!is.na(met))], + method = "spearman", + exact = TRUE) + tibble::tibble("rho" = ret$estimate, "p.value" = ret$p.value) + },.id = NULL) + corretlation.tab$FDR <- p.adjust(corretlation.tab$p.value, method = "fdr") + file.name.table <- paste0(dir.out,"spearman_correlation_",GeneSymbol,"_vs_near_probes.tsv") + message("Saving spearman correlation table as: ", file.name.table) + write_tsv(cbind(as.data.frame(pairs)[,c(1:3,6)],corretlation.tab),path = file.name.table) + } else if(correlation.method == "pearson"){ + corretlation.tab <- plyr::adply(pairs$ID,1,function(x){ + met <- assay(getMet(data)[x,]) + exp <- assay(getExp(data)[gene.location$ensembl_gene_id,]) + ret <- cor.test(x = met[which(!is.na(met))], + y = exp[which(!is.na(met))], + method = "pearson", + exact = TRUE) + tibble::tibble("cor" = ret$estimate, "p.value" = ret$p.value) + },.id = NULL) + corretlation.tab$FDR <- p.adjust(corretlation.tab$p.value, method = "fdr") + file.name.table <- paste0(dir.out,"/pearson_correlation_",GeneSymbol,"_vs_near_probes.tsv") + message("Saving pearson correlation table as: ", file.name.table) + write_tsv(cbind(as.data.frame(pairs)[,c(1:3,6)],corretlation.tab),path = file.name.table) + } + } + # Save probe gene scatter plot + if(!isFALSE(scatter.plot)){ + probes <- unique(pairs$ID) + gene <- gene.location$ensembl_gene_id + met <- melt(t(assay(getMet(data)[probes,]))) + met$Var1 <- substr(met$Var1,1,16) + met <- merge(met,colData(data)[,c("sample",group.col)], by.x = "Var1",by.y = "sample") + exp <- melt(t(assay(getExp(data)[gene,]))) + exp$Var1 <- substr(exp$Var1,1,16) + df <- merge(met,exp,by = "Var1") + colnames(df) <- c("Patient","probe","Methylation",group.col,"Gene","Expression") + p <- ggscatter(as.data.frame(df), + x = "Methylation", + y = "Expression", + size = 0.7, + #palette = "uchicago", + facet.by = "probe", + color = group.col + #shape = 21, size = 3, # Points color, shape and size + #add = "reg.line", # Add regressin line + #add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line + #conf.int = TRUE, # Add confidence interval + #cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor + #cor.coeff.args = list(method = "pearson", label.x = 0, label.sep = "\n") + ) + stat_cor(method = correlation.method, label.x = 0) + + labs(x = expression(paste("DNA methylation - ",beta, "-value")), + y = expression(paste("Gene Expression - ",Log[2],"(FPKM + 1)"))) + + geom_smooth(method = "lm") + guides(colour = guide_legend(override.aes = list(size=5))) + file.name <- paste0(dir.out,"/scatter_plot_pearson_correlation_",GeneSymbol,"_vs_near_probes.pdf") + ggsave(file.name,plot = p, + width = scatter.plot.width, + height = scatter.plot.height) + + for(p in pairs$ID){ + scatter.plot(data = data, + byPair = list(probe = p, + gene = gene.location$ensembl_gene_id), + category = group.col, + dir.out = dir.out, + width = 5, + height = 4, + save = TRUE, + correlation = TRUE, + lm_line = TRUE) + } + } + } + if(!GeneSymbol %in% unique(pairs$Symbol)) stop("GeneID not in the pairs") + pairs <- pairs[pairs$Symbol == GeneSymbol,] + + if(!"distNearestTSS" %in% colnames(pairs)) { + # For a given probe and gene find nearest TSS + pairs <- addDistNearestTSS(data, pairs) + } + if(!(missing(group1) & missing(group2))){ + data <- data[,colData(data)[,group.col] %in% c(group1, group2)] + } + strand.factor <- ifelse(as.data.frame(gene.location)$strand == "+",1,-1) + pairs$DistanceTSSwithSignal <- pairs$DistanceTSS * ifelse(grepl("R",pairs$Side),1,-1) * strand.factor + + meth <- assay(getMet(data))[pairs$ID,,drop = FALSE] + rownames(meth) <- paste0(pairs$ID, " (Dist.TSS ",pairs$DistanceTSSwithSignal,")") + exp <- assay(getExp(data))[unique(pairs$GeneID),,drop = FALSE] + + # Ordering the heatmap + # Split data into the two groups and sort samples by the expression of the gene + if(!(missing(group1) & missing(group2))){ + idx1 <- which(colData(data)[,group.col] == group1) + aux <- na.omit(exp[,idx1]) + dist1 <- sort(aux, index.return = T)$ix + + idx2 <- which(colData(data)[,group.col] == group2) + aux <- na.omit(exp[,idx2]) + dist2 <- sort(aux, index.return = T)$ix + + order <- c(idx1[dist1],idx2[dist2]) + } else { + aux <- na.omit(exp) + dist1 <- sort(aux, index.return = T)$ix + order <- dist1 + } + # Create color + colors <- c("#6495ED", "#22b315", "#458B74", "#ffe300", + "#ff0000", "#473C8B", "#00F5FF", "#CD6889", + "#B3EE3A", "#7B68EE", "#CDAF95", "#0F0F0F", "#FF7F00", + "#00008B", "#5F9EA0", "#F0FFFF", "#8B6969", "#9FB6CD", "#D02090", + "#FFFF00", "#104E8B", "#B22222", "#B3EE3A", "#FF4500", "#4F94CD", + "#40E0D0", "#F5FFFA", "#8B3A62", "#FF3030", "#FFFFFF") + l <- length(unique(colData(data)[,c(group.col)])) + l.all <- l + col <- colors[1:l] + names(col) <- unique(colData(data)[,c(group.col)]) + col.list <- list() + col.list[[group.col]] <- col + + for(i in annotation.col){ + l <- length(unique(colData(data)[,c(i)])) + if(l < 10){ + if(l.all + l <= length(colors)) { + col <- colors[(l.all+1):(l.all + l)] + l.all <- l.all + l + } else { + col <- colors[c((l.all+1):length(colors),1 + (l.all + l)%%length(colors))] + l.all <- (l.all + l)%%30 + } + n <- unique(colData(data)[,c(i)]) + n[is.na(n)] <- "NA" + names(col) <- n + col.list[[i]] <- col + } else { + message("Considering variable ", i, " as numeric") + suppressWarnings({ + nb <- as.numeric(colData(data)[,c(i)]) + colData(data)[,c(i)] <- nb + if(!all(na.omit(nb) >= 0)){ + col <- circlize::colorRamp2(c(min(nb,na.rm = T), + (max(nb,na.rm = T) + min(nb,na.rm = T))/2, + max(nb,na.rm = T)), c(colors[(l.all+1)],"white", colors[(l.all + 2)])) + l.all <- l.all + 2 + } else { + col.list[[i]] <- col + col <- circlize::colorRamp2(c(min(nb,na.rm = T),max(nb,na.rm = T)), c("white", colors[(l.all+1):(l.all + 1)])) + l.all <- l.all + 1 + } + col.list[[i]] <- col + + }) + } + } + + # Annotation track + ha = HeatmapAnnotation(df = colData(data)[,c(group.col,annotation.col),drop = F], + col = col.list, + show_annotation_name = TRUE, + annotation_name_side = "left", + annotation_height = unit(c(rep(0.5,length(annotation.col) + 1), 3), "cm"), + GeneExpression = anno_points(as.numeric(exp), size = unit(0.5, "mm"),axis = T, axis_side ="right"), + annotation_name_gp = gpar(fontsize = 6)) + + bottom_annotation_height = unit(3, "cm") + + ha2 = HeatmapAnnotation(df = colData(data)[,c(group.col,annotation.col),drop = FALSE], + show_legend = FALSE, + col = col.list) + ht_list = + Heatmap(meth, + name = "DNA methylation level", + col = colorRamp2(c(0, 0.5, 1), c("darkblue", "white", "gold")), + column_names_gp = gpar(fontsize = 5), + show_column_names = FALSE, + column_order = order, + show_row_names = TRUE, + cluster_columns = FALSE, + row_names_side = "left", + cluster_rows = FALSE, + row_names_gp = gpar(fontsize = 8), + top_annotation = ha, + column_title_gp = gpar(fontsize = 10), + row_title_gp = gpar(fontsize = 10)) + + if(!is.null(met.metadata)) { + for(i in met.metadata) + ht_list <- ht_list + + Heatmap(values(getMet(data)[pairs$ID,])[i], + name = i, + width = unit(5, "mm"), + column_title = "", + show_column_names = FALSE + ) + } + if(!is.null(exp.metadata)) { + for(i in exp.metadata) + ht_list <- ht_list + + Heatmap(values(getExp(data)[pairs$ID,])[i], + name = i, + width = unit(5, "mm"), + column_title = "", + show_column_names = F + ) + } + + ht_list <- ht_list + + ht_opt(legend_title_gp = gpar(fontsize = 10, fontface = "bold"), + legend_labels_gp = gpar(fontsize = 10)) + if(is.null(filename)) return(ht_list) + padding = unit.c(unit(2, "mm"), grobWidth(textGrob(paste(rep("a",max(nchar(c(group.col,annotation.col)))/1.15), collapse = ""))) - unit(1, "cm"), + unit(c(2, 2), "mm")) + if(grepl("\\.pdf",filename)) { + message("Saving as PDF: ",sprintf("%s/%s",dir.out,filename)) + pdf(sprintf("%s/%s",dir.out,filename), width = width, height = height) + } + if(grepl("\\.png",filename)) { + message("Saving as PNG: ", sprintf("%s/%s",dir.out,filename)) + if(width < 100) width <- 1000 + if(height < 100) height <- 1000 + png(sprintf("%s/%s",dir.out,filename), width = width, height = height) + } + draw(ht_list, padding = padding, newpage = TRUE, + column_title = paste0("Correspondence between probe DNA methylation and ", GeneSymbol," expression"), + column_title_gp = gpar(fontsize = 12, fontface = "bold"), + annotation_legend_side = "right", + heatmap_legend_side = "right") + dev.off() +} + + +#' @title Create a junction track for IGV visualization of interection +#' @description +#' Create a junction track for IGV visualization of interection +#' @param pairs A data frame output from getPairs function +#' @param filename Filename (".bed") +#' @param met.platform DNA methyaltion platform to retrieve data from: EPIC or 450K (default) +#' @param genome Which genome build will be used: hg38 (default) or hg19. +#' @param color.track A color for the track (i.e blue, red,#272E6A) +#' @param gene.symbol Filter pairs to a single gene. +#' @param track.name Track name +#' @param all.tss A logical. If TRUE it will link probes to all TSS of a gene (transcript level), if FALSE +#' it will link to the promoter region of a gene (gene level). +#' @importFrom dplyr %>% +#' @importFrom grDevices col2rgb +#' @importFrom utils write.table +#' @export +#' @author Tiago Chedraoui Silva (tiagochst at gmail.com) +#' @examples +#' \dontrun{ +#' data <- ELMER:::getdata("elmer.data.example") +#' nearGenes <-GetNearGenes(TRange=getMet(data)[c("cg00329272","cg10097755"),], +#' geneAnnot=getExp(data)) +#' Hypo.pair <- get.pair(data=data, +#' nearGenes=nearGenes, +#' permu.size=5, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' raw.pvalue = 0.2, +#' Pe = 0.2, +#' dir.out="./", +#' label= "hypo") +#' createIGVtrack(Hypo.pair,met.platform = "450K", genome = "hg38") +#' } +createIGVtrack <- function(pairs, + met.platform = "450K", + genome = "hg38", + filename = "ELMER_interactions.bed", + color.track = "black", + track.name = "junctions", + gene.symbol = NULL, + all.tss = TRUE){ + if(all.tss){ + tss <- getTSS(genome = genome) + tss <- tibble::as.tibble(tss) + } else { + tss <- get.GRCh(genome = genome,as.granges = TRUE) + tss <- tibble::as.tibble(promoters(tss,upstream = 0,downstream = 0)) + tss$transcription_start_site <- tss$start + } + + if(!is.null(gene.symbol)) { + if(!gene.symbol %in% pairs$Symbol) stop("Gene link with that gene symbol") + pairs <- pairs[pairs$Symbol == gene.symbol,] + } + met.metadata <- getInfiniumAnnotation(plat = met.platform,genome = genome) + met.metadata <- as.data.frame(met.metadata,row.names = names(met.metadata)) + met.metadata$Probe <- rownames(met.metadata) + pairs <- merge(pairs, tss, by.x = "GeneID", by.y = "ensembl_gene_id",all.x = TRUE) %>% + merge(met.metadata, by = "Probe", all.x = TRUE) + + pairs$ID <- paste0(pairs$Probe, ".", pairs$Symbol) + pairs$geneCordinates <- paste0(0,",",pairs$width.x) + pairs$strand <- "*" + pairs$Raw.p <- as.integer(4) + pairs$RGB <- paste(col2rgb(color.track)[,1],collapse = ",") + pairs$block_counts <- 2 + pairs$block_sizes <- "0,0" + # [seqname] [start] [end] [id] [score] [strand] + # [thickStart] [thickEnd] [r,g,b] [block_count] [block_sizes] [block_locations] + + pairs <- pairs[,c("seqnames.y", # [seqname] + "start.y", # [start] # probe + "transcription_start_site", # [end] + "ID", # ID + "Raw.p", # Depth + "strand", # Strand + "start.x", # thickStart + "end.x", # thickEnd + "RGB", # color + "block_counts", # block_count + "block_sizes", # block_sizes + "block_counts")] # block_locations + pairs <- na.omit(pairs) + header <- paste0('track name="',track.name,'" graphType=junctions') + unlink(filename,force = TRUE) + cat(header,file = filename,sep="\n",append = TRUE) + readr::write_delim(pairs, path = filename, append = TRUE) +} + +#' @title Create a bigwig file for IGV visualization of DNA methylation data (Array) +#' @description +#' Create a bigwig for IGV visualization of DNA methylation data (Array) +#' @param data A matrix +#' @param genome Which genome build will be used: hg38 (default) or hg19. +#' @param met.platform DNA methyaltion platform to retrieve data from: EPIC or 450K (default) +#' @param track.names Provide a list of track names (.bw) otherwise the deault is the will be {samples}.bw +#' @param dir Which directory files will be saved +#' @importFrom plyr a_ply +#' @importFrom rtracklayer export.wig +#' @importFrom GenomeInfoDb Seqinfo +#' @export +#' @author Tiago Chedraoui Silva (tiagochst at gmail.com) +#' @examples +#' \dontrun{ +#' data <- assay(getMet(ELMER:::getdata("elmer.data.example"))) +#' createBigWigDNAmetArray(data = data, met.platform = "450K", genome = "hg38") +#' } +createBigWigDNAmetArray <- function(data = NULL, + genome = "hg38", + met.platform = "450K", + track.names = NULL, + dir = "IGV_tracks"){ + + # where we will save the several tracks + dir.create(dir,recursive = TRUE, showWarnings = FALSE) + + # get genomic information for array + message("Preparing array metadata") + metadata <- getInfiniumAnnotation(plat = met.platform,genome = genome) + metadata <- metadata[!metadata$MASK_general,] + values(metadata) <- NULL + metadata$score <- NA + strand(metadata) <- "*" + metadata <- keepStandardChromosomes(metadata, pruning.mode="coarse") + seqinfo(metadata) <- Seqinfo(genome=genome) %>% keepStandardChromosomes + + message("Creating bigwig tracks") + # for each samples create the track + plyr::a_ply(colnames(data),1,.fun = function(sample){ + idx <- which(sample == colnames(data)) + metadata <- metadata[names(metadata) %in% rownames(data),] + data <- data[rownames(data) %in% names(metadata),] + met <- data[,sample] + metadata$score <- met[match(names(metadata),rownames(data))] + metadata <- metadata[!is.na(metadata$score),] + if(!is.null(track.names)) { + filename <- file.path(dir,track.names[idx]) + } else { + filename <- file.path(dir,paste0(sample,".bw")) + } + message("\nSaving: ", filename) + rtracklayer::export.bw(object = metadata,con = filename) + },.progress = "time") +} + + + + +#' @title Creating matrix for MR TF heatmap +#' @description Code used to create matrix for MR TF heatmap +#' @param dir Vector ofr directory with results +#' @param classification Consider family or subfamily +#' @param top Consider only top 1 within each (sub)family +#' @examples +#' \dontrun{ +#' elmer.results <- dirname( +#' dir(path = "analysis", +#' pattern = "*.hypo.pairs.significant.csv", +#' recursive = T, +#' full.names = T, +#' all.files = T)) +#' tabs <- get.tabs(dir = elmer.results, classification = "subfamily") +#' } +get.tabs <- function(dir, classification = "family", top = TRUE){ + tab <- get.tab(dir,classification,top = top) + tab.pval <- get.tab.pval(dir,classification,tab,top = top) + tab.or <- get.tab.or(dir,classification,tab,top = top) + tf.or.table <- get.tf.or.table(dir,classification,tab,top = top) + return(list("tab" = tab, + "tab.pval" = tab.pval, + "tab.or" = tab.or, + "tf.or.table" = tf.or.table)) +} + +#' @title summarize MR TF as a binary table with 1 if TF +#' was found in the analysis, 0 if not +#' @param dir Directory with ELMER results +#' @param classification Which columns to retrieve family or subfamily +#' @param top Consider only top 1 within each (sub)family +#' @importFrom dplyr full_join as_data_frame +#' @importFrom purrr reduce +#' @examples +#' \dontrun{ +#' dir.create("out") +#' dir.create("out2") +#' data <- tryCatch( +#' ELMER:::getdata("elmer.data.example"), +#' error = function(e) { +#' message(e) +#' data(elmer.data.example, envir = environment()) +#' }) +#' enriched.motif <- list("P53_HUMAN.H11MO.1.A"= c("cg00329272", "cg10097755", "cg08928189", +#' "cg17153775", "cg21156590", "cg19749688", "cg12590404", +#' "cg24517858", "cg00329272", "cg09010107", "cg15386853", +#' "cg10097755", "cg09247779", "cg09181054")) +#' TF <- get.TFs(data, +#' enriched.motif, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' TFs = data.frame( +#' external_gene_name=c("TP53","TP63","TP73"), +#' ensembl_gene_id= c("ENSG00000141510", +#' "ENSG00000073282", +#' "ENSG00000078900"), +#' stringsAsFactors = FALSE), +#' dir.out = "out", +#' label="hypo") +#' TF <- get.TFs(data, +#' enriched.motif, +#' group.col = "definition", +#' group1 = "Primary solid Tumor", +#' group2 = "Solid Tissue Normal", +#' TFs = data.frame( +#' external_gene_name=c("TP53","TP63","TP73"), +#' ensembl_gene_id= c("ENSG00000141510", +#' "ENSG00000073282", +#' "ENSG00000078900"), +#' stringsAsFactors = FALSE), +#' dir.out = "out2", +#' label="hypo") +#' ta.family <- get.tab(dir = c("out","out2"),classification = "family") +#' ta.subfamily <- get.tab(dir = c("out","out2"),classification = "subfamily") +#' unlink("out") +#' unlink("out2") +#' } +get.tab <- function(dir,classification, top = TRUE){ + message("o Creating TF binary matrix") + tab <- lapply(dir, + function(x) { + ret <- summarizeTF(path = x, + classification = classification, + top = top) + if(is.null(ret)) return(NULL) + colnames(ret)[2] <- x + return(ret) + }) + tab <- tab[which(unlist(lapply(tab,function(x){!is.null(x)})))] + + if(length(tab) == 0) { + message("No MR TF for classification ", classification) + return(NULL) + } + + tab <- purrr::reduce(tab,dplyr::full_join) + + tab[tab == "x"] <- 1 + tab[is.na(tab)] <- 0 + rownames(tab) <- tab$TF + tab$TF <- NULL + for(i in 1:ncol(tab)){ + tab[,i] <- as.numeric(tab[,i]) + } + tab <- tab[rowSums(tab) > 0,,drop = FALSE] + return(tab) +} + +#' @importFrom tidyr separate_rows gather +get.tab.or <- function(dir, classification, tab, top = TRUE){ + col <- ifelse(classification == "family","potential.TF.family", "potential.TF.subfamily") + if(top) col <- paste0("top.",col) + message("o Creating TF OR matrix") + # For each of those analysis get the enriched motifs and the MR TFs. + tab.or <- plyr::adply(dir, + .margins = 1, + function(path){ + TF <- readr::read_csv(dir(path = path, pattern = ".significant.TFs.with.motif.summary.csv", + recursive = T,full.names = T),col_types = readr::cols()) + motif <- readr::read_csv(dir(path = path, pattern = ".motif.enrichment.csv", + recursive = T,full.names = T),col_types = readr::cols()) + z <- tidyr::separate_rows(TF, col, convert = TRUE,sep = ";") %>% dplyr::full_join(motif) + z <- z[order(-z$OR),] # Drecreasing order of OR + OR <- z[match(rownames(tab),z[[col]]),] %>% pull(OR) + OR + },.id = NULL + ) + tab.or <- t(tab.or) + rownames(tab.or) <- rownames(tab) + colnames(tab.or) <- colnames(tab) + return(tab.or) +} + +#' @importFrom tidyr separate_rows gather +get.tab.pval <- function(dir,classification,tab, top = TRUE){ + col <- ifelse(classification == "family","potential.TF.family", "potential.TF.subfamily") + if(top) col <- paste0("top.",col) + + message("o Creating TF FDR matrix") + # For each of those analysis get the correlation pvalue of MR TFs exp vs Avg DNA met. of inferred TFBS. + tab.pval <- plyr::adply(dir, + .margins = 1, + function(path){ + TF <- readr::read_csv(dir(path = path, + pattern = ".significant.TFs.with.motif.summary.csv", + recursive = T,full.names = T),col_types = readr::cols()) + motif <- readr::read_csv(dir(path = path, + pattern = ".motif.enrichment.csv", + recursive = T,full.names = T),col_types = readr::cols()) + # For each TF breaks into lines (P53;P63) will create two lines. Then merge with motif to get OR value + z <- tidyr::separate_rows(TF, col, convert = TRUE,sep = ";") %>% dplyr::full_join(motif) + z <- z[order(-z$OR),] # Drecreasing order of OR + colnames(z)[grep(paste0("^",col),colnames(z))] <- "TF" + motif <- z[match(rownames(tab),z$TF),] %>% pull(motif) # get higher OR for that TF binding + TF.meth.cor <- get(load(dir(path = path, pattern = ".TFs.with.motif.pvalue.rda", recursive = T, full.names = T))) + TF.meth.cor <- cbind(TF.meth.cor,TF = rownames(TF.meth.cor)) + TF.meth.cor <- dplyr::as_data_frame(TF.meth.cor) + # Create table TF, motif, FDR + TF.meth.cor <- TF.meth.cor %>% tidyr::gather(key = motif, value = FDR, -TF) + # get FDR for TF and motif + FDR <- TF.meth.cor[match(paste0(rownames(tab),motif),paste0(TF.meth.cor$TF,TF.meth.cor$motif)),] %>% dplyr::pull(FDR) + as.numeric(FDR) + },.id = NULL + ) + + tab.pval <- t(tab.pval) + rownames(tab.pval) <- rownames(tab) + colnames(tab.pval) <- colnames(tab) + return(tab.pval) +} + + +#' @importFrom DelayedArray rowMins +get.top.tf.by.pval <- function(tab.pval,top = 5){ + labels <- c() + for(i in 1:ncol(tab.pval)){ + labels <- sort( + unique( + c(labels, + rownames(tab.pval)[head(sort(DelayedArray::rowMins(tab.pval[,i,drop = F],na.rm = T), + index.return = TRUE, + decreasing = F)$ix, + n = top)] + ) + ) + ) + } + + return(labels) +} + + +get.tf.or.table <-function(dir,classification,tab, top = TRUE){ + col <- ifelse(classification == "family","potential.TF.family", "potential.TF.subfamily") + if(top) col <- paste0("top.",col) + tf.or.table <- plyr::adply(dir, + .margins = 1, + function(path){ + TF <- readr::read_csv(dir(path = path, + pattern = ".significant.TFs.with.motif.summary.csv", + recursive = T,full.names = T), + col_types = readr::cols()) + motif <- readr::read_csv(dir(path = path, + pattern = ".motif.enrichment.csv", + recursive = T,full.names = T), + col_types = readr::cols()) + + # For each TF breaks into lines (P53;P63) will create two lines. Then merge with motif to get OR value + z <- tidyr::separate_rows(TF, col, convert = TRUE,sep = ";") %>% dplyr::full_join(motif) + z <- z[order(-z$OR),] # Drecreasing order of OR + colnames(z)[grep(paste0("^",col),colnames(z))] <- "TF" + motif <- z[match(rownames(tab),z$TF),] %>% pull(motif) # get higher OR for that TF binding + TF.meth.cor <- get(load(dir(path = path, + pattern = ".TFs.with.motif.pvalue.rda", + recursive = T, full.names = T)) + ) + TF.meth.cor <- cbind(TF.meth.cor,TF = rownames(TF.meth.cor)) + TF.meth.cor <- dplyr::as_data_frame(TF.meth.cor) + # Create table TF, motif, FDR + TF.meth.cor <- TF.meth.cor %>% tidyr::gather(key = "motif", value = "FDR", -TF) + TF.meth.cor$FDR <- as.numeric(TF.meth.cor$FDR) + TF.meth.cor <- TF.meth.cor[order(TF.meth.cor$FDR,decreasing = F),] + TF.meth.cor <- TF.meth.cor[ + match(paste0(rownames(tab),motif), + paste0(TF.meth.cor$TF,TF.meth.cor$motif)),] %>% + na.omit + TF.meth.cor$analysis <- path + TF.meth.cor + },.id = NULL + ) + tf.or.table <- tf.or.table[order(tf.or.table$FDR,decreasing = F),] + return(tf.or.table) +} \ No newline at end of file diff --git a/R/render_report.R b/R/render_report.R new file mode 100644 index 00000000..6c1da947 --- /dev/null +++ b/R/render_report.R @@ -0,0 +1,92 @@ +#' @title Build report for TCGA.pipe function +#' @description Build HTML report +#' @param title HTML report title +#' @param mae.file Absolute path to the mae used in the analysis (.rda or .rds) +#' @param group.col Group col +#' @param group1 Group 1 +#' @param group2 Group 2 +#' @param direction direction used in the analysis +#' @param dir.out Absolute path to folder with results. dir.out used in the analysis +#' @param genome Genome of reference used in the analysis +#' @param mode mode used in the analysis +#' @param minSubgroupFrac minSubgroupFrac used in the analysis +#' @param minMetdiff minMetdiff used in the analysis +#' @param metfdr metfdr used in the analysis +#' @param permu permu used in the analysis +#' @param rawpval rawpval used in the analysis +#' @param pe pe used in the analysis +#' @param nprobes nprobes used in the analysis +#' @param lower.OR lower.OR used in the analysis +#' @param out_file Output file name (i.e report.html) +#' @param funcivar Include funcivar analysis? +#' @export +#' @importFrom rmarkdown render +#' @examples +#' \dontrun{ +#' render_report( +#' group.col = "TN", +#' group1 = "Tumor", +#' group2 = "Normal", +#' dir.out = "~/paper_elmer/Result/BRCA/TN_Tumor_vs_Normal/hypo/", +#' direction = "hypo", +#' mae.file = "~/paper_elmer/Result/BRCA/BRCA_mae_hg38.rda" +#') +#' } +render_report <- function( + title = "Report", + mae.file, + group.col, + group1, + group2, + direction, + dir.out, + genome = "hg38", + mode = "supervised", + minSubgroupFrac = 0.2, + minMetdiff = 0.3, + metfdr = 0.01, + permu = 10000, + rawpval = 0.01, + pe = 0.01, + nprobes = 10, + lower.OR = 1.1, + out_file = file.path(getwd(),"report.html"), + funcivar = FALSE +) { + if(missing(dir.out)) stop("Please, set dir.out value") + if(missing(mae.file)) stop("Please, set mae.file value") + if(missing(group.col)) stop("Please, set mae value") + if(missing(group1)) stop("Please, set mae value") + if(missing(group2)) stop("Please, set mae value") + template <- system.file("rmd", "template.Rmd", package="ELMER") + + message("Compiling report") + parameters <- list( + title = title, + genome = genome, + mode = mode, + minSubgroupFrac = minSubgroupFrac, + minMetdiff = minMetdiff, + metfdr = metfdr, + permu = permu, + rawpval = rawpval, + pe = pe, + nprobes = nprobes, + lower.OR = lower.OR, + groupCol = group.col, + mae.file = mae.file, + group1 = group1, + group2 = group2, + direction = direction, + dir.out = dir.out, + funcivar = funcivar + ) + message("Saving report: ", out_file) + rmarkdown::render( + template, + intermediates_dir = dirname(out_file), + output_file = out_file, + params = parameters + ) + invisible(TRUE) +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 00000000..b6c538bc --- /dev/null +++ b/README.md @@ -0,0 +1,31 @@ +[![DOI](https://zenodo.org/badge/71802271.svg)](https://zenodo.org/badge/latestdoi/71802271) + + + +### An R/Bioconductor Tool Inferring Regulatory Element Landscapes and Transcription Factor Networks Using Methylomes + +#### Citing ELMER + + +Tiago C Silva, Simon G Coetzee, Nicole Gull, Lijing Yao, Dennis J Hazelett, Houtan Noushmehr, De-Chen Lin, Benjamin P Berman; ELMER v.2: an R/Bioconductor package to reconstruct gene regulatory networks from DNA methylation and transcriptome profiles, Bioinformatics, , bty902, https://doi.org/10.1093/bioinformatics/bty902 + +#### Main article +https://doi.org/10.1093/bioinformatics/bty902 + + +#### Installing and loading ELMER +To obtain a copy of ELMER, you will need to install devtools and ELMER.data which contains essential data for running ELMER package + +```r +install.packages(devtools) +library(devtools); +devtools::install_github("tiagochst/ELMER.data"); +devtools::install_github("tiagochst/ELMER"); +``` + +#### Documentation +* [ELMER documentation](https://bioconductor.org/packages/devel/bioc/vignettes/ELMER/inst/doc/index.html) +* [ELMER.data documentation](https://tiagochst.github.io/ELMER.data/index.nb.html) +* [Paper supplemental files](https://tiagochst.github.io/ELMER_supplemental/) + + diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 00000000..1d393a49 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,90 @@ +# DO NOT CHANGE the "init" and "install" sections below + +# Download script file from GitHub +init: + ps: | + $ErrorActionPreference = "Stop" + Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" + Import-Module '..\appveyor-tool.ps1' + +install: + ps: Bootstrap + +cache: + - C:\RLibrary + + +environment: + matrix: + - R_VERSION: release + R_ARCH: x64 + GCC_PATH: mingw_64 + USE_RTOOLS: true + WARNINGS_ARE_ERRORS: 0 + DOWNLOAD_FILE_METHOD: wininet + R_REMOTES_NO_ERRORS_FROM_WARNINGS: TRUE + +build_script: + - travis-tool.sh install_bioc BiocVersion + - travis-tool.sh install_deps + - travis-tool.sh install_bioc_deps + - travis-tool.sh install_bioc ALL + - travis-tool.sh install_bioc GenomeInfoDbData + - travis-tool.sh install_bioc BiocStyle + - travis-tool.sh install_bioc GenomicRanges + - travis-tool.sh install_bioc Biobase + - travis-tool.sh install_bioc maftools + - travis-tool.sh install_bioc affy + - travis-tool.sh install_bioc EDASeq + - travis-tool.sh install_bioc edgeR + - travis-tool.sh install_bioc biomaRt + - travis-tool.sh install_bioc IRanges + - travis-tool.sh install_bioc supraHex + - travis-tool.sh install_bioc S4Vectors + - travis-tool.sh install_bioc ComplexHeatmap + - travis-tool.sh install_bioc SummarizedExperiment + - travis-tool.sh install_bioc DO.db BiocGenerics + - travis-tool.sh install_bioc GenomicFeatures + - travis-tool.sh install_bioc TxDb.Hsapiens.UCSC.hg19.knownGene + - travis-tool.sh install_bioc limma + - travis-tool.sh install_bioc genefilter + - travis-tool.sh install_bioc ConsensusClusterPlus + - travis-tool.sh install_bioc pathview + - travis-tool.sh install_bioc clusterProfiler + - travis-tool.sh install_bioc GenomicInteractions + - travis-tool.sh install_github BioinformaticsFMRP/TCGAbiolinks tiagochst/ELMER.data + +test_script: + - travis-tool.sh run_tests + +on_failure: + - 7z a failure.zip *.Rcheck\* + - appveyor PushArtifact failure.zip + + +artifacts: + - path: '*.Rcheck\**\*.log' + name: Logs + + - path: '*.Rcheck\**\*.out' + name: Logs + + - path: '*.Rcheck\**\*.fail' + name: Logs + + - path: '*.Rcheck\**\*.Rout' + name: Logs + + - path: '\*_*.tar.gz' + name: Bits + + - path: '\*_*.zip' + name: Bits + + +notifications: + - provider: Slack + incoming_webhook: + secure: 7UV/4W8G1XsjPOnS7evgJipqeUPQtfYnnKRov2i6hSKyMAyec8OCi0SGnI/vQxa7131I0tCwQL9mSKN9eJP4QLR6t2MOnNpuvKKFoMriyrE= + on_build_success: false + on_build_failure: true diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 00000000..73cff32f --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,17 @@ +citEntry(entry="article", + title = "ELMER v.2: An R/Bioconductor package to reconstruct gene regulatory networks from DNA methylation and transcriptome profiles", + author = personList( + as.person("Tiago Chedraoui Silva"), + as.person("Simon G Coetzee"), + as.person("Nicole Gull"), + as.person("Lijing Yao"), + as.person("Dennis J Hazelett"), + as.person("Houtan Noushmehr"), + as.person("De-Chen Lin"), + as.person("Benjamin P Berman")), + year = 2018, + journal = "Bioinformatics", + doi = "10.1093/bioinformatics/bty902", + textVersion = + "Silva, Tiago C and Coetzee, Simon G and Gull, Nicole and Yao, Lijing and Hazelett, Dennis J and Noushmehr, Houtan and Lin, De-Chen and Berman, Benjamin P. ELMER v.2: An R/Bioconductor package to reconstruct gene regulatory networks from DNA methylation and transcriptome profiles" + ) diff --git a/inst/rmd/style.css b/inst/rmd/style.css new file mode 100644 index 00000000..c189001d --- /dev/null +++ b/inst/rmd/style.css @@ -0,0 +1,4 @@ +.main-container { + max-width: inherit !important; + margin-left: 1px; +} diff --git a/inst/rmd/tabset-dropdown.html b/inst/rmd/tabset-dropdown.html new file mode 100644 index 00000000..59017958 --- /dev/null +++ b/inst/rmd/tabset-dropdown.html @@ -0,0 +1,63 @@ + + + \ No newline at end of file diff --git a/inst/rmd/template.Rmd b/inst/rmd/template.Rmd new file mode 100644 index 00000000..8ddc9d8e --- /dev/null +++ b/inst/rmd/template.Rmd @@ -0,0 +1,439 @@ +--- +title: "`r sprintf('%s', params$title)`" +author: "`r sprintf('%s', params$author)`" +date: "`r Sys.Date()`" +params: + title: Report + author: + genome: hg38 + mode: supervised + minSubgroupFrac: 20% + minMetdiff: 0.3 + metfdr: 0.01 + permu: 10000 + rawpval: 0.01 + pe: 0.01 + nprobes: 10 + lower.OR: 1.1 + groupCol: + mae.file: + group1: + group2: + direction: + dir.out: + funcivar: FALSE +output: + html_document: + df_print: paged + highlight: tango + css: style.css + number_sections: yes + toc: yes + toc_float: + collapsed: yes + toc_depth: 3 + includes: + after_body: tabset-dropdown.html +editor_options: + chunk_output_type: inline +--- + +```{r, warning=FALSE,message=FALSE, echo=FALSE} +library(DT) +library(ELMER) +library(knitr) +library(ComplexHeatmap) +library(ggplot2) +if(params$funcivar) library(funciVar) +library(GenomicRanges) +library(dplyr) +``` + +```{r, echo=FALSE, warning=FALSE,message=FALSE, cols.print=20} +file <- params$mae.file +if(grepl("\\.rds",file,ignore.case = T)){ + mae <- readRDS(file) +} else { + mae <- get(load(file)) +} +``` + + +```{r, include=FALSE} +opts_knit$set(progress = FALSE, verbose = FALSE, fig.align='center') +opts_chunk$set(warning=FALSE, message=FALSE, echo=FALSE) +## this function is basically creating chunks within chunks, and then +## I use results='asis' so that the html image code is rendered +kexpand <- function(ht, cap) { + cat(knit(text = knit_expand(text = sprintf("```{r %s, fig.height=%s, fig.cap='%s'}\n.motif.plot\n```", cap, ht, cap) + ))) +} +kexpand.plot <- function(ht = 10, cap,width = 15) { + cat(knit(text = knit_expand(text = + sprintf("```{r %s, fig.height=%s, fig.width = %s, fig.cap='%s'}\n ##### %s\n plot(.pl)\n```", cap, ht, width,cap,cap) + ))) +} + +kexpand.df <- function(cap) { + cat(knit(text = knit_expand(text = + sprintf("```{r %s}\n.df\n```",cap) + ))) +} +kexpand.dt <- function(cap,format=NULL) { + cat(knit( + text = knit_expand(text = sprintf("```{r %s}\n + .df <- DT::datatable(.df, + filter = 'top', + class = 'cell-border stripe', + rownames = FALSE, + extensions = 'Buttons', + options = list(scrollX=TRUE, + dom = 'Bfrtip', + buttons = c('copy', 'csv', 'excel', 'pdf', 'print',I('colvis'))))\nif(!is.null(format)){.df <- DT::formatSignif(.df,format,3)}\n.df\n```",cap) + ))) +} + +kexpand.plotHeatmap <- function(ht = 10, cap, width = 10) { + cat(knit(text = knit_expand(text = + sprintf("```{r %s, fig.height=%s, fig.width = %s,fig.cap='%s'}\n ##### %s\n draw(.pl, newpage = TRUE, + column_title = 'Correspondence between probe DNA methylation and distal gene expression', + column_title_gp = gpar(fontsize = 12, fontface = 'bold'), + annotation_legend_side = 'bottom')\n```", cap, ht, width, cap,cap) + ))) +} +``` + +```{r, include=FALSE} +# Texts for file +text_top_TF_tbl <- function(){ + cat("\nThe table below shows for a given enriched motif, **the top potential TF** (best ranked TF based on the p-value) belonging to the same family or subfamily (TFClass classification) of the TF motif. The columns with a prefix p-value shows how significant is the anti-correlation of the average DNA methylation level of probes with the given motif and the TF expression.\n") +} +text_TF_tbl <- function(type = "family"){ + cat(paste0("\nThe table below shows for a given enriched motif, all potential TF belonging to **the same ", + type, + "** (TFClass classification) of the TF motif. The columns with a prefix p-value shows how significant is the anti-correlation of the average DNA methylation level of probes with the given motif and the TF expression.\n")) +} +text_TF_plot <- function(){ + cat("\nThe plot below shows for a given enriched motif the ranking of p-values showing how significant is the anti-correlation of the average DNA methylation level of probes with the given motif and the TF expression. TFs in the same family and subfamily of the given TF motif are highlighted. Also, the top 3 TFs (lowest p-values) are highlighted.\n") +} +text_TF_scatter <- function(){ + cat("\nThe plot below shows for a given enriched motif the average DNA methylation level of probes with the signature for the given motif vs the TF expression. Each dot is a sample.\n") +} +text_funcivar <- function(){ + cat("\n The plot below was produced with funciVar tool (https://github.com/Simon-Coetzee/funcivar), which calculate overlaps and enrichment between + genomic variants and genomic features or segmentations. The segmentations used were retrieved from (http://statehub.org/).\n") +} +``` + + +```{r, include=FALSE} +# Load DNA methylation platform 450K manifest (hg38) and select only probes paired +genome <- params$genome +# Load DNA methylation platform 450K manifest (hg38) and select only probes paired +distal.probes <- ELMER::get.feature.probe(feature = NULL,genome = genome, met.platform = "450K") + +if(params$funcivar){ + esegs.file <- paste0("esegs",genome,".rda") + if(!file.exists(esegs.file)) { + # Download state for breast cancer cell line (mcf-7) + base <- paste0("http://s3-us-west-2.amazonaws.com/statehub-trackhub/tracks/5813b67f46e0fb06b493ceb0/",genome,"/ENCODE/") + # download tracks (search used: "encode hg38 h3k27ac h3k4me1 h3k4me3 ctcf") + state <- c("mcf-7.16mark.segmentation.bed", + "bipolar_spindle_neuron.8mark.segmentation.bed", + "cardiac_muscle_cell.9mark.segmentation.bed", + "cd14-positive_monocyte.9mark.segmentation.bed", + "dohh2.8mark.segmentation.bed", + "fibroblast_of_dermis.8mark.segmentation.bed", + "fibroblast_of_lung.13mark.segmentation.bed", + "gm12878.12mark.segmentation.bed", + "hct116.12mark.segmentation.bed", + "hela-s3.13mark.segmentation.bed", + "hepatocyte.9mark.segmentation.bed", + "induced_pluripotent_stem_cell.7mark.segmentation.bed", + "k562.19mark.segmentation.bed", + "mcf-7.16mark.segmentation.bed", + "neutrophil.8mark.segmentation.bed") + + bed <- paste0(base,state) + dir.create("state_tracks", showWarnings = FALSE) + for( i in bed) { + if(!file.exists(file.path("state_tracks",basename(i)))) { + tryCatch({downloader::download(i,file.path("state_tracks",basename(i)))},error = function(e){}) + } + } + esegs <- GetSegmentations(files = dir("state_tracks",full.names = T)) %>% unlist + save(esegs, file = esegs.file) + } else { + load(esegs.file) + } +} +``` +# Summary + +## Groups + +```{r, echo=FALSE, warning=FALSE,message=FALSE, cols.print=20} +plyr::count(SummarizedExperiment::colData(mae)[,params$groupCol]) +``` + +## Arguments + +```{r, echo=FALSE, warning=FALSE,message=FALSE , cols.print=20} +arg <- data.frame("Argument" = c("Genome of reference", + "Mode", + "All: minSubgroupFrac", + "DNA methylation differences: min mean difference", + "DNA methylation differences: p-value adj cut-off", + "Pairs correlation: # permutations", + "Pairs correlation: raw p-value cut-off ", + "Pairs correlation: empirical p-values cut-off", + "Enrichement motif: minimun # probes (enriched motif)", + "Enrichement motif:lower.OR"), + "Value" = c(params$genome, + params$mode, + params$minSubgroupFrac, + params$minMetdiff, + params$metfdr, + params$permu, + params$rawpval, + params$pe, + params$nprobes, + params$lower.OR) +) +arg +``` + +## Summary results + +```{r, echo=FALSE, warning=FALSE,message=FALSE , cols.print=20} +root <- params$dir.out +direction <- params$direction +group1 <- params$group1 +group2 <- params$group2 +group.col <- params$groupCol + +suppressWarnings({ + summary <- data.frame( + nrow(readr::read_csv(paste0(root, "/getMethdiff.",direction,".probes.significant.csv"),col_types = readr::cols())), + nrow(readr::read_csv(paste0(root,"/getPair.",direction, ".pairs.significant.csv"),col_types = readr::cols())), + length(get(load(paste0(root, "/getMotif.",direction,".enriched.motifs.rda")))) + ) +}) +colnames(summary) <- c("Sig. diff probes","Sig. pairs","Enriched motifs") +summary$Analysis <- + paste0("Probes ",direction, "methylated in ", + group1, " vs ", + group2) +summary <- summary[,c("Analysis", "Sig. diff probes","Sig. pairs","Enriched motifs")] +summary +``` + + +```{r ,echo=FALSE, message=FALSE, warning=FALSE,results="asis",fig.height=8,fig.width=8} +g1 <- params$group1 +g2 <- params$group2 +group.col <- params$groupCol +dir <- params$direction +if(params$title == "ELMER") { + cat("\n#", g1, " vs ", g2,"\n") +} else { + cat("\n#",params$title,"\n") +} +cat("\n## Probes", paste0(dir,"methylated in ", g1, " vs ", g2,"\n")) +p <- readr::read_csv(paste0(root,"/getMethdiff.",dir,".probes.csv"),col_types = readr::cols()) +.pl <- TCGAbiolinks:::TCGAVisualize_volcano( + x = as.data.frame(p)[,grep("Minus",colnames(p),value = T)], + y = p$adjust.p, + title = paste0("Volcano plot - Probes ", + dir, " methylated in ", + g1, " vs ", g2,"\n"), + filename = NULL, + label = c("Not Significant", + paste0("Hypermethylated in ",g1), + paste0("Hypomethylated in ",g1)), + ylab = expression(paste(-Log[10], + " (FDR corrected P-values) [one tailed test]")), + xlab = expression(paste( + "DNA Methylation difference (",beta,"-values)") + ), + x.cut = params$minMetdiff, + y.cut = params$metfdr +) +kexpand.plot(5, paste0("Volcano plot - Probes ",dir,"methylated in ", g1, " vs ", g2)) + +file <- paste0(root,"/getPair.",dir, ".pairs.significant.csv") +if(file.exists(file)) { + cat("\n### Significant anti-correlated pairs of gene-probes\n") + suppressWarnings({ + .df <- readr::read_csv(paste0(root,"/getPair.",dir, ".pairs.significant.csv"),col_types = readr::cols()) + .df <- .df[order(.df$Raw.p),] + }) + kexpand.df(paste0(root,"/getPair.",dir, ".pairs.significant.csv")) + .pl <- heatmapPairs( + data = mae, + group.col = group.col, + group1 = g1, + group2 = g2, + pairs = .df, + filename = NULL + ) + + cap <- paste0("Heatmap: hypomethylated paired probes") + kexpand.plotHeatmap(ht = 6,cap,width = 10) + + if(params$funcivar){ + cat("\n#### Statehub: Chromatin state evaluation\n") + text_funcivar() + paired.probes <- unique(.df$Probe) + paired.probes <- distal.probes[names(distal.probes) %in% paired.probes] + enrichmet <- CalculateEnrichment( + variants = list(bg = distal.probes, fg = paired.probes), + features = esegs, + feature.type = "segmentations", + prior = c(a=0.5, b=0.5) + ) + + .pl <- PlotEnrichment(variant.enrichment = enrichmet, + value = "difference", + block1 = "state", + color.by = "sample", + ncol = 6) + cap <- paste0("Funcivar: ", paste0(dir,"methylated paired probes. Produced with funciVar tool (https://github.com/Simon-Coetzee/funcivar) and statehub.org data.")) + kexpand.plot(10,cap, 15) + + funcivar.code <- data.frame("Abbreviation" = c("AR","EAR","EWR","EPR","PAR","PWR","PPR","PPWR","CTCF","TRS","HET","SCR"), + "Chromatin state" = c("Active region", + "active enhancer", + "Weak Enhancer", + "poised enhancer", + "active promoter", + "Weak Promoter", + "poised promoter", + "Weak Poised Promoter", + "architectural complex", + "transcribed", + "heterochromatin", + "Polycomb Repressed Silenced.")) + .df <- funcivar.code + kexpand.df("funcivar abbreviations") + } + + file <- paste0(root,"/getMotif.",dir,".motif.enrichment.csv") + if(file.exists(file)) { + cat("\n### Motif enrichment analysis\n") + motifs <- readr::read_csv(file,col_types = readr::cols()) + .df <- motifs + kexpand.dt(paste0(root,"/getMotif.",dir,".motif.enrichment.csv")) + + motifs.enriched <- get(load(paste0(root, "/getMotif.",dir,".enriched.motifs.rda"))) + if(length(names(motifs.enriched)) > 0) { + .motif.plot <- motif.enrichment.plot( + motif.enrichment = motifs, + summary = FALSE, + title = paste0("Probes ",dir,"methylated in ", g1, " vs ", g2), + significant = list(lowerOR = params$lower.OR, NumOfProbes = params$nprobes), + save = FALSE + ) + kexpand(max(4,round(sum(motifs$lowerOR > params$lower.OR)/5)), paste0("Probes ",dir,"methylated in ", g1, " vs ", g2)) + + cat("\n### TF analysis\n") + TF <- readr::read_csv(paste0(root,"/getTF.",dir,".significant.TFs.with.motif.summary.csv"),col_types = readr::cols()) + load(paste0(root,"/getTF.",dir,".TFs.with.motif.pvalue.rda")) + #.df <- TF[na.omit(match(motifs$motif,TF$motif)),] + #kexpand.dt(paste0(root,g,"/",dir,"/getTF.",dir,".TFs.with.motif.pvalue.rda")) + + # For each enriched motif with a Family member create a plot + topmotifs <- motifs[motifs$motif %in% names(enriched.motif),"motif"] + .df <- merge(TF,motifs, by = "motif") + pval <- reshape::melt(TF.meth.cor) + + cat("\n\n#### Top potential TF\n\n") + text_top_TF_tbl() + colnames(pval) <- c("top.potential.TF.family","motif","pvalue.TF.family") + top.family <- merge(.df,pval)[,c("motif","OR","top.potential.TF.family","pvalue.TF.family")] + + colnames(pval) <- c("top.potential.TF.subfamily","motif","pvalue.TF.subfamily") + top.subfamily <- merge(.df,pval)[,c("motif","OR","top.potential.TF.subfamily","pvalue.TF.subfamily")] + .df <- merge(top.family,top.subfamily,all = TRUE) + .df <- .df[order(-.df$pvalue.TF.subfamily),] + + kexpand.dt(paste0(root,"/getTF.",dir,"aux"),c("pvalue.TF.subfamily",'pvalue.TF.family',"OR")) + + .df.aux <- merge(TF,motifs, by = "motif") + for(i in c("potential.TF.family","potential.TF.subfamily")){ + title <- paste0("\n\n#### ",gsub("\\."," ",i),"\n") + cat(title) + if(i == "potential.TF.family") { + text_TF_tbl() + aux <- tidyr::unnest(.df.aux,potential.TF.family=strsplit(potential.TF.family, ";")) + } else if(i == "potential.TF.subfamily") { + text_TF_tbl("subfamily") + aux <- tidyr::unnest(.df.aux,potential.TF.subfamily=strsplit(potential.TF.subfamily, ";")) + } + colnames(pval) <- c(i,"motif","pvalue") + .df <- merge(aux,pval)[,c("motif","OR",i,"pvalue")] + .df <- .df[order(-.df$pvalue),] + kexpand.dt(paste0(root,"/",dir,"getTF.",dir,"aux",i),c('pvalue',"OR")) + } + + sink('/dev/null') + x <- TF.rank.plot( + motif.pvalue = TF.meth.cor, + motif = topmotifs$motif, + title = paste0("Probes ",dir,"methylated in ", g1, " vs ", g2), + save = FALSE + ) + sink() + + cat("\n\n#### TF plots\n") + cat("\n##### TF plots {.tabset .tabset-dropdown}\n") + text_TF_plot() + for(i in names(x)) { + cat("\n######",gsub("_HUMAN.H11MO.*","",i)," \n") + .pl <- x[[i]] + kexpand.plot(6,paste0(gsub("_HUMAN.H11MO.*","",i)," - Probes ",dir,"methylated in ", g1, " vs ", g2),6) + } + + .df <- TF[na.omit(match(motifs$motif,TF$motif)),] + cat("\n##### Scatter plot {.tabset .tabset-dropdown}\n") + text_TF_scatter() + for(i in 1:nrow(.df)){ + text <- "" + if(!is.na(.df[i,"top.potential.TF.family"])) text <- paste0(.df[i,"top.potential.TF.family"]," and ") + cat("\n######",paste0(text, + "top 3 expression vs avg DNA methylation of paired enriched probes for ", + gsub("_HUMAN.H11MO.*","",.df[i,]$motif)," \n")) + top3 <- unlist(stringr::str_split(subset(TF,TF$motif == .df[i,]$motif)$top_5percent_TFs,";"))[1:3] + .pl <- scatter.plot( + data = mae, + byTF = list(TF = c(.df[i,"top.potential.TF.family"],.df[i,"top.potential.TF.subfamily"],top3), + probe = motifs.enriched[[.df[i,]$motif]]), + category = group.col, + save = FALSE, + lm_line = TRUE + ) + + kexpand.plot(6, + paste0(text, "top3 TF expression vs avg DNA methylation of paired enriched probes for ", + .df[i,]$motif," - Probes ",dir,"methylated in ", g1, " vs ", g2 + ) + ) + } + } + } +} + +``` + +# Complete code + +## ELMER analysis +```{R} +# ```{r, code=readLines(params$code), eval=FALSE} +# ``` +``` + +# Session Info +```{r, echo=FALSE, warning=FALSE,message=FALSE, cols.print=20} +sessionInfo() +``` \ No newline at end of file diff --git a/man/AddLegend.Rd b/man/AddLegend.Rd deleted file mode 100644 index 1355ebc1..00000000 --- a/man/AddLegend.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R -\name{AddLegend} -\alias{AddLegend} -\title{add lengend} -\usage{ -AddLegend(Labels = NULL, cols = NULL, margins = c(1, 1, 1, 1), - lab.las = 2, cexlab = 0.2 + 1/log10(length(cols))) -} -\arguments{ -\item{Labels}{A vector of characters} - -\item{cols}{A vector of colors for each characters in Labels} - -\item{...}{parameters in image function.} -} -\value{ -A legend -} -\description{ -add lengend -} - diff --git a/man/Binary.Rd b/man/Binary.Rd deleted file mode 100644 index 0d1e2b31..00000000 --- a/man/Binary.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R -\name{Binary} -\alias{Binary} -\title{binary data} -\usage{ -Binary(x, Break = 0.3, Break2 = NULL) -} -\arguments{ -\item{x}{A matrix.} - -\item{Break}{A value to binarize the data.} - -\item{Break2}{A value to cut value to 3 categories.} -} -\value{ -A binarized matrix. -} -\description{ -binary data -} - diff --git a/man/Distal.Rd b/man/Distal.Rd deleted file mode 100644 index 0db08c10..00000000 --- a/man/Distal.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/ReadFile.R -\name{Distal} -\alias{Distal} -\title{Generate random loci of genome.} -\usage{ -Distal(x, TSS.range, ignore.strand = F) -} -\arguments{ -\item{x}{GRange object which you want to identify distal elements from.} - -\item{TSS.range}{GRange object which contain promoter infomation} - -\item{ignore.strand}{A boolean which to specific if ignore strand or not.} -} -\value{ -GRange object which contains elements from x that doesn't overlap with TSS.range. -} -\description{ -Generate random loci of genome. -} - diff --git a/man/ELMER.Rd b/man/ELMER.Rd new file mode 100644 index 00000000..3aabcfb1 --- /dev/null +++ b/man/ELMER.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ELMER.R +\docType{package} +\name{ELMER} +\alias{ELMER} +\title{ELMER (Enhancer Linking by Methylation/Expression Relationships)} +\description{ +ELMER is designed to use DNA methylation and gene expression from a +large number of samples to infere regulatory element landscape and transcription +factor network in primary tissue. +} diff --git a/man/Get.Pvalue.p.Rd b/man/Get.Pvalue.p.Rd index da5936e9..58868280 100644 --- a/man/Get.Pvalue.p.Rd +++ b/man/Get.Pvalue.p.Rd @@ -1,13 +1,13 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/StatisticTest.R \name{Get.Pvalue.p} \alias{Get.Pvalue.p} \title{Calculate empirical Pvalue} \usage{ -Get.Pvalue.p(zscore.Matrix, permu) +Get.Pvalue.p(U.matrix, permu) } \arguments{ -\item{zscore.Matrix}{A data.frame of raw pvalue from U test. Output from .Stat.nonpara} +\item{U.matrix}{A data.frame of raw pvalue from U test. Output from .Stat.nonpara} \item{permu}{data frame of permutation. Output from .Stat.nonpara.permu} } @@ -17,4 +17,3 @@ A data frame with empirical Pvalue. \description{ Calculate empirical Pvalue } - diff --git a/man/GetNearGenes.Rd b/man/GetNearGenes.Rd index 97c56d87..6b32b093 100644 --- a/man/GetNearGenes.Rd +++ b/man/GetNearGenes.Rd @@ -1,27 +1,50 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/GetNearbyGenes.R \name{GetNearGenes} \alias{GetNearGenes} -\title{Collect nearby gene for one locus.} +\title{GetNearGenes to collect nearby genes for one locus.} \usage{ -GetNearGenes(SampleSize = 20, Gene = NULL, TBed = NULL, TRange = NULL, - cores = NULL) +GetNearGenes( + data = NULL, + probes = NULL, + geneAnnot = NULL, + TRange = NULL, + numFlankingGenes = 20 +) } \arguments{ -\item{SampleSize}{A number determine how many gene will be collected from each side of target (number shoule be even) Default to 20.} +\item{data}{A multi Assay Experiment with both DNA methylation and gene Expression objects} -\item{Gene}{A GRange object contains coordinates of promoters for human genome.} +\item{probes}{Name of probes to get nearby genes (it should be rownames of the DNA methylation +object in the data argument object)} -\item{TRange}{A GRange object contains coordinate of a list targets.} +\item{geneAnnot}{A GRange object or Summarized Experiment object that contains coordinates of promoters for +human genome.} -\item{cores}{A number to specific how many cores to use to compute. Default to detectCores()/2.} +\item{TRange}{A GRange object or Summarized Experiment object that contains coordinates of a list of targets loci.} -\item{Tbed}{A bed format data.frame object contains coordinate of a list targets.} +\item{numFlankingGenes}{A number determines how many gene will be collected totally. +Then the number devided by 2 is the number of genes collected from +each side of targets (number shoule be even) Default to 20.} } \value{ -A data frame of nearby genes and information: genes' IDs, genes' symbols, distance with target and side to which the gene locate to the target. +A data frame of nearby genes and information: genes' IDs, genes' symbols, +distance with target and side to which the gene locate to the target. } \description{ -Collect nearby gene for one locus. +GetNearGenes is a function to collect equal number of gene on each side of one locus. +It can receite either multi Assay Experiment with both DNA methylation and gene Expression matrix +and the names of probes to select nearby genes, or it can receive two granges objects TRange and geneAnnot. +} +\examples{ +geneAnnot <- getTSS(genome = "hg38") +probe <- GenomicRanges::GRanges(seqnames = c("chr1","chr2"), +range=IRanges::IRanges(start = c(16058489,236417627), end= c(16058489,236417627)), +name= c("cg18108049","cg17125141")) +names(probe) <- c("cg18108049","cg17125141") +NearbyGenes <- GetNearGenes(numFlankingGenes = 20,geneAnnot=geneAnnot,TRange=probe) +} +\references{ +Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. } - diff --git a/man/Normalize.Rd b/man/Normalize.Rd deleted file mode 100644 index 6c857330..00000000 --- a/man/Normalize.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R -\name{Normalize} -\alias{Normalize} -\title{Normalization to 0 to 1} -\usage{ -Normalize(x, col = FALSE, row = FALSE, na.rm = FALSE) -} -\arguments{ -\item{x}{A matrix.} - -\item{col}{A boolean to determine normalize by column or not.} - -\item{row}{A boolean to determine normalize by row or not.} - -\item{na.rm}{A boolean to determine to remove na number or not.} -} -\value{ -A normalized matrix. -} -\description{ -Normalization to 0 to 1 -} - diff --git a/man/NormalizeMean.Rd b/man/NormalizeMean.Rd deleted file mode 100644 index 23b8f3f1..00000000 --- a/man/NormalizeMean.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R -\name{NormalizeMean} -\alias{NormalizeMean} -\title{Normalization based on mean} -\usage{ -NormalizeMean(x, col = FALSE, row = FALSE, na.rm = FALSE) -} -\arguments{ -\item{x}{A matrix.} - -\item{col}{A boolean to determine normalize by column or not.} - -\item{row}{A boolean to determine normalize by row or not.} - -\item{na.rm}{A boolean to determine to remove na number or not.} -} -\value{ -A normalized matrix. -} -\description{ -Normalization based on mean -} - diff --git a/man/NormalizeMedian.Rd b/man/NormalizeMedian.Rd deleted file mode 100644 index ff5ccbc5..00000000 --- a/man/NormalizeMedian.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R -\name{NormalizeMedian} -\alias{NormalizeMedian} -\title{Normalization based on median} -\usage{ -NormalizeMedian(x, col = FALSE, row = FALSE, na.rm = FALSE) -} -\arguments{ -\item{x}{A matrix.} - -\item{col}{A boolean to determine normalize by column or not.} - -\item{row}{A boolean to determine normalize by row or not.} - -\item{na.rm}{A boolean to determine to remove na number or not.} -} -\value{ -A normalized matrix. -} -\description{ -Normalization based on median -} - diff --git a/man/PeakToVenn.Rd b/man/PeakToVenn.Rd deleted file mode 100644 index d80ad473..00000000 --- a/man/PeakToVenn.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R -\name{PeakToVenn} -\alias{PeakToVenn} -\title{Making peak sets venn diagram} -\usage{ -PeakToVenn(Peaks, ...) -} -\arguments{ -\item{Peaks}{A list of Peak sets.} - -\item{...}{parameters from VennDiagram package.} -} -\value{ -A venn diagram of peaks. -} -\description{ -Making peak sets venn diagram -} - diff --git a/man/Proximal.Rd b/man/Proximal.Rd deleted file mode 100644 index e4f6bc79..00000000 --- a/man/Proximal.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/ReadFile.R -\name{Proximal} -\alias{Proximal} -\title{Generate random loci of genome.} -\usage{ -Proximal(x, TSS.range, ignore.strand = F) -} -\arguments{ -\item{x}{GRange object which you want to identify distal elements from.} - -\item{TSS.range}{GRange object which contain promoter infomation} - -\item{ignore.strand}{A boolean which to specific if ignore strand or not.} -} -\value{ -GRange object which contains elements from x that overlap with TSS.range. -} -\description{ -Generate random loci of genome. -} - diff --git a/man/RandomLoci.Rd b/man/RandomLoci.Rd deleted file mode 100644 index 656a99c1..00000000 --- a/man/RandomLoci.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/ReadFile.R -\name{RandomLoci} -\alias{RandomLoci} -\title{Generate random loci of genome.} -\usage{ -RandomLoci(SampleSize = NULL, exclusion = NULL, regionWidth = 0) -} -\arguments{ -\item{SampleSize}{A number of random loci you want to generate.} - -\item{exclusion}{The chromosome you want to exclude such as chrX chrY.} - -\item{regionWidth}{The width of each random loci.} -} -\value{ -GRange object. -} -\description{ -Generate random loci of genome. -} - diff --git a/man/ReadBed.Rd b/man/ReadBed.Rd deleted file mode 100644 index 9b6d686a..00000000 --- a/man/ReadBed.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/ReadFile.R -\name{ReadBed} -\alias{ReadBed} -\title{Read a bed file.} -\usage{ -ReadBed(x, strand = FALSE, skip = 0, cols = NULL, seqLength = NULL) -} -\arguments{ -\item{x}{A path of bed file (characters)} - -\item{strand}{A boolean to specific strands. If true, strand column will be filled as input. If false, strand column will be filled "*""} - -\item{skip}{A number to specify how many lines should be removed from bed file.} - -\item{cols}{Specify the column to read from bed file.} - -\item{seqLength}{Specify custmer seqLength parameter in GRange function} -} -\value{ -GRange object containning bed file information. -} -\description{ -Read a bed file. -} - diff --git a/man/ReadGFF.Rd b/man/ReadGFF.Rd deleted file mode 100644 index 2ca611b0..00000000 --- a/man/ReadGFF.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/ReadFile.R -\name{ReadGFF} -\alias{ReadGFF} -\title{Read a GFF file.} -\usage{ -ReadGFF(x, strand = FALSE, skip = 0) -} -\arguments{ -\item{x}{A path of GFF file (characters)} - -\item{strand}{A boolean to specific strands. If true, strand column will be filled as input. If false, strand column will be filled "*""} - -\item{skip}{A number to specify how many lines should be removed from bed file.} -} -\value{ -GRange object containning GFF file information. -} -\description{ -Read a GFF file. -} - diff --git a/man/Stat.diff.meth.Rd b/man/Stat.diff.meth.Rd new file mode 100644 index 00000000..f5ae257c --- /dev/null +++ b/man/Stat.diff.meth.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/StatisticTest.R +\name{Stat.diff.meth} +\alias{Stat.diff.meth} +\title{Stat.diff.meth} +\usage{ +Stat.diff.meth( + meth, + groups, + group1, + group2, + test = t.test, + min.samples = 5, + percentage = 0.2, + Top.m = NULL +) +} +\arguments{ +\item{meth}{A matrix contain DNA methylation data.} + +\item{groups}{A vector of category of samples.} + +\item{group1}{Group 1 label in groups vector} + +\item{group2}{Group 2 label in groups vector} + +\item{test}{A function specify which statistic test will be used.} + +\item{min.samples}{Minimun number of samples to use in the analysis. Default 5. +If you have 10 samples in one group, percentage is 0.2 this will give 2 samples +in the lower quintile, but then 5 will be used.} + +\item{percentage}{A number specify the percentage of normal and tumor +samples used in the test.} + +\item{Top.m}{A logic. If to identify hypomethylated probe Top.m should be FALSE. +hypermethylated probe is TRUE.} +} +\value{ +Statistic test results to identify differentially methylated probes. +} +\description{ +Stat.diff.meth +} diff --git a/man/Stat.nonpara.Rd b/man/Stat.nonpara.Rd new file mode 100644 index 00000000..4d9467be --- /dev/null +++ b/man/Stat.nonpara.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/StatisticTest.R +\name{Stat.nonpara} +\alias{Stat.nonpara} +\title{U test (non parameter test) for permutation. This is one probe vs nearby gene +which is good for computing each probes for nearby genes.} +\usage{ +Stat.nonpara( + Probe, + NearGenes, + Top = NULL, + correlation = "negative", + unmethy = NULL, + methy = NULL, + Meths = Meths, + Exps = Exps +) +} +\arguments{ +\item{Probe}{A character of name of Probe in array.} + +\item{NearGenes}{A list of nearby gene for each probe which is output of GetNearGenes function.} + +\item{Top}{A number determines the percentage of top methylated/unmethylated samples. +Only used if unmethy and methy are not set.} + +\item{correlation}{Type of correlation to evaluate (negative or positive). +Negative (default) checks if hypomethylated region has a upregulated target gene. +Positive checks if region hypermethylated has a upregulated target gene.} + +\item{unmethy}{Index of U (unmethylated) group.} + +\item{methy}{Index of M (methylated) group.} + +\item{Meths}{A matrix contains methylation for each probe (row) and each sample (column).} + +\item{Exps}{A matrix contains Expression for each gene (row) and each sample (column).} +} +\value{ +U test results +} +\description{ +U test (non parameter test) for permutation. This is one probe vs nearby gene +which is good for computing each probes for nearby genes. +} diff --git a/man/Stat.nonpara.permu.Rd b/man/Stat.nonpara.permu.Rd new file mode 100644 index 00000000..278ce63f --- /dev/null +++ b/man/Stat.nonpara.permu.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/StatisticTest.R +\name{Stat.nonpara.permu} +\alias{Stat.nonpara.permu} +\title{Stat.nonpara.permu} +\usage{ +Stat.nonpara.permu( + Probe, + Gene, + Top = 0.2, + correlation = "negative", + unmethy = NULL, + methy = NULL, + Meths = Meths, + Exps = Exps +) +} +\arguments{ +\item{Probe}{A character of name of Probe in array.} + +\item{Gene}{A vector of gene ID.} + +\item{Top}{A number determines the percentage of top methylated/unmethylated samples. +Only used if unmethy and methy are not set.} + +\item{correlation}{Type of correlation to evaluate (negative or positive). +Negative (default) checks if hypomethylated region has a upregulated target gene. +Positive checks if region hypermethylated has a upregulated target gene.} + +\item{unmethy}{Index of U (unmethylated) group.} + +\item{methy}{Index of M (methylated) group.} + +\item{Meths}{A matrix contains methylation for each probe (row) and each sample (column).} + +\item{Exps}{A matrix contains Expression for each gene (row) and each sample (column).} +} +\value{ +U test results +} +\description{ +Stat.nonpara.permu +} diff --git a/man/TCGA.pipe.Rd b/man/TCGA.pipe.Rd new file mode 100644 index 00000000..89b2fbbf --- /dev/null +++ b/man/TCGA.pipe.Rd @@ -0,0 +1,125 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TCGA_pipe.R +\name{TCGA.pipe} +\alias{TCGA.pipe} +\title{ELMER analysis pipeline for TCGA data.} +\usage{ +TCGA.pipe( + disease, + genome = "hg38", + analysis = "all", + wd = getwd(), + cores = 1, + mode = "unsupervised", + Data = NULL, + diff.dir = "hypo", + genes = NULL, + mutant_variant_classification = c("Frame_Shift_Del", "Frame_Shift_Ins", + "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "In_Frame_Del", + "In_Frame_Ins", "Translation_Start_Site", "Nonstop_Mutation"), + group.col = "TN", + group1 = "Tumor", + group2 = "Normal", + ... +) +} +\arguments{ +\item{disease}{TCGA short form disease name such as COAD} + +\item{genome}{Data aligned against which genome of reference. Options: "hg19", "hg38" (default)} + +\item{analysis}{A vector of characters listing the analysis need to be done. +Analysis can be "download","distal.probes","diffMeth","pair","motif","TF.search". +Default is "all" meaning all the analysis will be processed.} + +\item{wd}{A path shows working dirctory. Default is "./"} + +\item{cores}{A interger which defines number of core to be used in parallel process. +Default is 1: don't use parallel process.} + +\item{mode}{This option will automatically set the percentage of samples to be used in the analysis. +Options: "supervised" (use 100\% of samples) or "unsupervised" (use 20\% of samples).} + +\item{Data}{A path shows the folder containing DNA methylation, expression and clinic data} + +\item{diff.dir}{A character can be "hypo" or "hyper", showing dirction DNA methylation changes. +If it is "hypo", get.diff.meth function will identify all significantly hypomethylated +CpG sites; If "hyper", get.diff.meth function will identify all significantly hypermethylated +CpG sites} + +\item{genes}{List of genes for which mutations will be verified. +A column in the MAE with the name of the gene +will be created with two groups WT (tumor samples without mutation), MUT (tumor samples w/ mutation), +NA (not tumor samples)} + +\item{mutant_variant_classification}{List of TCGA variant classification from MAF files to consider a samples +mutant. Only used when argument gene is set.} + +\item{group.col}{A column defining the groups of the sample. You can view the +available columns using: colnames(MultiAssayExperiment::colData(data)).} + +\item{group1}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{group2}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{...}{A list of parameters for functions: GetNearGenes, get.feature.probe, +get.diff.meth, get.pair} +} +\value{ +Different analysis results. +} +\description{ +ELMER analysis pipeline for TCGA data. This pipeline combine every steps of \pkg{ELMER} +analyses: get.feature.probe, get.diff.meth, get.pair, get.permu, get.enriched.motif and get.TFs. +Every steps' results are saved. +} +\examples{ + data <- ELMER:::getdata("elmer.data.example") + TCGA.pipe( + disease = "LUSC", + data = data, + analysis = c("diffMeth","pair", "motif","TF.search"), + mode = "supervised", + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + diff.dir = c("hypo"), + dir.out = "pipe", + sig.dif = 0.0001, + pvalue = 1.0, + min.incidence = 0, + lower.OR = 0.0 + ) +\dontrun{ + distal.probe <- TCGA.pipe(disease = "LUSC", analysis="distal.enhancer", wd="~/") + TCGA.pipe(disease = "LUSC",analysis = "all", genome = "hg19", cores = 1, permu.size=300, Pe=0.01) + projects <- TCGAbiolinks:::getGDCprojects()$project_id + projects <- gsub("TCGA-","",projects[grepl('^TCGA',projects,perl=TRUE)]) + for(proj in projects) TCGA.pipe(disease = proj,analysis = "download") + plyr::alply(sort(projects),1,function(proj) { + tryCatch({ + print(proj); + TCGA.pipe(disease = proj,analysis = c("createMAE"))}) + }, .progress = "text") + plyr::alply(sort(projects),1,function(proj) { + tryCatch({ + print(proj); + TCGA.pipe(disease = proj, + analysis = c("diffMeth","pair", "motif","TF.search"))}) + }, .progress = "text") + + # Evaluation mutation + TCGA.pipe(disease = "LUSC",analysis = "createMAE",gene = "NFE2L2") + TCGA.pipe( + disease = "LUSC",analysis = c("diffMeth","pair", "motif","TF.search"), + mode = "supervised", + group.col = "NFE2L2", group1 = "Mutant", group2 = "WT", + diff.dir = c("hypo"), + dir.out = "LUSC_NFE2L2_MutvsWT" + ) +} +} diff --git a/man/TF.rank.plot.Rd b/man/TF.rank.plot.Rd new file mode 100644 index 00000000..d172b196 --- /dev/null +++ b/man/TF.rank.plot.Rd @@ -0,0 +1,85 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/motif.TF.Plots.R +\name{TF.rank.plot} +\alias{TF.rank.plot} +\title{TF.rank.plot to plot the scores (-log10(P value)) which assess the correlation between +TF expression and average DNA methylation at motif sites.} +\usage{ +TF.rank.plot( + motif.pvalue, + motif, + title = NULL, + TF.label = NULL, + dir.out = "./", + save = TRUE, + cores = 1 +) +} +\arguments{ +\item{motif.pvalue}{A matrix or a path specifying location of "XXX.with.motif.pvalue.rda" +which is output of getTF.} + +\item{motif}{A vector of characters specify the motif to plot} + +\item{title}{Tite title (the motif will still be added to the title)} + +\item{TF.label}{A list shows the label for each motif. If TF.label is not specified, +the motif relevant TF and top3 TF will be labeled.} + +\item{dir.out}{A path specify the directory to which the figures will be saved. +Current directory is default.} + +\item{save}{A logic. If true (default), figure will be saved to dir.out} + +\item{cores}{A interger which defines the number of cores to be used in parallel process. +Default is 1: no parallel process.} +} +\value{ +A plot shows the score (-log(P value)) of association between TF +expression and DNA methylation at sites of a certain motif. +} +\description{ +TF.rank.plot is a function to plot the scores (-log10(P value)) which assess the +correlation between TF expression and average DNA methylation at motif sites. The the motif +relevant TF and top3 TFs will be labeled in a different color. +} +\examples{ +library(ELMER) +data <- tryCatch(ELMER:::getdata("elmer.data.example"), error = function(e) { + message(e) + data(elmer.data.example, envir = environment()) +}) +enriched.motif <- list("P53_HUMAN.H11MO.0.A"= c("cg00329272", "cg10097755", "cg08928189", + "cg17153775", "cg21156590", "cg19749688", "cg12590404", + "cg24517858", "cg00329272", "cg09010107", "cg15386853", + "cg10097755", "cg09247779", "cg09181054")) +TF <- get.TFs(data, + enriched.motif, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + TFs = data.frame( + external_gene_name=c("TP53","TP63","TP73"), + ensembl_gene_id= c("ENSG00000141510", + "ENSG00000073282", + "ENSG00000078900"), + stringsAsFactors = FALSE), + label="hypo") +TF.meth.cor <- get(load("getTF.hypo.TFs.with.motif.pvalue.rda")) +TF.rank.plot(motif.pvalue=TF.meth.cor, + motif="P53_HUMAN.H11MO.0.A", + TF.label=createMotifRelevantTfs("subfamily")["P53_HUMAN.H11MO.0.A"], + save=TRUE) +TF.rank.plot(motif.pvalue=TF.meth.cor, + motif="P53_HUMAN.H11MO.0.A", + save=TRUE) +# Same as above +TF.rank.plot(motif.pvalue=TF.meth.cor, + motif="P53_HUMAN.H11MO.0.A", + dir.out = "TFplots", + TF.label=createMotifRelevantTfs("family")["P53_HUMAN.H11MO.0.A"], + save=TRUE) +} +\author{ +Lijing Yao (maintainer: lijingya@usc.edu) +} diff --git a/man/TFsurvival.plot.Rd b/man/TFsurvival.plot.Rd new file mode 100644 index 00000000..c664eb89 --- /dev/null +++ b/man/TFsurvival.plot.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TFsurvival.plot.R +\name{TFsurvival.plot} +\alias{TFsurvival.plot} +\title{Creates survival plot of based on the expression of a TF} +\usage{ +TFsurvival.plot(data, TF, xlim = NULL, percentage = 0.3, save = TRUE) +} +\arguments{ +\item{data}{A multi assay Experiment with clinical data in the phenotypic data matrix +containing the following columns: vital_status, days_to_last_follow_up and days_to_death. Default from GDC and TCGAbiolinks} + +\item{TF}{A gene symbol} + +\item{xlim}{Limit x axis showed in plot} + +\item{percentage}{A number ranges from 0 to 1 specifying the percentage of samples in the +higher and lower expression groups. Default is 0.3} + +\item{save}{Save plot as PDF} +} +\description{ +This function will create a survival plot for the samples with higher, midium, low expression +of a given transcription factor. +By defau;t samples with higher expression are the top 30% and the lower expression the bottom 30%. +} diff --git a/man/WriteBed.Rd b/man/WriteBed.Rd deleted file mode 100644 index 30587a4d..00000000 --- a/man/WriteBed.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/ReadFile.R -\name{WriteBed} -\alias{WriteBed} -\title{Write a bed file from GRange object.} -\usage{ -WriteBed(x, save = T, fn = NULL) -} -\arguments{ -\item{x}{GRange object} - -\item{save}{if save is false, function will return a bed format data.frame. if save is true, fn parameter need to be specific and it output bed file in the path you specified in fn.} - -\item{fn}{A name of bed file you want to output.} -} -\value{ -A data.frame bed object or save output bed file. -} -\description{ -Write a bed file from GRange object. -} - diff --git a/man/addDistNearestTSS.Rd b/man/addDistNearestTSS.Rd new file mode 100644 index 00000000..a98193f8 --- /dev/null +++ b/man/addDistNearestTSS.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetNearbyGenes.R +\name{addDistNearestTSS} +\alias{addDistNearestTSS} +\title{Calculate the distance between probe and gene TSS} +\usage{ +addDistNearestTSS(data, NearGenes, genome, met.platform, cores = 1) +} +\arguments{ +\item{data}{A multi Assay Experiment with both DNA methylation and gene Expression objects} + +\item{NearGenes}{A list or a data frame with the pairs gene probes} + +\item{genome}{Which genome build will be used: hg38 (default) or hg19.} + +\item{met.platform}{DNA methyaltion platform to retrieve data from: EPIC or 450K (default)} + +\item{cores}{Number fo cores to be used. Deafult: 1} +} +\description{ +Calculate the distance between probe and gene TSS +} +\examples{ +\dontrun{ + data <- ELMER:::getdata("elmer.data.example") + NearbyGenes <- GetNearGenes( + data = data, + probes = c("cg15924102", "cg24741609"), + numFlankingGenes = 20 + ) + NearbyGenes <- addDistNearestTSS(data = data, NearGenes = NearbyGenes) +} +} diff --git a/man/addMutCol.Rd b/man/addMutCol.Rd new file mode 100644 index 00000000..0177c4f7 --- /dev/null +++ b/man/addMutCol.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TCGA_pipe.R +\name{addMutCol} +\alias{addMutCol} +\title{Adds mutation information to MAE} +\usage{ +addMutCol( + data, + disease, + genes, + mutant_variant_classification = c("Frame_Shift_Del", "Frame_Shift_Ins", + "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "In_Frame_Del", + "In_Frame_Ins", "Translation_Start_Site", "Nonstop_Mutation") +) +} +\arguments{ +\item{data}{MAE object} + +\item{disease}{TCGA disease (LUSC, GBM, etc)} + +\item{genes}{list of genes to add information} + +\item{mutant_variant_classification}{List of mutant_variant_classification that will be +consider a sample mutant or not.} +} +\description{ +Adds mutation information to MAE +} +\examples{ +\dontrun{ + data <- ELMER:::getdata("elmer.data.example") # Get data from ELMER.data + data <- ELMER:::addMutCol(data, "LUSC","TP53") +} +} diff --git a/man/calcDistNearestTSS.Rd b/man/calcDistNearestTSS.Rd new file mode 100644 index 00000000..e08dbabd --- /dev/null +++ b/man/calcDistNearestTSS.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetNearbyGenes.R +\name{calcDistNearestTSS} +\alias{calcDistNearestTSS} +\title{Calculate distance from region to nearest TSS} +\usage{ +calcDistNearestTSS(links, TRange, tssAnnot) +} +\arguments{ +\item{links}{Links to calculate the distance} + +\item{TRange}{Genomic coordinates for Tartget region} + +\item{tssAnnot}{TSS annotation} +} +\description{ +Idea +For a given region R linked to X genes G +merge R with nearest TSS for G (multiple) +this will increse nb of lines +i.e R1 - G1 - TSS1 - DIST1 + R1 - G1 - TSS2 - DIST2 + To vectorize the code: +make a granges from left and onde from right +and find distance +collapse the results keeping min distance for equals values +} +\examples{ +\dontrun{ + data <- ELMER:::getdata("elmer.data.example") + NearbyGenes <- GetNearGenes( + data = data, + probes = c("cg15924102", "cg24741609"), + numFlankingGenes = 20 + ) + + NearbyGenes <- ELMER:::calcDistNearestTSS( + links = NearbyGenes, + tssAnnot = getTSS(genome = "hg38"), + TRange = rowRanges(getMet(data)) + ) +} +} +\author{ +Tiago C. Silva +} diff --git a/man/calculateEnrichement.Rd b/man/calculateEnrichement.Rd new file mode 100644 index 00000000..b9c578ee --- /dev/null +++ b/man/calculateEnrichement.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Small.R +\name{calculateEnrichement} +\alias{calculateEnrichement} +\title{Calculate motif Erichment} +\usage{ +calculateEnrichement(foreground, background) +} +\arguments{ +\item{foreground}{A nsparseMatrix object in each 1 means the motif is found in a region, 0 not.} + +\item{background}{A nsparseMatrix object in each 1 means the motif is found in a region, 0 not.} +} +\description{ +Calculates fisher exact test +} +\examples{ +foreground <- Matrix::Matrix(sample(0:1,size = 100,replace = TRUE), + nrow = 10, ncol = 10,sparse = TRUE) +rownames(foreground) <- paste0("region",1:10) +colnames(foreground) <- paste0("motif",1:10) +background <- Matrix::Matrix(sample(0:1,size = 100,replace = TRUE), + nrow = 10, ncol = 10,sparse = TRUE) +rownames(background) <- paste0("region",1:10) +colnames(background) <- paste0("motif",1:10) +calculateEnrichement(foreground,background) +} diff --git a/man/cluster.main.Rd b/man/cluster.main.Rd deleted file mode 100644 index e568cb1c..00000000 --- a/man/cluster.main.Rd +++ /dev/null @@ -1,39 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R -\name{cluster.main} -\alias{cluster.main} -\title{cluster functions} -\usage{ -cluster.main(x, Rowv = TRUE, Colv = TRUE, distfun = dist, - distMethod = "euclidean", hclustfun = fastcluster::hclust, - hclustMethod = "complete", Distance.row = NULL, Distance.col = NULL) -} -\arguments{ -\item{x}{A matrix} - -\item{Rowv}{A boolean determines if the row dendrogram should be computed and reordered.} - -\item{Colv}{A boolean determines if the column dendrogram should be computed and reordered.} - -\item{distfun}{function used to compute the distance (dissimilarity) between both rows and columns. Defaults to dist.} - -\item{distMethod}{A character to specify method for computing distance. Default to euclidean. See detail for other methods.} - -\item{hclustfun}{function used to compute the hierarchical clustering when Rowv or Colv are not dendrograms. Defaults to fastcluster::hclust. Should take as argument a result of distfun and return an object to which as.dendrogram can be applied.} - -\item{hclustMethod}{A character to specify method for computing clustering. Default to complete} - -\item{Distance.row}{A vector of distance value for rows. If Distance.row was specified, distance calculation step will be skiped for rows.} - -\item{Distance.col}{A vector of distance value for columns. If Distance.col was specified, distance calculation step will be skiped for columns.} -} -\value{ -A list contains: x the original matrix; rowInd order of row after clustering; ddr dendrograms for rows; colInd order of columns after clustering; ddc dendrograms for columns. -} -\description{ -cluster functions -} -\details{ -distMethod euclidean: Usual square distance between the two vectors (2 norm).maximum: Maximum distance between two components of x and y (supremum norm).manhattan: Absolute distance between the two vectors (1 norm). canberra: sum(|x_i - y_i| / |x_i + y_i|). Terms with zero numerator and denominator are omitted from the sum and treated as if the values were missing. This is intended for non-negative values (e.g. counts): taking the absolute value of the denominator is a 1998 R modification to avoid negative distances.binary: (aka asymmetric binary): The vectors are regarded as binary bits, so non-zero elements are ‘on’ and zero elements are ‘off’. The distance is the proportion of bits in which only one is on amongst those in which at least one is on. minkowski: The p norm, the pth root of the sum of the pth powers of the differences of the components. -} - diff --git a/man/createBigWigDNAmetArray.Rd b/man/createBigWigDNAmetArray.Rd new file mode 100644 index 00000000..270a8ec4 --- /dev/null +++ b/man/createBigWigDNAmetArray.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plots.R +\name{createBigWigDNAmetArray} +\alias{createBigWigDNAmetArray} +\title{Create a bigwig file for IGV visualization of DNA methylation data (Array)} +\usage{ +createBigWigDNAmetArray( + data = NULL, + genome = "hg38", + met.platform = "450K", + track.names = NULL, + dir = "IGV_tracks" +) +} +\arguments{ +\item{data}{A matrix} + +\item{genome}{Which genome build will be used: hg38 (default) or hg19.} + +\item{met.platform}{DNA methyaltion platform to retrieve data from: EPIC or 450K (default)} + +\item{track.names}{Provide a list of track names (.bw) otherwise the deault is the will be {samples}.bw} + +\item{dir}{Which directory files will be saved} +} +\description{ +Create a bigwig for IGV visualization of DNA methylation data (Array) +} +\examples{ + \dontrun{ + data <- assay(getMet(ELMER:::getdata("elmer.data.example"))) + createBigWigDNAmetArray(data = data, met.platform = "450K", genome = "hg38") + } +} +\author{ +Tiago Chedraoui Silva (tiagochst at gmail.com) +} diff --git a/man/createIGVtrack.Rd b/man/createIGVtrack.Rd new file mode 100644 index 00000000..8b99cf78 --- /dev/null +++ b/man/createIGVtrack.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plots.R +\name{createIGVtrack} +\alias{createIGVtrack} +\title{Create a junction track for IGV visualization of interection} +\usage{ +createIGVtrack( + pairs, + met.platform = "450K", + genome = "hg38", + filename = "ELMER_interactions.bed", + color.track = "black", + track.name = "junctions", + gene.symbol = NULL, + all.tss = TRUE +) +} +\arguments{ +\item{pairs}{A data frame output from getPairs function} + +\item{met.platform}{DNA methyaltion platform to retrieve data from: EPIC or 450K (default)} + +\item{genome}{Which genome build will be used: hg38 (default) or hg19.} + +\item{filename}{Filename (".bed")} + +\item{color.track}{A color for the track (i.e blue, red,#272E6A)} + +\item{track.name}{Track name} + +\item{gene.symbol}{Filter pairs to a single gene.} + +\item{all.tss}{A logical. If TRUE it will link probes to all TSS of a gene (transcript level), if FALSE +it will link to the promoter region of a gene (gene level).} +} +\description{ +Create a junction track for IGV visualization of interection +} +\examples{ + \dontrun{ +data <- ELMER:::getdata("elmer.data.example") +nearGenes <-GetNearGenes(TRange=getMet(data)[c("cg00329272","cg10097755"),], + geneAnnot=getExp(data)) +Hypo.pair <- get.pair(data=data, + nearGenes=nearGenes, + permu.size=5, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + raw.pvalue = 0.2, + Pe = 0.2, + dir.out="./", + label= "hypo") + createIGVtrack(Hypo.pair,met.platform = "450K", genome = "hg38") + } +} +\author{ +Tiago Chedraoui Silva (tiagochst at gmail.com) +} diff --git a/man/createMAE.Rd b/man/createMAE.Rd new file mode 100644 index 00000000..2313a9ae --- /dev/null +++ b/man/createMAE.Rd @@ -0,0 +1,241 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Small.R +\name{createMAE} +\alias{createMAE} +\title{Construct a Multi Assay Experiment for ELMER analysis} +\usage{ +createMAE( + exp, + met, + colData, + sampleMap, + linearize.exp = FALSE, + filter.probes = NULL, + met.na.cut = 0.2, + filter.genes = NULL, + met.platform = "450K", + genome = NULL, + save = TRUE, + save.filename, + TCGA = FALSE +) +} +\arguments{ +\item{exp}{A Summarized Experiment with one assay, or +a matrix or path of rda file only containing the data. Rownames should be +either Ensembl gene id (ensembl_gene_id) or gene symbol (external_gene_name)} + +\item{met}{A Summarized Experiment with one assay containing beta-values, +a matrix or path of rda file only containing the data.} + +\item{colData}{A DataFrame or data.frame of the phenotype data for all participants. Must have column primary (sample ID).} + +\item{sampleMap}{A DataFrame or data.frame of the matching samples and colnames +of the gene expression and DNA methylation matrix. This should be used if your matrix +have different columns names. +This object must have following columns: +assay ("DNA methylation" and "Gene expression"), primary (sample ID) and colname (names of the columns of the matrix).} + +\item{linearize.exp}{Take log2(exp + 1) in order to linearize relation between methylation and expression} + +\item{filter.probes}{A GRanges object contains the coordinate of probes which locate +within promoter regions or distal feature regions such as union enhancer from REMC and FANTOM5. +See \code{\link{get.feature.probe}} function.} + +\item{met.na.cut}{Define the percentage of NA that the line should have to remove the probes +for humanmethylation platforms.} + +\item{filter.genes}{List of genes ensemble ids to filter from object} + +\item{met.platform}{DNA methylation platform "450K" or "EPIC"} + +\item{genome}{Which is the default genome to make gene information. Options hg19 and hg38} + +\item{save}{If TRUE, MAE object will be saved into a file named as the argument save.file if this was set, otherwise as mae_genome_met.platform.rda.} + +\item{save.filename}{Name of the rda file to save the object (must end in .rda)} + +\item{TCGA}{A logical. FALSE indicate data is not from TCGA (FALSE is default). +TRUE indicates data is from TCGA and sample section will automatically filled in.} +} +\value{ +A MultiAssayExperiment object +} +\description{ +This function will receive a gene expression and DNA methylation data objects +and create a Multi Assay Experiment. +} +\examples{ +# NON TCGA example: matrices has different column names +gene.exp <- S4Vectors::DataFrame( + sample1.exp = c("ENSG00000141510"=2.3,"ENSG00000171862"=5.4), + sample2.exp = c("ENSG00000141510"=1.6,"ENSG00000171862"=2.3) +) + +dna.met <- S4Vectors::DataFrame( + sample1.met = c("cg14324200"=0.5,"cg23867494"=0.1), + sample2.met = c("cg14324200"=0.3,"cg23867494"=0.9) +) +sample.info <- S4Vectors::DataFrame( + primary = c("sample1","sample2"), + sample.type = c("Normal", "Tumor") + ) + +sampleMap <- S4Vectors::DataFrame( + assay = c("Gene expression","DNA methylation","Gene expression","DNA methylation"), + primary = c("sample1","sample1","sample2","sample2"), + colname = c("sample1.exp","sample1.met","sample2.exp","sample2.met") +) + +mae <- createMAE( + exp = gene.exp, + met = dna.met, + sampleMap = sampleMap, + met.platform ="450K", + colData = sample.info, + genome = "hg38" +) + +# You can also use sample Mapping and Sample information tables from a tsv file +# You can use the createTSVTemplates function to create the tsv files +readr::write_tsv(as.data.frame(sampleMap), path = "sampleMap.tsv") +readr::write_tsv(as.data.frame(sample.info), path = "sample.info.tsv") + +mae <- createMAE( + exp = gene.exp, + met = dna.met, + sampleMap = "sampleMap.tsv", + met.platform ="450K", + colData = "sample.info.tsv", + genome = "hg38" +) + +# NON TCGA example: matrices has same column names +gene.exp <- S4Vectors::DataFrame(sample1 = c("ENSG00000141510"=2.3,"ENSG00000171862"=5.4), + sample2 = c("ENSG00000141510"=1.6,"ENSG00000171862"=2.3)) +dna.met <- S4Vectors::DataFrame(sample1 = c("cg14324200"=0.5,"cg23867494"=0.1), + sample2= c("cg14324200"=0.3,"cg23867494"=0.9)) +sample.info <- S4Vectors::DataFrame(primary = c("sample1","sample2"), + sample.type = c("Normal", "Tumor")) +sampleMap <- S4Vectors::DataFrame( + assay = c("Gene expression","DNA methylation","Gene expression","DNA methylation"), + primary = c("sample1","sample1","sample2","sample2"), + colname = c("sample1","sample1","sample2","sample2") +) +mae <- createMAE( + exp = gene.exp, + met = dna.met, + sampleMap = sampleMap, + met.platform ="450K", + colData = sample.info, + genome = "hg38" +) + +\dontrun{ + # TCGA example using TCGAbiolinks + # Testing creating MultyAssayExperiment object + # Load library + library(TCGAbiolinks) + library(SummarizedExperiment) + + samples <- c( + "TCGA-BA-4074", "TCGA-BA-4075", "TCGA-BA-4077", "TCGA-BA-5149", + "TCGA-UF-A7JK", "TCGA-UF-A7JS", "TCGA-UF-A7JT", "TCGA-UF-A7JV" + ) + + #1) Get gene expression matrix + query.exp <- GDCquery( + project = "TCGA-HNSC", + data.category = "Transcriptome Profiling", + data.type = "Gene Expression Quantification", + workflow.type = "STAR - Counts", + barcode = samples + ) + + GDCdownload(query.exp) + exp.hg38 <- GDCprepare(query = query.exp) + + # Aligned against Hg19 + query.exp.hg19 <- GDCquery( + project = "TCGA-HNSC", + data.category = "Gene expression", + data.type = "Gene expression quantification", + platform = "Illumina HiSeq", + file.type = "normalized_results", + experimental.strategy = "RNA-Seq", + barcode = samples, + legacy = TRUE + ) + GDCdownload(query.exp.hg19) + exp.hg19 <- GDCprepare(query.exp.hg19) + + # Our object needs to have emsembl gene id as rownames + rownames(exp.hg19) <- values(exp.hg19)$ensembl_gene_id + + # DNA Methylation + query.met <- GDCquery( + project = "TCGA-HNSC", + legacy = FALSE, + data.category = "DNA Methylation", + data.type = "Methylation Beta Value", + barcode = samples, + platform = "Illumina Human Methylation 450" + ) + + GDCdownload(query.met) + met <- GDCprepare(query = query.met) + + distal.enhancer <- get.feature.probe(genome = "hg19",met.platform = "450k") + + # Consisering it is TCGA and SE + mae.hg19 <- createMAE( + exp = exp.hg19, + met = met, + TCGA = TRUE, + genome = "hg19", + filter.probes = distal.enhancer + ) + values(getExp(mae.hg19)) + + mae.hg38 <- createMAE( + exp = exp.hg38, met = met, + TCGA = TRUE, genome = "hg38", + filter.probes = distal.enhancer + ) + values(getExp(mae.hg38)) + + # Consisering it is TCGA and not SE + mae.hg19.test <- createMAE( + exp = assay(exp.hg19), met = assay(met), + TCGA = TRUE, genome = "hg19", + filter.probes = distal.enhancer + ) + + mae.hg38 <- createMAE( + exp = assay(exp.hg38), met = assay(met), + TCGA = TRUE, genome = "hg38", + filter.probes = distal.enhancer + ) + values(getExp(mae.hg38)) + + # Consisering it is not TCGA and SE + # DNA methylation and gene expression Objects should have same sample names in columns + not.tcga.exp <- exp.hg19 + colnames(not.tcga.exp) <- substr(colnames(not.tcga.exp),1,15) + not.tcga.met <- met + colnames(not.tcga.met) <- substr(colnames(not.tcga.met),1,15) + + phenotype.data <- data.frame(row.names = colnames(not.tcga.exp), + primary = colnames(not.tcga.exp), + samples = colnames(not.tcga.exp), + group = c(rep("group1",4),rep("group2",4))) + distal.enhancer <- get.feature.probe(genome = "hg19",met.platform = "450k") + mae.hg19 <- createMAE(exp = not.tcga.exp, + met = not.tcga.met, + TCGA = FALSE, + filter.probes = distal.enhancer, + genome = "hg19", + colData = phenotype.data) +} +createMAE +} diff --git a/man/createMotifRelevantTfs.Rd b/man/createMotifRelevantTfs.Rd new file mode 100644 index 00000000..489a0c44 --- /dev/null +++ b/man/createMotifRelevantTfs.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Small.R +\name{createMotifRelevantTfs} +\alias{createMotifRelevantTfs} +\title{Get family of transcription factors} +\usage{ +createMotifRelevantTfs(classification = "family") +} +\arguments{ +\item{classification}{Select if we will use Family classification or sub-family} +} +\value{ +A list of TFs and its family members +} +\description{ +This will output a list each TF motif and TFs that binding the motis. Multiple TFs may +recognize a same motif such as TF family. +The association between each motif famil and transcription factor was created using the +(HOCOMOCO)[https://hocomoco11.autosome.org/human/mono?full=true] which TF structural families +was created according to TFClass [@wingender2014tfclass] +This data is stored as a list whose elements +are motifs and contents for each element are TFs which recognize the same motif that +is the name of the element. This data is used in function get.TFs in \pkg{ELMER} +to identify the real regulator TF whose motif is enriched in a given set of probes +and expression associate with average DNA methylation of these motif sites. +} diff --git a/man/createSummaryDocument.Rd b/man/createSummaryDocument.Rd new file mode 100644 index 00000000..3eb9c17d --- /dev/null +++ b/man/createSummaryDocument.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TCGA_pipe.R +\name{createSummaryDocument} +\alias{createSummaryDocument} +\title{Create summary document for TCGA.pipe function} +\usage{ +createSummaryDocument( + analysis = "all", + argument.values = "defaults", + genome = NULL, + mae.path = NULL, + mode = NULL, + direction = NULL, + group.col = NULL, + group1 = NULL, + group2 = NULL, + results.path = NULL +) +} +\arguments{ +\item{analysis}{Which analysis were performed} + +\item{argument.values}{Other argument values changed} + +\item{genome}{Genome of reference hg38 and hg19} + +\item{mae.path}{Where mae is stored} + +\item{mode}{Mode "supervised" or "unsupervised" used in the analysis} + +\item{direction}{Hypo or hyper direction} + +\item{group.col}{Group col} + +\item{group1}{Group 1} + +\item{group2}{Group 2} + +\item{results.path}{Path where the results were saved} +} +\description{ +This function will create a text file with the +date of the last run, which aanalysis were performed, the values of +the arguments so the user can keep track +} diff --git a/man/createTSVTemplates.Rd b/man/createTSVTemplates.Rd new file mode 100644 index 00000000..9f6298bf --- /dev/null +++ b/man/createTSVTemplates.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Small.R +\name{createTSVTemplates} +\alias{createTSVTemplates} +\title{Create examples files for Sample mapping and information used in createMAE function} +\usage{ +createTSVTemplates(met, exp) +} +\arguments{ +\item{met}{DNA methylation matrix or Summarized Experiment} + +\item{exp}{Gene expression matrix or Summarized Experiment} +} +\description{ +This function will receive the DNA methylation and gene expression matrix and will create +some examples of table for the argument colData and sampleMap used in ceeateMae function. +} +\examples{ +gene.exp <- S4Vectors::DataFrame(sample1.exp = c("ENSG00000141510"=2.3,"ENSG00000171862"=5.4), + sample2.exp = c("ENSG00000141510"=1.6,"ENSG00000171862"=2.3)) +dna.met <- S4Vectors::DataFrame(sample1.met = c("cg14324200"=0.5,"cg23867494"=0.1), + sample2.met = c("cg14324200"=0.3,"cg23867494"=0.9)) +createTSVTemplates(met = dna.met, exp = gene.exp) +} diff --git a/man/dendro.plot.Rd b/man/dendro.plot.Rd deleted file mode 100644 index f8ba6f43..00000000 --- a/man/dendro.plot.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R -\name{dendro.plot} -\alias{dendro.plot} -\title{Plot the dendro tree.} -\usage{ -dendro.plot(Rdend = NULL, Cdend = NULL, margin = c(0.5, 0.5, 0.5, 0.5), - title = NULL, cex.title) -} -\arguments{ -\item{Rdend}{dendrograms for rows.} - -\item{Cdend}{dendrograms for columns.} - -\item{title}{The main title (on top).} - -\item{cex.title}{A numerical value giving the amount by which plotting title text and symbols should be magnified relative to the default. This starts as 1 when a device is opened, and is reset when the layout is changed.} -} -\value{ -a graph of dendrograms tree. -} -\description{ -Plot the dendro tree. -} - diff --git a/man/findMotifRegion.Rd b/man/findMotifRegion.Rd new file mode 100644 index 00000000..2a68656d --- /dev/null +++ b/man/findMotifRegion.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Small.R +\name{findMotifRegion} +\alias{findMotifRegion} +\title{Use Hocomoco motif and homer to identify motifs in a given region} +\usage{ +findMotifRegion( + regions, + output.filename = "mapped_motifs_regions.txt", + region.size = NULL, + genome = "hg38", + nstep = 10000, + cores = 1 +) +} +\arguments{ +\item{regions}{A GRanges object. Names will be used as the identifier.} + +\item{output.filename}{Final file name} + +\item{region.size}{If NULL the motif will be mapped to the region. If set a window around its center will be considered. +For example if region.size is 500, then +-250bp round it will be searched.} + +\item{genome}{Homer genome (hg38, hg19)} + +\item{nstep}{Number of regions to evaluate in homer, the bigger, more memory it will use at each step.} + +\item{cores}{A interger which defines the number of cores to be used in parallel +process. Default is 1: no parallel process.} +} +\description{ +To find for each probe the know motif we will use HOMER software (http://homer.salk.edu/homer/). +Homer and genome should be installed before this function is executed +Step: +1 - get DNA methylation probes annotation with the regions +2 - Make a bed file from it +3 - Execute section: Finding Instance of Specific Motifs +from http://homer.salk.edu/homer/ngs/peakMotifs.html to the HOCOMOCO TF motifs +Also, As HOMER is using more RAM than the available we will split the files in to 100k probes. +Obs: for each probe we create a winddow of 500 bp (-size 500) around it. +This might lead to false positives, but will not have false negatives. +The false posives will be removed latter with some statistical tests. +} +\examples{ +\dontrun{ + # use the center of the region and +-250bp around it + gr0 <- GRanges(Rle(c("chr2", "chr2", "chr1", "chr3"), + c(1, 3, 2, 4) + ), + IRanges(1:10, width=10:1) + ) + names(gr0) <- paste0("ID",c(1:10)) + findMotifRegion(regions = gr0, region.size = 500, genome = "hg38", cores = 1) + + # use the region size itself + gr1 <- GRanges(Rle(c("chr2", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)), + IRanges(1:10, width=sample(200:1000,10))) + names(gr1) <- paste0("ID",c(1:10)) + findMotifRegion(regions = gr0, genome = "hg38", cores = 1) +} +} diff --git a/man/get.TFs.Rd b/man/get.TFs.Rd new file mode 100644 index 00000000..7b988cce --- /dev/null +++ b/man/get.TFs.Rd @@ -0,0 +1,157 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Main_function.R +\name{get.TFs} +\alias{get.TFs} +\title{get.TFs to identify regulatory TFs.} +\usage{ +get.TFs(data, + enriched.motif, + TFs, + group.col, + group1, + group2, + mode = "unsupervised", + correlation = "negative", + diff.dir = NULL, + motif.relevant.TFs, + minSubgroupFrac = 0.4, + dir.out = "./", + label = NULL, + save.plots = FALSE, + cores = 1, + topTFper = 0.05, + save = TRUE) +} +\arguments{ +\item{data}{A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function.} + +\item{enriched.motif}{A list containing output of get.enriched.motif function or a path of XX.rda file containing output of get.enriched.motif function.} + +\item{TFs}{A data.frame containing TF GeneID and Symbol or a path of XX.csv file containing TF GeneID and Symbol. +If missing, human.TF list will be used (human.TF data in ELMER.data). +For detail information, refer the reference paper.} + +\item{group.col}{A column defining the groups of the sample. You can view the +available columns using: colnames(MultiAssayExperiment::colData(data)).} + +\item{group1}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{group2}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{mode}{A character. Can be "unsupervised" or "supervised". If unsupervised is set +the U (unmethylated) and M (methylated) groups will be selected +among all samples based on methylation of each probe. +Otherwise U group and M group will set as the samples of group1 or group2 as described below: +If diff.dir is "hypo, U will be the group 1 and M the group2. +If diff.dir is "hyper" M group will be the group1 and U the group2.} + +\item{correlation}{Type of correlation to evaluate (negative or positive). +Negative checks if hypomethylated is upregulated. Positive if hypermethylated is upregulated.} + +\item{diff.dir}{A character can be "hypo" or "hyper", showing differential +methylation direction in group 1. It can be "hypo" which means the probes are hypomethylated in group1; +"hyper" which means the probes are hypermethylated in group1; +This argument is used only when mode is supervised nad +it should be the same value from get.diff.meth function.} + +\item{motif.relevant.TFs}{A list containing motif as names and relavent TFs as contents + for each list element or a path of XX.rda file containing a list as above. +If missing, motif.relavent.TFs will be used (motif.relavent.TFs data in ELMER.data). +For detail information, refer the reference paper.} + +\item{minSubgroupFrac}{A number ranging from 0 to 1 +specifying the percentage of samples used to create the groups U (unmethylated) +and M (methylated) used to link probes to TF expression. +Default is 0.4 (lowest quintile of all samples will be in the +U group and the highest quintile of all samples in the M group).} + +\item{dir.out}{A path specifies the directory for outputs of get.pair function. Default is current directory} + +\item{label}{A character labels the outputs.} + +\item{save.plots}{Create TF ranking plots ?} + +\item{cores}{A interger which defines the number of cores to be used in parallel process. Default is 1: no parallel process.} + +\item{topTFper}{Top ranked TF to be retrieved (default "0.05" - 5 percent)} + +\item{save}{A logic. If save is ture, two files will be saved: getTF.XX.significant.TFs.with.motif.summary.csv and +getTF.hypo.TFs.with.motif.pvalue.rda (see detail). If save is false, a data frame contains the same content with the first file.} +} +\value{ +Potential responsible TFs will be reported in a dataframe with 4 columns: + \itemize{ + \item motif: the names of motif. + \item top.potential.TF.family: the highest ranking upstream TFs which are known recognized the motif. First item in potential.TFs.family + \item top.potential.TF.subfamily: the highest ranking upstream TFs which are known recognized the motif. First item in potential.TFs.subfamily + \item potential.TFs.family: TFs which are within top 5\% list and are known recognized the motif (considering family classification). + \item potential.TFs.subfamily: TFs which are within top 5\% list and are known recognized the motif (considering subfamily classification). + \item top_5percent: all TFs which are within top 5\% list. + } +} +\description{ +get.TFs is a function to identify regulatory TFs based on motif analysis and association analysis +between the probes containing a particular motif and expression of all known TFs. If save is true, +two files will be saved: getTF.XX.significant.TFs.with.motif.summary.csv and getTF.hypo.TFs.with.motif.pvalue.rda (see detail). +} +\details{ +save: If save is ture, two files will be saved. The first file is getTF.XX.significant.TFs.with.motif.summary.csv (XX depends on option lable). +This file contain the regulatory TF significantly associate with average DNA methylation at particular motif sites. +The second file is getTF.hypo.TFs.with.motif.pvalue.rda (XX depends on option label). +This file contains a matrix storing the statistic results for significant associations between TFs (row) and average DNA methylation at motifs (column). +If save is false, a data frame which contains the same content with the first file will be reported. +} +\examples{ +data <- tryCatch( + ELMER:::getdata("elmer.data.example"), + error = function(e) { + message(e) + data(elmer.data.example, envir = environment()) + }) +enriched.motif <- list( + "P53_HUMAN.H11MO.1.A"= c( + "cg00329272", "cg10097755", "cg08928189", + "cg17153775", "cg21156590", "cg19749688", "cg12590404", + "cg24517858", "cg00329272", "cg09010107", "cg15386853", + "cg10097755", "cg09247779", "cg09181054" + ) +) +TF <- get.TFs( + data, + enriched.motif, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + TFs = data.frame( + external_gene_name=c("TP53","TP63","TP73"), + ensembl_gene_id= c( + "ENSG00000141510", + "ENSG00000073282", + "ENSG00000078900" + ), + stringsAsFactors = FALSE + ), + label = "hypo" +) +# This case will use Uniprot dabase to get list of Trasncription factors +TF <- get.TFs( + data, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + enriched.motif, + label = "hypo" +) +} +\references{ +Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +} +\author{ +Lijing Yao (creator: lijingya@usc.edu) +Tiago C Silva (maintainer: tiagochst@usp.br) +} diff --git a/man/get.diff.meth.Rd b/man/get.diff.meth.Rd new file mode 100644 index 00000000..7b3dba7c --- /dev/null +++ b/man/get.diff.meth.Rd @@ -0,0 +1,119 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Main_function.R +\name{get.diff.meth} +\alias{get.diff.meth} +\title{Identify hypo/hyper-methylated CpG sites between two groups (i.e. normal vs tumor samples, treated vs untreated).} +\usage{ +get.diff.meth( + data, + diff.dir = "hypo", + cores = 1, + mode = "unsupervised", + minSubgroupFrac = 0.2, + pvalue = 0.01, + group.col, + min.samples = 5, + group1, + group2, + test = t.test, + sig.dif = 0.3, + dir.out = "./", + save = TRUE +) +} +\arguments{ +\item{data}{A multiAssayExperiment with DNA methylation and Gene Expression data. +See \code{\link{createMAE}} function.} + +\item{diff.dir}{A character can be "hypo", "hyper" or "both", showing differential +methylation direction. It can be "hypo" which is only selecting hypomethylated probes (one tailed test); +"hyper" which is only selecting hypermethylated probes (one tailed test); +or "both" which are probes differenly methylated (two tailed test).} + +\item{cores}{A interger which defines the number of cores to be used in parallel +process. Default is 1: no parallel process.} + +\item{mode}{A character. Can be "unsupervised" or "supervised". If "supervised", the +minSubgroupFrac argument will be set to 1 to use all samples from both groups to find the +differently methylated regions. The supervised mode should be used when all samples from both +groups are considered homogenous (i.e. treated vs untreated, molecular subtype A vs molecular subtype B), +while unsupervised mode should be used when there is at least one group with heterogenous samples +(i.e tumor samples).} + +\item{minSubgroupFrac}{A number ranging from 0 to 1, +specifying the fraction of extreme samples from group 1 and group 2 +that are used to identify the differential DNA methylation. +The default is 0.2 because we typically want to be able to detect a specific +(possibly unknown) molecular subtype among tumor; these subtypes often make up only +a minority of samples, and 20\% was chosen as a lower bound for the purposes of statistical power. +If you are using pre-defined group labels, such as treated replicates vs. untreated replicated, +use a value of 1.0 (Supervised mode)} + +\item{pvalue}{A number specifies the significant P value (adjusted P value by BH) +threshold Limit for selecting significant hypo/hyper-methylated probes. Default is 0.01 +If pvalue is smaller than pvalue than it is considered significant.} + +\item{group.col}{A column defining the groups of the sample. You can view the +available columns using: colnames(MultiAssayExperiment::colData(data)).} + +\item{min.samples}{Minimun number of samples to use in the analysis. Default 5. +If you have 10 samples in one group, minSubgroupFrac is 0.2 this will give 2 samples +in the lower quintile, but then 5 will be used.} + +\item{group1}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{group2}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{test}{Statistical test to be used. Options: t.test (DEFAULT), wilcox.test} + +\item{sig.dif}{A number specifies the smallest DNA methylation difference as a cutoff for +selecting significant hypo/hyper-methylated probes. Default is 0.3.} + +\item{dir.out}{A path specify the directory for outputs. Default is is current directory.} + +\item{save}{A logic. When TRUE, two getMethdiff.XX.csv files will be generated (see detail)} +} +\value{ +Statistics for all probes and significant hypo or hyper-methylated probes. +} +\description{ +get.diff.meth applys one-way t-test to identify the CpG sites that are significantly +hypo/hyper-methyalated using proportional samples (defined by minSubgroupFrac option) from group 1 +and group 2. The P values will be adjusted by Benjamini-Hochberg method. +Option pvalue and sig.dif will be the criteria (cutoff) for selecting significant +differentially methylated CpG sites. +If save is TURE, two getMethdiff.XX.csv files will be generated (see detail). +} +\details{ +save: + When save is TRUE, function will generate two XX.csv files.The first one is named + getMethdiff.hypo.probes.csv (or getMethdiff.hyper.probes.csv depends on diff.dir). + The first file contains all statistic results for each probe. Based on this + file, user can change different P value or sig.dir cutoff to select the significant results + without redo the analysis. The second file is named getMethdiff.hypo.probes.significant.csv + (or getMethdiff.hyper.probes.significant.csv depends on diff.dir). This file contains + statistic results for the probes that pass the significant criteria (P value and sig.dir). + When save is FALSE, a data frame R object will be generate which contains the same + information with the second file. +} +\examples{ +data <- ELMER:::getdata("elmer.data.example") +Hypo.probe <- get.diff.meth(data, + diff.dir="hypo", + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + sig.dif = 0.1) # get hypomethylated probes +Hyper.probe <- get.diff.meth(data, + diff.dir="hyper", + group.col = "definition", + sig.dif = 0.1) # get hypomethylated probes +} +\references{ +Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +} diff --git a/man/get.enriched.motif.Rd b/man/get.enriched.motif.Rd new file mode 100644 index 00000000..a1ab14ae --- /dev/null +++ b/man/get.enriched.motif.Rd @@ -0,0 +1,119 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Main_function.R +\name{get.enriched.motif} +\alias{get.enriched.motif} +\title{get.enriched.motif to identify the overrepresented motifs in a set of probes (HM450K) regions.} +\usage{ +get.enriched.motif(data, probes.motif, probes, min.motif.quality = "DS", + background.probes, pvalue = 0.05, lower.OR = 1.1, min.incidence = 10, + dir.out = "./", label = NULL, save = TRUE, plot.title="") +} +\arguments{ +\item{data}{A multi Assay Experiment from \code{\link{createMAE}} function. +If set and probes.motif/background probes are missing this will be used to get +this other two arguments correctly. This argument is not require, you can set probes.motif and +the backaground.probes manually.} + +\item{probes.motif}{A matrix contains motifs occurrence within probes regions. Probes.motif in +\pkg{ELMER.data} will be used if probes.motif is missing (detail see Probes.motif.hg19.450K in ELMER.data).} + +\item{probes}{A vector lists the name of probes to define the set of probes in which motif enrichment +OR and confidence interval will be calculated.} + +\item{min.motif.quality}{Minimum motif quality score to consider. +Possible valules: A, B, C , D, AS (A and S), BS (A, B and S), CS (A, B , C and S), DS (all - default) +Description: Each PWM has a quality rating from A to D where +A represents motifs with the highest confidence, and D motifs only weakly describe the pattern with a +limited applications for quantitative analyses. +Special S quality marks the single-box motifs (secondary motif). +Source: http://hocomoco.autosome.ru/help#description_quality_score +More information: \url{http://nar.oxfordjournals.org/content/44/D1/D116.full#sec-8}} + +\item{background.probes}{A vector lists name of probes which are considered as +background for motif.enrichment calculation (see detail).} + +\item{pvalue}{FDR P-value cut off (default 0.05)} + +\item{lower.OR}{A number specifies the smallest lower boundary of 95\% confidence interval for Odds Ratio. +The motif with higher lower boudnary of 95\% confidence interval for Odds Ratio than the number +are the significantly enriched motifs (detail see reference).} + +\item{min.incidence}{A non-negative integer specifies the minimum incidence of motif in the given probes set. +10 is default.} + +\item{dir.out}{A path. Specifies the directory for outputs. Default is current directory} + +\item{label}{A character. Labels the outputs such as "hypo", "hyper"} + +\item{save}{If save is TURE, two files will be saved: getMotif.XX.enriched.motifs.rda and +getMotif.XX.motif.enrichment.csv (see detail).} + +\item{plot.title}{Plot title. Default: no title.} +} +\value{ +A list contains enriched motifs with the probes regions harboring the motif. + +A list (R object) with enriched motifs as name and probes containing the enriched + motif as contents. And hypo.motif.enrichment.pdf plot will be generated. +} +\description{ +get.enriched.motif is a function make use of Probes.motif data from \pkg{ELMER.data} +package to calculate the motif enrichment Odds Ratio and 95\% confidence interval for +a given set of probes using fisher test function, after performing the Fisher's exact test, +the results for all transcription factors are corrected for multiple testing with the Benjamini-Hochberg procedure. +If save is TURE, two output files will be saved: +getMotif.XX.enriched.motifs.rda and getMotif.XX.motif.enrichment.csv (see detail). +} +\details{ +background.probes: + For enhancer study, it is better to use probes within distal enhancer probes as + background.probes. For promoter study, it is better to use probes within promoter + regions as background.probes. Because enhancer and promoter have different CG content + and harbors different clusters of TFs motif. + + save: + if save is TRUE, two files will be save on the disk. The first file is + getMotif.XX.motif.enrichment.csv (XX depends on option label). This file reports + the Odds Ratio and 95\% confidence interval for these Odds Ratios which pass the + significant cutoff (lower.OR and min.incidence). The second file is + getMotif.XX.enriched.motifs.rda (XX depends on option lable). This file contains + a list R object with enriched motifs as name and probes containing the enriched + motif as contents. This object will be used in \code{\link{get.TFs}} function. + if save is FALSE, the function will return a R object which is the same with second file. +} +\examples{ +probes <- c("cg00329272","cg10097755","cg08928189", "cg17153775","cg21156590", +"cg19749688","cg12590404","cg24517858","cg00329272","cg09010107", +"cg15386853", "cg10097755", "cg09247779","cg09181054","cg19371916") + data <- tryCatch(ELMER:::getdata("elmer.data.example"), error = function(e) { + message(e) + data(elmer.data.example, envir = environment()) + }) +bg <- rownames(getMet(data)) +data(Probes.motif.hg38.450K,package = "ELMER.data") +enriched.motif <- get.enriched.motif( + probes.motif = Probes.motif.hg38.450K, + probes = probes, + background.probes = bg, + pvalue = 1, + min.incidence = 2, + label = "hypo" +) +# If the MAE is set, the background and the probes.motif will +# be automatically set +enriched.motif <- get.enriched.motif( + data = data, + min.motif.quality = "DS", + probes=probes, + pvalue = 1, + min.incidence=2, + label="hypo" +) +} +\references{ +Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +} +\author{ +Lijing Yao (creator: lijingya@usc.edu) +} diff --git a/man/get.feature.probe.Rd b/man/get.feature.probe.Rd new file mode 100644 index 00000000..bb9fc32a --- /dev/null +++ b/man/get.feature.probe.Rd @@ -0,0 +1,77 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Main_function.R +\name{get.feature.probe} +\alias{get.feature.probe} +\title{get.feature.probe to select probes within promoter regions or distal regions.} +\usage{ +get.feature.probe( + feature = NULL, + TSS, + genome = "hg38", + met.platform = "450K", + TSS.range = list(upstream = 2000, downstream = 2000), + promoter = FALSE, + rm.chr = NULL +) +} +\arguments{ +\item{feature}{A GRange object containing biofeature coordinate such as +enhancer coordinates. +If NULL only distal probes (2Kbp away from TSS will be selected) +feature option is only usable when promoter option is FALSE.} + +\item{TSS}{A GRange object contains the transcription start sites. When promoter is FALSE, Union.TSS +in \pkg{ELMER.data} will be used for default. When promoter is TRUE, UCSC gene TSS will +be used as default (see detail). User can specify their own preference TSS annotation.} + +\item{genome}{Which genome build will be used: hg38 (default) or hg19.} + +\item{met.platform}{DNA methyaltion platform to retrieve data from: EPIC or 450K (default)} + +\item{TSS.range}{A list specify how to define promoter regions. +Default is upstream =2000bp and downstream=2000bp.} + +\item{promoter}{A logical.If TRUE, function will ouput the promoter probes. +If FALSE, function will ouput the distal probes overlaping with features. The +default is FALSE.} + +\item{rm.chr}{A vector of chromosome need to be remove from probes such as chrX chrY or chrM} +} +\value{ +A GRange object containing probes that satisfy selecting critiria. +} +\description{ +get.feature.probe is a function to select the probes falling into +distal feature regions or promoter regions. + +This function selects the probes on HM450K that either overlap +distal biofeatures or TSS promoter. +} +\details{ +In order to get real distal probes, we use more comprehensive annotated TSS by both + GENCODE and UCSC. However, to get probes within promoter regions need more + accurate annotated TSS such as UCSC. Therefore, there are different settings for + promoter and distal probe selection. But user can specify their own favorable + TSS annotation. Then there won't be any difference between promoter and distal + probe selection. + @return A GRanges object contains the coordinate of probes which locate + within promoter regions or distal feature regions such as union enhancer from REMC and FANTOM5. + @usage get.feature.probe( + feature, + TSS, + TSS.range = list(upstream = 2000, downstream = 2000), + promoter = FALSE, rm.chr = NULL + ) +} +\examples{ +# get distal enhancer probe +\dontrun{ +Probe <- get.feature.probe() +} +# get promoter probes +\dontrun{ +Probe <- get.feature.probe(promoter=FALSE) +} +# get distal enhancer probe remove chrX chrY +Probe2 <- get.feature.probe(rm.chr=c("chrX", "chrY")) +} diff --git a/man/get.pair.Rd b/man/get.pair.Rd new file mode 100644 index 00000000..6bd6a1c7 --- /dev/null +++ b/man/get.pair.Rd @@ -0,0 +1,156 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Main_function.R +\name{get.pair} +\alias{get.pair} +\title{get.pair to predict enhancer-gene linkages.} +\usage{ +get.pair(data, + nearGenes, + minSubgroupFrac = 0.4, + permu.size = 10000, + permu.dir = NULL, + raw.pvalue = 0.001, + Pe = 0.001, + mode = "unsupervised", + diff.dir = NULL, + dir.out = "./", + diffExp = FALSE, + group.col, + group1 = NULL, + group2 = NULL, + cores = 1, + correlation = "negative", + filter.probes = TRUE, + filter.portion = 0.3, + filter.percentage = 0.05, + label = NULL, + addDistNearestTSS = FALSE, + save = TRUE) +} +\arguments{ +\item{data}{A multiAssayExperiment with DNA methylation and Gene Expression data. +See \code{\link{createMAE}} function.} + +\item{nearGenes}{Can be either a list containing output of GetNearGenes +function or path of rda file containing output of GetNearGenes function.} + +\item{minSubgroupFrac}{A number ranging from 0 to 1, specifying the fraction of +extreme samples that define group U (unmethylated) and group M (methylated), +which are used to link probes to genes. +The default is 0.4 (the lowest quintile of samples is the U group and the highest quintile samples is the M group) +because we typically want to be able to detect a specific (possibly unknown) molecular subtype among tumor; +these subtypes often make up only a minority of samples, and 20\% was chosen as a lower bound for the purposes of statistical power. +If you are using pre-defined group labels, such as treated replicates vs. untreated replicated, use a value of 1.0 (Supervised mode).} + +\item{permu.size}{A number specify the times of permuation used in the unsupervised mode. Default is 10000.} + +\item{permu.dir}{A path where the output of permutation will be.} + +\item{raw.pvalue}{A number specify the raw p-value cutoff for defining significant pairs. +Default is 0.001. It will select the significant P value cutoff before calculating the empirical p-values.} + +\item{Pe}{A number specify the empirical p-value cutoff for defining significant pairs. +Default is 0.001} + +\item{mode}{A character. Can be "unsupervised" or "supervised". If unsupervised is set +the U (unmethylated) and M (methylated) groups will be selected +among all samples based on methylation of each probe. +Otherwise U group and M group will set as the samples of group1 or group2 as described below: +If diff.dir is "hypo, U will be the group 1 and M the group2. +If diff.dir is "hyper" M group will be the group1 and U the group2.} + +\item{diff.dir}{A character can be "hypo" or "hyper", showing differential +methylation direction in group 1. It can be "hypo" which means the probes are hypomethylated in group1; +"hyper" which means the probes are hypermethylated in group1; +This argument is used only when mode is supervised nad +it should be the same value from get.diff.meth function.} + +\item{dir.out}{A path specify the directory for outputs. Default is current directory} + +\item{diffExp}{A logic. Default is FALSE. If TRUE, t test will be applied to +test whether putative target gene are differentially expressed between two groups.} + +\item{group.col}{A column defining the groups of the sample. You can view the +available columns using: colnames(MultiAssayExperiment::colData(data)).} + +\item{group1}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{group2}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{cores}{A interger which defines number of core to be used in parallel process. +Default is 1: don't use parallel process.} + +\item{correlation}{Type of correlation to evaluate (negative or positive). +Negative (default) checks if hypomethylated region has a upregulated target gene. +Positive checks if region hypermethylated has a upregulated target gene.} + +\item{filter.probes}{Should filter probes by selecting only probes that have at least +a certain number of samples below and above a certain cut-off. +See \code{\link{preAssociationProbeFiltering}} function.} + +\item{filter.portion}{A number specify the cut point to define binary methylation level for probe loci. +Default is 0.3. When beta value is above 0.3, the probe is methylated and +vice versa. For one probe, the percentage of methylated and unmethylated samples +should be above filter.percentage value. +Only used if filter.probes is TRUE. See \code{\link{preAssociationProbeFiltering}} function.} + +\item{filter.percentage}{Minimun percentage of samples to be considered in methylated and unmethylated +for the filter.portion option. Default 5\%. Only used if filter.probes is TRUE. + See \code{\link{preAssociationProbeFiltering}} function.} + +\item{label}{A character labels the outputs.} + +\item{addDistNearestTSS}{Calculated distance to the nearest TSS instead of gene distance. +Having to calculate the distance to nearest TSS will take some time.} + +\item{save}{Two files will be saved if save is true: getPair.XX.all.pairs.statistic.csv +and getPair.XX.pairs.significant.csv (see detail).} +} +\value{ +Statistics for all pairs and significant pairs +} +\description{ +get.pair is a function to predict enhancer-gene linkages using associations between +DNA methylation at enhancer CpG sites and expression of 20 nearby genes of the CpG sites +(see reference). Two files will be saved if save is true: getPair.XX.all.pairs.statistic.csv +and getPair.XX.pairs.significant.csv (see detail). +} +\examples{ +data <- ELMER:::getdata("elmer.data.example") +nearGenes <- GetNearGenes(TRange=getMet(data)[c("cg00329272","cg10097755"),], + geneAnnot=getExp(data)) +Hypo.pair <- get.pair(data=data, + nearGenes=nearGenes, + permu.size=5, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + raw.pvalue = 0.2, + Pe = 0.2, + dir.out="./", + label= "hypo") + +Hypo.pair <- get.pair(data = data, + nearGenes = nearGenes, + permu.size = 5, + raw.pvalue = 0.2, + Pe = 0.2, + dir.out = "./", + diffExp = TRUE, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + label = "hypo") +} +\references{ +Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +} +\author{ +Lijing Yao (creator: lijingya@usc.edu) +Tiago C Silva (maintainer: tiagochst@usp.br) +} diff --git a/man/get.permu.Rd b/man/get.permu.Rd new file mode 100644 index 00000000..55f5e8fe --- /dev/null +++ b/man/get.permu.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Main_function.R +\name{get.permu} +\alias{get.permu} +\title{get.permu to generate permutation results for calculation of empirical P values for +each enhancer-gene linkage.} +\usage{ +get.permu(data, + geneID, + methy = NULL, + unmethy = NULL, + percentage = 0.2, + rm.probes = NULL, + correlation = "negative", + permu.size = 10000, + permu.dir = NULL, + cores = 1) +} +\arguments{ +\item{data}{A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function.} + +\item{geneID}{A vector lists the genes' ID.} + +\item{methy}{Index of M (methylated) group.} + +\item{unmethy}{Index of U (unmethylated) group.} + +\item{percentage}{A number ranges from 0 to 1 specifying the percentage of samples of group 1 and group 2 +groups used to link probes to genes. Default is 0.2.} + +\item{rm.probes}{A vector lists the probes name.} + +\item{correlation}{Type of correlation to identify. Default is negative: look for hypomethylation and increase target expression.} + +\item{permu.size}{A number specify the times of permuation. Default is 10000.} + +\item{permu.dir}{A path where the output of permuation will be.} + +\item{cores}{A interger which defines number of core to be used in parallel process. +Default is 1: don't use parallel process.} +} +\value{ +Permutations +} +\description{ +get.permu is a function to use the same statistic model to calculate random enhancer-gene +pairs. Based on the permutation value, empirical P value can be calculated for the +real enhancer-gene pair (see reference). +} +\note{ +Permutation is the most time consuming step. It is recommended to use multiple +cores for this step. Default permutation time is 1000 which may need 12 hrs by 4 cores. +However 10,000 permutations is recommended to get high confidence results. But it may cost 2 days. +} +\examples{ +data <- ELMER:::getdata("elmer.data.example") +permu <-get.permu(data = data, + geneID=rownames(getExp(data)), + rm.probes=c("cg00329272","cg10097755"), + permu.size=5) +} +\references{ +Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +} +\author{ +Lijing Yao (creator: lijingya@usc.edu) +Tiago C Silva (maintainer: tiagochst@usp.br) +} diff --git a/man/get.tab.Rd b/man/get.tab.Rd new file mode 100644 index 00000000..4e7817ad --- /dev/null +++ b/man/get.tab.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plots.R +\name{get.tab} +\alias{get.tab} +\title{summarize MR TF as a binary table with 1 if TF +was found in the analysis, 0 if not} +\usage{ +get.tab(dir, classification, top = TRUE) +} +\arguments{ +\item{dir}{Directory with ELMER results} + +\item{classification}{Which columns to retrieve family or subfamily} + +\item{top}{Consider only top 1 within each (sub)family} +} +\description{ +summarize MR TF as a binary table with 1 if TF +was found in the analysis, 0 if not +} +\examples{ +\dontrun{ +dir.create("out") +dir.create("out2") +data <- tryCatch( + ELMER:::getdata("elmer.data.example"), + error = function(e) { + message(e) + data(elmer.data.example, envir = environment()) + }) +enriched.motif <- list("P53_HUMAN.H11MO.1.A"= c("cg00329272", "cg10097755", "cg08928189", + "cg17153775", "cg21156590", "cg19749688", "cg12590404", + "cg24517858", "cg00329272", "cg09010107", "cg15386853", + "cg10097755", "cg09247779", "cg09181054")) +TF <- get.TFs(data, + enriched.motif, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + TFs = data.frame( + external_gene_name=c("TP53","TP63","TP73"), + ensembl_gene_id= c("ENSG00000141510", + "ENSG00000073282", + "ENSG00000078900"), + stringsAsFactors = FALSE), + dir.out = "out", + label="hypo") +TF <- get.TFs(data, + enriched.motif, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + TFs = data.frame( + external_gene_name=c("TP53","TP63","TP73"), + ensembl_gene_id= c("ENSG00000141510", + "ENSG00000073282", + "ENSG00000078900"), + stringsAsFactors = FALSE), + dir.out = "out2", + label="hypo") + ta.family <- get.tab(dir = c("out","out2"),classification = "family") + ta.subfamily <- get.tab(dir = c("out","out2"),classification = "subfamily") + unlink("out") + unlink("out2") +} +} diff --git a/man/get.tabs.Rd b/man/get.tabs.Rd new file mode 100644 index 00000000..86b68ff7 --- /dev/null +++ b/man/get.tabs.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plots.R +\name{get.tabs} +\alias{get.tabs} +\title{Creating matrix for MR TF heatmap} +\usage{ +get.tabs(dir, classification = "family", top = TRUE) +} +\arguments{ +\item{dir}{Vector ofr directory with results} + +\item{classification}{Consider family or subfamily} + +\item{top}{Consider only top 1 within each (sub)family} +} +\description{ +Code used to create matrix for MR TF heatmap +} +\examples{ +\dontrun{ +elmer.results <- dirname( +dir(path = "analysis", + pattern = "*.hypo.pairs.significant.csv", + recursive = T, + full.names = T, + all.files = T)) +tabs <- get.tabs(dir = elmer.results, classification = "subfamily") +} +} diff --git a/man/get450K.Rd b/man/get450K.Rd new file mode 100644 index 00000000..0d90bc32 --- /dev/null +++ b/man/get450K.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FetchTCGA.R +\name{get450K} +\alias{get450K} +\title{get450K to download HM40K DNA methylation data for certain cancer types from TCGA website. + @description + get450K is a function to download latest version of HM450K DNA methylation + for all samples of certain cancer types from GDC website.} +\usage{ +get450K(disease, basedir="./Data",filter=0.2, genome = "hg38") +} +\arguments{ +\item{disease}{A character specifies the disease to download from TCGA such as BLCA} + +\item{basedir}{A path. Shows where the data will be stored.} + +\item{filter}{For each probe, the percentage of NA among the all the samples +should smaller than filter.} + +\item{genome}{Data aligned against which genome of reference. Options: "hg19", "hg38" (default)} +} +\value{ +Download all DNA methylation from HM450K level 3 data for + the specified disease. +} +\description{ +get450K to download HM40K DNA methylation data for certain cancer types from TCGA website. + @description + get450K is a function to download latest version of HM450K DNA methylation + for all samples of certain cancer types from GDC website. +} diff --git a/man/getClinic.Rd b/man/getClinic.Rd new file mode 100644 index 00000000..63057e59 --- /dev/null +++ b/man/getClinic.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FetchTCGA.R +\name{getClinic} +\alias{getClinic} +\title{getClinic to download clinic data for certain cancer types from TCGA website.} +\usage{ +getClinic(disease, basedir = "./Data") +} +\arguments{ +\item{disease}{A character specifies the disease to download from TCGA such as BLCA} + +\item{basedir}{A path shows where the data will be stored.} +} +\value{ +Download all clinic information for the specified disease. +} +\description{ +getClinic is a function to download latest version of clinic data for all samples of certain cancer types from TCGA website. +} diff --git a/man/getExp.Rd b/man/getExp.Rd new file mode 100644 index 00000000..fd0fa180 --- /dev/null +++ b/man/getExp.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methodAccess.R +\name{getExp} +\alias{getExp} +\title{Get Gene expression object from MAE} +\usage{ +getExp(data) +} +\arguments{ +\item{data}{A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function.} +} +\description{ +Get Gene expression object from MAE +} diff --git a/man/getExpSamples.Rd b/man/getExpSamples.Rd new file mode 100644 index 00000000..bcf886ac --- /dev/null +++ b/man/getExpSamples.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methodAccess.R +\name{getExpSamples} +\alias{getExpSamples} +\title{Get Gene expression object samples from MAE} +\usage{ +getExpSamples(data) +} +\arguments{ +\item{data}{A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function.} +} +\description{ +Get Gene expression object samples from MAE +} diff --git a/man/getGeneID.Rd b/man/getGeneID.Rd new file mode 100644 index 00000000..4462c360 --- /dev/null +++ b/man/getGeneID.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methodAccess.R +\name{getGeneID} +\alias{getGeneID} +\title{getGeneID to report gene id from symbol} +\usage{ +getGeneID(data, symbol) +} +\arguments{ +\item{data}{A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function.} + +\item{symbol}{A vector of characters which are gene symbols} +} +\value{ +The gene ID for these gene symbols +} +\description{ +getGeneID to report gene id from symbol +} +\examples{ +data <- ELMER:::getdata("elmer.data.example") +getGeneID(data, symbol="ZNF697") +} diff --git a/man/getMet.Rd b/man/getMet.Rd new file mode 100644 index 00000000..819369a5 --- /dev/null +++ b/man/getMet.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methodAccess.R +\name{getMet} +\alias{getMet} +\title{Get DNA methylation object from MAE} +\usage{ +getMet(data) +} +\arguments{ +\item{data}{A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function.} +} +\description{ +Get DNA methylation object from MAE +} diff --git a/man/getMetSamples.Rd b/man/getMetSamples.Rd new file mode 100644 index 00000000..a66d5e71 --- /dev/null +++ b/man/getMetSamples.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methodAccess.R +\name{getMetSamples} +\alias{getMetSamples} +\title{Get DNA methylation object samples from MAE} +\usage{ +getMetSamples(data) +} +\arguments{ +\item{data}{A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function.} +} +\description{ +Get DNA methylation object samples from MAE +} diff --git a/man/getRNAseq.Rd b/man/getRNAseq.Rd new file mode 100644 index 00000000..7b03e6b7 --- /dev/null +++ b/man/getRNAseq.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FetchTCGA.R +\name{getRNAseq} +\alias{getRNAseq} +\title{getRNAseq to download all RNAseq data for a certain cancer type from TCGA.} +\usage{ +getRNAseq(disease, basedir = "./Data", genome = "hg38") +} +\arguments{ +\item{disease}{A character specifies disease in TCGA such as BLCA} + +\item{basedir}{Download all RNA seq level 3 data for the specified disease.} + +\item{genome}{Data aligned against which genome of reference. Options: "hg19", "hg38" (default)} +} +\value{ +Download all RNA seq level 3 data for the specified disease. +} +\description{ +getRNAseq is a function to download RNAseq data for all samples of a certain cancer type from TCGA +} diff --git a/man/getRandomPairs.Rd b/man/getRandomPairs.Rd new file mode 100644 index 00000000..63d97478 --- /dev/null +++ b/man/getRandomPairs.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Small.R +\name{getRandomPairs} +\alias{getRandomPairs} +\title{Get random pairs} +\usage{ +getRandomPairs(pairs, genome = "hg38", met.platform = "450K", cores = 1) +} +\arguments{ +\item{pairs}{A data frame with probe, gene and side information. See example below.} + +\item{genome}{Which genome build will be used: hg38 (default) or hg19.} + +\item{met.platform}{DNA methyaltion platform to retrieve data from: EPIC or 450K (default)} + +\item{cores}{A interger which defines the number of cores to be used in parallel +process. Default is 1: no parallel process.} +} +\value{ +A data frame with the random linkages +} +\description{ +This function will receive a pair gene probes and will return a +random object with the following pattern, if a probe is linked to R1 and L3 genes +the random pairs will be a random probes (a distal probe not in the input pairs) +also linked to its R1 and L3 gene. +} +\examples{ +\dontrun{ + data <- ELMER:::getdata("elmer.data.example") + nearGenes <- GetNearGenes(TRange=getMet(data)[c("cg00329272","cg10097755"),], + geneAnnot=getExp(data)) + + pair <- get.pair(data = data, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + mode = "supervised", + diff.dir = "hypo", + nearGenes = nearGenes, + permu.size = 5, + raw.pvalue = 0.001, + Pe = 0.2, + dir.out="./", + permu.dir = "permu_test", + label = "hypo") +} + pair <- data.frame(Probe = rep("cg00329272",3), + GeneID = c("ENSG00000116213","ENSG00000130762","ENSG00000149527"), + Sides = c("R5","R2","L4")) + getRandomPairs(pair) +} diff --git a/man/getRegionNearGenes.Rd b/man/getRegionNearGenes.Rd new file mode 100644 index 00000000..c518084b --- /dev/null +++ b/man/getRegionNearGenes.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetNearbyGenes.R +\name{getRegionNearGenes} +\alias{getRegionNearGenes} +\title{Identifies nearest genes to a region} +\usage{ +getRegionNearGenes( + TRange = NULL, + numFlankingGenes = 20, + geneAnnot = NULL, + tssAnnot = NULL +) +} +\arguments{ +\item{TRange}{A GRange object contains coordinate of targets.} + +\item{numFlankingGenes}{A number determine how many gene will be collected from each} + +\item{geneAnnot}{A GRange object contains gene coordinates of for human genome.} + +\item{tssAnnot}{A GRange object contains tss coordinates of for human genome.} +} +\value{ +A data frame of nearby genes and information: genes' IDs, genes' symbols, +} +\description{ +Auxiliary function for GetNearGenes + This will get the closest genes (n=numFlankingGenes) for a target region (TRange) + based on a genome of refenrece gene annotation (geneAnnot). If the + transcript level annotation (tssAnnot) is provided the Distance will be updated to + the distance to the nearest TSS. +} +\examples{ +geneAnnot <- ELMER:::get.GRCh("hg38",as.granges = TRUE) +tssAnnot <- getTSS(genome = "hg38") +probe <- GenomicRanges::GRanges(seqnames = c("chr1","chr2"), +range=IRanges::IRanges(start = c(16058489,236417627), end= c(16058489,236417627)), +name= c("cg18108049","cg17125141")) +names(probe) <- c("cg18108049","cg17125141") +NearbyGenes <- getRegionNearGenes(numFlankingGenes = 20, + geneAnnot = geneAnnot, + TRange = probe, + tssAnnot = tssAnnot) +} +\author{ +Tiago C Silva (maintainer: tiagochst@usp.br) +} diff --git a/man/getSymbol.Rd b/man/getSymbol.Rd new file mode 100644 index 00000000..0cb7ed5a --- /dev/null +++ b/man/getSymbol.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methodAccess.R +\name{getSymbol} +\alias{getSymbol} +\title{getSymbol to report gene symbol from id} +\usage{ +getSymbol(data, geneID) +} +\arguments{ +\item{data}{A multiAssayExperiment with DNA methylation and Gene Expression data. See \code{\link{createMAE}} function.} + +\item{geneID}{A character which is the ensembl_gene_id} +} +\value{ +The gene symbol for input genes. +} +\description{ +getSymbol to report gene symbol from id +} +\examples{ +data <- ELMER:::getdata("elmer.data.example") +getSymbol(data, geneID="ENSG00000143067") +} diff --git a/man/getTCGA.Rd b/man/getTCGA.Rd new file mode 100644 index 00000000..41bee271 --- /dev/null +++ b/man/getTCGA.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FetchTCGA.R +\name{getTCGA} +\alias{getTCGA} +\title{getTCGA to download DNA methylation, RNA expression and clinic data for all samples of certain cancer type from TCGA.} +\usage{ +getTCGA(disease, Meth=TRUE, RNA=TRUE, Clinic=TRUE, basedir="./Data", genome = "hg38") +} +\arguments{ +\item{disease}{A character specifies the disease to download in TCGA such as BLCA} + +\item{Meth}{A logic if TRUE HM450K DNA methylation data will download.} + +\item{RNA}{A logic if TRUE RNA-seq Hiseq-V2 from TCGA level 3 will be download.} + +\item{Clinic}{A logic if TRUE clinic data will be download for that disease.} + +\item{basedir}{A path shows where the data will be stored.} + +\item{genome}{Data aligned against which genome of reference. Options: "hg19", "hg38" (default)} +} +\value{ +Download DNA methylation (HM450K)/RNAseq(HiseqV2)/Clinic data for +the specified disease from TCGA. +} +\description{ +getTCGA is a function to download DNA methylation, RNA expression and clinic data for all +samples of certain cancer type from TCGA website. And downloaded data will be transform +to matrixes or data frame for further analysis. +} +\examples{ +getTCGA( + disease = "BRCA", + Meth = FALSE, + RNA = FALSE, + Clinic = TRUE, + basedir = tempdir(), + genome = "hg19" +) +} diff --git a/man/getTF.Rd b/man/getTF.Rd new file mode 100644 index 00000000..9256e3d1 --- /dev/null +++ b/man/getTF.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Small.R +\name{getTF} +\alias{getTF} +\title{Get human TF list from the UNiprot database} +\usage{ +getTF() +} +\value{ +A data frame with the ensemble gene id. +} +\description{ +This function gets the last version of human TF list from the UNiprot database +} diff --git a/man/getTFBindingSites.Rd b/man/getTFBindingSites.Rd new file mode 100644 index 00000000..e8bf9e46 --- /dev/null +++ b/man/getTFBindingSites.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Small.R +\name{getTFBindingSites} +\alias{getTFBindingSites} +\title{Get MR TF binding regions infered by ELMER} +\usage{ +getTFBindingSites( + tf = NULL, + results.dir = NULL, + genome = "hg38", + met.platform = "450K" +) +} +\arguments{ +\item{tf}{TF name} + +\item{results.dir}{path to the directory with the results +(i.e. analysis/unsupervised/definition-Primary.solid.Tumor_vs_Solid.Tissue.Normal/hypo/)} + +\item{genome}{Human genome (hg38, hg19)} + +\item{met.platform}{DNA Methylation Array platform (EPIC, 450K)} +} +\description{ +Saves a bed file with the unmethylated probes (+-250bp) regions that was infered +to be bound by a given TF +} +\examples{ +\dontrun{ + getTFBindingSites("HNF1A", + results.dir = "analysis/unsupervised/group-Tumor_vs_Normal/hypo/") +} +} diff --git a/man/getTFtargets.Rd b/man/getTFtargets.Rd new file mode 100644 index 00000000..b23e0f5e --- /dev/null +++ b/man/getTFtargets.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Main_function.R +\name{getTFtargets} +\alias{getTFtargets} +\title{Get TF target genes} +\usage{ +getTFtargets( + pairs, + enriched.motif, + TF.result, + dmc.analysis, + mae, + save = TRUE, + dir.out = "./", + classification = "family", + cores = 1, + label = NULL +) +} +\arguments{ +\item{pairs}{Output of get.pairs function: dataframe or file path} + +\item{enriched.motif}{List of probes for each enriched motif: list of file path. +The file created by ELMER is getMotif...enriched.motifs.rda} + +\item{TF.result}{Output get.TF function: dataframe or file path} + +\item{dmc.analysis}{DMC results file or data frame} + +\item{mae}{A multiAssayExperiment outputed from createMAE function} + +\item{save}{A logic. If save is true, a files will be saved: getTFtarget.XX..csv +If save is false, only a data frame contains the same content with the first file.} + +\item{dir.out}{A path specifies the directory for outputs of get.pair function. Default is current directory} + +\item{classification}{use family or subfamily classification to consider potential TF} + +\item{cores}{Number of cores to be used in parallel} + +\item{label}{A character labels the outputs.} +} +\description{ +This function uses ELMER analysis +results and summarizes the possible genes targets for each TF +} +\examples{ +pairs <- data.frame(Probe = c("cg26992600","cg26992800","cg26992900"), + Symbol = c("KEAP1","DSP","ATP86")) +enriched.motif <- list("FOXD3_HUMAN.H11MO.0.D"= c("cg26992800","cg26992900")) +TF.result <- data.frame(motif = c("FOXD3_HUMAN.H11MO.0.D"), + potential.TF.family = c("TP63;TP73")) +getTFtargets(pairs,enriched.motif,TF.result) + +\dontrun{ +getTFtargets("../LUAD_LUSC_analysis_hg38/hyper/getPair.hyper.pairs.significant.csv", +enriched.motif = "../LUAD_analysis_hg38/hyper/getMotif.hyper.enriched.motifs.rda", +TF.result = "../LUAD_analysis_hg38/hyper/getTF.hyper.significant.TFs.with.motif.summary.csv") +} +} diff --git a/man/getTSS.Rd b/man/getTSS.Rd new file mode 100644 index 00000000..eee612f2 --- /dev/null +++ b/man/getTSS.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Small.R +\name{getTSS} +\alias{getTSS} +\title{getTSS to fetch GENCODE gene annotation (transcripts level) from Bioconductor package biomaRt +If upstream and downstream are specified in TSS list, promoter regions of GENCODE gene will be generated.} +\usage{ +getTSS(genome = "hg38", TSS = list(upstream = NULL, downstream = NULL)) +} +\arguments{ +\item{genome}{Which genome build will be used: hg38 (default) or hg19.} + +\item{TSS}{A list. Contains upstream and downstream like TSS=list(upstream, downstream). +When upstream and downstream is specified, coordinates of promoter regions with gene annotation will be generated.} +} +\value{ +GENCODE gene annotation if TSS is not specified. Coordinates of GENCODE gene promoter regions if TSS is specified. +} +\description{ +getTSS to fetch GENCODE gene annotation (transcripts level) from Bioconductor package biomaRt +If upstream and downstream are specified in TSS list, promoter regions of GENCODE gene will be generated. +} +\examples{ +# get GENCODE gene annotation (transcripts level) +\dontrun{ + getTSS <- getTSS() + getTSS <- getTSS(genome.build = "hg38", TSS=list(upstream=1000, downstream=1000)) +} +} +\author{ +Lijing Yao (maintainer: lijingya@usc.edu) +} diff --git a/man/heatmap.main.Rd b/man/heatmap.main.Rd deleted file mode 100644 index 51fcfb72..00000000 --- a/man/heatmap.main.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R -\name{heatmap.main} -\alias{heatmap.main} -\title{output heatmap} -\usage{ -heatmap.main(x, margins = c(5, 0.5, 0.5, 5), labRow = NULL, labCol = NULL, - nonlab = F, nonlab.row = F, nonlab.col = F, xlab = NULL, - ylab = NULL, col = heat.colors(225), zlim = NULL, cexRow = 0.2 + - 1/log10(nr), cexCol = 0.2 + 1/log10(nc)) -} -\arguments{ -\item{x}{output from cluster.main.} - -\item{margins}{a character vector of variable names to compute margins for.} - -\item{labRow}{a character vector of labels for rows of matrix in x.} - -\item{labCol}{a chracter vector of lables for columns of matrix in x.} - -\item{nonlab}{a boolean to determine no labels for rows and column.} - -\item{nonlab.row}{a boolean to determine no labels for rows.} - -\item{nonlab.row}{a boolean to determine no labels for columns.} - -\item{...}{parameters for image function.} -} -\value{ -A heatmap -} -\description{ -output heatmap -} - diff --git a/man/heatmapGene.Rd b/man/heatmapGene.Rd new file mode 100644 index 00000000..a5a19ecc --- /dev/null +++ b/man/heatmapGene.Rd @@ -0,0 +1,120 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plots.R +\name{heatmapGene} +\alias{heatmapGene} +\title{Heatmap for correlation between probes DNA methylation and a single gene expression.} +\usage{ +heatmapGene( + data, + group.col, + group1, + group2, + pairs, + GeneSymbol, + scatter.plot = FALSE, + correlation.method = "pearson", + correlation.table = FALSE, + annotation.col = NULL, + met.metadata = NULL, + exp.metadata = NULL, + dir.out = ".", + filter.by.probe.annotation = TRUE, + numFlankingGenes = 10, + width = 10, + height = 10, + scatter.plot.width = 10, + scatter.plot.height = 10, + filename = NULL +) +} +\arguments{ +\item{data}{A MultiAssayExperiment with a DNA methylation SummarizedExperiment (all probes) and a gene Expression SummarizedExperiment.} + +\item{group.col}{A column from the sample matrix from the MultiAssayExperiment object. Accessed with colData(mae)} + +\item{group1}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{group2}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{pairs}{List of probe and pair genes} + +\item{GeneSymbol}{Gene Symbol} + +\item{scatter.plot}{Plot scatter plots} + +\item{correlation.method}{Correlation method: Pearson or sperman} + +\item{correlation.table}{save table with spearman correlation analysis ?} + +\item{annotation.col}{A vector of columns from the sample matrix from the MultiAssayExperiment object. Accessed with colData(mae) +to be added as annotation to the heatmap} + +\item{met.metadata}{A vector of metdatada columns available in the DNA methylation GRanges to should be added to the heatmap.} + +\item{exp.metadata}{A vector of metdatada columns available in the Gene expression GRanges to should be added to the heatmap.} + +\item{dir.out}{Where to save the plots} + +\item{filter.by.probe.annotation}{Filter probes to plot based on probes annotation} + +\item{numFlankingGenes}{numFlankingGenes to plot.} + +\item{width}{Figure width} + +\item{height}{Figure height} + +\item{scatter.plot.width}{Scatter plot width} + +\item{scatter.plot.height}{Scatter plot height} + +\item{filename}{File names (.pdf) to save the file (i.e. "plot.pdf"). If NULL return plot.} +} +\value{ +A heatmap +} +\description{ +This heatmap will sort samples by their gene expression and show the DNA methylation levels of the paired probes to that gene. +If no pairs are given, nearest probes will be selected. +To use this function you MAE object (input data) will need all probes and not only the distal ones. +This plot can be used to evaluate promoter, and intro, exons regions and closer distal probes of a gene to verify if their +DNA methylation level is affecting the gene expression +} +\examples{ +\dontrun{ + data <- ELMER:::getdata("elmer.data.example") + group.col <- "subtype_Expression.Subtype" + group1 <- "classical" + group2 <- "secretory" + pairs <- data.frame(ID = c("cg15924102","cg19403323", "cg22396959"), + GeneID = c("ENSG00000196878", "ENSG00000009790", "ENSG00000009790" ), + Symbol = c("TRAF3IP3","LAMB3","LAMB3"), + Side = c("R1","L1","R3"), + Distance = c(6017,168499,0), + stringsAsFactors = FALSE) + heatmapGene(data = data, + group.col = group.col, + group1 = group1, + group2 = group2, + pairs = pairs, + GeneSymbol = "LAMB3", + height = 5, + annotation.col = c("ethnicity","vital_status"), + filename = "heatmap.pdf") + \dontrun{ + heatmapGene(data = data, + group.col = group.col, + group1 = group1, + group2 = group2, + GeneSymbol = "ACP6", + annotation.col = c("ethnicity","vital_status"), + filename = "heatmap_closer_probes.pdf") + } +} +} +\author{ +Tiago Chedraoui Silva (tiagochst at gmail.com) +} diff --git a/man/heatmapPairs.Rd b/man/heatmapPairs.Rd new file mode 100644 index 00000000..308a1ddc --- /dev/null +++ b/man/heatmapPairs.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plots.R +\name{heatmapPairs} +\alias{heatmapPairs} +\title{Heatmap of pairs gene and probes anti-correlated} +\usage{ +heatmapPairs( + data, + group.col, + group1, + group2, + pairs, + subset = FALSE, + cluster.within.groups = TRUE, + plot.distNearestTSS = FALSE, + annotation.col = NULL, + met.metadata = NULL, + exp.metadata = NULL, + width = 10, + height = 7, + filename = NULL +) +} +\arguments{ +\item{data}{A MultiAssayExperiment with a DNA methylation SummarizedExperiment (all probes) and a gene Expression SummarizedExperiment.} + +\item{group.col}{A column from the sample matrix from the MultiAssayExperiment object. Accessed with colData(mae)} + +\item{group1}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{group2}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{pairs}{List of probe and pair genes} + +\item{subset}{Subset MAE object to keep only groups compared ?} + +\item{cluster.within.groups}{Cluster columns based on the groups} + +\item{plot.distNearestTSS}{Plot track with distNearestTSS ?} + +\item{annotation.col}{A vector of columns from the sample matrix from the MultiAssayExperiment object. Accessed with colData(mae) +to be added as annotation to the heatmap.} + +\item{met.metadata}{A vector of metdatada columns available in the DNA methylation GRanges to should be added to the heatmap.} + +\item{exp.metadata}{A vector of metdatada columns available in the Gene expression GRanges to should be added to the heatmap.} + +\item{width}{Figure width} + +\item{height}{Figure height} + +\item{filename}{File names (.pdf) to save the file (i.e. "plot.pdf"). If NULL return plot.} +} +\value{ +A heatmap +} +\description{ +Heatmp plot of pairs gene and probes anti-correlated +} +\examples{ +\dontrun{ + data <- ELMER:::getdata("elmer.data.example") + group.col <- "subtype_Expression.Subtype" + group1 <- "classical" + group2 <- "secretory" + pairs <- data.frame(Probe = c("cg15924102","cg19403323", "cg22396959"), + GeneID = c("ENSG00000196878", "ENSG00000009790", "ENSG00000009790" ), + Symbol = c("TRAF3IP3","LAMB3","LAMB3"), + Distance = c(6017,168499,0), + Raw.p = c(0.001,0.00001,0.001), + Pe = c(0.001,0.00001,0.001)) + heatmapPairs( + data = data, group.col = group.col, + group1 = group1, group2 = group2, + annotation.col = c("ethnicity","vital_status","age_at_diagnosis"), + pairs, filename = "heatmap.pdf", + height = 4, width = 11 + ) + } +} +\author{ +Tiago Chedraoui Silva (tiagochst at gmail.com) +} diff --git a/man/keyplot.Rd b/man/keyplot.Rd deleted file mode 100644 index fdb2607a..00000000 --- a/man/keyplot.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R -\name{keyplot} -\alias{keyplot} -\title{keyplot} -\usage{ -keyplot(x, col, breaks, extremes = NULL, texts, margin = c(1, 1, 1, 1), - cex.axis = par("cex.axis")) -} -\arguments{ -\item{x}{A matrix.} - -\item{col}{A vector of colors to define colors.} - -\item{breaks}{a vector of number to define how many label in x axis.} - -\item{extremes}{a vector of number to define the edge value of the key. Default is NULL.} - -\item{...}{parameters for image.} -} -\value{ -A keyplot. -} -\description{ -keyplot -} - diff --git a/man/lm_eqn.Rd b/man/lm_eqn.Rd index 0b0c0ba7..41dc39b9 100644 --- a/man/lm_eqn.Rd +++ b/man/lm_eqn.Rd @@ -1,18 +1,24 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Small.R \name{lm_eqn} \alias{lm_eqn} \title{lable linear regression formula} \usage{ -lm_eqn(df) +lm_eqn(df, Dep, Exp) } \arguments{ -\item{df}{A data.frame object contains two variables: dependent variable (Dep) and explanation variable (Exp).} +\item{df}{A data.frame object contains two variables: dependent +variable (Dep) and explanation variable (Exp).} + +\item{Dep}{A character specify dependent variable. The first column +will be dependent variable as default.} + +\item{Exp}{A character specify explanation variable. The second column +will be explanation variable as default.} } \value{ -a linear regression formula +A linear regression formula } \description{ lable linear regression formula } - diff --git a/man/metBoxPlot.Rd b/man/metBoxPlot.Rd new file mode 100644 index 00000000..560d95a4 --- /dev/null +++ b/man/metBoxPlot.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plots.R +\name{metBoxPlot} +\alias{metBoxPlot} +\title{scatter.plot to plot scatter plots between gene expression and DNA methylation.} +\usage{ +metBoxPlot( + data, + group.col, + group1, + group2, + probe, + min.samples = 5, + minSubgroupFrac = 0.2, + diff.dir = "hypo", + legend.col = NULL, + title = NULL, + filename = NULL, + save = TRUE +) +} +\arguments{ +\item{data}{A multiAssayExperiment with DNA methylation and Gene Expression data. +See \code{\link{createMAE}} function.} + +\item{group.col}{A column defining the groups of the sample. You can view the +available columns using: colnames(MultiAssayExperiment::colData(data)).} + +\item{group1}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{group2}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{probe}{Character with probe name (i.e. "cg24517858")} + +\item{min.samples}{Minimun number of samples to use in the analysis. Default 5. +If you have 10 samples in one group, percentage is 0.2 this will give 2 samples +in the lower quintile, but then 5 will be used.} + +\item{minSubgroupFrac}{A number ranges from 0 to 1 specifying the percentage of samples +from group1 and group2 that are used to identify the differential methylation. +Default is 0.2 because we did not expect all cases to be from a single molecular +subtype.But, If you are working with molecular subtypes please set it to 1.} + +\item{diff.dir}{A character can be "hypo" or "hyper", showing differential + methylation dirction. It can be "hypo" which is only selecting hypomethylated probes; +"hyper" which is only selecting hypermethylated probes;} + +\item{legend.col}{legend title} + +\item{title}{plot title} + +\item{filename}{File names (.png) to save the file (i.e. "plot.png")} + +\item{save}{Save plot as PNG} +} +\value{ +Box plot +} +\description{ +scatter.plot is a function to plot various scatter plots between gene expression and +DNA methylation. When byPair is specified, scatter plot for individual probe-gene pairs +will be generated. When byProbe is specified, scatter plots for one probes with nearby +20 gene pairs will be generated. When byTF is specified, scatter plot for TF expression +and average DNA methylation at certain motif sites will be generated. +} +\examples{ +\dontrun{ + data <- ELMER:::getdata("elmer.data.example") + group.col <- "subtype_Expression.Subtype" + group1 <- "classical" + group2 <- "secretory" + metBoxPlot(data, + group.col = group.col, + group1 = group1, + group2 = group2, + probe ="cg17898069", + minSubgroupFrac = 0.2, + diff.dir = "hypo") +} +} +\author{ +Tiago Chedraoui Silva (tiagochst at gmail.com) +} diff --git a/man/motif.enrichment.plot.Rd b/man/motif.enrichment.plot.Rd new file mode 100644 index 00000000..2662ccfd --- /dev/null +++ b/man/motif.enrichment.plot.Rd @@ -0,0 +1,95 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/motif.TF.Plots.R +\name{motif.enrichment.plot} +\alias{motif.enrichment.plot} +\title{motif.enrichment.plot to plot bar plots showing motif enrichment ORs and 95\% confidence interval for ORs} +\usage{ +motif.enrichment.plot(motif.enrichment, + significant = NULL, + dir.out ="./", + save = TRUE, + label = NULL, + title = NULL, + width = 10, + height = NULL, + summary = FALSE) +} +\arguments{ +\item{motif.enrichment}{A data frame or a file path of get.enriched.motif output +motif.enrichment.csv file.} + +\item{significant}{A list to select subset of motif. Default is NULL.} + +\item{dir.out}{A path specify the directory to which the figures will be saved. +Current directory is default.} + +\item{save}{A logic. If true (default), figure will be saved to dir.out.} + +\item{label}{A character. Labels the outputs figure.} + +\item{title}{Plot title. Default: no title} + +\item{width}{Plot width} + +\item{height}{Plot height. If NULL a default value will be calculated} + +\item{summary}{Create a summary table along with the plot, it is necessary +to add two new columns to object (NumOfProbes and PercentageOfProbes)} +} +\value{ +A figure shows the enrichment level for selected motifs. +} +\description{ +motif.enrichment.plot to plot bar plots showing motif enrichment ORs and +95\% confidence interval for ORs. Option motif.enrichment can be a data frame +generated by \code{\link{get.enriched.motif}} or a path of XX.csv saved by the +same function. +} +\details{ +motif.enrichment If input data.frame object, it should contain "motif", +"OR", "lowerOR", "upperOR" columns. motif specifies name of motif; +OR specifies Odds Ratio, lowerOR specifies lower boundary of OR (95%) ; +upperOR specifies upper boundary of OR(95%). + +significant A list used to select subset of motif.enrichment by the +cutoff of OR, lowerOR, upperOR. significant=list(OR=1). More than one cutoff +can be specified such as significant = list(OR=1, lowerOR=1,upperOR=4) +} +\examples{ +motif.enrichment <- data.frame(motif = c("TP53","NR3C1","E2F1","EBF1","RFX5","ZNF143", "CTCF"), + OR = c(19.33,4.83,1, 4.18, 3.67,3.03,2.49), + lowerOR = c(10,3,1.09,1.9,1.5,1.9, 0.82), + upperOR = c(23,5,3,7,6,5,5), + stringsAsFactors = FALSE) +motif.enrichment.plot(motif.enrichment = motif.enrichment, + significant = list(OR = 3), + label = "hypo", save = FALSE) +motif.enrichment.plot(motif.enrichment = motif.enrichment, + significant = list(OR = 3), + label = "hypo", + title = "OR for paired probes hypomethylated in Mutant vs WT", + save = FALSE) +motif.enrichment <- data.frame(motif = c("TP53","NR3C1","E2F1","EBF1","RFX5","ZNF143", "CTCF"), + OR = c(19.33,4.83,1, 4.18, 3.67,3.03,2.49), + lowerOR = c(10,3,1.09,1.9,1.5,1.5, 0.82), + upperOR = c(23,5,3,7,6,5,5), + NumOfProbes = c(23,5,3,7,6,5,5), + PercentageOfProbes = c(0.23,0.05,0.03,0.07,0.06,0.05,0.05), + stringsAsFactors=FALSE) +motif.enrichment.plot(motif.enrichment = motif.enrichment, + significant = list(OR = 3), + label = "hypo", save = FALSE) +motif.enrichment.plot(motif.enrichment = motif.enrichment, + significant = list(OR = 3), + label = "hypo", + summary = TRUE, + title = "OR for paired probes hypomethylated in Mutant vs WT", + save = TRUE) +} +\references{ +Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +} +\author{ +Lijing Yao (creator: lijingya@usc.edu) +} diff --git a/man/preAssociationProbeFiltering.Rd b/man/preAssociationProbeFiltering.Rd new file mode 100644 index 00000000..9235c368 --- /dev/null +++ b/man/preAssociationProbeFiltering.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Small.R +\name{preAssociationProbeFiltering} +\alias{preAssociationProbeFiltering} +\title{Filtering probes} +\usage{ +preAssociationProbeFiltering(data, K = 0.3, percentage = 0.05) +} +\arguments{ +\item{data}{A MultiAssayExperiment with a DNA methylation martrix or a DNA methylation matrix} + +\item{K}{Cut off to consider probes as methylated or unmethylated. Default: 0.3} + +\item{percentage}{The percentage of samples we should have at least considered as methylated and unmethylated} +} +\value{ +An object with the same class, but with the probes removed. +} +\description{ +This function has some filters to the DNA methylation data +in each it selects probes to avoid correlations due to non-cancer +contamination and for additional stringency. + \itemize{ +\item Filter 1: We usually call locus unmethylated when the methylation value < 0.3 and methylated when the methylation value > 0.3. + Therefore Meth_B is the percentage of methylation value > K. + Basically, this step will make sure we have at least a percentage of beta values lesser than K and n percentage of beta values greater K. + For example, if percentage is 5\%, the number of samples 100 and K = 0.3, + this filter will select probes that we have at least 5 (5\% of 100\%) samples have beta values > 0.3 and at least 5 samples have beta values < 0.3. + This filter is importante as true promoters and enhancers usually have a pretty low value (of course purity can screw that up). + we often see lots of PMD probes across the genome with intermediate values like 0.4. + Choosing a value of 0.3 will certainly give some false negatives, but not compared to the number of false positives we thought we might get without this filter. +} +} +\examples{ + random.probe <- runif(100, 0, 1) + bias_l.probe <- runif(100, 0, 0.3) + bias_g.probe <- runif(100, 0.3, 1) + met <- rbind(random.probe,bias_l.probe,bias_g.probe) + met <- preAssociationProbeFiltering(data = met, K = 0.3, percentage = 0.05) + met <- rbind(random.probe,random.probe,random.probe) + met <- preAssociationProbeFiltering(met, K = 0.3, percentage = 0.05) + data <- ELMER:::getdata("elmer.data.example") # Get data from ELMER.data + data <- preAssociationProbeFiltering(data, K = 0.3, percentage = 0.05) + + cg24741609 <- runif(100, 0, 1) + cg17468663 <- runif(100, 0, 0.3) + cg14036402 <- runif(100, 0.3, 1) + met <- rbind(cg24741609,cg14036402,cg17468663) + colnames(met) <- paste("sample",1:100) + exp <- met + rownames(exp) <- c("ENSG00000141510","ENSG00000171862","ENSG00000171863") + sample.info <- S4Vectors::DataFrame(primary = paste("sample",1:100), + sample.type = rep(c("Normal", "Tumor"),50)) + rownames(sample.info) <- colnames(exp) + mae <- createMAE(exp = exp, met = met, colData = sample.info, genome = "hg38") + mae <- preAssociationProbeFiltering(mae, K = 0.3, percentage = 0.05) +} +\references{ +Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription +factor networks from cancer methylomes." Genome biology 16.1 (2015): 1. +Method section (Linking enhancer probes with methylation changes to target genes with expression changes). +} diff --git a/man/promoterMeth.Rd b/man/promoterMeth.Rd new file mode 100644 index 00000000..d7c978ee --- /dev/null +++ b/man/promoterMeth.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Main_function.R +\name{promoterMeth} +\alias{promoterMeth} +\title{promoterMeth to calculate associations of gene expression with DNA methylation +at promoter regions} +\usage{ +promoterMeth(data, sig.pvalue = 0.01, minSubgroupFrac = 0.4, + upstream = 200, downstream = 2000, save = TRUE, cores = 1) +} +\arguments{ +\item{data}{A Multi Assay Experiment object with DNA methylation and +gene expression Summarized Experiment objects} + +\item{sig.pvalue}{A number specifies significant cutoff for gene silenced by promoter +methylation. Default is 0.01. P value is raw P value without adjustment.} + +\item{minSubgroupFrac}{A number ranging from 0 to 1 +specifying the percentage of samples used to create the groups U (unmethylated) +and M (methylated) used to link probes to genes. +Default is 0.4 (lowest quintile of all samples will be in the +U group and the highest quintile of all samples in the M group).} + +\item{upstream}{Number of bp upstream of TSS to consider as promoter region} + +\item{downstream}{Number of bp downstream of TSS to consider as promoter region} + +\item{save}{A logic. If it is true, the result will be saved.} + +\item{cores}{Number of cores to be used in paralellization. Default 1 (no paralellization)} +} +\value{ +A data frame contains genes whose expression significantly anti-correlated +with promoter methylation. +} +\description{ +promoterMeth is a function to calculate associations of gene expression with DNA methylation +at promoter regions. +} +\details{ +promoterMeth +} +\examples{ +\dontrun{ + data(elmer.data.example.promoter) + Gene.promoter <- promoterMeth(mae.promoter) +} +} diff --git a/man/render_report.Rd b/man/render_report.Rd new file mode 100644 index 00000000..76566c48 --- /dev/null +++ b/man/render_report.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/render_report.R +\name{render_report} +\alias{render_report} +\title{Build report for TCGA.pipe function} +\usage{ +render_report( + title = "Report", + mae.file, + group.col, + group1, + group2, + direction, + dir.out, + genome = "hg38", + mode = "supervised", + minSubgroupFrac = 0.2, + minMetdiff = 0.3, + metfdr = 0.01, + permu = 10000, + rawpval = 0.01, + pe = 0.01, + nprobes = 10, + lower.OR = 1.1, + out_file = file.path(getwd(), "report.html"), + funcivar = FALSE +) +} +\arguments{ +\item{title}{HTML report title} + +\item{mae.file}{Absolute path to the mae used in the analysis (.rda or .rds)} + +\item{group.col}{Group col} + +\item{group1}{Group 1} + +\item{group2}{Group 2} + +\item{direction}{direction used in the analysis} + +\item{dir.out}{Absolute path to folder with results. dir.out used in the analysis} + +\item{genome}{Genome of reference used in the analysis} + +\item{mode}{mode used in the analysis} + +\item{minSubgroupFrac}{minSubgroupFrac used in the analysis} + +\item{minMetdiff}{minMetdiff used in the analysis} + +\item{metfdr}{metfdr used in the analysis} + +\item{permu}{permu used in the analysis} + +\item{rawpval}{rawpval used in the analysis} + +\item{pe}{pe used in the analysis} + +\item{nprobes}{nprobes used in the analysis} + +\item{lower.OR}{lower.OR used in the analysis} + +\item{out_file}{Output file name (i.e report.html)} + +\item{funcivar}{Include funcivar analysis?} +} +\description{ +Build HTML report +} +\examples{ +\dontrun{ +render_report( + group.col = "TN", + group1 = "Tumor", + group2 = "Normal", + dir.out = "~/paper_elmer/Result/BRCA/TN_Tumor_vs_Normal/hypo/", + direction = "hypo", + mae.file = "~/paper_elmer/Result/BRCA/BRCA_mae_hg38.rda" +) +} +} diff --git a/man/scatter.Rd b/man/scatter.Rd new file mode 100644 index 00000000..edfe8d28 --- /dev/null +++ b/man/scatter.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Scatter.plot.R +\name{scatter} +\alias{scatter} +\title{scatter} +\usage{ +scatter( + meth, + exp, + legend.title = "Legend", + category = NULL, + xlab = NULL, + ylab = NULL, + ylim = NULL, + dots.size = 0.9, + title = NULL, + correlation = FALSE, + correlation.text.size = 3, + color.value = NULL, + lm_line = FALSE +) +} +\arguments{ +\item{meth}{A vector of number.} + +\item{exp}{A vector of number or matrix with sample in column and gene in rows.} + +\item{legend.title}{Plot legend title} + +\item{category}{A vector of sample labels.} + +\item{xlab}{A character specify the title of x axis.} + +\item{ylab}{A character specify the title of y axis.} + +\item{ylim}{y-axis limit i.e. c(0,25)} + +\item{dots.size}{Control dots size} + +\item{title}{A character specify the figure title.} + +\item{correlation}{Show spearman correlation values} + +\item{correlation.text.size}{Correlation values} + +\item{color.value}{A vector specify the color of each category, such as} + +\item{lm_line}{A logic. If it is TRUE, regression line will be added to the graph.} +} +\value{ +A ggplot figure object +} +\description{ +scatter +} diff --git a/man/scatter.plot.Rd b/man/scatter.plot.Rd new file mode 100644 index 00000000..23a9d396 --- /dev/null +++ b/man/scatter.plot.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Scatter.plot.R +\name{scatter.plot} +\alias{scatter.plot} +\title{scatter.plot to plot scatter plots between gene expression and DNA methylation.} +\usage{ +scatter.plot(data, + byPair = list(probe = c(), gene = c()), + byProbe = list(probe = c(), numFlankingGenes = 20), + byTF = list(TF = c(), probe = c()), + category = NULL, + ylim = NULL, + dots.size = 0.9, + correlation = FALSE, + width = 7, + height = 6, + dir.out = "./", + save = TRUE, ...) +} +\arguments{ +\item{data}{A multiAssayExperiment with DNA methylation and Gene Expression data. +See \code{\link{createMAE}} function.} + +\item{byPair}{A list: byPair =list(probe=c(),gene=c()); probe contains a vector +of probes' name and gene contains a vector of gene ID. The length of probe +should be the same with length of gene. Output see numFlankingGenes} + +\item{byProbe}{A list byProbe =list(probe=c(), geneNum=20); probe contains +a vector of probes'name and geneNum specify the number of gene near the probes +will ploted. 20 is default for numFlankingGenes Output see detail.} + +\item{byTF}{A list byTF =list(TF=c(), probe=c()); TF contains a vector of TF's +symbol and probe contains the a vector of probes' name. Output see detail.} + +\item{category}{A vector labels subtype of samples or a character which is the +column name in the colData(data) in the multiAssayExperiment object. Once specified, samples +will label different color. The color can be customized by using color.value.} + +\item{ylim}{y-axis limit i.e. c(0,25)} + +\item{dots.size}{Control dots size} + +\item{correlation}{Add pearson correlation values to the plot} + +\item{width}{PDF width} + +\item{height}{PDF height} + +\item{dir.out}{A path specify the directory to which the figures will be saved. +Current directory is default.} + +\item{save}{A logic. If true, figure will be saved to dir.out.} + +\item{...}{color.value, lm_line in scatter function} +} +\value{ +Scatter plots. +} +\description{ +scatter.plot is a function to plot various scatter plots between gene expression and +DNA methylation. When byPair is specified, scatter plot for individual probe-gene pairs +will be generated. When byProbe is specified, scatter plots for one probes with nearby +20 gene pairs will be generated. When byTF is specified, scatter plot for TF expression +and average DNA methylation at certain motif sites will be generated. +} +\details{ +byPair The output will be scatter plot for individual pairs. + +byProbe The output will be scatter plot for the probe and nearby genes. + +byTF The output will be scatter plot for the TFs and the average +DNA methylation at the probes set specified in byTF list. +} +\examples{ +data <- ELMER:::getdata("elmer.data.example") +scatter.plot(data, + byProbe=list(probe=c("cg19403323"),numFlankingGenes=20), + category="definition", save=FALSE) +scatter.plot(data,byProbe=list(probe=c("cg19403323"),numFlankingGenes=20), + category="definition", save=TRUE) ## save to pdf +# b. generate one probe-gene pair +scatter.plot(data,byPair=list(probe=c("cg19403323"),gene=c("ENSG00000143322")), + category="definition", save=FALSE,lm_line=TRUE) +} +\author{ +Lijing Yao (maintainer: lijingya@usc.edu) +} diff --git a/man/schematic.Rd b/man/schematic.Rd deleted file mode 100644 index daae9ba1..00000000 --- a/man/schematic.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Schematic_plot.R -\name{schematic} -\alias{schematic} -\title{Generate random loci of genome.} -\usage{ -schematic(target.range, Gene.range, special = list(Symbols = c(), colors = - c()), target.col, fn, save = T) -} -\arguments{ -\item{target.range}{A GRange object showing target coordinate.} - -\item{Gene.range}{A GRange oject contains coordinate of a list of genes to show on figure.} - -\item{special}{A list. Symbols is specific the gene that you want to emphasize and colors to specific the color to each gene you want to emphasize.} - -\item{target.col}{A color for target} - -\item{save}{Specific save the graph object or not} -} -\value{ -A graph object or save as a pdf. -} -\description{ -Generate random loci of genome. -} - diff --git a/man/schematic.plot.Rd b/man/schematic.plot.Rd new file mode 100644 index 00000000..0d105b43 --- /dev/null +++ b/man/schematic.plot.Rd @@ -0,0 +1,107 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Schematic.plot.R +\name{schematic.plot} +\alias{schematic.plot} +\title{schematic.plot to plot schematic plots showing the locations of genes and probes.} +\usage{ +schematic.plot(data, + group.col = NULL, + group1 = NULL, + group2 = NULL, + pair, + byProbe, + byGeneID, + byCoordinate=list(chr=c(), start=c(), end=c()), + statehub.tracks, + dir.out="./", + save=TRUE,...) +} +\arguments{ +\item{data}{A Multi Assay Experiment object with DNA methylation and +gene expression Summarized Experiment objects} + +\item{group.col}{A column defining the groups of the sample. You can view the +available columns using: colnames(MultiAssayExperiment::colData(data)).} + +\item{group1}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.} + +\item{group2}{A group from group.col. ELMER will run group1 vs group2. +That means, if direction is hyper, get probes +hypermethylated in group 1 compared to group 2.#' @param byProbe A vector of probe names.} + +\item{pair}{A data frame with three columns: Probe, Gene ID (Ensemble gene ID) +and Pe (empirical p-value). This is the ouput of get.pair function.} + +\item{byProbe}{A vector of probe names} + +\item{byGeneID}{A vector of gene ID} + +\item{byCoordinate}{A list contains chr, start and end. +byCoordinate=list(chr=c(),start=c(),end=c()).} + +\item{statehub.tracks}{Relative path to a statehub track.} + +\item{dir.out}{A path specify the directory for outputs. Default is current directory} + +\item{save}{A logic. If true, figures will be saved to dir.out.} + +\item{...}{Parameters for GetNearGenes} +} +\description{ +schematic.plot is a function to plot schematic plots showing the locations of genes and probes. +} +\details{ +byProbes: + When a vector of probes' name are provided, + function will produce schematic plots for each individual probes. + The schematic plot contains probe, nearby 20 (or the number of gene user specified.) + genes and the significantly linked gene to the probe. + +byGene: + When a vector of gene ID are provided, function will produce schematic plots + for each individual genes. The schematic plot contains the gene and all the + significantly linked probes. + +byCoordinate: + When a genomic coordinate is provided, function will + produce a schematic plot for this coordinate. The schematic plot contains + all genes and significantly linked probes in the range and the significant links. +} +\examples{ +data <- ELMER:::getdata("elmer.data.example") +pair <- data.frame(Probe = c("cg19403323","cg19403323", "cg26403223"), + GeneID = c("ENSG00000196878", "ENSG00000009790", "ENSG00000009790" ), + Symbol = c("TRAF3IP3","LAMB3","LAMB3"), + Raw.p =c(0.001,0.00001,0.001), + Pe = c(0.001,0.00001,0.001)) +schematic.plot(data, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + pair = pair, + byProbe = "cg19403323") +schematic.plot(data, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + pair = pair, + byGeneID = "ENSG00000009790") + +schematic.plot(data, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + pair = pair, + byCoordinate = list(chr="chr1", start = 209000000, end = 209960000)) +\dontrun{ + schematic.plot(data, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + pair = pair, + byProbe = "cg19403323", + statehub.tracks = "hg38/ENCODE/mcf-7.16mark.segmentation.bed") +} +} diff --git a/man/side.bars.Rd b/man/side.bars.Rd deleted file mode 100644 index fb53e316..00000000 --- a/man/side.bars.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R -\name{side.bars} -\alias{side.bars} -\title{making side bars for heatmap} -\usage{ -side.bars(x, side = "colside", order = NULL, margins = c(5, 0.5, 0.5, 5), - lab = NULL, col = heat.colors(225), zlim = NULL, cexlab = 0.2 + - 1/log10(nr)) -} -\arguments{ -\item{x}{A matrix which is same order of the original matrix before cluster. See details} - -\item{side}{A character which are either 'colside' or 'rowside' specifying where the side bar locates.} - -\item{order}{A vector of number specifying order of side bars.} - -\item{...}{parameters for image function.} -} -\value{ -a side bar for heatmap -} -\description{ -making side bars for heatmap -} -\details{ -x must be the matrix. If it is colbars, the row number of col bars should be the same as the col number of the matrix. If it is rowbars, the row number of row bars should be the same as the row number of the matrix -} - diff --git a/man/side.bars2.Rd b/man/side.bars2.Rd deleted file mode 100644 index de03ca74..00000000 --- a/man/side.bars2.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2 (4.1.0): do not edit by hand -% Please edit documentation in R/Heatmap.Func.R -\name{side.bars2} -\alias{side.bars2} -\title{making side bars for heatmap using segment to solve the problem in image that line will very tiny when a lot of samples.} -\usage{ -side.bars2(x, side = "colside", order = NULL, margins = c(5, 0.5, 0.5, 5), - lab = NULL, zlim = NULL, cexlab = 0.2 + 1/log10(nr)) -} -\arguments{ -\item{x}{A matrix which is same order of the original matrix before cluster. See details} - -\item{side}{A character which are either 'colside' or 'rowside' specifying where the side bar locates.} - -\item{order}{A vector of number specifying order of side bars.} - -\item{...}{parameters for image function.} -} -\value{ -a side bar for heatmap -} -\description{ -making side bars for heatmap using segment to solve the problem in image that line will very tiny when a lot of samples. -} -\details{ -x must be the matrix. If it is colbars, the row number of col bars should be the same as the col number of the matrix. If it is rowbars, the row number of row bars should be the same as the row number of the matrix -} - diff --git a/man/summarizeTF.Rd b/man/summarizeTF.Rd new file mode 100644 index 00000000..6519fd11 --- /dev/null +++ b/man/summarizeTF.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Main_function.R +\name{summarizeTF} +\alias{summarizeTF} +\title{Make MR TF binary table} +\usage{ +summarizeTF(files = NULL, path = NULL, classification = "family", top = FALSE) +} +\arguments{ +\item{files}{Output of get.pairs function: dataframe or file path} + +\item{path}{Directory path with the ELMER results. Files with the following pattern +will be selected TF.*with.motif.summary.csv.} + +\item{classification}{Consider subfamily or family classifications} + +\item{top}{Get only the top potential (default) or all potentials} +} +\description{ +This function uses ELMER analysis +results and summarizes the MR TF identified in each analysis +} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 00000000..c1a2b7b1 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(ELMER) + +test_check("ELMER") \ No newline at end of file diff --git a/tests/testthat/test-createMAE.R b/tests/testthat/test-createMAE.R new file mode 100644 index 00000000..df643bb5 --- /dev/null +++ b/tests/testthat/test-createMAE.R @@ -0,0 +1,213 @@ +context("Testing Multi Assay Experiment creation") +# +# test_that("The creation of a using matrices and no TCGA data with equal colnames in DNA methylation and Gene Expression", { +# # NON TCGA example: matrices has diffetrent column names +# gene.exp <- DataFrame(sample1 = c("ENSG00000141510"=2.3,"ENSG00000171862"=5.4), +# sample2 = c("ENSG00000141510"=1.6,"ENSG00000171862"=2.3) +# ) +# dna.met <- DataFrame(sample1 = c("cg14324200"=0.5,"cg23867494"=0.1), +# sample2 = c("cg14324200"=0.3,"cg23867494"=0.9) +# ) +# sample.info <- DataFrame(sample.type = c("Normal", "Tumor")) +# rownames(sample.info) <- colnames(gene.exp) +# +# suppressMessages({ +# mae <- createMAE(exp = gene.exp, met = dna.met, colData = sample.info, genome = "hg38") +# }) +# expect_equal(metadata(mae)$genome,"hg38") +# expect_false(metadata(mae)$TCGA) +# expect_equal(dim(getExp(mae)),c(2,2)) +# expect_equal(dim(getMet(mae)),c(2,2)) +# expect_equal(assay(getMet(mae)),as.matrix(dna.met)) +# expect_equal(assay(getExp(mae)),as.matrix(gene.exp)) +# expect_equal(colData(mae),sample.info) +# expect_true(all(sampleMap(mae)$assay %in% c("DNA methylation","Gene expression"))) +# }) +# +# test_that("The creation of a using matrices and no TCGA data with different colnames in DNA methylation and Gene Expression", { +# # NON TCGA example: matrices has diffetrent column names +# gene.exp <- DataFrame(sample1.exp = c("ENSG00000141510"=2.3,"ENSG00000171862"=5.4), +# sample2.exp = c("ENSG00000141510"=1.6,"ENSG00000171862"=2.3) +# ) +# dna.met <- DataFrame(sample1.met = c("cg14324200"=0.5,"cg23867494"=0.1), +# sample2.met = c("cg14324200"=0.3,"cg23867494"=0.9) +# ) +# sample.info <- DataFrame(sample.type = c("Normal", "Tumor")) +# rownames(sample.info) <- c("sample1","sample2") +# sampleMap <- DataFrame(primary = c("sample1","sample1","sample2","sample2"), +# colname = c("sample1.exp","sample1.met","sample2.exp","sample2.met")) +# +# suppressMessages({ +# mae <- createMAE(exp = gene.exp, met = dna.met, +# sampleMap = sampleMap, colData = sample.info, genome = "hg19") +# }) +# expect_equal(metadata(mae)$genome,"hg19") +# expect_false(metadata(mae)$TCGA) +# expect_equal(dim(getExp(mae)),c(2,2)) +# expect_equal(dim(getMet(mae)),c(2,2)) +# expect_equal(assay(getMet(mae)),as.matrix(dna.met)) +# expect_equal(assay(getExp(mae)),as.matrix(gene.exp)) +# expect_equal(colData(mae),sample.info) +# expect_true(all(sampleMap(mae)$assay %in% c("DNA methylation","Gene expression"))) +# expect_true(all(c("external_gene_name","ensembl_gene_id") %in% colnames(values(getExp(mae))))) +# }) +# +# test_that("The creation of a using Summarized Experiment objects and TCGA data", { +# # NON TCGA example: matrices has diffetrent column names +# gene.exp <- DataFrame(sample1.exp = c("ENSG00000141510"=2.3,"ENSG00000171862"=5.4), +# sample2.exp = c("ENSG00000141510"=1.6,"ENSG00000171862"=2.3) +# ) +# dna.met <- DataFrame(sample1.met = c("cg14324200"=0.5,"cg23867494"=0.1), +# sample2.met = c("cg14324200"=0.3,"cg23867494"=0.9) +# ) +# sample.info <- DataFrame(sample.type = c("Normal", "Tumor")) +# rownames(sample.info) <- c("sample1","sample2") +# sampleMap <- DataFrame(primary = c("sample1","sample1","sample2","sample2"), +# colname = c("sample1.exp","sample1.met","sample2.exp","sample2.met")) +# +# suppressMessages({ +# mae <- createMAE(exp = gene.exp, met = dna.met, +# sampleMap = sampleMap, colData = sample.info, genome = "hg19") +# }) +# expect_equal(metadata(mae)$genome,"hg19") +# expect_false(metadata(mae)$TCGA) +# expect_equal(dim(getExp(mae)),c(2,2)) +# expect_equal(dim(getMet(mae)),c(2,2)) +# expect_equal(assay(getMet(mae)),as.matrix(dna.met)) +# expect_equal(assay(getExp(mae)),as.matrix(gene.exp)) +# expect_equal(colData(mae),sample.info) +# expect_true(all(sampleMap(mae)$assay %in% c("DNA methylation","Gene expression"))) +# expect_true(all(c("external_gene_name","ensembl_gene_id") %in% colnames(values(getExp(mae))))) +# +# }) +# +# test_that("The creation of a using Summarized Experiment objects and TCGA data", { +# +# # TCGA example using TCGAbiolinks +# # Testing creating MultyAssayExperiment object +# # Load library +# # Consisering it is TCGA and SE +# data <- ELMER:::getdata("elmer.data.example") # Get data from ELMER.data +# +# suppressMessages({ +# mae <- createMAE(exp = getExp(data), +# met = getMet(data), +# TCGA = TRUE, genome = "hg19") +# }) +# expect_equal(metadata(mae)$genome,"hg19") +# expect_true(metadata(mae)$TCGA) +# expect_true(all(sampleMap(mae)$assay %in% c("DNA methylation","Gene expression"))) +# expect_true(all(c("external_gene_name","ensembl_gene_id") %in% colnames(values(getExp(mae))))) +# expect_equal(dim(getMet(mae)),c(101,234)) +# expect_equal(dim(getExp(mae)),c(1026,234)) +# +# +# suppressMessages({ +# mae <- createMAE(exp = getExp(data), +# met = getMet(data), +# TCGA = TRUE, +# genome = "hg38") +# }) +# expect_equal(metadata(mae)$genome,"hg38") +# expect_true(metadata(mae)$TCGA) +# expect_true(all(sampleMap(mae)$assay %in% c("DNA methylation","Gene expression"))) +# expect_true(all(c("external_gene_name","ensembl_gene_id") %in% colnames(values(getExp(mae))))) +# expect_equal(dim(getMet(mae)),c(101,234)) +# expect_equal(dim(getExp(mae)),c(1026,234)) +# +# # Consisering it is TCGA and not SE +# +# suppressMessages({ +# mae <- createMAE(exp = assay(getExp(data)), met = assay(getMet(data)), +# TCGA = TRUE, genome = "hg19") +# }) +# expect_equal(metadata(mae)$genome,"hg19") +# +# suppressMessages({ +# mae <- createMAE(exp = assay(getExp(data)), +# met = assay(getMet(data)), +# TCGA = TRUE, genome = "hg38") +# }) +# expect_equal(metadata(mae)$genome,"hg38") +# +# # Consisering it is not TCGA and SE +# # DNA methylation and gene expression Objects should have same sample names in columns +# not.tcga.exp <- assay(getExp(data)) +# colnames(not.tcga.exp) <- substr(colnames(not.tcga.exp),1,15) +# not.tcga.met <- assay(getMet(data)) +# colnames(not.tcga.met) <- substr(colnames(not.tcga.met),1,15) +# +# phenotype.data <- data.frame(row.names = colnames(not.tcga.exp), +# samples = colnames(not.tcga.exp), +# group = c(rep("group1", length(colnames(not.tcga.exp)) / 2 ), +# rep("group2", length(colnames(not.tcga.exp)) /2 ))) +# +# suppressMessages({ +# mae <- createMAE(exp = not.tcga.exp, met = not.tcga.met, +# TCGA = FALSE, genome = "hg19", colData = phenotype.data) +# }) +# +# }) + +# +# test_that("Number of probes in MAE matches the distal probes", { +# library(TCGAbiolinks) +# library(dplyr) +# gcimp.samples <- TCGAquery_subtype("lgg") %>% dplyr::filter(base::grepl("G-CIMP",Supervised.DNA.Methylation.Cluster,ignore.case = T)) +# #----------------------------------- +# # 2 - Get data +# # ---------------------------------- +# #----------------------------------- +# # 2.1 - DNA Methylation +# # ---------------------------------- +# query <- GDCquery(project = "TCGA-LGG", +# data.category = "DNA Methylation", +# platform = "Illumina Human Methylation 450", +# barcode = gcimp.samples$patient[1:3]) +# GDCdownload(query) +# met <- GDCprepare(query, save = FALSE) +# #----------------------------------- +# # 2 - Expression +# # ---------------------------------- +# query <- GDCquery(project = "TCGA-LGG", +# data.category = "Transcriptome Profiling", +# data.type = "Gene Expression Quantification", +# workflow.type = "HTSeq - FPKM-UQ", +# barcode = gcimp.samples$patient[1:3]) +# GDCdownload(query) +# exp <- GDCprepare(query, save = FALSE) +# for(genome in c("hg38","hg19")){ +# distal.probe <- get.feature.probe(genome = genome, met.platform = "450K") +# mae <- createMAE(exp = exp, +# met = met, +# genome = genome, +# met.platform = "450K", +# linearize.exp = TRUE, +# met.na.cut = 1.1, +# filter.probes = distal.probe, +# TCGA = TRUE) +# +# expect_equal(length(distal.probe),nrow(getMet(mae))) +# expect_equal(metadata(mae)$genome,genome) +# expect_true("TN" %in% colnames(colData(mae))) +# } +# unlink("GDCdata",recursive = TRUE, force = TRUE) +# }) +# +# test_that("Adding mutation column is working", { +# data <- tryCatch(ELMER:::getdata("elmer.data.example"), error = function(e) { +# message(e) +# data(elmer.data.example, envir = environment()) +# }) +# suppressMessages({ +# data <- createMAE(exp = getExp(data), +# met = getMet(data), +# TCGA = TRUE, +# genome = "hg38") +# }) +# data <- addMutCol(data, "LUSC","LDHD") +# expect_true("LDHD" %in% colnames(colData(data))) +# expect_true("WT" %in% colData(data)$LDHD) +# expect_true("Mutant" %in% colData(data)$LDHD) +# expect_true("Normal" %in% colData(data)$LDHD) +# }) diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R new file mode 100644 index 00000000..e6dd9055 --- /dev/null +++ b/tests/testthat/test-data.R @@ -0,0 +1,26 @@ +context("Checking if data imported is correctly created") + +test_that("Family of TF is correctly created from HOCOMOCO (TFClass database)", { + # Create list of TF in the same family + family <- createMotifRelevantTfs() + # TP53 TP63 ad TP73 are in the same family + expect_true(all(c("TP53", "TP63", "TP73") %in% family$P53_HUMAN.H11MO.0.A)) + expect_true(all(c("TP53", "TP63", "TP73") %in% family$P63_HUMAN.H11MO.0.A)) + expect_true(all(c("TP53", "TP63", "TP73") %in% family$P73_HUMAN.H11MO.0.A)) + expect_equal(length(family$P53_HUMAN.H11MO.0.A), 3) + expect_equal(length(family$P63_HUMAN.H11MO.0.A), 3) + expect_equal(length(family$P73_HUMAN.H11MO.0.A), 3) +}) + +test_that("Get list of human TF from ELMER.data", { + tf <- getTF() + expect_true(all(c("TP53", "TP63", "TP73") %in% tf$external_gene_name)) +}) + +test_that("Mapping from entrez gene ID to emsemble gene ID is right", { + genes <- c("100887754","100873766","100874231") + mapping <- get.GRCh(genome = "hg19",genes) + expect_equal(mapping[mapping$entrezgene == genes[1],]$ensembl_gene_id,"ENSG00000243300" ) + expect_equal(mapping[mapping$entrezgene == genes[2],]$ensembl_gene_id,"ENSG00000252952" ) + expect_equal(mapping[mapping$entrezgene == genes[3],]$ensembl_gene_id,"ENSG00000227213" ) +}) \ No newline at end of file diff --git a/tests/testthat/test-diffmet.R b/tests/testthat/test-diffmet.R new file mode 100644 index 00000000..5e46a597 --- /dev/null +++ b/tests/testthat/test-diffmet.R @@ -0,0 +1,151 @@ +context("Testing get.diff.meth") + +test_that("The directions should change if we change the groups", { + data <- ELMER:::getdata("elmer.data.example") + Hypo.probe.1 <- get.diff.meth(data, + minSubgroupFrac = 1, + diff.dir="hypo", + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + sig.dif = 0.1) # get hypomethylated probes + Hyper.probe.1 <- get.diff.meth(data, + minSubgroupFrac = 1, + diff.dir="hyper", + group.col = "definition", + group1 = "Solid Tissue Normal", + group2 = "Primary solid Tumor", + sig.dif = 0.1) # get hypomethylated probes + expect_equal(Hyper.probe.1$probe,Hypo.probe.1$probe) + expect_true(min(Hypo.probe.1$adjust.p) >= 0) + expect_true(max(Hypo.probe.1$adjust.p) <= 1) + expect_true(min(Hypo.probe.1$pvalue) >= 0) + expect_true(max(Hypo.probe.1$pvalue) <= 1) + + expect_equal(round(Hyper.probe.1[,3],digits = 2),-round(Hypo.probe.1[,3],digits = 2)) + + mean.tp <- rowMeans(assay(getMet(data)[Hyper.probe.1$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Primary solid Tumor"])) + mean.nt <- rowMeans(assay(getMet(data)[Hyper.probe.1$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Solid Tissue Normal"])) + expect_equal(Hyper.probe.1[1,3] > 0, (mean.nt - mean.tp)[[1]] > 0) + + mean.tp <- rowMeans(assay(getMet(data)[Hypo.probe.1$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Primary solid Tumor"])) + mean.nt <- rowMeans(assay(getMet(data)[Hypo.probe.1$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Solid Tissue Normal"])) + expect_equal(Hypo.probe.1[1,3] < 0, (mean.tp - mean.nt )[[1]] < 0) + + Hyper.probe.2 <- get.diff.meth(data, + minSubgroupFrac = 1, + diff.dir="hyper", + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + sig.dif = 0.1) # get hypomethylated probes + Hypo.probe.2 <- get.diff.meth(data, + minSubgroupFrac = 1, + diff.dir="hypo", + group.col = "definition", + group1 = "Solid Tissue Normal", + group2 = "Primary solid Tumor", + sig.dif = 0.1) # get hypomethylated probes + expect_equal(Hyper.probe.2$probe,Hypo.probe.2$probe) + expect_equal(round(Hyper.probe.2[,3],digits = 2),-round(Hypo.probe.2[,3],digits = 2)) + + mean.tp <- rowMeans(assay(getMet(data)[Hyper.probe.2$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Primary solid Tumor"])) + mean.nt <- rowMeans(assay(getMet(data)[Hyper.probe.2$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Solid Tissue Normal"])) + expect_equal(Hyper.probe.2[1,3] > 0, (mean.tp - mean.nt)[[1]] > 0) + + mean.tp <- rowMeans(assay(getMet(data)[Hypo.probe.2$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Primary solid Tumor"])) + mean.nt <- rowMeans(assay(getMet(data)[Hypo.probe.1$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Solid Tissue Normal"])) + expect_equal(Hypo.probe.2[1,3] < 0, (mean.nt-mean.tp )[[1]] < 0) + +}) + +test_that("The test argument can be changed", { + data <- ELMER:::getdata("elmer.data.example") + + suppressMessages({ + Hypo.probe.1 <- get.diff.meth(data, + minSubgroupFrac = 1, + diff.dir="hypo", + test = t.test, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + sig.dif = 0.1) # get hypomethylated probes + Hyper.probe.1 <- get.diff.meth(data, + minSubgroupFrac = 1, + diff.dir="hyper", + test = t.test, + group.col = "definition", + group1 = "Solid Tissue Normal", + group2 = "Primary solid Tumor", + sig.dif = 0.1) # get hypomethylated probes + }) + expect_equal(Hyper.probe.1$probe,Hypo.probe.1$probe) + expect_equal(round(Hyper.probe.1[,3],digits = 2),-round(Hypo.probe.1[,3],digits = 2)) + + mean.tp <- rowMeans(assay(getMet(data)[Hyper.probe.1$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Primary solid Tumor"])) + mean.nt <- rowMeans(assay(getMet(data)[Hyper.probe.1$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Solid Tissue Normal"])) + expect_equal(Hyper.probe.1[1,3] > 0, (mean.nt - mean.tp)[[1]] > 0) + + mean.tp <- rowMeans(assay(getMet(data)[Hypo.probe.1$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Primary solid Tumor"])) + mean.nt <- rowMeans(assay(getMet(data)[Hypo.probe.1$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Solid Tissue Normal"])) + expect_equal(Hypo.probe.1[1,3] < 0, (mean.tp - mean.nt )[[1]] < 0) + + + suppressMessages({ + Hyper.probe.2 <- get.diff.meth(data, + minSubgroupFrac = 1, + diff.dir="hyper", + group.col = "definition", + test = t.test, + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + sig.dif = 0.1) # get hypomethylated probes + Hypo.probe.2 <- get.diff.meth(data, + minSubgroupFrac = 1, + diff.dir="hypo", + test = t.test, + group.col = "definition", + group1 = "Solid Tissue Normal", + group2 = "Primary solid Tumor", + sig.dif = 0.1) # get hypomethylated probes + }) + expect_equal(Hyper.probe.2$probe,Hypo.probe.2$probe) + expect_equal(round(Hyper.probe.2[,3],digits = 2),-round(Hypo.probe.2[,3],digits = 2)) + + mean.tp <- rowMeans(assay(getMet(data)[Hyper.probe.2$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Primary solid Tumor"])) + mean.nt <- rowMeans(assay(getMet(data)[Hyper.probe.2$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Solid Tissue Normal"])) + expect_equal(Hyper.probe.2[1,3] > 0, (mean.tp - mean.nt)[[1]] > 0) + + mean.tp <- rowMeans(assay(getMet(data)[Hypo.probe.2$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Primary solid Tumor"])) + mean.nt <- rowMeans(assay(getMet(data)[Hypo.probe.1$probe[1],colData(data)[sampleMap(data)[sampleMap(data)$assay == "DNA methylation","primary"],"definition"] == "Solid Tissue Normal"])) + expect_equal(Hypo.probe.2[1,3] < 0, (mean.nt-mean.tp )[[1]] < 0) +}) + +test_that("It threats correclty NAs and thrseholds", { + data <- ELMER:::getdata("elmer.data.example") + + suppressMessages({ + diff.probes <- get.diff.meth(data, + minSubgroupFrac = 1, + diff.dir="hypo", + test = t.test, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + sig.dif = 0.1, + pvalue = 0.0) # get hypomethylated probes + expect_true(nrow(diff.probes) == 0) + diff.probes <- get.diff.meth(data, + minSubgroupFrac = 1, + diff.dir="hypo", + test = t.test, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + sig.dif = 0.0, + pvalue = 1.1) # get hypomethylated probes + }) + expect_equal(nrow(diff.probes), nrow(diff.probes)) +}) + diff --git a/tests/testthat/test-get.enriched.motifs.R b/tests/testthat/test-get.enriched.motifs.R new file mode 100644 index 00000000..b3ca9726 --- /dev/null +++ b/tests/testthat/test-get.enriched.motifs.R @@ -0,0 +1,113 @@ +context("Check get enriched motif function") + +test_that("get enriched motif function returns the expected result", { + data <- ELMER:::getdata("elmer.data.example") + bg <- rownames(getMet(data)) + probes <- bg[1:20] + + # In this case MAFG_HUMAN.H11MO.0.A is the enriched motif + # 1) SP2_HUMAN.H11MO.0.A, has the motif for all probes, but as it has for all bg probes + # it will be conisered false positve + # 2) Has 1 for all the probes and 0 for all background (our best case) + # 3) Has 0 for all cases (should not be selected) + Probes.motif <- data.frame("SP2_HUMAN.H11MO.0.A" = rep(1, length(bg)), + "MAFG_HUMAN.H11MO.0.A" = c(rep(1, length(bg)/2),rep(0, length(bg)/2 + 1)), + "NR2E1_HUMAN.H11MO.0.D" = rep(0, length(bg)), + "TBX15_HUMAN.H11MO.0.D" = rep(0, length(bg))) + rownames(Probes.motif) <- bg + Probes.motif[probes,4] <- 1 + Probes.motif[1,] <- c(0,0,1,0) # The case before will give an execption + + suppressMessages({ + enriched.motif <- get.enriched.motif(probes.motif=Probes.motif, + probes=probes, + pvalue = 1, + lower.OR = 0.1, + min.motif.quality = "D", + background.probes = bg, + label="hypo") + }) + # In this case MAFG_HUMAN.H11MO.0.A is the enriched motif + # 1) SP2_HUMAN.H11MO.0.A, has the motif for all probes, but as it has for all bg probes + # it will be conisered false positve + # 2) Has 1 for all the probes and 0 for all background (our best case) + # 3) Has 0 for all cases (should not be selected) + Probes.motif <- data.frame("SP2_HUMAN.H11MO.0.A" = rep(1, length(bg)), + "MAFG_HUMAN.H11MO.0.A" = c(rep(1, length(bg)/2),rep(0, length(bg)/2 + 1)), + "NR2E1_HUMAN.H11MO.0.D" = rep(0, length(bg))) + rownames(Probes.motif) <- bg + Probes.motif[1,] <- c(0,0,1) # The case before will give an execption + + + suppressMessages({ + enriched.motif <- get.enriched.motif(probes.motif=Probes.motif, + probes = probes, + pvalue = 1, + min.motif.quality = "D", + background.probes = bg, + label="hypo") + }) + expect_equal(names(enriched.motif), "MAFG_HUMAN.H11MO.0.A") + expect_true(all(enriched.motif[[1]] %in% probes)) +}) + +test_that("min.incidence works", { + data <- ELMER:::getdata("elmer.data.example") + bg <- rownames(getMet(data)) + probes <- bg[1:20] + + # In this case MAFG_HUMAN.H11MO.0.A is the enriched motif + # 1) SP2_HUMAN.H11MO.0.A, has the motif for all probes, but as it has for all bg probes + # it will be conisered false positve + # 2) Has 1 for all the probes and 0 for all background (our best case) + # 3) Has 0 for all cases (should not be selected) + Probes.motif <- data.frame("SP2_HUMAN.H11MO.0.A" = rep(1, length(bg)), + "MAFG_HUMAN.H11MO.0.A" = c(rep(1, length(bg)/2),rep(0, length(bg)/2 + 1)), + "NR2E1_HUMAN.H11MO.0.D" = rep(0, length(bg)), + "TBX15_HUMAN.H11MO.0.D" = rep(0, length(bg))) + rownames(Probes.motif) <- bg + Probes.motif[probes,4] <- 1 + Probes.motif[1,] <- c(0,0,1,0) # The case before will give an execption + + suppressMessages({ + + enriched.motif <- get.enriched.motif(probes.motif=Probes.motif, + probes=probes, + min.incidence = 20, + lower.OR = 0.1, + pvalue = 1, + min.motif.quality = "D", + background.probes = bg, + label="hypo") + }) + expect_true(length(enriched.motif) == 0) + + suppressMessages({ + + enriched.motif <- get.enriched.motif(probes.motif=Probes.motif, + probes=probes, + min.incidence = 0, + lower.OR = 0.0, + pvalue = 1, + min.motif.quality = "D", + background.probes = bg, + label="hypo") + }) + expect_true(length(enriched.motif) == ncol(Probes.motif)) + + suppressMessages({ + enriched.motif <- get.enriched.motif(probes.motif=Probes.motif, + probes=probes, + min.incidence = 0, + lower.OR = 0.0, + pvalue = 1, + min.motif.quality = "B", + background.probes = bg, + label="hypo") + }) + expect_true(length(enriched.motif) == 2) + unlink("hypo.quality*",recursive = TRUE, force = TRUE) + unlink("hypo.all*",recursive = TRUE, force = TRUE) + unlink("getMotif.hypo.enriched.motifs.rda",recursive = TRUE, force = TRUE) + unlink("getMotif.hypo.motif.enrichment.csv",recursive = TRUE, force = TRUE) +}) \ No newline at end of file diff --git a/tests/testthat/test-get.pair.R b/tests/testthat/test-get.pair.R new file mode 100644 index 00000000..110c17e5 --- /dev/null +++ b/tests/testthat/test-get.pair.R @@ -0,0 +1,206 @@ +context("Checking get pair function") +test_that("Supervised mode works", { + + data <- ELMER:::getdata("elmer.data.example") + nearGenes <- GetNearGenes(TRange = getMet(data)[c("cg00329272","cg10097755"),], + geneAnnot = getExp(data)) + Hypo.pair <- get.pair(data = data, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + mode = "supervised", + diff.dir = "hypo", + nearGenes = nearGenes, + permu.size = 5, + raw.pvalue = 0.001, + Pe = 0.2, + dir.out="./", + permu.dir = "permu_test", + label = "hypo") + # Group 1 is the unmethylated group its expression has to be higher + exp.group1 <- rowMeans(assay(getExp(data)[Hypo.pair$GeneID,colData(data)$definition == "Primary solid Tumor"])) + exp.group2 <- rowMeans(assay(getExp(data)[Hypo.pair$GeneID,colData(data)$definition == "Solid Tissue Normal"])) + expect_true(all(exp.group1 > exp.group2)) + + unlink("permu_test",recursive = TRUE, force = TRUE) + Hyper.pair <- get.pair(data = data, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + mode = "supervised", + diff.dir = "hyper", + nearGenes = nearGenes, + permu.size = 5, + raw.pvalue = 0.001, + Pe = 0.2, + dir.out="./", + permu.dir = "permu_test", + label = "hyper") + # Group 2 is the unmethylated group its expression has to be higher + exp.group1 <- rowMeans(assay(getExp(data)[Hyper.pair$GeneID,colData(data)$definition == "Primary solid Tumor"])) + exp.group2 <- rowMeans(assay(getExp(data)[Hyper.pair$GeneID,colData(data)$definition == "Solid Tissue Normal"])) + expect_true(all(exp.group1 < exp.group2)) + + + unlink("permu_test",recursive = TRUE, force = TRUE) +}) +test_that("Function uses correctly the permu.dir", { + data <- ELMER:::getdata("elmer.data.example") + nearGenes <- GetNearGenes(TRange = getMet(data)[c("cg00329272","cg10097755"),], + geneAnnot = getExp(data)) + Hypo.pair <- get.pair(data = data, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + nearGenes = nearGenes, + permu.size = 5, + raw.pvalue = 0.1, + Pe = 0.2, + dir.out="./", + permu.dir = "permu_test", + label = "hypo") + # Folder was crreated correcly + expect_true(file.exists("permu_test/permu.rda")) + expect_true(ncol(get(load("permu_test/permu.rda"))) == 5) + # Result correctly use a gene from the nearGene list + expect_true(all(Hypo.pair[Hypo.pair$Probe %in% "cg00329272" ,]$GeneID %in% nearGenes[nearGenes$ID == "cg00329272",]$GeneID)) + expect_true(all(Hypo.pair$Pe < 0.2)) + expect_true(min(Hypo.pair$Pe) >= 0) + expect_true(max(Hypo.pair$Pe) <= 1) + expect_true(min(Hypo.pair$Raw.p) >= 0) + expect_true(max(Hypo.pair$Raw.p) <= 1) + + # If we add one more probe the value should be saved + Hypo.pair <- get.pair(data = data, + nearGenes = nearGenes, + permu.size = 6, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + raw.pvalue = 0.001, + Pe = 0.2, + dir.out="./", + permu.dir = "permu_test", + label = "hypo") + # Folder was crreated correcly + expect_true(file.exists("permu_test/permu.rda")) + expect_true(ncol(get(load("permu_test/permu.rda"))) == 6) + + # If we reduce the number of probes + Hypo.pair <- get.pair(data = data, + nearGenes = nearGenes, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + permu.size=5, + raw.pvalue = 0.001, + Pe = 0.001, + dir.out="./", + permu.dir = "permu_test", + label= "hypo") + # raw.pvalue filter is working + expect_true(nrow(Hypo.pair) == 0) + + # If we add new genes + nearGenes <- GetNearGenes(TRange = getMet(data)[c("cg00329272","cg10097755","cg22396959"),], + geneAnnot = getExp(data)) + Hypo.pair <- get.pair(data = data, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + nearGenes=nearGenes, + permu.size=7, + raw.pvalue = 0.001, + Pe = 0.2, + dir.out="./", + permu.dir = "permu_test", + label= "hypo") + + # If we add new genes and new probes + nearGenes <- GetNearGenes(TRange=rowRanges(getMet(data)[c("cg19403323", + "cg17468663", + "cg00329272", + "cg14036402", + "cg10097755", + "cg22396959"),]), + geneAnnot=rowRanges(getExp(data))) + Hypo.pair <- get.pair(data=data, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + nearGenes=nearGenes, + permu.size=10, + raw.pvalue = 0.01, + Pe = 0.2, + dir.out="./", + permu.dir = "permu_test", + label= "hypo") +}) + +#test_that("Gene expression is calculated", { + #data <- ELMER:::getdata("elmer.data.example") + #nearGenes <- GetNearGenes(TRange = getMet(data)[c("cg00329272","cg10097755"),], + # geneAnnot = getExp(data)) + #Hypo.pair <- get.pair(data = data, + # nearGenes = nearGenes, + # permu.size = 5, + # raw.pvalue = 0.001, + # Pe = 0.2, + # diffExp = TRUE, + # group.col = "definition", + # group1 = "Primary solid Tumor", + # group2 = "Solid Tissue Normal", + # dir.out="./", + # permu.dir = "permu_test", + # label = "hypo") + #expect_true(any(grepl("log2FC", colnames(Hypo.pair)))) + #expect_true(any(grepl("pvalue", colnames(Hypo.pair)))) +#}) + + +test_that("Test calculation of Pe (empirical raw.pvalue) from Raw-pvalue is working", { + + # If my raw-raw.pvalue is smaller than for other probes my Pe should be small + # If my raw-raw.pvalue is higher than for other probes my Pe should be higher + # Case 1 (ENSG00000157916):smaller + # Case 2 (ENSG00000149527) intermediarie + # Case 3 (ENSG00000116213) higher + U.matrix <- data.frame("GeneID" = c("ENSG00000157916","ENSG00000149527","ENSG00000116213"), + "Raw.p" = c(0.001, 0.01, 0.1)) + permu <- data.frame("cg13480549" = c(0.1,0.1,0.01), + "cg15128801"= c(0.1,0.001,0.01), + "cg22396959"= c(0.1,0.001,0.01), + "cg13918150"= c(0.1,0.001,0.01), + "cg26403223"= c(0.1,0.1,0.01), + row.names = c("ENSG00000157916","ENSG00000149527","ENSG00000116213") + ) + Pe <- Get.Pvalue.p(U.matrix = U.matrix, permu = permu) + expect_true(Pe[Pe$GeneID == "ENSG00000157916","Pe"] == min(Pe$Pe)) + expect_true(Pe[Pe$GeneID == "ENSG00000149527","Pe"] < max(Pe$Pe) & Pe[Pe$GeneID== "ENSG00000149527","Pe"] > min(Pe$Pe)) + expect_true(Pe[Pe$GeneID == "ENSG00000116213","Pe"] == max(Pe$Pe)) +}) +test_that("Random probe selection is the same for every run", { + probes <- paste0("cg",000000:450000) + set.seed(200); probes.permu <- sample(probes, size = 10000, replace = FALSE) + set.seed(200); probes.permu.rep <- sample(probes, size = 10000, replace = FALSE) + expect_true(all(probes.permu == probes.permu.rep)) + + data <- ELMER:::getdata("elmer.data.example") + permu <- get.permu(data = data, + permu.dir = "test_permu_1", + geneID = rownames(getExp(data))[1], + rm.probes = c("cg00329272","cg10097755"), + permu.size = 51) + permu <- get.permu(data = data, + permu.dir = "test_permu_2", + geneID = rownames(getExp(data))[1], + rm.probes = c("cg00329272","cg10097755"), + permu.size = 51) + probes.permu <- colnames(get(load("test_permu_1/permu.rda"))) + probes.permu.rep <- colnames(get(load("test_permu_2/permu.rda"))) + expect_true(all(probes.permu == probes.permu.rep)) + unlink("test_permu_2",recursive = TRUE, force = TRUE) + unlink("test_permu_1",recursive = TRUE, force = TRUE) + unlink("permu_test",recursive = TRUE, force = TRUE) + unlink("getPair.hypo*",recursive = TRUE, force = TRUE) +}) diff --git a/tests/testthat/test-getRandomPair.R b/tests/testthat/test-getRandomPair.R new file mode 100644 index 00000000..1559ee0c --- /dev/null +++ b/tests/testthat/test-getRandomPair.R @@ -0,0 +1,34 @@ +context("Checking getRandomPair function") +library(plyr) +library(dplyr) +library(data.table) + +test_that("Links are as expected", { + + data <- ELMER:::getdata("elmer.data.example") + links <- GetNearGenes( + TRange = rowRanges(getMet(data)), + geneAnnot = rowRanges(getExp(data)) + ) + links <- links[sample(1:nrow(links),250),] # get 250 random links + random.pairs <- getRandomPairs(links) + + random.pairs %>% + group_by(Probe) %>% + summarize(col1=paste(sort(Side),collapse = ",")) %>% + data.frame() -> sig.pairs.links + + links %>% + group_by(ID) %>% + summarize(col1=paste(sort(Side),collapse = ",")) %>% + data.frame() -> random.pairs.links + + # Same nb of probes ? + expect_true(length(unique(links$ID)) == length(unique(random.pairs$Probe))) + + # Same number of position links + expect_true(all(table(links$Side) == table(random.pairs$Side))) + + # same links per probe + expect_true(all(table(random.pairs.links$col1) == table(sig.pairs.links$col1))) +}) diff --git a/tests/testthat/test-getTF.R b/tests/testthat/test-getTF.R new file mode 100644 index 00000000..07f34db0 --- /dev/null +++ b/tests/testthat/test-getTF.R @@ -0,0 +1,303 @@ +context("Get TF") + +test_that("Correclty shows TF if top 5 TFs cotinas any member of the motif TF family", { + data <- ELMER:::getdata("elmer.data.example") + enriched.motif <- list( + "P53_HUMAN.H11MO.0.A" = c("cg00329272", "cg10097755", + "cg08928189", "cg17153775", + "cg21156590", "cg19749688", + "cg12590404", "cg24517858", + "cg00329272", "cg09010107", + "cg15386853", "cg10097755", + "cg09247779", "cg09181054") + ) + suppressMessages({ + TF <- get.TFs(data = data, + enriched.motif = enriched.motif, + TFs = data.frame( + external_gene_name = c( + "TP53", + "TP63", + "TP73", + "DLX6", + "DMRT1" + ), + ensembl_gene_id = c( + "ENSG00000141510", + "ENSG00000073282", + "ENSG00000078900", + "ENSG00000006377", + "ENSG00000137090"), + stringsAsFactors = FALSE + ), + group.col = "shortLetterCode", + group1 = "TP", + group2 = "NT", + label = "hypo" + ) + }) + + tf.family <- createMotifRelevantTfs() + expect_true(TF$potential.TF.family %in% tf.family$P53_HUMAN.H11MO.0.A) + expect_true(TF$top.potential.TF.family %in% TF$top_5percent) + expect_true(TF$top.potential.TF.family %in% TF$potential.TF.family) + expect_true(is.na(TF$top.potential.TF.subfamily)) + expect_true(TF$potential.TF.family %in% TF$top_5percent) + +}) + +test_that("Correclty shows TF if top5 TFs cotinas any member of the motif TF family", { + data <- ELMER:::getdata("elmer.data.example") + enriched.motif <- list("P53_HUMAN.H11MO.0.A" = c("cg00329272", "cg10097755", + "cg08928189", "cg17153775", + "cg21156590", "cg19749688", + "cg12590404", "cg24517858", + "cg00329272", "cg09010107", + "cg15386853", "cg10097755", + "cg09247779", "cg09181054")) + suppressMessages({ + TF <- get.TFs(data, + enriched.motif, + TFs=data.frame(external_gene_name=c("TP53", + "TP63", + "TP73", + "DLX6", + "DMRT1" + ), + ensembl_gene_id= c("ENSG00000141510", + "ENSG00000073282", + "ENSG00000078900", + "ENSG00000006377", + "ENSG00000137090"), + stringsAsFactors = FALSE), + mode = "supervised", + diff.dir = "hypo", + group.col = "shortLetterCode", + group1 = "TP", + group2 = "NT", + label="hypo") + }) + + tf.family <- createMotifRelevantTfs() + expect_true(TF$potential.TF.family %in% tf.family$P53_HUMAN.H11MO.0.A) + expect_true(TF$top.potential.TF.family %in% TF$top_5percent) + expect_true(TF$top.potential.TF.family %in% TF$potential.TF.family) + expect_true(is.na(TF$top.potential.TF.subfamily)) + expect_true(TF$potential.TF.family %in% TF$top_5percent) + + TF <- get.TFs(data, + enriched.motif, + TFs=data.frame(external_gene_name=c("TP53", + "TP63", + "TP73", + "DLX6", + "DMRT1" + ), + ensembl_gene_id= c("ENSG00000141510", + "ENSG00000073282", + "ENSG00000078900", + "ENSG00000006377", + "ENSG00000137090"), + stringsAsFactors = FALSE), + mode = "supervised", + diff.dir = "hyper", + group.col = "shortLetterCode", + group1 = "TP", + group2 = "NT", + label="hyper") + pval.sig <- get(load("getTF.hypo.TFs.with.motif.pvalue.rda")) + pval.insig <- get(load("getTF.hyper.TFs.with.motif.pvalue.rda")) + expect_true(all(-log10(pval.sig[,1]) > -log10(pval.insig[,1]))) +}) + +test_that("Shows NA if top5 TFs does not include any member of the motif TF family", { + data <- ELMER:::getdata("elmer.data.example") + enriched.motif <- list("P53_HUMAN.H11MO.0.A" = c("cg00329272", "cg10097755", + "cg08928189", "cg17153775", + "cg21156590", "cg19749688", + "cg12590404", "cg24517858", + "cg00329272", "cg09010107", + "cg15386853", "cg10097755", + "cg09247779", "cg09181054")) + suppressMessages({ + TF <- get.TFs(data, enriched.motif, label = "hypo", + group.col = "shortLetterCode", + group1 = "TP", + group2 = "NT") + }) + + tf.family <- createMotifRelevantTfs() + human.tf <- getTF() + # Check if top5 has 5% elements that TF from the object + expect_equal(floor(sum(human.tf$ensembl_gene_id %in% rownames(getExp(data))) * 0.05), + length(unlist(strsplit(as.character(TF$top_5percent),";")))) + if(!TF$top_5percent %in% tf.family$P53_HUMAN.H10MO.B){ + expect_true(is.na(TF$top.potential.TF.family)) + expect_true(is.na(TF$potential.TF.family)) + expect_true(is.na(TF$top.potential.TF.subfamily)) + expect_true(is.na(TF$potential.TF.subfamily)) + } +}) + +test_that("Test if the results is right", { + + # We will create the data where whe have 3 cases: + # 1) no changes in expression + # 2) Unmethylated group has lower TF expression + # 3) Unmethylated group has a higher TF expression + # 4) Unmethylated group has highest TF expression + # The case 4 is the potential TF + + # We have the 3 cases for 6 patients + exp <- t( + data.frame( + "ENSG00000141510" = c(1,1,1,1,1,1), # No change in expression + "ENSG00000073282" = c(0,0,0,1,1,1), # Change in the other direction + "ENSG00000135776" = c(0.2,0.4,0.6,0.8,0.9,1), # raw p-value should be higher than the best case + "ENSG00000078900" = c(1,1,1,0,0,0)) + ) # Should be true + colnames(exp) <- c(as.character(1:6)) + exp <- makeSummarizedExperimentFromGeneMatrix(exp = exp, genome = "hg19") + + # First 3 patients are Unmethylated + met <- t(data.frame("cg00329272" = c(0,0,0,1,1,1))) + colnames(met) <- c(as.character(1:6)) + met <- makeSummarizedExperimentFromDNAMethylation(met = met, met.platform = "450K", genome = "hg19") + + colData <- data.frame( + sample = as.character(1:6), + group = c(rep("g1",3),rep("g2",3)), + primary = as.character(1:6) + ) + # Create datas + data <- createMAE(exp = exp,met = met, genome = "hg19", colData = colData) + + enriched.motif <- list("P53_HUMAN.H11MO.0.A" = c("cg00329272")) + + + suppressMessages({ + TF <- get.TFs(data = data, + enriched.motif = enriched.motif, + group.col = "group", + group1 = "g1", + group2 = "g2", + mode = "supervised", + diff.dir = "hypo", + TFs = data.frame( + external_gene_name=c("TP53", "TP63","TP73","ABCB10"), + ensembl_gene_id= c( + "ENSG00000141510", + "ENSG00000073282", + "ENSG00000078900", + "ENSG00000135776" + ), + stringsAsFactors = FALSE + ), + label = "hypo") + }) + + expect_true(TF$potential.TF.family == "TP73") + expect_true(TF$top.potential.TF.family == "TP73") + expect_true(TF$top_5percent == "TP73") + + # Changing percentage to 50% (split in half: 3 samples as methylated and 3 as unmethylated) + # Will give us the same result + + suppressMessages({ + TF <- get.TFs(data, + enriched.motif, + minSubgroupFrac = 0.5, + group.col = "group", + group1 = "g1", + group2 = "g2", + TFs = data.frame( + external_gene_name=c("TP53", "TP63","TP73","ABCB10"), + ensembl_gene_id= c("ENSG00000141510", + "ENSG00000073282", + "ENSG00000078900", + "ENSG00000135776"), + stringsAsFactors = FALSE + ), + label = "hypo") + }) + + expect_true(TF$potential.TF.family == "TP73") + expect_true(TF$top.potential.TF.family == "TP73") + expect_true(TF$top_5percent == "TP73") + + + # Changing the order should give the right gene + exp <- t(data.frame("ENSG00000078900" = c(1,1,1,1,1,1), # No change in expression + "ENSG00000073282" = c(0,0,0,1,1,1), # Change in the other direction + "ENSG00000135776" = c(0.2,0.4,0.6,0.8,0.9,1), # raw p-value should be higher than the best case + "ENSG00000141510" = c(1,1,1,0,0,0))) # Should be true + colnames(exp) <- c(as.character(1:6)) + exp <- makeSummarizedExperimentFromGeneMatrix(exp, genome = "hg19") + + # First 3 patients are Unmethylated + met <- t(data.frame("cg00329272" = c(0,0,0,1,1,1))) + colnames(met) <- c(as.character(1:6)) + met <- makeSummarizedExperimentFromDNAMethylation(met, met.platform = "450K", genome = "hg19") + + colData <- data.frame( + sample = as.character(1:6), + group = c(rep("g2",3),rep("g1",3)), + primary = as.character(1:6) + ) + # Create datas + data <- createMAE(exp,met, genome = "hg19", colData = colData) + + enriched.motif <- list("P53_HUMAN.H11MO.0.A" = c("cg00329272")) + + suppressMessages({ + TF <- get.TFs(data, + enriched.motif, + group.col = "group", + group1 = "g1", + group2 = "g2", + TFs = data.frame(external_gene_name=c("TP53", "TP63","TP73","ABCB10"), + ensembl_gene_id= c("ENSG00000141510", + "ENSG00000073282", + "ENSG00000078900", + "ENSG00000135776"), + stringsAsFactors = FALSE), + label = "hypo") + }) + + expect_true(TF$potential.TF.family == "TP53") + expect_true(TF$top.potential.TF.family == "TP53") + expect_true(TF$top_5percent == "TP53") + + # Changing percentage to 50% (split in half: 3 samples as methylated and 3 as unmethylated) + # Will give us the same result + + suppressMessages({ + TF <- get.TFs(data, + enriched.motif, + minSubgroupFrac = 0.5, + group.col = "group", + group1 = "g1", + group2 = "g2", + save.plots = TRUE, + TFs = data.frame(external_gene_name=c("TP53", "TP63","TP73","ABCB10"), + ensembl_gene_id= c("ENSG00000141510", + "ENSG00000073282", + "ENSG00000078900", + "ENSG00000135776"), + stringsAsFactors = FALSE), + label = "hypo") + }) + expect_true(TF$potential.TF.family == "TP53") + expect_true(TF$top.potential.TF.family == "TP53") + expect_true(TF$top_5percent == "TP53") +}) + +test_that("It creates a PDF with the TF ranking plot", { + expect_true(file.exists("TFrankPlot/P53_HUMAN.H11MO.0.A.TFrankPlot.pdf")) + unlink("TFrankPlot",recursive = TRUE, force = TRUE) + unlink("getTF.hypo.significant.TFs.with.motif.summary.csv",recursive = TRUE, force = TRUE) + unlink("subfamily.motif.relevant.TFs.rda",recursive = TRUE, force = TRUE) + unlink("HumanTF.rda",recursive = TRUE, force = TRUE) + unlink("family.motif.relevant.TFs.rda",recursive = TRUE, force = TRUE) + unlink("getTF.hypo.TFs.with.motif.pvalue.rda",recursive = TRUE, force = TRUE) +}) diff --git a/tests/testthat/test-neargenes.R b/tests/testthat/test-neargenes.R new file mode 100644 index 00000000..755c1463 --- /dev/null +++ b/tests/testthat/test-neargenes.R @@ -0,0 +1,81 @@ +context("Get GetNearGenes") + +test_that("It maps correctly to hg38", { + tssAnnot <- getTSS(genome = "hg38") + geneAnnot <- tssAnnot + + probe <- getInfiniumAnnotation(plat = "450K",genome = "hg19")["cg18108049"] + + NearbyGenes <- getRegionNearGenes( + TRange = probe, + geneAnnot = geneAnnot, + numFlankingGenes = 4 + ) + + expect_equal(NearbyGenes[NearbyGenes$Side == "L2",]$GeneID , "ENSG00000184908") + expect_equal(NearbyGenes[NearbyGenes$Side == "L1",]$GeneID, "ENSG00000185519") + expect_equal(NearbyGenes[NearbyGenes$Side == "L2",]$Symbol, "CLCNKB") + expect_equal(NearbyGenes[NearbyGenes$Side == "L1",]$Symbol, "FAM131C") + expect_equal(NearbyGenes[NearbyGenes$Side == "R1",]$GeneID, "ENSG00000142627") + expect_equal(NearbyGenes[NearbyGenes$Side == "R1",]$Symbol, "EPHA2") + + NearbyGenes <- GetNearGenes( + numFlankingGenes = 4, + geneAnnot = geneAnnot, + TRange = probe + ) + expect_equal(NearbyGenes[NearbyGenes$Side == "L2",]$GeneID, "ENSG00000184908") + expect_equal(NearbyGenes[NearbyGenes$Side == "L1",]$GeneID, "ENSG00000185519") + expect_equal(NearbyGenes[NearbyGenes$Side == "L2",]$Symbol, "CLCNKB") + expect_equal(NearbyGenes[NearbyGenes$Side == "L1",]$Symbol, "FAM131C") + expect_equal(NearbyGenes[NearbyGenes$Side == "R1",]$GeneID, "ENSG00000142627") + expect_equal(NearbyGenes[NearbyGenes$Side == "R1",]$Symbol, "EPHA2") +}) + + +test_that("It maps correctly to hg38 if more than one region", { + tssAnnot <- getTSS(genome = "hg38") + geneAnnot <- tssAnnot + probe <- getInfiniumAnnotation( + plat = "450K",genome = "hg19" + )[c("cg18108049","cg14008030","cg00381604","cg15254640","cg08417382")] + + NearbyGenes <- getRegionNearGenes( + TRange = probe, + geneAnnot = geneAnnot, + numFlankingGenes = 4 + ) + + expect_equal(NearbyGenes[NearbyGenes$Side == "L2" & NearbyGenes$ID == "cg18108049",]$GeneID , "ENSG00000184908") + expect_equal(NearbyGenes[NearbyGenes$Side == "L1" & NearbyGenes$ID == "cg18108049",]$GeneID, "ENSG00000185519") + expect_equal(NearbyGenes[NearbyGenes$Side == "L2" & NearbyGenes$ID == "cg18108049",]$Symbol, "CLCNKB") + expect_equal(NearbyGenes[NearbyGenes$Side == "L1" & NearbyGenes$ID == "cg18108049",]$Symbol, "FAM131C") + expect_equal(NearbyGenes[NearbyGenes$Side == "R1" & NearbyGenes$ID == "cg18108049",]$GeneID, "ENSG00000142627") + expect_equal(NearbyGenes[NearbyGenes$Side == "R1" & NearbyGenes$ID == "cg18108049",]$Symbol, "EPHA2") + + NearbyGenes <- GetNearGenes(numFlankingGenes = 20, + geneAnnot = geneAnnot, + TRange = probe) + expect_equal(NearbyGenes[NearbyGenes$Side == "L2" & NearbyGenes$ID == "cg18108049",]$GeneID, "ENSG00000184908") + expect_equal(NearbyGenes[NearbyGenes$Side == "L1" & NearbyGenes$ID == "cg18108049",]$GeneID, "ENSG00000185519") + expect_equal(NearbyGenes[NearbyGenes$Side == "L2" & NearbyGenes$ID == "cg18108049",]$Symbol, "CLCNKB") + expect_equal(NearbyGenes[NearbyGenes$Side == "L1" & NearbyGenes$ID == "cg18108049",]$Symbol, "FAM131C") + expect_equal(NearbyGenes[NearbyGenes$Side == "R1" & NearbyGenes$ID == "cg18108049",]$GeneID, "ENSG00000142627") + expect_equal(NearbyGenes[NearbyGenes$Side == "R1" & NearbyGenes$ID == "cg18108049",]$Symbol, "EPHA2") +}) + +test_that("It maps correctly to hg19", { + tssAnnot <- getTSS(genome = "hg19") + geneAnnot <- tssAnnot + probe <- getInfiniumAnnotation(plat = "450K",genome = "hg19")["cg18108049"] + + # chr1:16010827:16062808 + # chr1:16058489:16058489 + NearbyGenes <- getRegionNearGenes( + TRange = probe, + geneAnnot = geneAnnot, + numFlankingGenes = 30 + ) + + expect_true(all(c("SLC25A34","RSC1A1","AGMAT","DDI2","PLEKHM2") %in% NearbyGenes$Symbol)) +}) \ No newline at end of file diff --git a/tests/testthat/test-pipe.R b/tests/testthat/test-pipe.R new file mode 100644 index 00000000..75a61ffc --- /dev/null +++ b/tests/testthat/test-pipe.R @@ -0,0 +1,21 @@ +context("Testing TCGA.pipe") + +test_that("TCGA.pipe is working", { + data <- ELMER:::getdata("elmer.data.example") + TCGA.pipe(disease = "LUSC", + data = data, + analysis = c("diffMeth","pair", "motif","TF.search"), + mode = "supervised", + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + diff.dir = c("hypo"), + dir.out = "pipe", + sig.dif = 0.0001, + pvalue = 1.0, + min.incidence = 0, + lower.OR = 0.0) + +}) + + diff --git a/vignettes/_site.yml b/vignettes/_site.yml new file mode 100644 index 00000000..b5ce31bd --- /dev/null +++ b/vignettes/_site.yml @@ -0,0 +1,95 @@ +name: "ELMER" +navbar: + title: "ELMER" + href: https://www.bioconductor.org/help/course-materials/2017/BioC2017/ + left: + - text: "Introduction" + icon: fa-home + href: index.html + - text: "Data input" + icon: fa-database + href: input.html + - text: "Analysis" + icon: fa-flask + menu: + - text: "Analysis steps" + - text: "1 - Creating MAE with distal probes (data input)" + href: analysis_data_input.html + - text: "2 - Identifying differentially methylated probes" + href: analysis_diff_meth.html + - text: "3 - Identifying putative probe-gene pairs" + href: analysis_get_pair.html + - text: "4 - Motif enrichment analysis on the selected probes" + href: analysis_motif_enrichment.html + - text: "5 - Identifying regulatory TFs" + href: analysis_regulatory_tf.html + - text: "---------" + - text: "Compact version with TCGA data" + - text: "TCGA pipe" + icon: fa-flask + href: pipe.html + - text: "Plots" + icon: fa-photo + menu: + - text: "Plot functions" + - text: "1 - Scatter plots" + href: plots_scatter.html + - text: "2 - Schematic plots" + href: plots_schematic.html + - text: "3 - Motif enrichment plots" + href: plots_motif_enrichment.html + - text: "4 - Regulatory TF plots" + href: plots_TF.html + - text: "5 - Heatmap plots" + href: plots_heatmap.html + - text: "Graphical User interface" + icon: fa-hand-pointer-o + href: analysis_gui.html + - text: "Use case" + icon: fa-code + href: usecase.html + right: + - icon: fa-book fa-lg + menu: + - text: "ELMER package" + - text: "Github" + href: https://github.com/tiagochst/ELMER + - text: "Bioconductor" + href: http://bioconductor.org/packages/devel/bioc/html/ELMER.html + - text: "---------" + - text: "ELMER.data package" + - text: "Github" + href: https://github.com/tiagochst/ELMER + - text: "Bioconductor" + href: http://bioconductor.org/packages/devel/bioc/html/ELMER.html + - text: "---------" + - text: "Bioc2017 workshop" + - text: "Github ELMER/TCGAbiolinks" + href: https://github.com/BioinformaticsFMRP/Bioc2017.TCGAbiolinks.ELMER + - text: "---------" + - text: "TCGAbiolinks package" + - text: "Github" + href: https://github.com/BioinformaticsFMRP/TCGAbiolinks + - text: "Bioconductor" + href: http://bioconductor.org/packages/devel/bioc/html/TCGAbiolinks.html + - icon: fa-info-circle fa-lg + menu: + - text: "Cedars-sinai" + - icon: fa-twitter fa-lg + text: "center4bfg" + href: https://twitter.com/center4bfg + - icon: fa-home fa-lg + text: "center4bfg.org" + href: https://center4bfg.org + - text: "University of São Paulo (USP)" + - icon: fa-home fa-lg + text: "fmrp.usp.br" + href: http://www.fmrp.usp.br/?lang=en + - text: "---------" + - text: "Fundings" + - text: "FAPESP (16/10436-9)" + href: http://www.bv.fapesp.br/en/pesquisa/?sort=-data_inicio&q2=%28instituicao%3A%22Cedars-Sinai+Medical+Center%22%29+AND+%28%28bolsa_en_exact%3A%22Scholarships+abroad%22+AND+situacao_en_exact%3A%22Ongoing%22%29%29 + - text: "NCI ITCR program (1U01CA184826)" + href: https://itcr.nci.nih.gov/ + - text: "Genomic Data Analysis Network NIH/NCI (1U24CA210969)" + href: https://projectreporter.nih.gov/project_info_description.cfm?aid=9210719&icde=31197242&ddparam=&ddvalue=&ddsub=&cr=1&csb=default&cs=ASC diff --git a/vignettes/analysis_data_input.Rmd b/vignettes/analysis_data_input.Rmd new file mode 100644 index 00000000..fd24c4a1 --- /dev/null +++ b/vignettes/analysis_data_input.Rmd @@ -0,0 +1,147 @@ +--- +title: "3.1 - Data input - Creating MAE object" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: true + toc_float: true + toc_depth: 2 + css: style.css + +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"3.1 - Data input - Creating MAE object"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +
+ +# Illustration of ELMER analysis steps + +The example data set (`GeneExp`,`Meth`) is a subset of chromosome 1 data from TCGA LUSC and it is available with the ELMER package. + +```{r, echo = FALSE,hide=TRUE, message=FALSE, warning=FALSE} +library(ELMER) +library(DT) +library(dplyr) +dir.create("result",showWarnings = FALSE) +library(BiocStyle) +``` + + +ELMER analysis have 5 main steps which are shown in the next sections individually. +And later the function `TCGA.pipe`, which is a pipeline combining all 5 steps and producing all results and figures, is presented. + +# Preparing data input + +## Selection of probes within biofeatures + +This step is to select HM450K/EPIC probes, which locate far from TSS (at least 2Kb away) +These probes are called distal probes. + +Be default, this comprehensive list of TSS annotated by ENSEMBL database, +which is programatically accessed using `r BiocStyle::Biocpkg("biomaRt")` to get its last version, +will be used to select distal probes. But user can use their +own TSS annotation or add features such as H3K27ac ChIP-seq in a certain cell line, to select probes overlapping thoses features regions. + + +```{r, message=FALSE} +# get distal probes that are 2kb away from TSS on chromosome 1 +distal.probes <- get.feature.probe( + genome = "hg19", + met.platform = "450K", + rm.chr = paste0("chr",c(2:22,"X","Y")) +) +``` + +## Creation of a MAE object +```{r,eval=TRUE, message=FALSE} +library(MultiAssayExperiment) +library(ELMER.data) +data(LUSC_RNA_refined,package = "ELMER.data") +data(LUSC_meth_refined,package = "ELMER.data") +GeneExp[1:5,1:5] +Meth[1:5,1:5] +mae <- createMAE( + exp = GeneExp, + met = Meth, + save = TRUE, + linearize.exp = TRUE, + save.filename = "mae.rda", + filter.probes = distal.probes, + met.platform = "450K", + genome = "hg19", + TCGA = TRUE +) +as.data.frame(colData(mae)[1:5,]) %>% datatable(options = list(scrollX = TRUE)) +as.data.frame(sampleMap(mae)[1:5,]) %>% datatable(options = list(scrollX = TRUE)) +as.data.frame(assay(getMet(mae)[1:5,1:5])) %>% datatable(options = list(scrollX = TRUE)) +as.data.frame(assay(getMet(mae)[1:5,1:5])) %>% datatable(options = list(scrollX = TRUE)) +``` + +## Using non-TCGA data + +In case you are using non-TCGA data there are two matrices to be inputed, colData with the samples metadata and sampleMap, mapping for each column of the gene expression and DNA methylation matrices to samples. An simple example is below if the columns of the matrices have the same name. + +```{r,eval=FALSE, message=FALSE} +library(ELMER) +# example input +met <- matrix(rep(0,15),ncol = 5) +colnames(met) <- c( + "Sample1", + "Sample2", + "Sample3", + "Sample4", + "Sample5" +) +rownames(met) <- c("cg26928153","cg16269199","cg13869341") + +exp <- matrix(rep(0,15),ncol = 5) +colnames(exp) <- c( + "Sample1", + "Sample2", + "Sample3", + "Sample4", + "Sample5" +) +rownames(exp) <- c("ENSG00000073282","ENSG00000078900","ENSG00000141510") + + +assay <- c( + rep("DNA methylation", ncol(met)), + rep("Gene expression", ncol(exp)) +) +primary <- c(colnames(met),colnames(exp)) +colname <- c(colnames(met),colnames(exp)) +sampleMap <- data.frame(assay,primary,colname) + +distal.probes <- get.feature.probe( + genome = "hg19", + met.platform = "EPIC" +) + +colData <- data.frame(sample = colnames(met)) +rownames(colData) <- colnames(met) + +mae <- createMAE( + exp = exp, + met = met, + save = TRUE, + filter.probes = distal.probes, + colData = colData, + sampleMap = sampleMap, + linearize.exp = TRUE, + save.filename = "mae.rda", + met.platform = "EPIC", + genome = "hg19", + TCGA = FALSE +) +``` + + +# Bibliography \ No newline at end of file diff --git a/vignettes/analysis_diff_meth.Rmd b/vignettes/analysis_diff_meth.Rmd new file mode 100644 index 00000000..3d2f72a2 --- /dev/null +++ b/vignettes/analysis_diff_meth.Rmd @@ -0,0 +1,119 @@ +--- +title: "3.2 - Identifying differentially methylated probes" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: true + toc_float: true + toc_depth: 2 + css: style.css + +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"3.2 - Identifying differentially methylated probes"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + + +```{r, echo = FALSE,hide=TRUE, message=FALSE, warning=FALSE} +library(ELMER.data) +library(ELMER) +library(DT) +library(dplyr) +library(BiocStyle) +``` + +
+ +# Identifying differentially methylated probes + +The first step is the identification of differentially methylated CpGs (DMCs) carried out by function `get.diff.meth`. + +In the `Supervised` mode, we compare the DNA methylation level of each distal CpG for all samples in Group 1 compared to all samples Group 2, using an unpaired one-tailed t-test. In the `Unsupervised` mode, the samples of each group (Group 1 and Group 2) are ranked by their DNA methylation beta values for the given probe, and those samples in the lower quintile (20\% samples with the lowest methylation levels) of each group are used to identify if the probe is hypomethylated in Group 1 compared to Group 2. The reverse applies for the identification of hypermethylated probes. It is important to highlight that in the `Unsupervised` mode, each probe selected may be based on a different subset the samples, and thus probe sets from multiple molecular subtypes may be represented. In the `Supervised` mode, all tests are based on the same set of samples. + +The 20\% is a parameter to the `diff.meth` function called `minSubgroupFrac`. For the unsupervised analysis, this is set to 20\% as in Yao et al. [@yao2015inferring], because we wanted to be able to detect a specific molecular subtype among samples; these subtypes often make up only a minority of samples, and 20\% was chosen as a lower bound for the purposes of statistical power (high enough sample numbers to yield t-test p-values that could overcome multiple hypotheses corrections, yet low enough to be able to capture changes in individual molecular subtypes occurring in 20\% or more of the cases.) This number can be set as an input to the `diff.meth` function and should be tuned based on sample sizes in individual studies. +In the `Supervised` mode, where the comparison groups are implicit in the sample set and labeled, the `minSubgroupFrac` parameter is set to 100\%. An example would be a cell culture experiment with 5 replicates of the untreated cell line, and another 5 replicates that include an experimental treatment. + + +To identify hypomethylated DMCs, a one-tailed t-test is used to rule out the null hypothesis: $\mu_{group1} \geq \mu_{group2}$, where $\mu_{group1}$ is the mean methylation within the lowest group 1 quintile (or another percentile as specified by the `minSubgroupFrac` parameter) and $\mu_{group2}$ is the mean within the lowest group 2 quintile. Raw p-values are adjusted for multiple hypothesis testing using the Benjamini-Hochberg method, and probes are selected when they had adjusted p-value less than $0.01$ (which can be configured using the `pvalue` parameter). For additional stringency, probes are only selected if the methylation difference: $\Delta = \mu_{group1} - \mu_{group2}$ was greater than $0.3$. The same method is used to identify hypermethylated DMCs, except we use the *upper* quintile, and the opposite tail in the t-test is chosen. + + +![Source: Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription factor networks from cancer methylomes." Genome biology 16.1 (2015): 105.](figures/paper_diff_meth.jpg) [@yao2015inferring,@yao2015demystifying] + + +# Function arguments + +
+
Main get.diff.meth arguments
+
+| Argument | Description | +|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| data | A `multiAssayExperiment` with DNA methylation and Gene Expression data. See `createMAE` function. | +| diff.dir | A character can be "hypo", "hyper" or "both", showing differential methylation dirction. It can be "hypo" which is only selecting hypomethylated probes (one tailed test); "hyper" which is only selecting hypermethylated probes (one tailed test); or "both" which are probes differenly methylated (two tailed test). | +| minSubgroupFrac | A number ranging from 0 to 1,specifying the fraction of extreme samples from group 1 and group 2 that are used to identify the differential DNA methylation. The default is 0.2 because we typically want to be able to detect a specific (possibly unknown) molecular subtype among tumor; these subtypes often make up only a minority of samples, and 20\% was chosen as a lower bound for the purposes of statistical power. If you are using pre-defined group labels, such as treated replicates vs. untreated replicated, use a value of 1.0 (***Supervised*** mode) | +| pvalue | A number specifies the significant P value (adjusted P value by BH) cutoff for selecting significant hypo/hyper-methylated probes. Default is 0.01 | +| group.col | A column defining the groups of the sample. You can view the available columns using: `colnames(MultiAssayExperiment::colData(data))`. | +| group1 | A group from group.col. ELMER will run group1 vs group2. That means, if direction is hyper, get probes hypermethylated in group 1 compared to group 2. | +| group2 | A group from group.col. ELMER will run group1 vs group2. That means, if direction is hyper, get probes hypermethylated in group 1 compared to group 2. | +| sig.dif | A number specifies the smallest DNA methylation difference as a cutoff for selecting significant hypo/hyper-methylated probes. Default is 0.3. | +
+
+ +# Example of use +```{r,eval=TRUE, message=FALSE, warning = FALSE, results = "hide"} +mae <- get(load("mae.rda")) +sig.diff <- get.diff.meth( + data = mae, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + minSubgroupFrac = 0.2, # if supervised mode set to 1 + sig.dif = 0.3, + diff.dir = "hypo", # Search for hypomethylated probes in group 1 + cores = 1, + dir.out ="result", + pvalue = 0.01 +) +``` + +```{r,eval=TRUE, message=FALSE, warning = FALSE} +head(sig.diff) %>% datatable(options = list(scrollX = TRUE)) +# get.diff.meth automatically save output files. +# - getMethdiff.hypo.probes.csv contains statistics for all the probes. +# - getMethdiff.hypo.probes.significant.csv contains only the significant probes which +# is the same with sig.diff +# - a volcano plot with the diff mean and significance levels +dir(path = "result", pattern = "getMethdiff") +``` + + +```{r,eval=TRUE, message=FALSE, warning = FALSE,echo=FALSE} +group1 <- "Primary solid Tumor" +group2 <- "Solid Tissue Normal" +out <- readr::read_csv(dir(path = "result", pattern = "getMethdiff.hypo.probes.csv",full.names = TRUE)) +TCGAbiolinks:::TCGAVisualize_volcano( + x = as.data.frame(out)[,grep("Minus",colnames(out),value = T)], + y = out$adjust.p, + title = paste0("Volcano plot - Probes ", + "hypomethylated in ", group1, " vs ", group2,"\n"), + filename = NULL, + label = c("Not Significant", + paste0("Hypermethylated in ",group1), + paste0("Hypomethylated in ",group1)), + ylab = expression(paste(-Log[10], + " (FDR corrected P-values) [one tailed test]")), + xlab = expression(paste( + "DNA Methylation difference (",beta,"-values)") + ), + x.cut = 0.3, + y.cut = 0.01 +) +``` + + +# Bibliography diff --git a/vignettes/analysis_get_pair.Rmd b/vignettes/analysis_get_pair.Rmd new file mode 100644 index 00000000..6da240cc --- /dev/null +++ b/vignettes/analysis_get_pair.Rmd @@ -0,0 +1,152 @@ +--- +title: "3.3 - Identifying putative probe-gene pairs" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: true + toc_float: true + toc_depth: 2 + css: style.css + +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"3.3 - Identifying putative probe-gene pairs"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r, echo = FALSE,hide=TRUE, message=FALSE, warning=FALSE} +library(ELMER.data) +library(ELMER) +library(DT) +library(dplyr) +library(BiocStyle) +``` + +
+ +# Identifying putative probe-gene pairs + +## Introduction +This step is links distal probes with methylation changes to target genes with expression +changes and report the putative target gene for selected probes. This is carried out +by function `get.pair`. + + +## Description +For each differentially methylated distal probe (DMC), the closest 10 upstream genes and the closest 10 downstream genes are tested for inverse correlation between methylation of the probe and expression of the gene, which is the same basic strategy employed in ELMER version 1. However, we now import all gene annotations programmatically using the Biomart [@durinck2005biomart] package. This allows easy extensibility to use any annotations desired (our default uses Ensembl annotations). + +This step also differs between the `Supervised` and `Unsupervised` modes. In the `Unsupervised` mode, as in ELMER 1.0, for each probe-gene pair, the samples (all samples from both groups) are divided into two groups: the $M$ group, which consist of the upper methylation quintile (the 20\%of samples with the highest methylation at the enhancer probe), and the $U$ group, +which consists of the lowest methylation quintile (the 20\% of samples with the +lowest methylation). In the new `Supervised` mode, the $U$ and $M$ groups are defined strictly by sample group labels, and all samples in each group are used. +For each differentially methylated distal probe (DMC), the closest 10 upstream +genes and the closest 10 downstream genes are tested for inverse correlation between +methylation of the probe and expression of the gene (the number 10 can be changed using the `numFlankingGenes` parameter). To select these genes, +the probe-gene distance is defined as the distance from the probe to the transcription +start site specified by the ENSEMBL gene level annotations [@yates2015ensembl] accessed via +the R/Bioconductor package biomaRt [@durinck2009mapping,@durinck2005biomart]. By choosing a constant number of genes to test for each probe, our goal is to avoid systematic false positives for probes in gene rich regions. This is especially important given the highly non-uniform gene density of mammalian genomes. + +Thus, exactly 20 statistical tests were performed for each probe, as follows. + +For each candidate probe-gene pair, +the Mann-Whitney U test is used to test the null hypothesis that overall gene +expression in group $M$ is greater than or equal than that in group $U$. +This non-parametric test was used in order to minimize the effects +of expression outliers, which can occur across a very wide dynamic range. +In the `unsupervised mode` for each probe-gene pair tested, the raw p-value $P_r$ is corrected for multiple +hypothesis using a permutation approach as follows. +The gene in the pair is held constant, and `x` random methylation probes are +chosen to perform the same one-tailed U test, generating a set of `x` permutation +p-values $P_p$. We chose the x random probes only from among those that were +"distal" (farther than $2kb$ from an annotated transcription start site), in order +to draw these null-model probes from the same set as the probe being tested [@sham2014statistical]. +An empirical p-value $P_e$ value was calculated using the following formula +(which introduces a pseudo-count of 1): + +\begin{equation} +P_e = \frac{num(P_p \leq P_r)+ 1}{x+1} +\end{equation} + +In the `unsupervised mode` for each probe-gene pair tested, the raw p-value $P_r$ is corrected for multiple +hypothesis using Benjamini-Hochberg Procedure. + +Notice that in the `Supervised` mode, no additional filtering is necessary to ensure that the $M$ and $U$ group segregate by sample group labels. The two sample groups are segregated by definition, since these probes were selected for their differential methylation, with the same directionality, between the two groups. + + +![Source: Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription factor networks from cancer methylomes." Genome biology 16.1 (2015): 105.](figures/paper_get.pairs.jpg) [@yao2015inferring] + + +# Function arguments + +
+
Main get.pair arguments
+
+| Argument | Description | +|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| data | A multiAssayExperiment with DNA methylation and Gene Expression data. See createMAE function. | +| nearGenes | Can be either a list containing output of GetNearGenes function or path of rda file containing output of `GetNearGenes` function. | +| minSubgroupFrac | A number ranging from 0 to 1.0 specifying the percentage of samples used to create groups U (unmethylated) and M (methylated) used to link probes to genes. Default is 0.4 (lowest quintile samples will be in the U group and the highest quintile samples in the M group). | +| permu.size | A number specify the times of permuation. Default is 10000. | +| raw.pvalue | A number specify the raw p-value cutoff for defining signficant pairs. Default is 0.05. It will select the significant P value cutoff before calculating the empirical p-values. | +| Pe | A number specify the empirical p-value cutoff for defining signficant pairs. Default is 0.001. | +| group.col | A column defining the groups of the sample. You can view the available columns using: `colnames(MultiAssayExperiment::colData(data))`. | +| group1 | A group from group.col. | +| group2 | A group from group.col. | +| mode |A character. Can be "unsupervised" or "supervised". If unsupervised is set the U (unmethylated) and M (methylated) groups will be selected among all samples based on methylation of each probe. Otherwise U group and M group will set as the samples of group1 or group2 as described below: If diff.dir is "hypo, U will be the group 1 and M the group2. If diff.dir is "hyper" M group will be the group1 and U the group2.| +| diff.dir | A character can be "hypo" or "hyper", showing differential methylation dirction in group 1. It can be "hypo" which means the probes are hypomethylated in group1; "hyper" which means the probes are hypermethylated in group1; This argument is used only when mode is supervised nad it should be the same value from get.diff.meth function. | +| filter.probes | Should filter probes by selecting only probes that have at least a certain number of samples below and above a certain cut-off. See `preAssociationProbeFiltering` function. | +| filter.portion | A number specify the cut point to define binary methlation level for probe loci. Default is 0.3. When beta value is above 0.3, the probe is methylated and vice versa. For one probe, the percentage of methylated and unmethylated samples should be above filter.percentage value. Only used if filter.probes is TRUE. See preAssociationProbeFiltering function. | +| filter.percentage | Minimum percentage of samples to be considered in methylated and unmethylated for the filter.portion option. Default 5%. Only used if filter.probes is TRUE. See preAssociationProbeFiltering function. | +
+
+ +# Example of use +```{r, eval = TRUE, message = FALSE, warning = FALSE, results = "hide"} +# Load results from previous sections +mae <- get(load("mae.rda")) +sig.diff <- read.csv("result/getMethdiff.hypo.probes.significant.csv") + +nearGenes <- GetNearGenes( + data = mae, + probes = sig.diff$probe, + numFlankingGenes = 20 +) # 10 upstream and 10 dowstream genes + +Hypo.pair <- get.pair( + data = mae, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + nearGenes = nearGenes, + mode = "unsupervised", + permu.dir = "result/permu", + permu.size = 100, # Please set to 100000 to get significant results + raw.pvalue = 0.05, + Pe = 0.01, # Please set to 0.001 to get significant results + filter.probes = TRUE, # See preAssociationProbeFiltering function + filter.percentage = 0.05, + filter.portion = 0.3, + dir.out = "result", + cores = 1, + label = "hypo" +) +``` + +Observation: The distance column in the nearGenes object and in thable getPair.hypo.all.pairs.statistic.csv are the distance to the gene. +To update, to the distance to the nearest TSS please use the function `addDistNearestTSS`. +This function was not used default due to time requirements to run for all probes and all their 20 nearest genes, but it is ran for the significant pairs. + +```{r, eval = TRUE, message = FALSE, warning = FALSE} +Hypo.pair %>% datatable(options = list(scrollX = TRUE)) +# get.pair automatically save output files. +# getPair.hypo.all.pairs.statistic.csv contains statistics for all the probe-gene pairs. +# getPair.hypo.pairs.significant.csv contains only the significant probes which is +# same with Hypo.pair. +dir(path = "result", pattern = "getPair") +``` + +# Bibliography diff --git a/vignettes/analysis_gui.Rmd b/vignettes/analysis_gui.Rmd new file mode 100644 index 00000000..1dfeb92f --- /dev/null +++ b/vignettes/analysis_gui.Rmd @@ -0,0 +1,305 @@ +--- +title: "5 - Integrative analysis workshop with TCGAbiolinks and ELMER - Analysis GUI" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: true + toc_float: true + toc_depth: 2 + css: style.css +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{5 - Integrative analysis workshop with TCGAbiolinks and ELMER - Analysis GUI} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, echo = FALSE,hide=TRUE, message=FALSE,warning = FALSE} +library(ELMER.data) +library(ELMER) +``` + +
+ +# Introduction + +In this section, we will perform the same analysis performed using ELMER, +but instead of doing it programmatically we will use TCGAbiolinksGUI [@Silva147496]. + +First we will launch the TCGAbiolinksGUI. +```{r gui, eval=FALSE, message=FALSE,warning=F} +library(TCGAbiolinksGUI) +TCGAbiolinksGUI() +``` + +# Downloading data + +Please download this two objects: + +* [Gene expression Summarized Experiment](https://github.com/BioinformaticsFMRP/Bioc2017.TCGAbiolinks.ELMER/raw/master/data/lusc.exp.rda) +* [DNA methylation Summarized Experiment](https://github.com/BioinformaticsFMRP/Bioc2017.TCGAbiolinks.ELMER/raw/master/data/lusc.met.rda) + +# Analysis + + +## Create MultiAssayExperiment object + +
+
+To create the MultiAssayExperiment object go to `Integrative analysis/ELMER/Create input data`. +
+
+ +![](figures/elmer_data_menu.png) + +
+
+
+Select the DNA methylation object previously downloaded. +
+
+ +![](figures/elmer_data_dnamet_select.png) +![](figures/elmer_data_dnamet.png) + + +
+
+
+Select the gene expression object previously downloaded. +
+
+ + +![](figures/elmer_data_exp_select.png) +![](figures/elmer_data_exp.png) + + +
+
+
+Fill the field `Save as:` and click on Create MAE object. +
+
+ +![](figures/elmer_data_final.jpg) + + +
+
+
+The object will be created. +
+
+ +![](figures/elmer_data_saved.png) + +
+ +## Perform analysis + + +
+
+
+To perform ELMER analysis go to `Integrative analysis/ELMER/Analysis`. +
+
+ +![](figures/elmer_analysis_menu.png) + +
+
+
+Select the MAE data created in the previous section. +
+
+ +![](figures/elmer_analysis_data.png) + + +
+
+
+Select the groups that will be analysed: Primary solid Tumor and Solid Tissue Normal. +
+
+ +![](figures/elmer_analysis_groups.png) + +
+
+
+We will identify probes that are hypomethylated in Primary solid Tumor compared to Solid Tissue Normal. +
+
+ +![](figures/elmer_analysis_diffmeth.png) + +
+
+
+For the significant differently methylated probes identified before we will correlated +with the 20 nearest genes. Change the value of the field `Number of permutations` to `100`, +`Raw P-value cut-off` to `0.05` and `Empirical P value cut-off` to `0.01`. +
+
+ + +![](figures/elmer_analysis_pair.png) + + +
+
+
+There will be no changes in the step 3. +
+
+ +![](figures/elmer_analysis_motif.png) + +
+
+
+There will be no changes in the step 4. +
+
+ +![](figures/elmer_analysis_TF.png) + +
+
+
+Click on `Run the analysis`. +
+
+ + +![](figures/elmer_analysis_final.png) + + +
+
+
+If the analysis identified significant regulatory TF the results will be saved into an R object. +
+
+ +![](figures/elmer_analysis_message.png) + +
+ +## Visualize results + + +
+
+
+To visualize the results go to `Integrative analysis/ELMER/Visualize results`. +
+
+ +![](figures/elmer_visualize_results.png) + + + +
+
+
+Click on `Select results` and select the object created on the previous section. +
+
+ +![](figures/elmer_visualize_select1.png) +![](figures/elmer_visualize_select2.png) + + +
+
+
+Or the avarage DNA methylation levels of probes of a Motif vs the expression of a +TF. +
+
+ +![](figures/elmer_visualize_plot_scatter_byTF.jpg) + + +
+
+
+For each enriched motif you can verify the ranking of sigificances between the correlation +of DNA methylation level on the significant paired probes with that motif vs the TF expression (for all human TF). +
+
+ +![](figures/elmer_visualize_tfraningplot.jpg) + + +
+
+
+The enrichement of each motif can be visualized. +
+
+ +![](figures/elemer_visualize_motif_enrichment.jpg) + +
+
+
+You can take a look for a gene which was the probe linked. +
+
+ +![](figures/elemer_visualize_schematic_gene.jpg) + +
+
+
+You can see the plot and its neraby genes. +
+
+ +![](figures/elemer_visualize_schematic_probe.jpg) + +
+
+
+It is possible to visualize the table with the significant differently methylated probes. +
+
+ +![](figures/elmer_visualize_table_sigprobes.jpg) + + +
+
+
+It is possible to visualize the table with the enriched motifs. +
+
+ +![](figures/elmer_visualize_table_enriched_motif.jpg) + + +
+
+
+It is possible to visualize the table with the candidates regulatory TF. +
+
+ +![](figures/elmer_visualize_table_tf.jpg) + +# Session Info + +```{r sessioninfo, eval=TRUE} +sessionInfo() +``` + +# Bibliography diff --git a/vignettes/analysis_motif_enrichment.Rmd b/vignettes/analysis_motif_enrichment.Rmd new file mode 100644 index 00000000..3b741360 --- /dev/null +++ b/vignettes/analysis_motif_enrichment.Rmd @@ -0,0 +1,99 @@ +--- +title: "3.4 - Motif enrichment analysis on the selected probes" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: default + toc: true + toc_float: true + toc_depth: 2 + css: style.css + +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"3.4 - Motif enrichment analysis on the selected probes"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + + +```{r, echo = FALSE,hide=TRUE, message=FALSE, warning=FALSE} +library(ELMER.data) +library(ELMER) +library(DT) +library(dplyr) +library(BiocStyle) +``` + +
+ +# Motif enrichment analysis on the selected probes + +## Introduction + +This step is to identify enriched motif in a set of probes which is carried out by +function `get.enriched.motif`. + +## Description + +In order to identify enriched motifs and potential upstream regulatory TFs, all probes with occurring in significant probe-gene pairs are combined for motif enrichment analysis. HOMER (Hypergeometric Optimization of Motif EnRichment) [@heinz2010simple] is used to find motif occurrences in a $\pm 250bp$ region around each probe, using HOCOMOCO (HOmo sapiens COmprehensive MOdel COllection) v11 [@kulakovskiy2016hocomoco]. Transcription factor (TF) binding models are available at https://hocomoco.autosome.org/downloads. HOCOMOCO is the most comprehensive TFBS database and is consistently updated, marking an improvement over ELMER version 1. + +For each probe set tested (i.e. the set of all probes occurring in significant probe-gene pairs), we quantify enrichments using Fisher's exact test (where $a$ is the number of probes within the selected probe set that contains one or more motif occurrences; $b$ is the number of probes within the selected probe set that do not contain a motif occurrence; $c$ and $d$ are the same counts within +the entire array probe set drawn from the same set of distal-only probes using the same definition as the primary analysis) and multiple testing correction with the Benjamini-Hochberg procedure [@fisher]. + +A probe set was considered significantly enriched +for a particular motif if the 95\% confidence interval of the Odds Ratio was greater than $1.1$ (specified by option `lower.OR`, $1.1$ is default), the motif +occurred at least 10 times (specified by option `min.incidence`, $10$ is default) in +the probe set and $FDR < 0.05$. + + +# Function arguments +
+
Main get.pair arguments
+
+| Argument | Description | +|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| data | A multi Assay Experiment from createMAE function. If set and probes.motif/background probes are missing this will be used to get this other two arguments correctly. This argument is not require, you can set probes.motif and the backaground.probes manually. | +| probes | A vector lists the name of probes to define the set of probes in which motif enrichment OR and confidence interval will be calculated. | +| lower.OR | A number specifies the smallest lower boundary of 95% confidence interval for Odds Ratio. The motif with higher lower boudnary of 95% confidence interval for Odds Ratio than the number are the significantly enriched motifs (detail see reference). | +| min.incidence | A non-negative integer specifies the minimum incidence of motif in the given probes set. 10 is default. | +
+
+ +# Example of use +```{r,eval=TRUE, message=FALSE, warning = FALSE,results = "hide"} +# Load results from previous sections +mae <- get(load("mae.rda")) +sig.diff <- read.csv("result/getMethdiff.hypo.probes.significant.csv") +pair <- read.csv("result/getPair.hypo.pairs.significant.csv") +head(pair) # significantly hypomethylated probes with putative target genes + +# Identify enriched motif for significantly hypomethylated probes which +# have putative target genes. +enriched.motif <- get.enriched.motif( + data = mae, + probes = pair$Probe, + dir.out = "result", + label = "hypo", + min.incidence = 10, + lower.OR = 1.1 +) +``` + +```{r,eval=TRUE, message=FALSE, warning = FALSE} +names(enriched.motif) # enriched motifs +head(enriched.motif[names(enriched.motif)[1]]) ## probes in the given set that have the first motif. + +# get.enriched.motif automatically save output files. +# getMotif.hypo.enriched.motifs.rda contains enriched motifs and the probes with the motif. +# getMotif.hypo.motif.enrichment.csv contains summary of enriched motifs. +dir(path = "result", pattern = "getMotif") + +# motif enrichment figure will be automatically generated. +dir(path = "result", pattern = "motif.enrichment.pdf") +``` + +# Bibliography diff --git a/vignettes/analysis_regulatory_tf.Rmd b/vignettes/analysis_regulatory_tf.Rmd new file mode 100644 index 00000000..6154dbf7 --- /dev/null +++ b/vignettes/analysis_regulatory_tf.Rmd @@ -0,0 +1,119 @@ +--- +title: "3.5 - Identifying regulatory TFs" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: true + toc_float: true + toc_depth: 2 + css: style.css + +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"3.5 - Identifying regulatory TFs"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + + +```{r, echo = FALSE,hide=TRUE, message=FALSE, warning=FALSE} +library(ELMER.data) +library(ELMER) +library(DT) +library(dplyr) +library(BiocStyle) +``` +
+ +# Identification of master regulator TFs + + +## Introduction +This step identifies regulatory TF whose expression associates with TF binding motif +DNA methylation and it is carried out by function `get.TFs`. + +## Description + +When a group of enhancers is coordinately altered in a specific sample subset, this is often the result of an altered upstream **master regulator** transcription factor in the gene regulatory network. ELMER identifies master regulator TFs corresponding to each of the TF binding motifs enriched from the previous analysis step. +For each enriched motif, ELMER takes the mean DNA methylation of all distal probes (in significant probe-gene pairs) that contain that motif occurrence (within a $\pm 250bp$ region) and compares this mean DNA methylation to the expression of each gene annotated as a human TF. + + +In the `Unsupervised` mode, a statistical test is performed for each motif-TF pair, as follows. All samples +are divided into two groups: the $M$ group, which consists +of the 20\% of samples with the highest average methylation at all motif-adjacent +probes, and the $U$ group, which consisted of the 20\% of samples with the lowest +methylation. This step is performed by the `get.TFs` function, which takes `minSubgroupFrac` as an input parameter, again with a default of 20\%. +For each candidate motif-TF pair, the Mann-Whitney U test is used to test +the null hypothesis that overall gene expression in group $M$ is greater or equal +than that in group $U$. This non-parametric test was used in order to minimize the +effects of expression outliers, which can occur across a very wide dynamic range. +For each motif tested, this results in a raw p-value ($P_r$) for each of the human TFs. + +The new `Supervised` mode uses the same approach as described for the identification of putative target gene(s) step. The $U$ and $M$ groups are one of the the label group of samples and the `minSubgroupFrac` parameter is set to 100\% to use all samples from both groups in the statistical test. This also can result in greater statistical power when using the `Supervised` mode. + + +Finally, all TFs were ranked by the $-log_{10}(P_{r})$, and those falling within the top 5\% of +this ranking were considered candidate upstream regulators. +By default, the top 3 most anti-correlated TFs, and all TF classified by TFClass database in the same (sub)family are highlighted. + + +![Source: Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription factor networks from cancer methylomes." Genome biology 16.1 (2015): 105.](figures/paper_get_pairs.jpg) [@yao2015inferring,@yao2015demystifying] + + +# Function arguments +
+
Main get.pair arguments
+
+| Argument | Description | +|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| data | A multiAssayExperiment with DNA methylation and Gene Expression data. See `createMAE` function. | +| enriched.motif | A list containing output of get.enriched.motif function or a path of XX.rda file containing output of get.enriched.motif function. | +| group.col | A column defining the groups of the sample. You can view the available columns using: `colnames(MultiAssayExperiment::colData(data))`. +| group1 | A group from group.col. | +| group2 | A group from group.col. | +| minSubgroupFrac | A number ranging from 0 to 1 specifying the percentage of samples used to create the groups U (unmethylated) and M (methylated) used to link probes to TF expression. Default is 0.4 (lowest quintile of all samples will be in the U group and the highest quintile of all samples in the M group). | +| mode | A character. Can be "unsupervised" or "supervised". If unsupervised is set the U (unmethylated) and M (methylated) groups will be selected among all samples based on methylation of each probe. Otherwise U group and M group will set as the samples of group1 or group2 as described below: If diff.dir is "hypo, U will be the group 1 and M the group2. If diff.dir is "hyper" M group will be the group1 and U the group2. | +| diff.dir | A character can be "hypo" or "hyper", showing differential methylation dirction in group 1. It can be "hypo" which means the probes are hypomethylated in group1; "hyper" which means the probes are hypermethylated in group1; This argument is used only when mode is supervised nad it should be the same value from get.diff.meth function. | +
+
+ + +# Example of use +```{r,eval=TRUE, message=FALSE, warning = FALSE, results = "hide"} +# Load results from previous sections +mae <- get(load("mae.rda")) +load("result/getMotif.hypo.enriched.motifs.rda") +``` + +```{r,eval=TRUE, message=FALSE, warning = FALSE, results = "hide"} +## identify regulatory TF for the enriched motifs +TF <- get.TFs( + data = mae, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + mode = "unsupervised", + enriched.motif = enriched.motif, + dir.out = "result", + cores = 1, + label = "hypo", + save.plots = TRUE +) +``` + +```{r,eval=TRUE, message=FALSE, warning = FALSE} +# get.TFs automatically save output files. +# getTF.hypo.TFs.with.motif.pvalue.rda contains statistics for all TF with average +# DNA methylation at sites with the enriched motif. +# getTF.hypo.significant.TFs.with.motif.summary.csv contains only the significant probes. +dir(path = "result", pattern = "getTF") + +# TF ranking plot based on statistics will be automatically generated. +dir(path = "result/TFrankPlot/", pattern = "pdf") +``` + +# Bibliography diff --git a/vignettes/bibliography.bib b/vignettes/bibliography.bib new file mode 100644 index 00000000..406282b3 --- /dev/null +++ b/vignettes/bibliography.bib @@ -0,0 +1,156 @@ +@article{yao2015inferring, + title={Inferring regulatory element landscapes and transcription factor networks from cancer methylomes}, + author={Yao, Lijing and others}, + journal={Genome biology}, + volume={16}, + number={1}, + pages={105}, + year={2015}, + publisher={BioMed Central} +} + +@article{fisher, + title={On the interpretation of $\chi$ 2 from contingency tables, and the calculation of P}, + author={Fisher, Ronald A}, + journal={Journal of the Royal Statistical Society}, + volume={85}, + number={1}, + pages={87--94}, + year={1922}, + publisher={JSTOR} +} + + +@article{kulakovskiy2016hocomoco, + title={HOCOMOCO: expansion and enhancement of the collection of transcription factor binding sites models}, + author={Kulakovskiy, Ivan V and others}, + journal={Nucleic acids research}, + volume={44}, + number={D1}, + pages={D116--D125}, + year={2016}, + publisher={Oxford Univ Press} +} +@article{zhou2016comprehensive, + title={Comprehensive characterization, annotation and innovative use of Infinium DNA methylation BeadChip probes}, + author={Zhou, Wanding and Laird, Peter W and Shen, Hui}, + journal={Nucleic Acids Research}, + pages={gkw967}, + year={2016}, + publisher={Oxford Univ Press} +} +@article{grossman2016toward, + title={Toward a shared vision for cancer genomic data}, + author={Grossman, Robert L and others}, + journal={New England Journal of Medicine}, + volume={375}, + number={12}, + pages={1109--1112}, + year={2016}, + publisher={Mass Medical Soc} +} + + @Article{mae2017, + title = {Software For The Integration Of Multi-Omics Experiments In Bioconductor}, + author = {Marcel Ramos and others}, + journal = {Cancer Research}, + year = {2017}, + volume = {77(21); e39-42}, + } + + +@article{heinz2010simple, + title={Simple combinations of lineage-determining transcription factors prime cis-regulatory elements required for macrophage and B cell identities}, + author={Heinz, Sven and others}, + journal={Molecular cell}, + volume={38}, + number={4}, + pages={576--589}, + year={2010}, + publisher={Elsevier} +} + +@article {Silva147496, + author = {Silva, Tiago C. and Colaprico, Antonio and Olsen, Catharina and Bontempi, Gianluca and Ceccarelli, Michele and Berman, Benjamin P. and Noushmehr, Houtan}, + title = {TCGAbiolinksGUI: A graphical user interface to analyze cancer molecular and clinical data}, + year = {2017}, + doi = {10.1101/147496}, + publisher = {Cold Spring Harbor Labs Journals}, + URL = {http://www.biorxiv.org/content/early/2017/06/08/147496}, + eprint = {http://www.biorxiv.org/content/early/2017/06/08/147496.full.pdf}, + journal = {bioRxiv} +} + +@article{TCGAbiolinks, +author = {Colaprico, Antonio and Silva, Tiago C. and Olsen, Catharina and Garofano, Luciano and Cava, Claudia and Garolini, Davide and Sabedot, Thais S. and Malta, Tathiane M. and Pagnotta, Stefano M. and Castiglioni, Isabella and Ceccarelli, Michele and Bontempi, Gianluca and Noushmehr, Houtan}, +title = {TCGAbiolinks: an R/Bioconductor package for integrative analysis of TCGA data}, +volume = {44}, +number = {8}, +pages = {e71}, +year = {2016}, +doi = {10.1093/nar/gkv1507}, +URL = {http://nar.oxfordjournals.org/content/44/8/e71.abstract}, +eprint = {http://nar.oxfordjournals.org/content/44/8/e71.full.pdf+html}, +journal = {Nucleic Acids Research} +} + + +@article{zhou2016comprehensive, + title={Comprehensive characterization, annotation and innovative use of Infinium DNA methylation BeadChip probes}, + author={Zhou, Wanding and others}, + journal={Nucleic Acids Research}, + pages={gkw967}, + year={2016}, + publisher={Oxford Univ Press} +} +@article{durinck2009mapping, + title={Mapping identifiers for the integration of genomic datasets with the R/Bioconductor package biomaRt}, + author={Durinck, Steffen and others}, + journal={Nature protocols}, + volume={4}, + number={8}, + pages={1184--1191}, + year={2009}, + publisher={Nature Publishing Group} +} + @article{yates2015ensembl, + title={Ensembl 2016}, + author={Yates, Andrew and others}, + journal={Nucleic acids research}, + pages={gkv1157}, + year={2015}, + publisher={Oxford Univ Press} +} + +@article{yao2015demystifying, + title={Demystifying the secret mission of enhancers: linking distal regulatory elements to target genes}, + author={Yao, Lijing and Berman, Benjamin P and Farnham, Peggy J}, + journal={Critical reviews in biochemistry and molecular biology}, + volume={50}, + number={6}, + pages={550--573}, + year={2015}, + publisher={Taylor \& Francis} +} + +@article{durinck2005biomart, + title={BioMart and Bioconductor: a powerful link between biological databases and microarray data analysis}, + author={Durinck, Steffen and others}, + journal={Bioinformatics}, + volume={21}, + number={16}, + pages={3439--3440}, + year={2005}, + publisher={Oxford Univ Press} +} + +@article{sham2014statistical, + title={Statistical power and significance testing in large-scale genetic studies}, + author={Sham, Pak C and Purcell, Shaun M}, + journal={Nature reviews. Genetics}, + volume={15}, + number={5}, + pages={335}, + year={2014}, + publisher={Nature Publishing Group} +} diff --git a/vignettes/figures/elemer_visualize_motif_enrichment.jpg b/vignettes/figures/elemer_visualize_motif_enrichment.jpg new file mode 100644 index 00000000..b9323c3d Binary files /dev/null and b/vignettes/figures/elemer_visualize_motif_enrichment.jpg differ diff --git a/vignettes/figures/elemer_visualize_schematic_gene.jpg b/vignettes/figures/elemer_visualize_schematic_gene.jpg new file mode 100644 index 00000000..9a82805d Binary files /dev/null and b/vignettes/figures/elemer_visualize_schematic_gene.jpg differ diff --git a/vignettes/figures/elemer_visualize_schematic_probe.jpg b/vignettes/figures/elemer_visualize_schematic_probe.jpg new file mode 100644 index 00000000..2ed8e50e Binary files /dev/null and b/vignettes/figures/elemer_visualize_schematic_probe.jpg differ diff --git a/vignettes/figures/elmer_analysis_TF.png b/vignettes/figures/elmer_analysis_TF.png new file mode 100644 index 00000000..942209f6 Binary files /dev/null and b/vignettes/figures/elmer_analysis_TF.png differ diff --git a/vignettes/figures/elmer_analysis_data.png b/vignettes/figures/elmer_analysis_data.png new file mode 100644 index 00000000..1719e1ea Binary files /dev/null and b/vignettes/figures/elmer_analysis_data.png differ diff --git a/vignettes/figures/elmer_analysis_diffmeth.png b/vignettes/figures/elmer_analysis_diffmeth.png new file mode 100644 index 00000000..c05dfd11 Binary files /dev/null and b/vignettes/figures/elmer_analysis_diffmeth.png differ diff --git a/vignettes/figures/elmer_analysis_final.png b/vignettes/figures/elmer_analysis_final.png new file mode 100644 index 00000000..0a692f5e Binary files /dev/null and b/vignettes/figures/elmer_analysis_final.png differ diff --git a/vignettes/figures/elmer_analysis_groups.png b/vignettes/figures/elmer_analysis_groups.png new file mode 100644 index 00000000..44441610 Binary files /dev/null and b/vignettes/figures/elmer_analysis_groups.png differ diff --git a/vignettes/figures/elmer_analysis_menu.png b/vignettes/figures/elmer_analysis_menu.png new file mode 100644 index 00000000..9fbde0f0 Binary files /dev/null and b/vignettes/figures/elmer_analysis_menu.png differ diff --git a/vignettes/figures/elmer_analysis_message.png b/vignettes/figures/elmer_analysis_message.png new file mode 100644 index 00000000..cea43d26 Binary files /dev/null and b/vignettes/figures/elmer_analysis_message.png differ diff --git a/vignettes/figures/elmer_analysis_motif.png b/vignettes/figures/elmer_analysis_motif.png new file mode 100644 index 00000000..39c91cfa Binary files /dev/null and b/vignettes/figures/elmer_analysis_motif.png differ diff --git a/vignettes/figures/elmer_analysis_pair.png b/vignettes/figures/elmer_analysis_pair.png new file mode 100644 index 00000000..cd77e2ce Binary files /dev/null and b/vignettes/figures/elmer_analysis_pair.png differ diff --git a/vignettes/figures/elmer_data_dnamet.png b/vignettes/figures/elmer_data_dnamet.png new file mode 100644 index 00000000..000b9401 Binary files /dev/null and b/vignettes/figures/elmer_data_dnamet.png differ diff --git a/vignettes/figures/elmer_data_dnamet_select.png b/vignettes/figures/elmer_data_dnamet_select.png new file mode 100644 index 00000000..3f608852 Binary files /dev/null and b/vignettes/figures/elmer_data_dnamet_select.png differ diff --git a/vignettes/figures/elmer_data_exp.png b/vignettes/figures/elmer_data_exp.png new file mode 100644 index 00000000..3ea299d7 Binary files /dev/null and b/vignettes/figures/elmer_data_exp.png differ diff --git a/vignettes/figures/elmer_data_exp_select.png b/vignettes/figures/elmer_data_exp_select.png new file mode 100644 index 00000000..306a9c6e Binary files /dev/null and b/vignettes/figures/elmer_data_exp_select.png differ diff --git a/vignettes/figures/elmer_data_final.jpg b/vignettes/figures/elmer_data_final.jpg new file mode 100644 index 00000000..86efbdb9 Binary files /dev/null and b/vignettes/figures/elmer_data_final.jpg differ diff --git a/vignettes/figures/elmer_data_menu.png b/vignettes/figures/elmer_data_menu.png new file mode 100644 index 00000000..ceb5e72a Binary files /dev/null and b/vignettes/figures/elmer_data_menu.png differ diff --git a/vignettes/figures/elmer_data_saved.png b/vignettes/figures/elmer_data_saved.png new file mode 100644 index 00000000..dd7fa572 Binary files /dev/null and b/vignettes/figures/elmer_data_saved.png differ diff --git a/vignettes/figures/elmer_visualize_plot_scatter_byTF.jpg b/vignettes/figures/elmer_visualize_plot_scatter_byTF.jpg new file mode 100644 index 00000000..388f6574 Binary files /dev/null and b/vignettes/figures/elmer_visualize_plot_scatter_byTF.jpg differ diff --git a/vignettes/figures/elmer_visualize_results.png b/vignettes/figures/elmer_visualize_results.png new file mode 100644 index 00000000..7bf3fccd Binary files /dev/null and b/vignettes/figures/elmer_visualize_results.png differ diff --git a/vignettes/figures/elmer_visualize_select1.png b/vignettes/figures/elmer_visualize_select1.png new file mode 100644 index 00000000..5aa0bbf7 Binary files /dev/null and b/vignettes/figures/elmer_visualize_select1.png differ diff --git a/vignettes/figures/elmer_visualize_select2.png b/vignettes/figures/elmer_visualize_select2.png new file mode 100644 index 00000000..f57d8b7e Binary files /dev/null and b/vignettes/figures/elmer_visualize_select2.png differ diff --git a/vignettes/figures/elmer_visualize_table_enriched_motif.jpg b/vignettes/figures/elmer_visualize_table_enriched_motif.jpg new file mode 100644 index 00000000..427bcb8e Binary files /dev/null and b/vignettes/figures/elmer_visualize_table_enriched_motif.jpg differ diff --git a/vignettes/figures/elmer_visualize_table_sigprobes.jpg b/vignettes/figures/elmer_visualize_table_sigprobes.jpg new file mode 100644 index 00000000..834110cd Binary files /dev/null and b/vignettes/figures/elmer_visualize_table_sigprobes.jpg differ diff --git a/vignettes/figures/elmer_visualize_table_tf.jpg b/vignettes/figures/elmer_visualize_table_tf.jpg new file mode 100644 index 00000000..92933dd9 Binary files /dev/null and b/vignettes/figures/elmer_visualize_table_tf.jpg differ diff --git a/vignettes/figures/elmer_visualize_tfraningplot.jpg b/vignettes/figures/elmer_visualize_tfraningplot.jpg new file mode 100644 index 00000000..8142601a Binary files /dev/null and b/vignettes/figures/elmer_visualize_tfraningplot.jpg differ diff --git a/vignettes/figures/gui_dnamet_prepare.png b/vignettes/figures/gui_dnamet_prepare.png new file mode 100644 index 00000000..6266c548 Binary files /dev/null and b/vignettes/figures/gui_dnamet_prepare.png differ diff --git a/vignettes/figures/gui_dnamet_prepare_completed.png b/vignettes/figures/gui_dnamet_prepare_completed.png new file mode 100644 index 00000000..6e47b8df Binary files /dev/null and b/vignettes/figures/gui_dnamet_prepare_completed.png differ diff --git a/vignettes/figures/gui_exp_prepare.png b/vignettes/figures/gui_exp_prepare.png new file mode 100644 index 00000000..ac07e8bb Binary files /dev/null and b/vignettes/figures/gui_exp_prepare.png differ diff --git a/vignettes/figures/gui_exp_prepare_completed.png b/vignettes/figures/gui_exp_prepare_completed.png new file mode 100644 index 00000000..2fe7d47b Binary files /dev/null and b/vignettes/figures/gui_exp_prepare_completed.png differ diff --git a/vignettes/figures/gui_exp_query_results.jpg b/vignettes/figures/gui_exp_query_results.jpg new file mode 100644 index 00000000..850c225e Binary files /dev/null and b/vignettes/figures/gui_exp_query_results.jpg differ diff --git a/vignettes/figures/gui_exp_results.png b/vignettes/figures/gui_exp_results.png new file mode 100644 index 00000000..a644d24c Binary files /dev/null and b/vignettes/figures/gui_exp_results.png differ diff --git a/vignettes/figures/gui_menu.png b/vignettes/figures/gui_menu.png new file mode 100644 index 00000000..0d74d4f0 Binary files /dev/null and b/vignettes/figures/gui_menu.png differ diff --git a/vignettes/figures/gui_se_select.png b/vignettes/figures/gui_se_select.png new file mode 100644 index 00000000..c9336d96 Binary files /dev/null and b/vignettes/figures/gui_se_select.png differ diff --git a/vignettes/figures/gui_visualize_se.png b/vignettes/figures/gui_visualize_se.png new file mode 100644 index 00000000..c2a47bc6 Binary files /dev/null and b/vignettes/figures/gui_visualize_se.png differ diff --git a/vignettes/figures/paper_diff_meth.jpg b/vignettes/figures/paper_diff_meth.jpg new file mode 100644 index 00000000..13a54994 Binary files /dev/null and b/vignettes/figures/paper_diff_meth.jpg differ diff --git a/vignettes/figures/paper_get.pairs.jpg b/vignettes/figures/paper_get.pairs.jpg new file mode 100644 index 00000000..eb5d7ea4 Binary files /dev/null and b/vignettes/figures/paper_get.pairs.jpg differ diff --git a/vignettes/figures/paper_get_pairs.jpg b/vignettes/figures/paper_get_pairs.jpg new file mode 100644 index 00000000..2f9b010b Binary files /dev/null and b/vignettes/figures/paper_get_pairs.jpg differ diff --git a/vignettes/figures/workflow.png b/vignettes/figures/workflow.png new file mode 100644 index 00000000..f6735b0f Binary files /dev/null and b/vignettes/figures/workflow.png differ diff --git a/vignettes/hexELMER.png b/vignettes/hexELMER.png new file mode 100644 index 00000000..3f58725d Binary files /dev/null and b/vignettes/hexELMER.png differ diff --git a/vignettes/index.Rmd b/vignettes/index.Rmd new file mode 100644 index 00000000..07728ebf --- /dev/null +++ b/vignettes/index.Rmd @@ -0,0 +1,131 @@ +--- +title: "ELMER v.2: An R/Bioconductor package to reconstruct gene regulatory networks from DNA methylation and transcriptome profiles" +author: "Tiago Chedraoui Silva [aut], Lijing Yao [aut], Simon Coetzee [aut], Nicole Gull [ctb], Houtan Noushmehr [ctb], Dennis J. Hazelett [ctb], Peggy Farnham [aut], Hui Shen [ctb], Peter Laird [ctb], De-Chen Lin[ctb], Benjamin P. Berman [aut]" +date: "`r Sys.Date()`" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: true + toc_float: true + toc_depth: 2 + css: style.css +fontsize: 11pt +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"1 - ELMER v.2: An R/Bioconductor package to reconstruct gene regulatory networks from DNA methylation and transcriptome profiles"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + + + +# Introduction + +DNA methylation can be used to identify functional changes at transcriptional enhancers and other cis-regulatory modules (CRMs) in tumors and other primary disease tissues. Our R/Bioconductor package `r BiocStyle::Biocpkg("ELMER")` (Enhancer Linking by Methylation/Expression Relationships) provides a systematic approach that reconstructs gene regulatory networks (GRNs) by combining methylation and gene expression data derived from the same set of samples. `r BiocStyle::Biocpkg("ELMER")` uses methylation changes at CRMs as the central hub of these networks, using correlation analysis to associate them with both upstream master regulator (MR) transcription factors and downstream target genes. + +This package can be easily applied to TCGA public available cancer data sets and custom DNA methylation and gene expression data sets. + +ELMER analyses have 5 main steps: + + 1. Identify distal probes on HM450K or EPIC arrays. + 2. Identify distal probes with significantly different DNA methylation level + between two groups + 3. Identify putative target genes for differentially methylated distal probes. + 4. Identify enriched motifs for the distalprobes which are significantly + differentially methylated and linked to putative target gene. + 5. Identify regulatory TFs whose expression associate with DNA methylation at enriched motifs. + +# Package workflow + +The package workflow is showed in the figure below: + +![ELMER workflow: ELMER receives as input a DNA methylation object, a gene expression object (both can be either a matrix or a SummarizedExperiment object) and a Genomic Ranges (GRanges) object with distal probes to be used as a filter which can be retrieved using the `get.feature.probe` function. The function `createMAE` will create a Multi Assay Experiment object keeping only samples that have both DNA methylation and gene expression data. Genes will be mapped to genomic position and annotated using ENSEMBL database, while for probes it will add annotation from (http://zwdzwd.github.io/InfiniumAnnotation). This MAE object will be used as input to the next analysis functions. First, it identifies differentially methylated probes followed by the identification of their nearest genes (10 upstream and 10 downstream) through the `get.diff.meth` and `GetNearGenes` functions respectively. For each probe, it will verify if any of the nearby genes were affected by its change in the DNA methylation level and a list of gene and probes pairs will be outputted from `get.pair` function. For the probes in those pairs, it will search for enriched regulatory Transcription Factors motifs with the `get.enriched.motif` function. Finally, the enriched motifs will be correlated with the level of the transcription factor through the `get.TFs` function. In the figure green Boxes represent user input data, blue boxes represent output object, orange boxes represent auxiliary pre-computed data and gray boxes are functions. ](figures/workflow.png) + +# Main differences between ELMER v2 vs ELMER v1 + +## Summary table + +| | ELMER Version 1 | ELMER Version 2 | +|--------------------------------|:----------------------------------------------|:--------------------------------------------------------------------| +| Primary data structure | mee object (custom data structure) | MAE object (Bioconductor data structure) | +| Auxiliary data | Manually created | Programmatically created | +| Number of human TFs | 1,982 | 1,639 (curated list from Lambert, Samuel A., et al.) | +| Number of TF motifs | 91 | 771 (HOCOMOCO v11 database) | +| TF classification | 78 families | 82 families and 331 subfamilies \newline(TFClass database, HOCOMOCO) | +| Analysis performed | Normal vs tumor samples | Group 1 vs group 2 | +| Statistical grouping | **Unsupervised** only | **Unsupervised** or **supervised** using labeled groups | +| TCGA data source | The Cancer Genome Atlas (TCGA) (not available) | The NCI's Genomic Data Commons (GDC) | +| Genome of reference | GRCh37 (hg19) | GRCh37 (hg19)/GRCh38 (hg38) | +| DNA methylation platforms | HM450 | EPIC and HM450 | +| Graphical User Interface (GUI) | None | TCGAbiolinksGUI | +| Automatic report | None | HTML summarizing results | +| Annotations | None | StateHub | + +## Supervised vs Unsupervised mode + +In ELMER v2 we introduce a new concept, the algorithm `mode` that can be either `supervised` or `unsupervised`. +In the unsupervised mode (described in ELMER v1), it is assumed that one of the two groups is a heterogeneous mix of different (sometimes unknown) molecular phenotypes. For instance, in the example of Breast Cancer, normal breast tissues (Group A) are relatively homogenous, whereas Breast tumors fall into multiple molecular subtypes. + +The assumption of the Unsupervised mode is that methylation changes may be restricted to a subset of one or more molecular subtypes, and thus only be present in a fraction of the samples in the test group. For instance, methylation changes related to estrogen signaling may only be present in LuminalA or LuminalB subtypes. + +When this structure is unknown, the Unsupervised mode is the appropriate model, since it only requires changes in a subset of samples (by default, 20%). In contrast, in the Supervised mode, it is assumed that each group represents a more homogenous molecular phenotype, and thus we compare all samples in Group A vs. all samples in Group B. This can be used in the case of direct comparison of tumor subtypes (i.e. Luminal vs. Basal-like tumors), but can also be used in numerous other situations, including sorted cells of different types, or treated vs. untreated samples in perturbation experiments. + +# Installing and loading ELMER + +To install this package from github (development version), start R and enter: + +```{r, eval = FALSE} +devtools::install_github(repo = "tiagochst/ELMER.data") +devtools::install_github(repo = "tiagochst/ELMER") +``` + +To install this package from Bioconductor start R and enter: + +```{r, eval = FALSE} +if (!requireNamespace("BiocManager", quietly=TRUE)) + install.packages("BiocManager") +BiocManager::install("ELMER") +``` + +Then, to load ELMER enter: + +```{r, fig.height=6,echo=FALSE, message=FALSE, warning=FALSE, include=TRUE} +library(ELMER, quietly = TRUE) +``` + +# Citing this work + +If you used ELMER package or its results, please cite: + +* Yao, L., Shen, H., Laird, P. W., Farnham, P. J., & Berman, B. P. "Inferring regulatory element landscapes and transcription factor networks from cancer methylomes." Genome Biol 16 (2015): 105. +* Yao, Lijing, Benjamin P. Berman, and Peggy J. Farnham. "Demystifying the secret mission of enhancers: linking distal regulatory elements to target genes." Critical reviews in biochemistry and molecular biology 50.6 (2015): 550-573. +* Tiago C Silva, Simon G Coetzee, Nicole Gull, Lijing Yao, Dennis J Hazelett, Houtan Noushmehr, De-Chen Lin, Benjamin P Berman; ELMER v.2: An R/Bioconductor package to reconstruct gene regulatory networks from DNA methylation and transcriptome profiles, Bioinformatics, , bty902, https://doi.org/10.1093/bioinformatics/bty902 + +If you get TCGA data using `getTCGA` function, please cite TCGAbiolinks package: + +* Colaprico A, Silva TC, Olsen C, Garofano L, Cava C, Garolini D, Sabedot T, Malta TM, Pagnotta SM, Castiglioni I, Ceccarelli M, Bontempi G and Noushmehr H. "TCGAbiolinks: an R/Bioconductor package for integrative analysis of TCGA data." Nucleic acids research (2015): gkv1507. +* Silva, TC, A Colaprico, C Olsen, F D’Angelo, G Bontempi, M Ceccarelli, and H Noushmehr. 2016. “TCGA Workflow: Analyze Cancer Genomics and Epigenomics Data Using Bioconductor Packages [Version 2; Referees: 1 Approved, 1 Approved with Reservations].” F1000Research 5 (1542). doi:10.12688/f1000research.8923.2. + +* Grossman, Robert L., et al. "Toward a shared vision for cancer genomic data." New England Journal of Medicine 375.12 (2016): 1109-1112. + +If you get use the Graphical user interface, please cite `TCGAbiolinksGUI` package: + +* Silva, Tiago C. and Colaprico, Antonio and Olsen, Catharina and Bontempi, Gianluca and Ceccarelli, Michele and Berman, Benjamin P. and Noushmehr, Houtan. "TCGAbiolinksGUI: A graphical user interface to analyze cancer molecular and clinical data" (bioRxiv 147496; doi: https://doi.org/10.1101/147496) + +# Bugs and questions + +If you have questions, wants to report a bug, please use our github repository: http://www.github.com/tiagochst/ELMER + +# Paper supplemental material + +TCGA-BRCA reports (paper supplemental material) can be found at https://tiagochst.github.io/ELMER_supplemental/ + +# Session Info + +```{r sessioninfo, eval=TRUE} +sessionInfo() +``` diff --git a/vignettes/input.Rmd b/vignettes/input.Rmd new file mode 100644 index 00000000..0d9ab202 --- /dev/null +++ b/vignettes/input.Rmd @@ -0,0 +1,296 @@ +--- +title: "2 - Introduction: Input data" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: default + toc: true + toc_float: true + toc_depth: 2 + css: style.css + +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"2 - Introduction: Input data"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r, fig.height=6,echo=FALSE, message=FALSE, warning=FALSE, include=TRUE} +library(ELMER.data) +library(ELMER) +data(elmer.data.example) +data(LUSC_meth_refined) +data(LUSC_RNA_refined) +library(DT) +``` + +
+ +# Input data + +A Multi Assay Experiment object [@mae2017] is the input for all main functions of `r BiocStyle::Biocpkg("ELMER")` and can be generated by `createMAE` function. + +To perform `r BiocStyle::Biocpkg("ELMER")` analyses, the [Multi Assay Experiment](https://bioconductor.org/packages/release/bioc/html/MultiAssayExperiment.html) needs: + +- a DNA methylation matrix or SummarizedExperiment object from HM450K or EPIC platform for multiple samples; +- a gene expression matrix or SummarizedExperiment object for the same samples; +- a matrix mapping DNA methylation samples to gene expression samples +- a matrix with samples metadata (i.e. clinical data, molecular subtype information). + +If TCGA data are used, the the last two matrices will be automatically generated. +Based on the genome of reference selected, metadata for the DNA methylation probes, such as genomic coordinates, will be added from [Wanding Zhou annotation](http://zwdzwd.github.io/InfiniumAnnotation) [@zhou2016comprehensive]; +and metadata for gene annotation will be added from ensemble database [@yates2015ensembl] using [biomaRt](http://bioconductor.org/packages/biomaRt/) +[@durinck2009mapping]. + + +## DNA methylation data + +DNA methylation data feeding to `r BiocStyle::Biocpkg("ELMER")` should be a matrix of DNA methylation +beta ($\beta$) value for samples (column) and probes (row) processed from row HM450K +array data. If TCGA data is used, processed data from GDC website will be downloaded +and automatically transformed to the matrix by `r BiocStyle::Biocpkg("ELMER")`. The processed TCGA +DNA methylation data were calculated as $\frac{M}{(M+U)}$, where M represents the methylated +allele intensity and U the unmethylated allele intensity. Beta values range from 0 to 1, +reflecting the fraction of methylated alleles at each CpG in the each tumor; beta values +close to 0 indicates low levels of DNA methylation and beta values close to 1 +indicates high levels of DNA methylation. + +If user have raw HM450K data, these data can be processed by `r BiocStyle::Biocpkg("Methylumi")` +or `r BiocStyle::Biocpkg("minfi")` generating DNA methylation beta ($\beta$) value for each CpG site +and multiple samples. The `getBeta` function in `r BiocStyle::Biocpkg("minfi")` can be used to generate a matrix +of DNA methylation beta ($\beta$) value to feed in `r BiocStyle::Biocpkg("ELMER")`. And we recommend to +save this matrix as `meth.rda` since `createMAE` +can read in files by specifying their path which will help to reduce memory usage. + +```{r} +# Example of DNA methylation data input +datatable(Meth[1:10, 1:10], + options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), + rownames = TRUE) +``` + +## Gene expression data + +Gene expresion data feeding to `r BiocStyle::Biocpkg("ELMER")` should be a matrix of gene expression +values for samples (column) and genes (row). Gene expression value can be generated +from different platforms: array or RNA-seq. The row data should be processed by other +software to get gene or transcript level gene expression calls such as mapping by +[tophat](https://ccb.jhu.edu/software/tophat/index.shtml), +calling expression value by [cufflink](https://github.com/cole-trapnell-lab/cufflinks), +[RSEM](https://github.com/deweylab/RSEM) or +[GenomeStudio](http://www.illumina.com/techniques/microarrays/array-data-analysis-experimental-design/genomestudio.html) for expression array. +It is recommended to normalize expression data making gene expression +comparable across samples such as quantile normalization. User can refer TCGA RNA-seq +analysis pipeline to do generate comparable gene expression data. Then transform the +gene expression values from each sample to the matrix for feeding into `r BiocStyle::Biocpkg("ELMER")`. +If users want to use TCGA data, `r BiocStyle::Biocpkg("ELMER")` has functions to download the +RNA-Seq Quantification data (HTSeq-FPKM-UQ) from GDC website and transform the data to the matrix for feeding +into `r BiocStyle::Biocpkg("ELMER")`. It is recommended to save this matrix as `RNA.rda` since `createMAE` +can read in files by specifying the path of files which will help to reduce memory usage. + +```{r} +# Example of Gene expression data input +datatable(GeneExp[1:10, 1:2], + options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), + rownames = TRUE) +``` + + +## Sample information + +Sample information should be stored as a data.frame object containing sample ID, +group labels (control and experiment). Sample ID and groups labels are required. +Other information for each sample can be added to this data.frame object. +When TCGA data were used, samples information will be automatically generated by +`createMAE` function by specifying option `TCGA=TRUE`. A columns name `TN` will +create the groups Tumor and Normal using the following samples to each group: + +Tumor samples are: + +* Primary solid Tumor +* Recurrent Solid Tumor +* Primary Blood Derived Cancer - Peripheral Blood +* Recurrent Blood Derived Cancer - Bone Marrow +* Additional - New Primary +* Metastatic +* Additional Metastatic +* Human Tumor Original Cells +* Primary Blood Derived Cancer - Bone Marrow + +Normal samples: + +* Blood Derived Normal +* Solid Tissue Normal +* Buccal Cell Normal +* EBV Immortalized Normal +* Bone Marrow Normal + +```{r, message=FALSE} +library(MultiAssayExperiment) +data <- createMAE( + exp = GeneExp, + met = Meth, + met.platform = "450K", + genome = "hg19", + save = FALSE, + TCGA = TRUE +) +data +as.data.frame(colData(data)[,c("patient","definition","TN")]) %>% + datatable(options = list(scrollX = TRUE,pageLength = 5)) + +# Adding sample information for non TCGA samples +# You should have two objects with one for DNA methylation and +# one for gene expression. They should have the same number of samples and the names of the +# sample in the gene expression object and in hte DNA methylation matrix +# should be the same +not.tcga.exp <- GeneExp # 234 samples +colnames(not.tcga.exp) <- substr(colnames(not.tcga.exp),1,15) +not.tcga.met <- Meth # 268 samples +colnames(not.tcga.met) <- substr(colnames(not.tcga.met),1,15) +# Number of samples in both objects (234) +table(colnames(not.tcga.met) %in% colnames(not.tcga.exp)) + +# Our sample information must have as row names the samples information +phenotype.data <- data.frame(row.names = colnames(not.tcga.exp), + primary = colnames(not.tcga.exp), + group = c(rep("group1", ncol(GeneExp)/2), + rep("group2", ncol(GeneExp)/2))) + +data.hg19 <- createMAE(exp = not.tcga.exp, + met = not.tcga.met, + TCGA = FALSE, + met.platform = "450K", + genome = "hg19", + colData = phenotype.data) +data.hg19 + +# The samples that does not have data for both DNA methylation and Gene exprssion will be removed even for the phenotype data +phenotype.data <- data.frame(row.names = colnames(not.tcga.met), + primary = colnames(not.tcga.met), + group = c(rep("group1", ncol(Meth)/4), + rep("group2", ncol(Meth)/4), + rep("group3", ncol(Meth)/4), + rep("group4", ncol(Meth)/4))) + +data.hg38 <- createMAE(exp = not.tcga.exp, + met = not.tcga.met, + TCGA = FALSE, + save = FALSE, + met.platform = "450K", + genome = "hg38", + colData = phenotype.data) +data.hg38 +as.data.frame(colData(data.hg38)[1:20,]) %>% + datatable(options = list(scrollX = TRUE,pageLength = 5)) +``` + +## Probe information + +Probe information is stored as a GRanges object containing the coordinates +of each probe on the DNA methylation array and names of each probe. +The default probe information is fetching from [Wanding Zhou annotation](http://zwdzwd.github.io/InfiniumAnnotation) [@zhou2016comprehensive] + +```{r, message=FALSE} +library(SummarizedExperiment, quietly = TRUE) +rowRanges(getMet(data))[1:3,1:8] +``` + +## Gene information + +Gene information is stored as a GRanges object containing coordinates of +each gene, gene id, gene symbol and gene isoform id. The default gene information +is the ensembl gene annotation fetched from `r BiocStyle::Biocpkg("biomaRt")` by `r BiocStyle::Biocpkg("ELMER")` +function. + +```{r} +rowRanges(getExp(data)) +``` + +# Multi Assay Experiment object + +A Multi Assay Experiment object from the `r BiocStyle::Biocpkg("MultiAssayExperiment")` +package is the input for multiple main functions of `r BiocStyle::Biocpkg("ELMER")`. +It contains the above components and making a Multi Assay Experiment object by `createMAE` function will keep each +component consistent with each other. For example, althougth DNA methylation and gene +expression matrixes have different rows (probe for DNA methylation and gene id for gene +expression), the column (samples) order should be same in the two matrixes. The `createMAE` +function will keep them consistent when it generates the Multi Assay Experiment object. + + +```{r, message=FALSE} +data <- createMAE(exp = GeneExp, + met = Meth, + genome = "hg19", + save = FALSE, + met.platform = "450K", + TCGA = TRUE) + +# For TGCA data 1-12 represents the patient and 1-15 represents the sample ID (i.e. primary solid tumor samples ) +all(substr(colnames(getExp(data)),1,15) == substr(colnames(getMet(data)),1,15)) + +# See sample information for data +as.data.frame(colData(data)) %>% datatable(options = list(scrollX = TRUE)) + +# See sample names for each experiment +as.data.frame(sampleMap(data)) %>% datatable(options = list(scrollX = TRUE)) +``` + +You can also use your own data and annotations to create Multi Assay Experiment object. + +```{r, message=FALSE} +# NON TCGA example: matrices has different column names +gene.exp <- S4Vectors::DataFrame(sample1.exp = c("ENSG00000141510"=2.3,"ENSG00000171862"=5.4), + sample2.exp = c("ENSG00000141510"=1.6,"ENSG00000171862"=2.3)) +dna.met <- S4Vectors::DataFrame(sample1.met = c("cg14324200"=0.5,"cg23867494"=0.1), + sample2.met = c("cg14324200"=0.3,"cg23867494"=0.9)) +sample.info <- S4Vectors::DataFrame(primary = c("sample1","sample2"), + sample.type = c("Normal", "Tumor")) +sampleMap <- S4Vectors::DataFrame( + assay = c("Gene expression","DNA methylation","Gene expression","DNA methylation"), + primary = c("sample1","sample1","sample2","sample2"), + colname = c("sample1.exp","sample1.met","sample2.exp","sample2.met")) +mae <- createMAE(exp = gene.exp, + met = dna.met, + sampleMap = sampleMap, + met.platform ="450K", + colData = sample.info, + genome = "hg38") +# You can also use sample Mapping and Sample information tables from a tsv file +# You can use the createTSVTemplates function to create the tsv files +readr::write_tsv(as.data.frame(sampleMap), path = "sampleMap.tsv") +readr::write_tsv(as.data.frame(sample.info), path = "sample.info.tsv") +mae <- createMAE(exp = gene.exp, + met = dna.met, + sampleMap = "sampleMap.tsv", + met.platform ="450K", + colData = "sample.info.tsv", + genome = "hg38") +mae + +# NON TCGA example: matrices has same column names +gene.exp <- S4Vectors::DataFrame(sample1 = c("ENSG00000141510"=2.3,"ENSG00000171862"=5.4), + sample2 = c("ENSG00000141510"=1.6,"ENSG00000171862"=2.3)) +dna.met <- S4Vectors::DataFrame(sample1 = c("cg14324200"=0.5,"cg23867494"=0.1), + sample2= c("cg14324200"=0.3,"cg23867494"=0.9)) +sample.info <- S4Vectors::DataFrame(primary = c("sample1","sample2"), + sample.type = c("Normal", "Tumor")) +sampleMap <- S4Vectors::DataFrame( + assay = c("Gene expression","DNA methylation","Gene expression","DNA methylation"), + primary = c("sample1","sample1","sample2","sample2"), + colname = c("sample1","sample1","sample2","sample2") +) +mae <- createMAE(exp = gene.exp, + met = dna.met, + sampleMap = sampleMap, + met.platform ="450K", + colData = sample.info, + genome = "hg38") +mae +``` + +# Bibliography \ No newline at end of file diff --git a/vignettes/pipe.Rmd b/vignettes/pipe.Rmd new file mode 100644 index 00000000..efeaf689 --- /dev/null +++ b/vignettes/pipe.Rmd @@ -0,0 +1,141 @@ +--- +title: "3.6 - TCGA.pipe: Running ELMER for TCGA data in a compact way" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: true + toc_float: true + toc_depth: 2 + css: style.css + +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"3.6 - TCGA.pipe: Running ELMER for TCGA data in a compact way"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +
+ +# TCGA.pipe: Running ELMER for TCGA data in a compact way + +`TCGA.pipe` is a function for easily downloading TCGA data from GDC using TCGAbiolinks package [@TCGAbiolinks] +and performing all the analyses in ELMER. For illustration purpose, we skip the downloading step. +The user can use the `getTCGA` function to download TCGA data or +use `TCGA.pipe` by including "download" in the analysis option. + +The following command will do distal DNA methylation analysis and predict putative target genes, motif analysis and identify regulatory transcription factors. + +```{r, fig.height = 6, eval = FALSE} +TCGA.pipe("LUSC", + wd = "./ELMER.example", + cores = parallel::detectCores()/2, + mode = "unsupervised" + permu.size = 300, + Pe = 0.01, + analysis = c("distal.probes","diffMeth","pair","motif","TF.search"), + diff.dir = "hypo", + rm.chr = paste0("chr",c("X","Y"))) +``` + +
+
TCGA.pipe: Mode argument
+
+ +In this new version we added the argument `mode` in the `TCGA.pipe` function. +This will automatically set the `minSubgroupFrac` to the following values: + +Modes available: + +- `unsupervised`: + * Use 20% of each group to identify differently methylated regions (`minSubgroupFrac` = 0.2 in `get.diff.meth`) + * Use 40% of all samples to create Unmethytlated (U) and Methylated (M) groups in the other steps (the lowest quintile of samples is the U group and the highest quintile samples is the M group) (`minSubgroupFrac` = 0.4 in `get.pairs` and `get.TFs` functions) +- `supervised`: + * Use all samples in all functions and set Unmethytlated (U) and Methylated (M) one of the group selected in the analysis. + +The `unsupervised` mode should be used when want to be able to detect a specific (possibly unknown) molecular subtype among tumor; +these subtypes often make up only a minority of samples, and 20\% was chosen as a lower bound for the purposes of statistical power. +If you are using pre-defined group labels, such as treated replicates vs. untreated replicated, use `supervised` mode (all samples), + +For more information please read the analysis section of the vignette. +
+
+ +# Using mutation data to identify groups + +We add in `TCGA.pipe` function (download step) the option to identify mutant samples to perform WT vs Mutant analysis. +It will download open [MAF file](https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/) +from GDC database [@grossman2016toward], select a gene and identify the which are the mutant samples based on the following classification: +(it can be changed using the atgument `mutant_variant_classification`). + +
+
Mutations classification
+
+| Argument | Description | +|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Frame_Shift_Del | Mutant | +| Frame_Shift_Ins | Mutant | +| Missense_Mutation | Mutant | +| Nonsense_Mutation | Mutant | +| Splice_Site | Mutant | +| In_Frame_Del | Mutant | +| In_Frame_Ins | Mutant | +| Translation_Start_Site | Mutant | +| Nonstop_Mutation | Mutant | +| Silent | WT | +|3'UTR| WT | +|5'UTR| WT | +|3'Flank| WT | +|5'Flank| WT | +|IGR1 (intergenic region)| WT | +|Intron| WT | +|RNA| WT | +|Target_region| WT | +
+
+ +The arguments to be used are below: + +
+
`TCGA.pipe` mutation arguments
+
+| Argument | Description | +|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| genes | List of genes for which mutations will be verified. A column in the MAE with the name of the gene will be created with two groups WT (tumor samples without mutation), MUT (tumor samples w/ mutation), NA (not tumor samples)| +| mutant_variant_classification | List of GDC variant classification from MAF files to consider a samples mutant. Only used when argument gene is set.| +| group.col | A column defining the groups of the sample. You can view the available columns using: colnames(MultiAssayExperiment::colData(data)).| +| group1 | A group from group.col. ELMER will run group1 vs group2. That means, if direction is hyper, get probes hypermethylated in group 1 compared to group 2.| +| group2 | A group from group.col. ELMER will run group1 vs group2. That means, if direction is hyper, get probes hypermethylated in group 1 compared to group 2.| +
+
+ +Here is an example we TCGA-LUSC data is downloaded and we will compare TP53 Mutant vs +TP53 WT samples. + + +```{r, fig.height = 6, eval = FALSE} +TCGA.pipe("LUSC", + wd = "./ELMER.example", + cores = parallel::detectCores()/2, + mode = "supervised" + genes = "TP53", + group.col = "TP53", + group1 = "Mutant", + group2 = "WT", + permu.size = 300, + Pe = 0.01, + analysis = c("download","diffMeth","pair","motif","TF.search"), + diff.dir = "hypo", + rm.chr = paste0("chr",c("X","Y"))) +``` + +# Session Info + +```{r sessioninfo, eval=TRUE} +sessionInfo() +``` + +# Bibliography diff --git a/vignettes/plots_TF.Rmd b/vignettes/plots_TF.Rmd new file mode 100644 index 00000000..f01716df --- /dev/null +++ b/vignettes/plots_TF.Rmd @@ -0,0 +1,50 @@ +--- +title: "4.4 - Regulatory TF plots" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: true + toc_float: true + toc_depth: 2 + css: style.css + +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"4.4 - Regulatory TF plots"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r, echo = FALSE,hide=TRUE, message=FALSE, warning=FALSE} +library(ELMER.data) +library(ELMER) +library(DT) +library(dplyr) +library(BiocStyle) +``` +
+ +# TF ranking plot + +For a given enriched motif, all human TF are ranked by the statistical $-log_{10}(P-value)$ assessing the anti-correlation level of candidate Master Regulator TF expression with average DNA methylation level for sites with the given motif. As a result, the most anti-correlated TFs will be ranked in the first positions. By default, the top 3 most anti-correlated TFs and all TF classified by TFClass database in the same (sub)family are highlighted with colors blue, red and orange, respectively. + + +## TF ranking plot: family classification + +Shown are TF ranking plots based on the score ($-log_{10}(P value))$ of association between TF expression and DNA methylation of an enriched motif in the LUSC cancer type. The dashed line indicates the boundary of the top 5% association score. The top 3 associated TFs and the TF family members=(dots in red) that are associated with that specific motif are labeled in the plot + +```{r,eval=TRUE,fig.cap=" TF ranking plot: For a given enriched motif, all human TF are ranked by the statistical $-log_{10}(P-value)$ assessing the anti-correlation level of candidate Master Regulator TF expression with average DNA methylation level for sites with the given motif. As a result, the most anti-correlated TFs will be ranked in the first positions. By default, the top 3 most anti-correlated TFs, and all TF classified by TFClass database in the same family and subfamily are highlighted with colors blue, red and orange, respectively."} +load("result/getTF.hypo.TFs.with.motif.pvalue.rda") +motif <- colnames(TF.meth.cor)[1] +TF.rank.plot( + motif.pvalue = TF.meth.cor, + motif = motif, + save = FALSE +) +``` + + diff --git a/vignettes/plots_heatmap.Rmd b/vignettes/plots_heatmap.Rmd new file mode 100644 index 00000000..33409f5b --- /dev/null +++ b/vignettes/plots_heatmap.Rmd @@ -0,0 +1,56 @@ +--- +title: "4.5 - Heatmap plots" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: true + toc_float: true + toc_depth: 2 + css: style.css + +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"4.5 - Heatmap plots"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r, echo = FALSE,hide=TRUE, message=FALSE, warning=FALSE} +library(ELMER.data) +library(ELMER) +library(DT) +library(dplyr) +library(BiocStyle) +``` +
+ +# Loading required data for plot +```{r,eval=TRUE, message=FALSE, warning = FALSE, results = "hide"} +# Load results from previous sections +mae <- get(load("mae.rda")) +``` + +# Heatmap + +Generate a heatmap of paired probes. + +```{r results='hide', echo=TRUE,eval=F, message=FALSE, warning=FALSE, fig.width=10, fig.height=5, fig.cap="Heatmap of paired pairs."} + +pair <- read.csv("result/getPair.hypo.pairs.significant.csv") + +heatmapPairs(data = mae, + group.col = "definition", + group1 = "Primary solid Tumor", + annotation.col = c("years_smoked","gender"), + group2 = "Solid Tissue Normal", + pairs = pair, + filename = NULL) + +``` + + + diff --git a/vignettes/plots_motif_enrichment.Rmd b/vignettes/plots_motif_enrichment.Rmd new file mode 100644 index 00000000..5971b24c --- /dev/null +++ b/vignettes/plots_motif_enrichment.Rmd @@ -0,0 +1,48 @@ +--- +title: "4.3 - Motif enrichment plots" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: FALSE + toc_float: FALSE + toc_depth: 2 + css: style.css + +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"4.3 - Motif enrichment plots"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + + +```{r, echo = FALSE,hide=TRUE, message=FALSE, warning=FALSE} +library(ELMER) +library(DT) +library(dplyr) +library(BiocStyle) +``` +
+ +# Motif enrichment plot +Motif enrichment plot shows the enrichment levels for the selected motifs. + +The plot shows the Odds Ratio (x axis) for the selected motifs with OR above 1.3 and lower boundary of OR above 1.1. The range shows the 95% confidence interval for each Odds Ratio +```{r results='hide', eval=TRUE, fig.height=6,fig.cap="The plot shows the Odds Ratio (x axis) for the selected motifs with OR above 1.3 and lower boundary of OR above 1.3. The range shows the 95% confidence interval for each Odds Ratio."} +motif.enrichment.plot(motif.enrichment = "result/getMotif.hypo.motif.enrichment.csv", + significant = list(OR = 1.5,lowerOR = 1.3), + label = "hypo", + save = FALSE) +``` + +```{r results='hide', eval=TRUE, fig.height=10,fig.cap="The plot shows the Odds Ratio (x axis) for the selected motifs with OR above 1.3 and lower boundary of OR above 1.3. The range shows the 95% confidence interval for each Odds Ratio."} +motif.enrichment.plot(motif.enrichment = "result/getMotif.hypo.motif.enrichment.csv", + significant = list(OR = 1.5,lowerOR = 1.3), + label = "hypo", + summary = TRUE, + save = FALSE) +``` diff --git a/vignettes/plots_scatter.Rmd b/vignettes/plots_scatter.Rmd new file mode 100644 index 00000000..116481c1 --- /dev/null +++ b/vignettes/plots_scatter.Rmd @@ -0,0 +1,78 @@ +--- +title: "4.1 - Scatter plots" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: true + toc_float: true + toc_depth: 2 + css: style.css + +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"4.1 - Scatter plots"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +
+ +```{r, echo = FALSE,hide=TRUE, message=FALSE, warning=FALSE} +library(ELMER.data) +library(ELMER) +library(DT) +library(dplyr) +library(BiocStyle) +``` +# Loading required data for plot +```{r,eval=TRUE, message=FALSE, warning = FALSE, results = "hide"} +# Load results from previous sections +mae <- get(load("mae.rda")) +``` + +# Scatter plots + +## Scatter plot of one probe and its nearby genes + +Generate scatter plots for one probes' nearby 20 gene expression vs DNA methylation at this probe. + +Each scatter plot shows the methylation level of an example probe cg19403323 in all LUSC samples plotted against the expression of one of 20 adjacent genes. +```{r results='hide', echo=TRUE, message=FALSE, warning=FALSE, fig.height=5, fig.cap="Each scatter plot shows the methylation level of an example probe cg19403323 in all LUSC samples plotted against the expression of one of 20 adjacent genes."} +scatter.plot(data = mae, + byProbe = list(probe = c("cg19403323"), numFlankingGenes = 20), + category = "definition", + lm = TRUE, # Draw linear regression curve + save = FALSE) +``` + +## Scatter plot of one pair +Generate a scatter plot for one probe-gene pair. Figure \ref{fig:figure2} + +Scatter plot shows the methylation level of an example probe cg19403323 in all LUSC samples plotted against the expression of the putative target gene SYT14. +```{r results='hide',eval=TRUE, fig.cap="Scatter plot shows the methylation level of an example probe cg19403323 in all LUSC samples plotted against the expression of the putative target gene SYT14."} +scatter.plot(data = mae, + byPair = list(probe = c("cg19403323"), gene = c("ENSG00000143469")), + category = "definition", save = TRUE, lm_line = TRUE) +``` + +## TF expression vs. average DNA methylation +Generate scatter plot for TF expression vs average DNA methylation of the sites +with certain motif. + +Each scatter plot shows the average methylation level of sites with the TP53 motif in all LUSC samples plotted against the expression of the transcription factor TP53, TP63, TP73 respectively. + +```{r,eval=TRUE, warning=FALSE, fig.cap="Each scatter plot shows the average methylation level of sites with the first enriched motif in all LUSC samples plotted against the expression of the transcription factor TP53, SOX2 respectively."} +load("result/getMotif.hypo.enriched.motifs.rda") +names(enriched.motif)[1] +scatter.plot(data = mae, + byTF = list(TF = c("TP53","SOX2"), + probe = enriched.motif[[names(enriched.motif)[1]]]), + category = "definition", + save = TRUE, + lm_line = TRUE) +``` + diff --git a/vignettes/plots_schematic.Rmd b/vignettes/plots_schematic.Rmd new file mode 100644 index 00000000..35de0dd0 --- /dev/null +++ b/vignettes/plots_schematic.Rmd @@ -0,0 +1,65 @@ +--- +title: "4.2 - Schematic plots" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: true + toc_float: true + toc_depth: 2 + css: style.css + +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"4.2 - Schematic plots"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r, echo = FALSE,hide=TRUE, message=FALSE, warning=FALSE} +library(ELMER) +library(DT) +library(dplyr) +library(BiocStyle) +``` +
+ +# Loading required data for plot +```{r,eval=TRUE, message=FALSE, warning = FALSE, results = "hide"} +# Load results from previous sections +mae <- get(load("mae.rda")) +pair <- read.csv("result/getPair.hypo.pairs.significant.csv") +``` + +# Schematic plot +Schematic plot shows a brief view of linkages between genes and probes. + +## Nearby Genes +Generate schematic plot for one probe with 20 nearby genes and label the gene significantly linked with the probe in red. + +```{r results='hide', eval=TRUE,fig.height=5, fig.cap="The schematic plot shows probe colored in blue and the location of nearby 20 genes. The genes significantly linked to the probe were shown in red.", message=FALSE, warning=FALSE} +schematic.plot( + pair = pair, + data = mae, + group.col = "definition", + byProbe = pair$Probe[1], + save = FALSE +) +``` + +## Nearby Probes +Generate schematic plot for one gene with the probes which the gene is significantly +linked to. + +```{r results='hide', eval=TRUE, fig.width=6, fig.height=10, fig.cap="The schematic plot shows the gene colored in red and all blue colored probes, which are significantly linked to the expression of this gene."} +schematic.plot( + pair = pair, + data = mae, + group.col = "definition", + byGene = pair$GeneID[1], + save = FALSE +) +``` diff --git a/vignettes/style.css b/vignettes/style.css new file mode 100644 index 00000000..233d51ed --- /dev/null +++ b/vignettes/style.css @@ -0,0 +1,85 @@ +h1.title { + font-size: 28px; +} +body{ + font-family: Helvetica; + font-size: 10pt; + text-align: justify +} +/* Headers */ +h1,h2,h3,h4,h5,h6{ + font-size: 20pt; +} + +h4.author{ + font-size: 10pt; +} +h4.date{ + font-size: 10pt; +} +.html-widget { + margin-bottom: 1em; +} +h1 .header-section-number::after { + content: "."; +} +.list-group-item.active, .list-group-item.active:hover, .list-group-item.active:focus { + background-color: #336699; +} + +.navbar-default { + background-color: #336699; + border-bottom-color: #00CD00; +} + +.navbar-default .navbar-nav>li>a { + color: #ffffff; +} + +.navbar-default .navbar-nav>.active>a, .navbar-default .navbar-nav>.active>a:hover, .navbar-default .navbar-nav>.active>a:focus { + color: #ffffff; + /* background-color: #336699;*/ + background-color: #00CD00; +} + +.navbar-default .navbar-nav>li>a:hover, .navbar-default .navbar-nav>li>a:focus { + color: #00CD00; + background-color: transparent; +} + +.navbar-default .navbar-brand { + color: #00CD00; + background-color: transparent; +} + +.navbar-default .navbar-brand:hover, .navbar-default .navbar-brand:focus { + color: #00CD00; + background-color: transparent; +} + + +.navbar { + border-width: 3px; +} + +a:hover, a:focus { + color: #00CD00; + text-decoration: underline; +} + +a { + color: #00CD00; + text-decoration: none; +} + +.navbar-default .navbar-nav>li>a:hover, .navbar-default .navbar-nav>li>a:focus { + color: #00CD00; + background-color: #1a242f; +} + + +th { + background-color: #336699; + color: white; +} +tr:nth-child(even) {background-color: #f2f2f2;} diff --git a/vignettes/usecase.Rmd b/vignettes/usecase.Rmd new file mode 100644 index 00000000..39e92e3b --- /dev/null +++ b/vignettes/usecase.Rmd @@ -0,0 +1,444 @@ +--- +title: "11 - ELMER: Use case" +output: + html_document: + self_contained: true + number_sections: no + theme: flatly + highlight: tango + mathjax: null + toc: true + toc_float: true + toc_depth: 2 + css: style.css +fontsize: 11pt +bibliography: bibliography.bib +vignette: > + %\VignetteIndexEntry{"11 - ELMER: Use case"} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +
+ +# Introduction + +This section contains the complete ELMER code for the following analysis: + +* Vignette example +* BRCA Supervised analysis +* BRCA Unsupervised analysis + +# Vignette example + +Below is the complete code that was explained in the other sections. + +```{r, eval=FALSE} +library(MultiAssayExperiment) +library(ELMER.data) +library(ELMER) +# get distal probes that are 2kb away from TSS on chromosome 1 +distal.probes <- get.feature.probe( + genome = "hg19", + met.platform = "450K", + rm.chr = paste0("chr",c(2:22,"X","Y")) +) +data(LUSC_RNA_refined,package = "ELMER.data") # GeneExp +data(LUSC_meth_refined,package = "ELMER.data") # Meth + +mae <- createMAE( + exp = GeneExp, + met = Meth, + save = TRUE, + linearize.exp = TRUE, + save.filename = "mae.rda", + filter.probes = distal.probes, + met.platform = "450K", + genome = "hg19", + TCGA = TRUE +) + +group.col <- "definition" +group1 <- "Primary solid Tumor" +group2 <- "Solid Tissue Normal" +dir.out <- "result" +diff.dir <- "hypo" # Search for hypomethylated probes in group 1 + +sig.diff <- get.diff.meth( + data = mae, + group.col = group.col, + group1 = group1, + group2 = group2, + minSubgroupFrac = 0.2, + sig.dif = 0.3, + diff.dir = diff.dir, + cores = 1, + dir.out = dir.out, + pvalue = 0.01 +) + + +nearGenes <- GetNearGenes( + data = mae, + probes = sig.diff$probe, + numFlankingGenes = 20 +) # 10 upstream and 10 dowstream genes + +pair <- get.pair( + data = mae, + group.col = group.col, + group1 = group1, + mode = "unsupervised", + group2 = group2, + nearGenes = nearGenes, + diff.dir = diff.dir, + minSubgroupFrac = 0.4, # % of samples to use in to create groups U/M + permu.dir = file.path(dir.out,"permu"), + permu.size = 100, # Please set to 100000 to get significant results + raw.pvalue = 0.05, + Pe = 0.01, # Please set to 0.001 to get significant results + filter.probes = TRUE, # See preAssociationProbeFiltering function + filter.percentage = 0.05, + filter.portion = 0.3, + dir.out = dir.out, + cores = 1, + label = diff.dir +) + +# Identify enriched motif for significantly hypomethylated probes which +# have putative target genes. +enriched.motif <- get.enriched.motif( + data = mae, + probes = pair$Probe, + dir.out = dir.out, + label = diff.dir, + min.incidence = 10, + lower.OR = 1.1 +) + +TF <- get.TFs( + data = mae, + mode = "unsupervised", + group.col = group.col, + group1 = group1, + group2 = group2, + enriched.motif = enriched.motif, + dir.out = dir.out, + cores = 1, + label = diff.dir +) + + +``` + +# BRCA Unsupervised analysis + +```{r, eval=FALSE} +library(stringr) +library(TCGAbiolinks) +library(dplyr) +library(ELMER) +library(MultiAssayExperiment) +library(parallel) +library(readr) +dir.create("~/paper_elmer/",showWarnings = FALSE) +setwd("~/paper_elmer/") + +file <- "mae_BRCA_hg38_450K_no_ffpe.rda" +if(file.exists(file)) { + mae <- get(load(file)) +} else { + getTCGA( + disease = "BRCA", # TCGA disease abbreviation (BRCA,BLCA,GBM, LGG, etc) + basedir = "DATA", # Where data will be downloaded + genome = "hg38" + ) # Genome of refenrece "hg38" or "hg19" + + distal.probes <- get.feature.probe( + feature = NULL, + genome = "hg38", + met.platform = "450K" + ) + + + mae <- createMAE( + exp = "~/paper_elmer/Data/BRCA/BRCA_RNA_hg38.rda", + met = "~/paper_elmer/Data/BRCA/BRCA_meth_hg38.rda", + met.platform = "450K", + genome = "hg38", + linearize.exp = TRUE, + filter.probes = distal.probes, + met.na.cut = 0.2, + save = FALSE, + TCGA = TRUE + ) + # Remove FFPE samples from the analysis + mae <- mae[,!mae$is_ffpe] + + # Get molecular subytpe information from cell paper and more metadata (purity etc...) + # https://doi.org/10.1016/j.cell.2015.09.033 + file <- "http://ars.els-cdn.com/content/image/1-s2.0-S0092867415011952-mmc2.xlsx" + downloader::download(file, basename(file)) + subtypes <- readxl::read_excel(basename(file), skip = 2) + + subtypes$sample <- substr(subtypes$Methylation,1,16) + meta.data <- merge(colData(mae),subtypes,by = "sample",all.x = T) + meta.data <- meta.data[match(colData(mae)$sample,meta.data$sample),] + meta.data <- S4Vectors::DataFrame(meta.data) + rownames(meta.data) <- meta.data$sample + stopifnot(all(meta.data$patient == colData(mae)$patient)) + colData(mae) <- meta.data + save(mae, file = "mae_BRCA_hg38_450K_no_ffpe.rda") +} +dir.out <- "BRCA_unsupervised_hg38/hypo" +cores <- 10 +diff.probes <- get.diff.meth( + data = mae, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + diff.dir = "hypo", # Get probes hypometh. in group 1 + cores = cores, + minSubgroupFrac = 0.2, # % group samples used. + pvalue = 0.01, + sig.dif = 0.3, + dir.out = dir.out, + save = TRUE +) + +# For each differently methylated probes we will get the +# 20 nearby genes (10 downstream and 10 upstream) +nearGenes <- GetNearGenes( + data = mae, + probes = diff.probes$probe, + numFlankingGenes = 20 +) + +# This step is the most time consuming. Depending on the size of the groups +# and the number of probes found previously it migh take hours +Hypo.pair <- get.pair( + data = mae, + nearGenes = nearGenes, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + permu.dir = paste0(dir.out,"/permu"), + permu.size = 10000, + mode = "unsupervised", + minSubgroupFrac = 0.4, # 40% of samples to create U and M + raw.pvalue = 0.001, + Pe = 0.001, + filter.probes = TRUE, + filter.percentage = 0.05, + filter.portion = 0.3, + dir.out = dir.out, + cores = cores, + label = "hypo" +) +# Number of pairs: 2950 + + +enriched.motif <- get.enriched.motif( + data = mae, + min.motif.quality = "DS", + probes = unique(Hypo.pair$Probe), + dir.out = dir.out, + label = "hypo", + min.incidence = 10, + lower.OR = 1.1 +) +TF <- get.TFs( + data = mae, + group.col = "definition", + group1 = "Primary solid Tumor", + group2 = "Solid Tissue Normal", + minSubgroupFrac = 0.4, # Set to 1 if supervised mode + enriched.motif = enriched.motif, + dir.out = dir.out, + cores = cores, + label = "hypo" +) + +``` + +# BRCA Supervised analysis + +```{r, eval=FALSE} +library(stringr) +library(TCGAbiolinks) +library(dplyr) +library(ELMER) +library(MultiAssayExperiment) +library(parallel) +library(readr) +#----------------------------------- +# 1 - Samples +# ---------------------------------- +dir.create("~/paper_elmer/",showWarnings = FALSE) +setwd("~/paper_elmer/") + +file <- "mae_BRCA_hg38_450K_no_ffpe.rda" +if(file.exists(file)) { + mae <- get(load(file)) +} else { + getTCGA( + disease = "BRCA", # TCGA disease abbreviation (BRCA,BLCA,GBM, LGG, etc) + basedir = "DATA", # Where data will be downloaded + genome = "hg38" + ) # Genome of refenrece "hg38" or "hg19" + + distal.probes <- get.feature.probe( + feature = NULL, + genome = "hg38", + met.platform = "450K" + ) + + mae <- createMAE( + exp = "DATA/BRCA/BRCA_RNA_hg38.rda", + met = "DATA/BRCA/BRCA_meth_hg38.rda", + met.platform = "450K", + genome = "hg38", + linearize.exp = TRUE, + filter.probes = distal.probes, + met.na.cut = 0.2, + save = FALSE, + TCGA = TRUE + ) + # Remove FFPE samples from the analysis + mae <- mae[,!mae$is_ffpe] + + # Get molecular subytpe information from cell paper and more metadata (purity etc...) + # https://doi.org/10.1016/j.cell.2015.09.033 + file <- "http://ars.els-cdn.com/content/image/1-s2.0-S0092867415011952-mmc2.xlsx" + downloader::download(file, basename(file)) + subtypes <- readxl::read_excel(basename(file), skip = 2) + + subtypes$sample <- substr(subtypes$Methylation,1,16) + meta.data <- merge(colData(mae),subtypes,by = "sample",all.x = T) + meta.data <- meta.data[match(colData(mae)$sample,meta.data$sample),] + meta.data <- S4Vectors::DataFrame(meta.data) + rownames(meta.data) <- meta.data$sample + stopifnot(all(meta.data$patient == colData(mae)$patient)) + colData(mae) <- meta.data + save(mae, file = "mae_BRCA_hg38_450K_no_ffpe.rda") +} + +cores <- 6 +direction <- c( "hypo","hyper") +genome <- "hg38" +group.col <- "PAM50" +groups <- t(combn(na.omit(unique(colData(mae)[,group.col])),2)) +for(g in 1:nrow(groups)) { + group1 <- groups[g,1] + group2 <- groups[g,2] + for (j in direction){ + tryCatch({ + message("Analysing probes ",j, "methylated in ", group1, " vs ", group2) + dir.out <- paste0("BRCA_supervised_",genome,"/",group1,"_",group2,"/",j) + dir.create(dir.out, recursive = TRUE) + #-------------------------------------- + # STEP 3: Analysis | + #-------------------------------------- + # Step 3.1: Get diff methylated probes | + #-------------------------------------- + Sig.probes <- get.diff.meth( + data = mae, + group.col = group.col, + group1 = group1, + group2 = group2, + sig.dif = 0.3, + minSubgroupFrac = 1, + cores = cores, + dir.out = dir.out, + diff.dir = j, + pvalue = 0.01 + ) + if(nrow(Sig.probes) == 0) next + #------------------------------------------------------------- + # Step 3.2: Identify significant probe-gene pairs | + #------------------------------------------------------------- + # Collect nearby 20 genes for Sig.probes + nearGenes <- GetNearGenes( + data = mae, + probe = Sig.probes$probe + ) + + pair <- get.pair( + data = mae, + nearGenes = nearGenes, + group.col = group.col, + group1 = group1, + group2 = group2, + permu.dir = paste0(dir.out,"/permu"), + dir.out = dir.out, + mode = "supervised", + diff.dir = j, + cores = cores, + label = j, + permu.size = 10000, + raw.pvalue = 0.001 + ) + + Sig.probes.paired <- readr::read_csv( + paste0(dir.out, + "/getPair.",j, + ".pairs.significant.csv") + )[,1, drop = TRUE] + + + #------------------------------------------------------------- + # Step 3.3: Motif enrichment analysis on the selected probes | + #------------------------------------------------------------- + if(length(Sig.probes.paired) > 0 ){ + #------------------------------------------------------------- + # Step 3.3: Motif enrichment analysis on the selected probes | + #------------------------------------------------------------- + enriched.motif <- get.enriched.motif( + probes = Sig.probes.paired, + dir.out = dir.out, + data = mae, + label = j, + plot.title = paste0("BRCA: OR for paired probes ", + j, "methylated in ", + group1, " vs ",group2) + ) + motif.enrichment <- readr::read_csv( + paste0(dir.out, + "/getMotif.",j, + ".motif.enrichment.csv") + ) + if(length(enriched.motif) > 0){ + #------------------------------------------------------------- + # Step 3.4: Identifying regulatory TFs | + #------------------------------------------------------------- + print("get.TFs") + + TF <- get.TFs( + data = mae, + enriched.motif = enriched.motif, + dir.out = dir.out, + mode = "supervised", + group.col = group.col, + group1 = group1, + diff.dir = j, + group2 = group2, + cores = cores, + label = j + ) + TF.meth.cor <- get( + load(paste0(dir.out, "/getTF.",j, ".TFs.with.motif.pvalue.rda")) + ) + save( + mae, TF, enriched.motif, Sig.probes.paired, + pair, nearGenes, Sig.probes, motif.enrichment, + TF.meth.cor, + file = paste0(dir.out,"/ELMER_results_",j,".rda") + ) + } + } + }, error = function(e){ + message(e) + }) + } +} +```