From a2669cd62a39d8fabc1a39ce9142ee64f8a56525 Mon Sep 17 00:00:00 2001 From: "hpages@fhcrc.org" Date: Sat, 7 Mar 2015 01:55:35 +0000 Subject: [PATCH] 2 tweaks to makeGRangesFromDataFrame(): - Always ignore "width" column in the input (even when keep.extra.columns=TRUE). - If 'seqinfo' is not supplied, order the seqlevels according to the output of GenomeInfoDb::rankSeqlevels(). git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/GenomicRanges@100356 bc3139a8-67e5-0310-9ffc-ced21a209358 --- DESCRIPTION | 2 +- R/makeGRangesFromDataFrame.R | 68 ++++++++++++++++++++++++++------- man/makeGRangesFromDataFrame.Rd | 29 ++++++++------ 3 files changed, 73 insertions(+), 26 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 583db13..031301d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -7,7 +7,7 @@ Description: The ability to efficiently represent and manipulate genomic intervals. Specialized containers for representing and manipulating short alignments against a reference genome are defined in the GenomicAlignments package. -Version: 1.19.44 +Version: 1.19.45 Author: P. Aboyoun, H. Pages and M. Lawrence Maintainer: Bioconductor Package Maintainer biocViews: Genetics, Infrastructure, Sequencing, Annotation, Coverage, diff --git a/R/makeGRangesFromDataFrame.R b/R/makeGRangesFromDataFrame.R index 7c03db0..5af88ad 100644 --- a/R/makeGRangesFromDataFrame.R +++ b/R/makeGRangesFromDataFrame.R @@ -62,6 +62,20 @@ stop("cannnot determine start/end columns") } +.find_width_col <- function(df_colnames, width.field, prefix) +{ + idx <- which(df_colnames %in% paste0(prefix, width.field)) + if (length(idx) == 0L) + idx <- which(df_colnames %in% width.field) + if (length(idx) == 0L) + return(NA_integer_) + if (length(idx) >= 2L) { + warning("cannnot determine width column unambiguously") + return(idx[[1L]]) + } + idx +} + .find_seqnames_col <- function(df_colnames, seqnames.field, prefix) { idx <- which(df_colnames %in% paste0(prefix, seqnames.field)) @@ -88,8 +102,9 @@ idx } -### Returns an integer vector of length 4 with names "seqnames", "start", -### "end", and "strand". +### Returns a named integer vector of length 5. Names are: seqnames, start, +### end, width, and strand. The values must be valid column numbers, except +### for the width and strand elements that can also be NAs. .find_GRanges_cols <- function(df_colnames, seqnames.field=c("seqnames", "seqname", "chromosome", "chrom", @@ -110,6 +125,9 @@ start.field0, end.field0) prefix <- start_end_cols[[2L]] + ## Name of "width" field is not under user control for now (until we need + ## need that). + width_col <- .find_width_col(df_colnames0, "width", prefix) seqnames_col <- .find_seqnames_col(df_colnames0, seqnames.field0, prefix) @@ -121,7 +139,8 @@ strand.field0, prefix) } - c(seqnames=seqnames_col, start_end_cols[[1L]], strand=strand_col) + c(seqnames=seqnames_col, start_end_cols[[1L]], width=width_col, + strand=strand_col) } ### 'df' must be a data.frame or DataFrame object. @@ -155,8 +174,10 @@ makeGRangesFromDataFrame <- function(df, strand.field=strand.field, ignore.strand=ignore.strand) - ## Prepare the GRanges components. + ## Prepare 'ans_seqnames'. ans_seqnames <- df[[granges_cols[["seqnames"]]]] + + ## Prepare 'ans_ranges'. ans_start <- df[[granges_cols[["start"]]]] ans_end <- df[[granges_cols[["end"]]]] if (!is.numeric(ans_start) || !is.numeric(ans_end)) @@ -165,32 +186,51 @@ makeGRangesFromDataFrame <- function(df, "must be numeric") if (starts.in.df.are.0based) ans_start <- ans_start + 1L - ans_ranges <- IRanges(ans_start, ans_end) + ans_names <- rownames(df) + if (identical(ans_names, as.character(seq_len(nrow(df))))) + ans_names <- NULL + ans_ranges <- IRanges(ans_start, ans_end, names=ans_names) + + ## Prepare 'ans_strand'. if (is.na(granges_cols[["strand"]]) || ignore.strand) { ans_strand <- "*" } else { ans_strand <- df[[granges_cols[["strand"]]]] } + + ## Prepare 'ans_mcols'. if (keep.extra.columns) { drop_idx <- c(granges_cols[["seqnames"]], granges_cols[["start"]], granges_cols[["end"]]) + if (!is.na(granges_cols[["width"]])) + drop_idx <- c(drop_idx, granges_cols[["width"]]) if (!is.na(granges_cols[["strand"]])) drop_idx <- c(drop_idx, granges_cols[["strand"]]) ans_mcols <- df[-drop_idx] } else { ans_mcols <- NULL } - ans_names <- rownames(df) - if (identical(as.character(seq_len(nrow(df))), ans_names)) - ans_names <- NULL - ## Make the GRanges object and return it. - ans <- GRanges(ans_seqnames, ans_ranges, strand=ans_strand, - ans_mcols, seqinfo=ans_seqinfo) - if (!is.null(ans_names)) - names(ans) <- ans_names - ans + ## Prepare 'ans_seqinfo'. + if (is.null(ans_seqinfo)) { + ## Only if 'ans_seqnames' is a factor-Rle, we preserve the seqlevels + ## in the order they are in 'levels(ans_seqnames)'. Otherwise, we + ## order them according to rankSeqlevels(). + seqlevels <- levels(ans_seqnames) + if (is.null(seqlevels)) { + seqlevels <- unique(ans_seqnames) + if (!is.character(seqlevels)) + seqlevels <- as.character(seqlevels) + } + if (!(is(ans_seqnames, "Rle") && is.factor(runValue(ans_seqnames)))) + seqlevels[rankSeqlevels(seqlevels)] <- seqlevels + ans_seqinfo <- Seqinfo(seqlevels) + } + + ## Make and return the GRanges object. + GRanges(ans_seqnames, ans_ranges, strand=ans_strand, + ans_mcols, seqinfo=ans_seqinfo) } setAs("data.frame", "GRanges", diff --git a/man/makeGRangesFromDataFrame.Rd b/man/makeGRangesFromDataFrame.Rd index b1c3b95..bf5f938 100644 --- a/man/makeGRangesFromDataFrame.Rd +++ b/man/makeGRangesFromDataFrame.Rd @@ -39,14 +39,15 @@ makeGRangesFromDataFrame(df, } \item{keep.extra.columns}{ \code{TRUE} or \code{FALSE} (the default). - If \code{TRUE}, then the columns in \code{df} that are not used to form - the genomic ranges returned in the \link{GRanges} object will be stored - in it as metadata columns. Otherwise, they will be ignored. + If \code{TRUE}, the columns in \code{df} that are not used to form + the genomic ranges of the returned \link{GRanges} object are then + returned as metadata columns on the object. Otherwise, they are ignored. + If \code{df} has a \code{width} column, then it's always ignored. } \item{ignore.strand}{ \code{TRUE} or \code{FALSE} (the default). If \code{TRUE}, then the strand of the returned \link{GRanges} object - will be set to \code{"*"}. + is set to \code{"*"}. } \item{seqinfo}{ Either \code{NULL}, or a \link{Seqinfo} object, or a character vector @@ -60,30 +61,30 @@ makeGRangesFromDataFrame(df, that contains the chromosome name (a.k.a. sequence name) associated with each genomic range. Only the first name in \code{seqnames.field} that is found - in \code{colnames(df)} will be used. + in \code{colnames(df)} is used. If no one is found, then an error is raised. } \item{start.field}{ A character vector of recognized names for the column in \code{df} that contains the start positions of the genomic ranges. Only the first name in \code{start.field} that is found - in \code{colnames(df)} will be used. + in \code{colnames(df)} is used. If no one is found, then an error is raised. } \item{end.field}{ A character vector of recognized names for the column in \code{df} that contains the end positions of the genomic ranges. Only the first name in \code{start.field} that is found - in \code{colnames(df)} will be used. + in \code{colnames(df)} is used. If no one is found, then an error is raised. } \item{strand.field}{ A character vector of recognized names for the column in \code{df} that contains the strand associated with each genomic range. Only the first name in \code{strand.field} that is found - in \code{colnames(df)} will be used. + in \code{colnames(df)} is used. If no one is found or if \code{ignore.strand} is \code{TRUE}, - then the strand of the returned \link{GRanges} object will be + then the strand of the returned \link{GRanges} object is set to \code{"*"}. } \item{starts.in.df.are.0based}{ @@ -103,7 +104,11 @@ makeGRangesFromDataFrame(df, If the \code{seqinfo} argument was supplied, the returned object will have exactly the seqlevels specified in \code{seqinfo} and in the same - order. + order. Otherwise, the seqlevels are ordered according to the output of + the \code{\link[GenomeInfoDb]{rankSeqlevels}} function (except if + \code{df} contains the seqnames in the form of a factor-Rle, in which + case the levels of the factor-Rle become the seqlevels of the returned + object and with no re-ordering). If \code{df} has non-automatic row names (i.e. \code{rownames(df)} is not \code{NULL} and is not \code{seq_len(nrow(df))}), then they will be @@ -124,7 +129,9 @@ makeGRangesFromDataFrame(df, \itemize{ \item \link{GRanges} objects. - \item \link{Seqinfo} objects. + \item \link[GenomeInfoDb]{Seqinfo} objects and the + \code{\link[GenomeInfoDb]{rankSeqlevels}} function in the + \pkg{GenomeInfoDb} package. \item The \code{\link{makeGRangesListFromFeatureFragments}} function for making a \link{GRangesList} object from a list of fragmented