2 tweaks to makeGRangesFromDataFrame():

- Always ignore "width" column in the input (even when keep.extra.columns=TRUE). - If 'seqinfo' is not supplied, order the seqlevels according to the output of GenomeInfoDb::rankSeqlevels(). git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/GenomicRanges@100356 bc3139a8-67e5-0310-9ffc-ced21a209358
RamsinghLab · Mar 7, 2015 · a2669cd · a2669cd
1 parent 260f4b6
commit a2669cd
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 26 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -7,7 +7,7 @@ Description: The ability to efficiently represent and manipulate genomic
 	intervals. Specialized containers for representing and manipulating
 	short alignments against a reference genome are defined in the
 	GenomicAlignments package.
-Version: 1.19.44
+Version: 1.19.45
 Author: P. Aboyoun, H. Pages and M. Lawrence
 Maintainer: Bioconductor Package Maintainer <[email protected]>
 biocViews: Genetics, Infrastructure, Sequencing, Annotation, Coverage,

diff --git a/R/makeGRangesFromDataFrame.R b/R/makeGRangesFromDataFrame.R
@@ -62,6 +62,20 @@
     stop("cannnot determine start/end columns")
 }
 
+.find_width_col <- function(df_colnames, width.field, prefix)
+{
+    idx <- which(df_colnames %in% paste0(prefix, width.field))
+    if (length(idx) == 0L)
+        idx <- which(df_colnames %in% width.field)
+    if (length(idx) == 0L)
+        return(NA_integer_)
+    if (length(idx) >= 2L) {
+        warning("cannnot determine width column unambiguously")
+        return(idx[[1L]])
+    }
+    idx
+}
+
 .find_seqnames_col <- function(df_colnames, seqnames.field, prefix)
 {
     idx <- which(df_colnames %in% paste0(prefix, seqnames.field))
@@ -88,8 +102,9 @@
     idx
 }
 
-### Returns an integer vector of length 4 with names "seqnames", "start",
-### "end", and "strand".
+### Returns a named integer vector of length 5. Names are: seqnames, start,
+### end, width, and strand. The values must be valid column numbers, except
+### for the width and strand elements that can also be NAs.
 .find_GRanges_cols <- function(df_colnames,
                                seqnames.field=c("seqnames", "seqname",
                                                 "chromosome", "chrom",
@@ -110,6 +125,9 @@
                                            start.field0,
                                            end.field0)
     prefix <- start_end_cols[[2L]]
+    ## Name of "width" field is not under user control for now (until we need
+    ## need that).
+    width_col <- .find_width_col(df_colnames0, "width", prefix)
     seqnames_col <- .find_seqnames_col(df_colnames0,
                                        seqnames.field0,
                                        prefix)
@@ -121,7 +139,8 @@
                                        strand.field0,
                                        prefix)
     }
-    c(seqnames=seqnames_col, start_end_cols[[1L]], strand=strand_col)
+    c(seqnames=seqnames_col, start_end_cols[[1L]], width=width_col,
+      strand=strand_col)
 }
 
 ### 'df' must be a data.frame or DataFrame object.
@@ -155,8 +174,10 @@ makeGRangesFromDataFrame <- function(df,
                                        strand.field=strand.field,
                                        ignore.strand=ignore.strand)
 
-    ## Prepare the GRanges components.
+    ## Prepare 'ans_seqnames'.
     ans_seqnames <- df[[granges_cols[["seqnames"]]]]
+
+    ## Prepare 'ans_ranges'.
     ans_start <- df[[granges_cols[["start"]]]]
     ans_end <- df[[granges_cols[["end"]]]]
     if (!is.numeric(ans_start) || !is.numeric(ans_end))
@@ -165,32 +186,51 @@ makeGRangesFromDataFrame <- function(df,
              "must be numeric")
     if (starts.in.df.are.0based)
         ans_start <- ans_start + 1L
-    ans_ranges <- IRanges(ans_start, ans_end)
+    ans_names <- rownames(df)
+    if (identical(ans_names, as.character(seq_len(nrow(df)))))
+        ans_names <- NULL
+    ans_ranges <- IRanges(ans_start, ans_end, names=ans_names)
+
+    ## Prepare 'ans_strand'.
     if (is.na(granges_cols[["strand"]]) || ignore.strand) {
         ans_strand <- "*"
     } else {
         ans_strand <- df[[granges_cols[["strand"]]]]
     }
+
+    ## Prepare 'ans_mcols'.
     if (keep.extra.columns) {
         drop_idx <- c(granges_cols[["seqnames"]],
                       granges_cols[["start"]],
                       granges_cols[["end"]])
+        if (!is.na(granges_cols[["width"]]))
+            drop_idx <- c(drop_idx, granges_cols[["width"]])
         if (!is.na(granges_cols[["strand"]]))
             drop_idx <- c(drop_idx, granges_cols[["strand"]])
         ans_mcols <- df[-drop_idx]
     } else {
         ans_mcols <- NULL
     }
-    ans_names <- rownames(df)
-    if (identical(as.character(seq_len(nrow(df))), ans_names))
-        ans_names <- NULL
 
-    ## Make the GRanges object and return it.
-    ans <- GRanges(ans_seqnames, ans_ranges, strand=ans_strand,
-                   ans_mcols, seqinfo=ans_seqinfo)
-    if (!is.null(ans_names))
-        names(ans) <- ans_names
-    ans
+    ## Prepare 'ans_seqinfo'.
+    if (is.null(ans_seqinfo)) {
+        ## Only if 'ans_seqnames' is a factor-Rle, we preserve the seqlevels
+        ## in the order they are in 'levels(ans_seqnames)'. Otherwise, we
+        ## order them according to rankSeqlevels().
+        seqlevels <- levels(ans_seqnames)
+        if (is.null(seqlevels)) {
+            seqlevels <- unique(ans_seqnames)
+            if (!is.character(seqlevels))
+                seqlevels <- as.character(seqlevels)
+        }
+        if (!(is(ans_seqnames, "Rle") && is.factor(runValue(ans_seqnames))))
+            seqlevels[rankSeqlevels(seqlevels)] <- seqlevels
+        ans_seqinfo <- Seqinfo(seqlevels)
+    }
+
+    ## Make and return the GRanges object.
+    GRanges(ans_seqnames, ans_ranges, strand=ans_strand,
+            ans_mcols, seqinfo=ans_seqinfo)
 }
 
 setAs("data.frame", "GRanges",

diff --git a/man/makeGRangesFromDataFrame.Rd b/man/makeGRangesFromDataFrame.Rd
@@ -39,14 +39,15 @@ makeGRangesFromDataFrame(df,
   }
   \item{keep.extra.columns}{
     \code{TRUE} or \code{FALSE} (the default).
-    If \code{TRUE}, then the columns in \code{df} that are not used to form
-    the genomic ranges returned in the \link{GRanges} object will be stored
-    in it as metadata columns. Otherwise, they will be ignored.
+    If \code{TRUE}, the columns in \code{df} that are not used to form
+    the genomic ranges of the returned \link{GRanges} object are then
+    returned as metadata columns on the object. Otherwise, they are ignored.
+    If \code{df} has a \code{width} column, then it's always ignored.
   }
   \item{ignore.strand}{
     \code{TRUE} or \code{FALSE} (the default).
     If \code{TRUE}, then the strand of the returned \link{GRanges} object
-    will be set to \code{"*"}.
+    is set to \code{"*"}.
   }
   \item{seqinfo}{
     Either \code{NULL}, or a \link{Seqinfo} object, or a character vector
@@ -60,30 +61,30 @@ makeGRangesFromDataFrame(df,
     that contains the chromosome name (a.k.a. sequence name) associated
     with each genomic range.
     Only the first name in \code{seqnames.field} that is found
-    in \code{colnames(df)} will be used.
+    in \code{colnames(df)} is used.
     If no one is found, then an error is raised.
   }
   \item{start.field}{
     A character vector of recognized names for the column in \code{df}
     that contains the start positions of the genomic ranges.
     Only the first name in \code{start.field} that is found
-    in \code{colnames(df)} will be used.
+    in \code{colnames(df)} is used.
     If no one is found, then an error is raised.
   }
   \item{end.field}{
     A character vector of recognized names for the column in \code{df}
     that contains the end positions of the genomic ranges.
     Only the first name in \code{start.field} that is found
-    in \code{colnames(df)} will be used.
+    in \code{colnames(df)} is used.
     If no one is found, then an error is raised.
   }
   \item{strand.field}{
     A character vector of recognized names for the column in \code{df}
     that contains the strand associated with each genomic range.
     Only the first name in \code{strand.field} that is found
-    in \code{colnames(df)} will be used.
+    in \code{colnames(df)} is used.
     If no one is found or if \code{ignore.strand} is \code{TRUE},
-    then the strand of the returned \link{GRanges} object will be
+    then the strand of the returned \link{GRanges} object is
     set to \code{"*"}.
   }
   \item{starts.in.df.are.0based}{
@@ -103,7 +104,11 @@ makeGRangesFromDataFrame(df,
 
   If the \code{seqinfo} argument was supplied, the returned object will
   have exactly the seqlevels specified in \code{seqinfo} and in the same
-  order.
+  order. Otherwise, the seqlevels are ordered according to the output of
+  the \code{\link[GenomeInfoDb]{rankSeqlevels}} function (except if
+  \code{df} contains the seqnames in the form of a factor-Rle, in which
+  case the levels of the factor-Rle become the seqlevels of the returned
+  object and with no re-ordering).
 
   If \code{df} has non-automatic row names (i.e. \code{rownames(df)} is
   not \code{NULL} and is not \code{seq_len(nrow(df))}), then they will be
@@ -124,7 +129,9 @@ makeGRangesFromDataFrame(df,
   \itemize{
     \item \link{GRanges} objects.
 
-    \item \link{Seqinfo} objects.
+    \item \link[GenomeInfoDb]{Seqinfo} objects and the
+          \code{\link[GenomeInfoDb]{rankSeqlevels}} function in the
+          \pkg{GenomeInfoDb} package.
 
     \item The \code{\link{makeGRangesListFromFeatureFragments}} function
           for making a \link{GRangesList} object from a list of fragmented