Skip to content

Commit

Permalink
2 tweaks to makeGRangesFromDataFrame():
Browse files Browse the repository at this point in the history
- Always ignore "width" column in the input (even when keep.extra.columns=TRUE).
- If 'seqinfo' is not supplied, order the seqlevels according to the output of
  GenomeInfoDb::rankSeqlevels().


git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/GenomicRanges@100356 bc3139a8-67e5-0310-9ffc-ced21a209358
  • Loading branch information
[email protected] committed Mar 7, 2015
1 parent 260f4b6 commit a2669cd
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 26 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Description: The ability to efficiently represent and manipulate genomic
intervals. Specialized containers for representing and manipulating
short alignments against a reference genome are defined in the
GenomicAlignments package.
Version: 1.19.44
Version: 1.19.45
Author: P. Aboyoun, H. Pages and M. Lawrence
Maintainer: Bioconductor Package Maintainer <[email protected]>
biocViews: Genetics, Infrastructure, Sequencing, Annotation, Coverage,
Expand Down
68 changes: 54 additions & 14 deletions R/makeGRangesFromDataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,20 @@
stop("cannnot determine start/end columns")
}

.find_width_col <- function(df_colnames, width.field, prefix)
{
idx <- which(df_colnames %in% paste0(prefix, width.field))
if (length(idx) == 0L)
idx <- which(df_colnames %in% width.field)
if (length(idx) == 0L)
return(NA_integer_)
if (length(idx) >= 2L) {
warning("cannnot determine width column unambiguously")
return(idx[[1L]])
}
idx
}

.find_seqnames_col <- function(df_colnames, seqnames.field, prefix)
{
idx <- which(df_colnames %in% paste0(prefix, seqnames.field))
Expand All @@ -88,8 +102,9 @@
idx
}

### Returns an integer vector of length 4 with names "seqnames", "start",
### "end", and "strand".
### Returns a named integer vector of length 5. Names are: seqnames, start,
### end, width, and strand. The values must be valid column numbers, except
### for the width and strand elements that can also be NAs.
.find_GRanges_cols <- function(df_colnames,
seqnames.field=c("seqnames", "seqname",
"chromosome", "chrom",
Expand All @@ -110,6 +125,9 @@
start.field0,
end.field0)
prefix <- start_end_cols[[2L]]
## Name of "width" field is not under user control for now (until we need
## need that).
width_col <- .find_width_col(df_colnames0, "width", prefix)
seqnames_col <- .find_seqnames_col(df_colnames0,
seqnames.field0,
prefix)
Expand All @@ -121,7 +139,8 @@
strand.field0,
prefix)
}
c(seqnames=seqnames_col, start_end_cols[[1L]], strand=strand_col)
c(seqnames=seqnames_col, start_end_cols[[1L]], width=width_col,
strand=strand_col)
}

### 'df' must be a data.frame or DataFrame object.
Expand Down Expand Up @@ -155,8 +174,10 @@ makeGRangesFromDataFrame <- function(df,
strand.field=strand.field,
ignore.strand=ignore.strand)

## Prepare the GRanges components.
## Prepare 'ans_seqnames'.
ans_seqnames <- df[[granges_cols[["seqnames"]]]]

## Prepare 'ans_ranges'.
ans_start <- df[[granges_cols[["start"]]]]
ans_end <- df[[granges_cols[["end"]]]]
if (!is.numeric(ans_start) || !is.numeric(ans_end))
Expand All @@ -165,32 +186,51 @@ makeGRangesFromDataFrame <- function(df,
"must be numeric")
if (starts.in.df.are.0based)
ans_start <- ans_start + 1L
ans_ranges <- IRanges(ans_start, ans_end)
ans_names <- rownames(df)
if (identical(ans_names, as.character(seq_len(nrow(df)))))
ans_names <- NULL
ans_ranges <- IRanges(ans_start, ans_end, names=ans_names)

## Prepare 'ans_strand'.
if (is.na(granges_cols[["strand"]]) || ignore.strand) {
ans_strand <- "*"
} else {
ans_strand <- df[[granges_cols[["strand"]]]]
}

## Prepare 'ans_mcols'.
if (keep.extra.columns) {
drop_idx <- c(granges_cols[["seqnames"]],
granges_cols[["start"]],
granges_cols[["end"]])
if (!is.na(granges_cols[["width"]]))
drop_idx <- c(drop_idx, granges_cols[["width"]])
if (!is.na(granges_cols[["strand"]]))
drop_idx <- c(drop_idx, granges_cols[["strand"]])
ans_mcols <- df[-drop_idx]
} else {
ans_mcols <- NULL
}
ans_names <- rownames(df)
if (identical(as.character(seq_len(nrow(df))), ans_names))
ans_names <- NULL

## Make the GRanges object and return it.
ans <- GRanges(ans_seqnames, ans_ranges, strand=ans_strand,
ans_mcols, seqinfo=ans_seqinfo)
if (!is.null(ans_names))
names(ans) <- ans_names
ans
## Prepare 'ans_seqinfo'.
if (is.null(ans_seqinfo)) {
## Only if 'ans_seqnames' is a factor-Rle, we preserve the seqlevels
## in the order they are in 'levels(ans_seqnames)'. Otherwise, we
## order them according to rankSeqlevels().
seqlevels <- levels(ans_seqnames)
if (is.null(seqlevels)) {
seqlevels <- unique(ans_seqnames)
if (!is.character(seqlevels))
seqlevels <- as.character(seqlevels)
}
if (!(is(ans_seqnames, "Rle") && is.factor(runValue(ans_seqnames))))
seqlevels[rankSeqlevels(seqlevels)] <- seqlevels
ans_seqinfo <- Seqinfo(seqlevels)
}

## Make and return the GRanges object.
GRanges(ans_seqnames, ans_ranges, strand=ans_strand,
ans_mcols, seqinfo=ans_seqinfo)
}

setAs("data.frame", "GRanges",
Expand Down
29 changes: 18 additions & 11 deletions man/makeGRangesFromDataFrame.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,15 @@ makeGRangesFromDataFrame(df,
}
\item{keep.extra.columns}{
\code{TRUE} or \code{FALSE} (the default).
If \code{TRUE}, then the columns in \code{df} that are not used to form
the genomic ranges returned in the \link{GRanges} object will be stored
in it as metadata columns. Otherwise, they will be ignored.
If \code{TRUE}, the columns in \code{df} that are not used to form
the genomic ranges of the returned \link{GRanges} object are then
returned as metadata columns on the object. Otherwise, they are ignored.
If \code{df} has a \code{width} column, then it's always ignored.
}
\item{ignore.strand}{
\code{TRUE} or \code{FALSE} (the default).
If \code{TRUE}, then the strand of the returned \link{GRanges} object
will be set to \code{"*"}.
is set to \code{"*"}.
}
\item{seqinfo}{
Either \code{NULL}, or a \link{Seqinfo} object, or a character vector
Expand All @@ -60,30 +61,30 @@ makeGRangesFromDataFrame(df,
that contains the chromosome name (a.k.a. sequence name) associated
with each genomic range.
Only the first name in \code{seqnames.field} that is found
in \code{colnames(df)} will be used.
in \code{colnames(df)} is used.
If no one is found, then an error is raised.
}
\item{start.field}{
A character vector of recognized names for the column in \code{df}
that contains the start positions of the genomic ranges.
Only the first name in \code{start.field} that is found
in \code{colnames(df)} will be used.
in \code{colnames(df)} is used.
If no one is found, then an error is raised.
}
\item{end.field}{
A character vector of recognized names for the column in \code{df}
that contains the end positions of the genomic ranges.
Only the first name in \code{start.field} that is found
in \code{colnames(df)} will be used.
in \code{colnames(df)} is used.
If no one is found, then an error is raised.
}
\item{strand.field}{
A character vector of recognized names for the column in \code{df}
that contains the strand associated with each genomic range.
Only the first name in \code{strand.field} that is found
in \code{colnames(df)} will be used.
in \code{colnames(df)} is used.
If no one is found or if \code{ignore.strand} is \code{TRUE},
then the strand of the returned \link{GRanges} object will be
then the strand of the returned \link{GRanges} object is
set to \code{"*"}.
}
\item{starts.in.df.are.0based}{
Expand All @@ -103,7 +104,11 @@ makeGRangesFromDataFrame(df,
If the \code{seqinfo} argument was supplied, the returned object will
have exactly the seqlevels specified in \code{seqinfo} and in the same
order.
order. Otherwise, the seqlevels are ordered according to the output of
the \code{\link[GenomeInfoDb]{rankSeqlevels}} function (except if
\code{df} contains the seqnames in the form of a factor-Rle, in which
case the levels of the factor-Rle become the seqlevels of the returned
object and with no re-ordering).
If \code{df} has non-automatic row names (i.e. \code{rownames(df)} is
not \code{NULL} and is not \code{seq_len(nrow(df))}), then they will be
Expand All @@ -124,7 +129,9 @@ makeGRangesFromDataFrame(df,
\itemize{
\item \link{GRanges} objects.
\item \link{Seqinfo} objects.
\item \link[GenomeInfoDb]{Seqinfo} objects and the
\code{\link[GenomeInfoDb]{rankSeqlevels}} function in the
\pkg{GenomeInfoDb} package.
\item The \code{\link{makeGRangesListFromFeatureFragments}} function
for making a \link{GRangesList} object from a list of fragmented
Expand Down

0 comments on commit a2669cd

Please sign in to comment.