Skip to content

Commit

Permalink
improve error handling in relativeRanges()
Browse files Browse the repository at this point in the history
  • Loading branch information
[email protected] committed Feb 12, 2015
1 parent dacdd91 commit b8bda85
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 32 deletions.
38 changes: 23 additions & 15 deletions R/absoluteRanges.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,20 @@
normarg_seqlengths <- function(seqlengths)
{
if (!is.numeric(seqlengths))
stop("'seqlengths' must be a non-empty numeric vector")
stop(wmsg("'seqlengths' must be a non-empty numeric vector"))
if (length(seqlengths) == 0L)
return(setNames(integer(0), character(0)))
seqlengths_names <- names(seqlengths)
if (is.null(seqlengths_names))
stop("'seqlengths' must be named")
stop(wmsg("'seqlengths' must be named"))
if (any(seqlengths_names %in% c(NA_character_, "")))
stop("'seqlengths' has names that are NA or the empty string")
stop(wmsg("'seqlengths' has names that are NA or the empty string"))
if (any(duplicated(seqlengths_names)))
stop("'seqlengths' has duplicated names")
stop(wmsg("'seqlengths' has duplicated names"))
if (!is.integer(seqlengths))
seqlengths <- setNames(as.integer(seqlengths), seqlengths_names)
if (any(seqlengths < 0L, na.rm=TRUE))
stop("'seqlengths' contains negative values")
stop(wmsg("'seqlengths' contains negative values"))
seqlengths
}

Expand Down Expand Up @@ -61,7 +61,7 @@ isSmallGenome <- function(seqlengths)
absoluteRanges <- function(x)
{
if (!is(x, "GenomicRanges"))
stop("'x' must be a GenomicRanges object")
stop(wmsg("'x' must be a GenomicRanges object"))
x_seqlengths <- seqlengths(x)
if (!isTRUE(isSmallGenome(x_seqlengths)))
stop(wmsg("the total length of the underlying sequences is too big ",
Expand All @@ -87,7 +87,7 @@ absoluteRanges <- function(x)
relativeRanges <- function(x, seqlengths)
{
if (!is(x, "Ranges"))
stop("'x' must be a Ranges object")
stop(wmsg("'x' must be a Ranges object"))
if (is.numeric(seqlengths)) {
ans_seqlengths <- normarg_seqlengths(seqlengths)
ans_seqinfo <- Seqinfo(seqnames=names(ans_seqlengths),
Expand All @@ -104,15 +104,23 @@ relativeRanges <- function(x, seqlengths)
stop(wmsg("the total length of the sequences specified ",
"thru 'seqlengths' is too big or couldn't be ",
"computed (because some lengths are NA)"))
offsets <- c(0L, cumsum(unname(ans_seqlengths)[-length(ans_seqlengths)]))
chrom_starts <- offsets + 1L
start2chrom <- findInterval(start(x), chrom_starts)
end2chrom <- findInterval(end(x), chrom_starts)
if (!identical(start2chrom, end2chrom))
stop(wmsg("Some ranges in 'x' are crossing sequence boundaries. ",
offsets <- c(0L, cumsum(unname(ans_seqlengths)))

## Map each range in 'x' to a sequence in the genome.
ticks <- offsets + 1L
start2seqid <- findInterval(start(x), ticks)
end2seqid <- findInterval(end(x), ticks)
if (!identical(start2seqid, end2seqid))
stop(wmsg("Some ranges in 'x' cannot be mapped to a sequence in the ",
"genome because they cross sequence boundaries. ",
"Cannot convert them into relative ranges."))
ans_ranges <- shift(x, shift=-offsets[start2chrom])
ans_seqnames <- names(ans_seqlengths)[start2chrom]
if (any(start2seqid < 1L) || any(start2seqid > length(ans_seqlengths)))
stop(wmsg("Some ranges in 'x' cannot be mapped to a sequence in the ",
"genome because they are outside the boundaries of the ",
"genome. Cannot convert them into relative ranges."))

ans_ranges <- shift(x, shift=-offsets[start2seqid])
ans_seqnames <- names(ans_seqlengths)[start2seqid]
GRanges(ans_seqnames, ans_ranges, seqinfo=ans_seqinfo)
}

63 changes: 46 additions & 17 deletions man/absoluteRanges.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,19 @@ isSmallGenome(seqlengths)
}

\details{
Because ranges in Bioconductor are using integer vectors for their
internal representation, \code{absoluteRanges} and \code{relativeRanges}
cannot be used on genomes for which the total sequence length is >
\code{.Machine$integer.max}. Utility function \code{isSmallGenome} is
provided as a mean for the user to check upfront whether the size of a
genome is <= \code{.Machine$integer.max}, and thus compatible with
\code{absoluteRanges} and \code{relativeRanges}.
Because \code{absoluteRanges} returns the \emph{absolute} ranges in an
\link[IRanges]{IRanges} object, and because an \link[IRanges]{IRanges}
object cannot hold ranges with an end > \code{.Machine$integer.max}
(i.e. >= 2^31 on most platforms), \code{absoluteRanges} cannot be used
if the size of the underlying genome (i.e. the total length of the
sequences in it) is > \code{.Machine$integer.max}. Utility function
\code{isSmallGenome} is provided as a mean for the user to check
upfront whether the genome is \emph{small} (i.e. its size is <=
\code{.Machine$integer.max}) or not, and thus compatible with
\code{absoluteRanges} or not.

\code{relativeRanges} applies the same restriction by looking at the
\code{seqlengths} argument.
}

\value{
Expand Down Expand Up @@ -106,27 +112,50 @@ stopifnot(all(gr == gr2))
## ON REAL DATA
## ---------------------------------------------------------------------

## With a "small" genome

library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene
isSmallGenome(txdb)
ex <- exons(txdb)
ex

ar <- absoluteRanges(ex)
ar
isSmallGenome(ex)

ex2 <- relativeRanges(ar, seqlengths(ex))
ex2 # original strand is not restored
strand(ex2) <- strand(ex) # set it back
stopifnot(all(ex == ex2))
## Note that because isSmallGenome() can return NA (see Value section
## above), its result should always be wrapped inside isTRUE() when
## used in an if statement:
if (isTRUE(isSmallGenome(ex))) {
ar <- absoluteRanges(ex)
ar

ex2 <- relativeRanges(ar, seqlengths(ex))
ex2 # original strand is not restored

## Sanity check:
strand(ex2) <- strand(ex) # restore the strand
stopifnot(all(ex == ex2))
}

## With a "big" genome (but we can reduce it)

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
isSmallGenome(txdb)
if (interactive()) {
ex <- exons(txdb)
ex <- exons(txdb)
isSmallGenome(ex)
\dontrun{
absoluteRanges(ex) # error!
}

## However, if we are only interested in some chromosomes, we might
## still be able to use absoluteRanges():
seqlevels(ex, force=TRUE) <- paste0("chr", 1:10)
isSmallGenome(ex) # TRUE!
ar <- absoluteRanges(ex)
ex2 <- relativeRanges(ar, seqlengths(ex))

## Sanity check:
strand(ex2) <- strand(ex)
stopifnot(all(ex == ex2))
}

\keyword{manip}

0 comments on commit b8bda85

Please sign in to comment.