diff --git a/R/absoluteRanges.R b/R/absoluteRanges.R index 73f903a..d04302d 100644 --- a/R/absoluteRanges.R +++ b/R/absoluteRanges.R @@ -11,20 +11,20 @@ normarg_seqlengths <- function(seqlengths) { if (!is.numeric(seqlengths)) - stop("'seqlengths' must be a non-empty numeric vector") + stop(wmsg("'seqlengths' must be a non-empty numeric vector")) if (length(seqlengths) == 0L) return(setNames(integer(0), character(0))) seqlengths_names <- names(seqlengths) if (is.null(seqlengths_names)) - stop("'seqlengths' must be named") + stop(wmsg("'seqlengths' must be named")) if (any(seqlengths_names %in% c(NA_character_, ""))) - stop("'seqlengths' has names that are NA or the empty string") + stop(wmsg("'seqlengths' has names that are NA or the empty string")) if (any(duplicated(seqlengths_names))) - stop("'seqlengths' has duplicated names") + stop(wmsg("'seqlengths' has duplicated names")) if (!is.integer(seqlengths)) seqlengths <- setNames(as.integer(seqlengths), seqlengths_names) if (any(seqlengths < 0L, na.rm=TRUE)) - stop("'seqlengths' contains negative values") + stop(wmsg("'seqlengths' contains negative values")) seqlengths } @@ -61,7 +61,7 @@ isSmallGenome <- function(seqlengths) absoluteRanges <- function(x) { if (!is(x, "GenomicRanges")) - stop("'x' must be a GenomicRanges object") + stop(wmsg("'x' must be a GenomicRanges object")) x_seqlengths <- seqlengths(x) if (!isTRUE(isSmallGenome(x_seqlengths))) stop(wmsg("the total length of the underlying sequences is too big ", @@ -87,7 +87,7 @@ absoluteRanges <- function(x) relativeRanges <- function(x, seqlengths) { if (!is(x, "Ranges")) - stop("'x' must be a Ranges object") + stop(wmsg("'x' must be a Ranges object")) if (is.numeric(seqlengths)) { ans_seqlengths <- normarg_seqlengths(seqlengths) ans_seqinfo <- Seqinfo(seqnames=names(ans_seqlengths), @@ -104,15 +104,23 @@ relativeRanges <- function(x, seqlengths) stop(wmsg("the total length of the sequences specified ", "thru 'seqlengths' is too big or couldn't be ", "computed (because some lengths are NA)")) - offsets <- c(0L, cumsum(unname(ans_seqlengths)[-length(ans_seqlengths)])) - chrom_starts <- offsets + 1L - start2chrom <- findInterval(start(x), chrom_starts) - end2chrom <- findInterval(end(x), chrom_starts) - if (!identical(start2chrom, end2chrom)) - stop(wmsg("Some ranges in 'x' are crossing sequence boundaries. ", + offsets <- c(0L, cumsum(unname(ans_seqlengths))) + + ## Map each range in 'x' to a sequence in the genome. + ticks <- offsets + 1L + start2seqid <- findInterval(start(x), ticks) + end2seqid <- findInterval(end(x), ticks) + if (!identical(start2seqid, end2seqid)) + stop(wmsg("Some ranges in 'x' cannot be mapped to a sequence in the ", + "genome because they cross sequence boundaries. ", "Cannot convert them into relative ranges.")) - ans_ranges <- shift(x, shift=-offsets[start2chrom]) - ans_seqnames <- names(ans_seqlengths)[start2chrom] + if (any(start2seqid < 1L) || any(start2seqid > length(ans_seqlengths))) + stop(wmsg("Some ranges in 'x' cannot be mapped to a sequence in the ", + "genome because they are outside the boundaries of the ", + "genome. Cannot convert them into relative ranges.")) + + ans_ranges <- shift(x, shift=-offsets[start2seqid]) + ans_seqnames <- names(ans_seqlengths)[start2seqid] GRanges(ans_seqnames, ans_ranges, seqinfo=ans_seqinfo) } diff --git a/man/absoluteRanges.Rd b/man/absoluteRanges.Rd index 375e8d3..6a4621d 100644 --- a/man/absoluteRanges.Rd +++ b/man/absoluteRanges.Rd @@ -46,13 +46,19 @@ isSmallGenome(seqlengths) } \details{ - Because ranges in Bioconductor are using integer vectors for their - internal representation, \code{absoluteRanges} and \code{relativeRanges} - cannot be used on genomes for which the total sequence length is > - \code{.Machine$integer.max}. Utility function \code{isSmallGenome} is - provided as a mean for the user to check upfront whether the size of a - genome is <= \code{.Machine$integer.max}, and thus compatible with - \code{absoluteRanges} and \code{relativeRanges}. + Because \code{absoluteRanges} returns the \emph{absolute} ranges in an + \link[IRanges]{IRanges} object, and because an \link[IRanges]{IRanges} + object cannot hold ranges with an end > \code{.Machine$integer.max} + (i.e. >= 2^31 on most platforms), \code{absoluteRanges} cannot be used + if the size of the underlying genome (i.e. the total length of the + sequences in it) is > \code{.Machine$integer.max}. Utility function + \code{isSmallGenome} is provided as a mean for the user to check + upfront whether the genome is \emph{small} (i.e. its size is <= + \code{.Machine$integer.max}) or not, and thus compatible with + \code{absoluteRanges} or not. + + \code{relativeRanges} applies the same restriction by looking at the + \code{seqlengths} argument. } \value{ @@ -106,27 +112,50 @@ stopifnot(all(gr == gr2)) ## ON REAL DATA ## --------------------------------------------------------------------- +## With a "small" genome + library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene -isSmallGenome(txdb) ex <- exons(txdb) ex -ar <- absoluteRanges(ex) -ar +isSmallGenome(ex) -ex2 <- relativeRanges(ar, seqlengths(ex)) -ex2 # original strand is not restored -strand(ex2) <- strand(ex) # set it back -stopifnot(all(ex == ex2)) +## Note that because isSmallGenome() can return NA (see Value section +## above), its result should always be wrapped inside isTRUE() when +## used in an if statement: +if (isTRUE(isSmallGenome(ex))) { + ar <- absoluteRanges(ex) + ar + + ex2 <- relativeRanges(ar, seqlengths(ex)) + ex2 # original strand is not restored + + ## Sanity check: + strand(ex2) <- strand(ex) # restore the strand + stopifnot(all(ex == ex2)) +} + +## With a "big" genome (but we can reduce it) library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene -isSmallGenome(txdb) -if (interactive()) { - ex <- exons(txdb) +ex <- exons(txdb) +isSmallGenome(ex) +\dontrun{ absoluteRanges(ex) # error! } + +## However, if we are only interested in some chromosomes, we might +## still be able to use absoluteRanges(): +seqlevels(ex, force=TRUE) <- paste0("chr", 1:10) +isSmallGenome(ex) # TRUE! +ar <- absoluteRanges(ex) +ex2 <- relativeRanges(ar, seqlengths(ex)) + +## Sanity check: +strand(ex2) <- strand(ex) +stopifnot(all(ex == ex2)) } \keyword{manip}