From 16625ee04b1d18fe96500bca72bb65c81c045b2c Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 13 Jun 2019 22:03:50 -0700 Subject: [PATCH] More cache-efficient version of sumCountsAcrossFeatures. --- R/sumCountsAcrossFeatures.R | 20 ++++++---- src/init.cpp | 2 +- src/scater.h | 2 +- src/sum_counts.cpp | 74 ++++++++++++++++++------------------- 4 files changed, 49 insertions(+), 49 deletions(-) diff --git a/R/sumCountsAcrossFeatures.R b/R/sumCountsAcrossFeatures.R index 2dcc5eb0..b8ac0e02 100644 --- a/R/sumCountsAcrossFeatures.R +++ b/R/sumCountsAcrossFeatures.R @@ -8,6 +8,7 @@ #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying whether summation should be parallelized. #' #' @return A count matrix where counts for all features in the same set are summed together within each cell. +#' Rows are ordered according to \code{levels(ids)}. #' #' @details #' This function provides a convenient method for aggregating counts across multiple rows for each cell. @@ -47,21 +48,24 @@ sumCountsAcrossFeatures <- function(object, ids, exprs_values="counts", BPPARAM= object <- assay(object, exprs_values, withDimnames=FALSE) } - by_set <- split(seq_along(ids) - 1L, ids) + ids <- as.factor(ids) + ans <- integer(length(ids)) + ans <- as.integer(ids) - 1L + assignments <- .assign_jobs_to_workers(ncol(object), BPPARAM) - out_list <- bpmapply(start=assignments$start, end=assignments$end, FUN=.sum_across_rows_internal, - MoreArgs=list(by_set=by_set, mat=object), BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE) + out_list <- bpmapply(start=assignments$start, end=assignments$end, + FUN=.sum_across_rows_internal, + MoreArgs=list(by_set=ans, ntotal=nlevels(ids), mat=object), + BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE) out <- do.call(cbind, out_list) colnames(out) <- colnames(object) - rownames(out) <- names(by_set) + rownames(out) <- levels(ids) out } -.sum_across_rows_internal <- function(mat, by_set, start, end) +.sum_across_rows_internal <- function(mat, by_set, ntotal, start, end) # Internal function to drag along the namespace. { - .Call(cxx_sum_row_counts, mat, by_set, start, end) + .Call(cxx_sum_row_counts, mat, by_set, ntotal, start, end) } - - diff --git a/src/init.cpp b/src/init.cpp index 8357587b..3005e708 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -13,7 +13,7 @@ static const R_CallMethodDef all_call_entries[] = { REGISTER(combined_qc, 7), REGISTER(top_cumprop, 2), - REGISTER(sum_row_counts, 4), + REGISTER(sum_row_counts, 5), REGISTER(sum_col_counts, 4), REGISTER(row_above, 4), diff --git a/src/scater.h b/src/scater.h index 17ab9baf..aac31a5c 100644 --- a/src/scater.h +++ b/src/scater.h @@ -17,7 +17,7 @@ SEXP combined_qc(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP top_cumprop(SEXP, SEXP); -SEXP sum_row_counts(SEXP, SEXP, SEXP, SEXP); +SEXP sum_row_counts(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP sum_col_counts(SEXP, SEXP, SEXP, SEXP); diff --git a/src/sum_counts.cpp b/src/sum_counts.cpp index 9cb9426e..a3e6de5b 100644 --- a/src/sum_counts.cpp +++ b/src/sum_counts.cpp @@ -8,25 +8,17 @@ #include #include -std::vector create_summation(Rcpp::List Sumset) { - const size_t nsummations=Sumset.size(); - std::vector summation_set(Sumset.size()); - for (size_t fx=0; fx -Rcpp::RObject sum_row_counts_internal(Rcpp::RObject input, const std::vector& summable_set, size_t start_index, size_t end_index) { +Rcpp::RObject sum_row_counts_internal(Rcpp::RObject input, const Rcpp::IntegerVector& summable_set, + size_t nsummations, size_t start_index, size_t end_index) +{ auto mat=beachmat::create_matrix(input); const size_t ncells=mat->get_ncol(); - const size_t nsummations=summable_set.size(); typename M::vector holder_out(nsummations); if (end_index > ncells) { throw std::runtime_error("end index out of range"); @@ -44,41 +36,38 @@ Rcpp::RObject sum_row_counts_internal(Rcpp::RObject input, const std::vectorset_col(c - start_index, holder_out.begin()); + std::fill(holder_out.begin(), holder_out.end(), 0); } return out->yield(); } -SEXP sum_row_counts (SEXP counts, SEXP sumset, SEXP job_start, SEXP job_end) { +SEXP sum_row_counts (SEXP counts, SEXP sumset, SEXP nsums, SEXP job_start, SEXP job_end) { BEGIN_RCPP - auto summation_set=create_summation(sumset); - // Determining the type and amount of work to do. size_t start_index=check_integer_scalar(job_start, "start index"); size_t end_index=check_integer_scalar(job_end, "end index"); - if (end_index < start_index) { throw std::runtime_error("start index is less than end index"); } - const size_t ncells=end_index - start_index; + if (end_index < start_index) { + throw std::runtime_error("start index is less than end index"); + } + const size_t nsummations=check_integer_scalar(nsums, "number of sets"); auto mattype=beachmat::find_sexp_type(counts); if (mattype==INTSXP) { return sum_row_counts_internal(counts, summation_set, start_index, end_index); + beachmat::integer_output>(counts, sumset, nsummations, start_index, end_index); } else if (mattype==REALSXP) { return sum_row_counts_internal(counts, summation_set, start_index, end_index); + beachmat::numeric_output>(counts, sumset, nsummations, start_index, end_index); } else { throw std::runtime_error("unacceptable matrix type"); @@ -91,7 +80,9 @@ SEXP sum_row_counts (SEXP counts, SEXP sumset, SEXP job_start, SEXP job_end) { ************************/ template -Rcpp::RObject sum_col_counts_internal(Rcpp::RObject input, const std::vector& summable_set, size_t start_index, size_t end_index) { +Rcpp::RObject sum_col_counts_internal(Rcpp::RObject input, const Rcpp::List& summable_set, + size_t start_index, size_t end_index) +{ auto mat=beachmat::create_matrix(input); const size_t ncells=mat->get_ncol(); const size_t ngenes=mat->get_nrow(); @@ -104,10 +95,17 @@ Rcpp::RObject sum_col_counts_internal(Rcpp::RObject input, const std::vector(njobs, summable_set.size(), OPARAM); - - size_t i_summed=0; - for (const auto& curset : summable_set) { + const size_t nsummations=summable_set.size(); + auto out=beachmat::create_output(njobs, nsummations, OPARAM); + + /* We do the rather unintuitive approach of running across each set + * and summing columns, rather than running across columns and adding + * them to the different sets as we go along. This approach has the + * benefit of avoiding the need to hold intermediate sums, which may + * be costly to re-extract depending on the backend used in output_param. + */ + for (size_t i_summed=0; i_summed!=summable_set.size(); ++i_summed) { + Rcpp::IntegerVector curset=summable_set[i_summed]; std::fill(holder_out.begin()+start_index, holder_out.begin()+end_index, 0); for (auto cell : curset) { @@ -126,10 +124,9 @@ Rcpp::RObject sum_col_counts_internal(Rcpp::RObject input, const std::vectorset_col(i_summed, holder_out.begin()+start_index); - ++i_summed; } return out->yield(); } @@ -137,25 +134,24 @@ Rcpp::RObject sum_col_counts_internal(Rcpp::RObject input, const std::vector(counts, summation_set, start_index, end_index); + beachmat::integer_output>(counts, sumset, start_index, end_index); } else if (mattype==REALSXP) { return sum_col_counts_internal(counts, summation_set, start_index, end_index); + beachmat::numeric_output>(counts, sumset, start_index, end_index); } else { throw std::runtime_error("unacceptable matrix type"); } END_RCPP } -