Skip to content

Commit

Permalink
More cache-efficient version of sumCountsAcrossFeatures.
Browse files Browse the repository at this point in the history
  • Loading branch information
LTLA committed Jun 14, 2019
1 parent d2e08f4 commit 16625ee
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 49 deletions.
20 changes: 12 additions & 8 deletions R/sumCountsAcrossFeatures.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying whether summation should be parallelized.
#'
#' @return A count matrix where counts for all features in the same set are summed together within each cell.
#' Rows are ordered according to \code{levels(ids)}.
#'
#' @details
#' This function provides a convenient method for aggregating counts across multiple rows for each cell.
Expand Down Expand Up @@ -47,21 +48,24 @@ sumCountsAcrossFeatures <- function(object, ids, exprs_values="counts", BPPARAM=
object <- assay(object, exprs_values, withDimnames=FALSE)
}

by_set <- split(seq_along(ids) - 1L, ids)
ids <- as.factor(ids)
ans <- integer(length(ids))
ans <- as.integer(ids) - 1L

assignments <- .assign_jobs_to_workers(ncol(object), BPPARAM)
out_list <- bpmapply(start=assignments$start, end=assignments$end, FUN=.sum_across_rows_internal,
MoreArgs=list(by_set=by_set, mat=object), BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE)
out_list <- bpmapply(start=assignments$start, end=assignments$end,
FUN=.sum_across_rows_internal,
MoreArgs=list(by_set=ans, ntotal=nlevels(ids), mat=object),
BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE)

out <- do.call(cbind, out_list)
colnames(out) <- colnames(object)
rownames(out) <- names(by_set)
rownames(out) <- levels(ids)
out
}

.sum_across_rows_internal <- function(mat, by_set, start, end)
.sum_across_rows_internal <- function(mat, by_set, ntotal, start, end)
# Internal function to drag along the namespace.
{
.Call(cxx_sum_row_counts, mat, by_set, start, end)
.Call(cxx_sum_row_counts, mat, by_set, ntotal, start, end)
}


2 changes: 1 addition & 1 deletion src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ static const R_CallMethodDef all_call_entries[] = {
REGISTER(combined_qc, 7),
REGISTER(top_cumprop, 2),

REGISTER(sum_row_counts, 4),
REGISTER(sum_row_counts, 5),
REGISTER(sum_col_counts, 4),

REGISTER(row_above, 4),
Expand Down
2 changes: 1 addition & 1 deletion src/scater.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ SEXP combined_qc(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP top_cumprop(SEXP, SEXP);


SEXP sum_row_counts(SEXP, SEXP, SEXP, SEXP);
SEXP sum_row_counts(SEXP, SEXP, SEXP, SEXP, SEXP);

SEXP sum_col_counts(SEXP, SEXP, SEXP, SEXP);

Expand Down
74 changes: 35 additions & 39 deletions src/sum_counts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,17 @@
#include <stdexcept>
#include <vector>

std::vector<Rcpp::IntegerVector> create_summation(Rcpp::List Sumset) {
const size_t nsummations=Sumset.size();
std::vector<Rcpp::IntegerVector> summation_set(Sumset.size());
for (size_t fx=0; fx<nsummations; ++fx) {
summation_set[fx]=Sumset[fx];
}
return summation_set;
}

/***************************
* Summing across features *
***************************/

template <class M, class O>
Rcpp::RObject sum_row_counts_internal(Rcpp::RObject input, const std::vector<Rcpp::IntegerVector>& summable_set, size_t start_index, size_t end_index) {
Rcpp::RObject sum_row_counts_internal(Rcpp::RObject input, const Rcpp::IntegerVector& summable_set,
size_t nsummations, size_t start_index, size_t end_index)
{
auto mat=beachmat::create_matrix<M>(input);
const size_t ncells=mat->get_ncol();

const size_t nsummations=summable_set.size();
typename M::vector holder_out(nsummations);
if (end_index > ncells) {
throw std::runtime_error("end index out of range");
Expand All @@ -44,41 +36,38 @@ Rcpp::RObject sum_row_counts_internal(Rcpp::RObject input, const std::vector<Rcp
for (size_t c=start_index; c<end_index; ++c) {
col_holder.fill(c);
auto it=col_holder.get_values();
auto ssIt=summable_set.begin();

for (auto& curval : holder_out) {
const auto& curset=(*ssIt);
curval=0;
for (auto feat : curset) {
curval += *(it + feat);
for (auto s : summable_set) {
if (s!=NA_INTEGER) {
holder_out[s]+=*it;
}
++ssIt;
++it;
}

out->set_col(c - start_index, holder_out.begin());
std::fill(holder_out.begin(), holder_out.end(), 0);
}
return out->yield();
}

SEXP sum_row_counts (SEXP counts, SEXP sumset, SEXP job_start, SEXP job_end) {
SEXP sum_row_counts (SEXP counts, SEXP sumset, SEXP nsums, SEXP job_start, SEXP job_end) {
BEGIN_RCPP

auto summation_set=create_summation(sumset);

// Determining the type and amount of work to do.
size_t start_index=check_integer_scalar(job_start, "start index");
size_t end_index=check_integer_scalar(job_end, "end index");
if (end_index < start_index) { throw std::runtime_error("start index is less than end index"); }
const size_t ncells=end_index - start_index;
if (end_index < start_index) {
throw std::runtime_error("start index is less than end index");
}
const size_t nsummations=check_integer_scalar(nsums, "number of sets");

auto mattype=beachmat::find_sexp_type(counts);
if (mattype==INTSXP) {
return sum_row_counts_internal<beachmat::integer_matrix,
beachmat::integer_output>(counts, summation_set, start_index, end_index);
beachmat::integer_output>(counts, sumset, nsummations, start_index, end_index);

} else if (mattype==REALSXP) {
return sum_row_counts_internal<beachmat::numeric_matrix,
beachmat::numeric_output>(counts, summation_set, start_index, end_index);
beachmat::numeric_output>(counts, sumset, nsummations, start_index, end_index);

} else {
throw std::runtime_error("unacceptable matrix type");
Expand All @@ -91,7 +80,9 @@ SEXP sum_row_counts (SEXP counts, SEXP sumset, SEXP job_start, SEXP job_end) {
************************/

template <class M, class O>
Rcpp::RObject sum_col_counts_internal(Rcpp::RObject input, const std::vector<Rcpp::IntegerVector>& summable_set, size_t start_index, size_t end_index) {
Rcpp::RObject sum_col_counts_internal(Rcpp::RObject input, const Rcpp::List& summable_set,
size_t start_index, size_t end_index)
{
auto mat=beachmat::create_matrix<M>(input);
const size_t ncells=mat->get_ncol();
const size_t ngenes=mat->get_nrow();
Expand All @@ -104,10 +95,17 @@ Rcpp::RObject sum_col_counts_internal(Rcpp::RObject input, const std::vector<Rcp

beachmat::output_param OPARAM(mat.get());
const size_t njobs=end_index - start_index;
auto out=beachmat::create_output<O>(njobs, summable_set.size(), OPARAM);

size_t i_summed=0;
for (const auto& curset : summable_set) {
const size_t nsummations=summable_set.size();
auto out=beachmat::create_output<O>(njobs, nsummations, OPARAM);

/* We do the rather unintuitive approach of running across each set
* and summing columns, rather than running across columns and adding
* them to the different sets as we go along. This approach has the
* benefit of avoiding the need to hold intermediate sums, which may
* be costly to re-extract depending on the backend used in output_param.
*/
for (size_t i_summed=0; i_summed!=summable_set.size(); ++i_summed) {
Rcpp::IntegerVector curset=summable_set[i_summed];
std::fill(holder_out.begin()+start_index, holder_out.begin()+end_index, 0);

for (auto cell : curset) {
Expand All @@ -126,36 +124,34 @@ Rcpp::RObject sum_col_counts_internal(Rcpp::RObject input, const std::vector<Rcp
holder_out[i] += *val;
}
}
}
}

out->set_col(i_summed, holder_out.begin()+start_index);
++i_summed;
}
return out->yield();
}

SEXP sum_col_counts (SEXP counts, SEXP sumset, SEXP job_start, SEXP job_end) {
BEGIN_RCPP

auto summation_set=create_summation(sumset);

// Determining the type and amount of work to do.
size_t start_index=check_integer_scalar(job_start, "start index");
size_t end_index=check_integer_scalar(job_end, "end index");
if (end_index < start_index) { throw std::runtime_error("start index is less than end index"); }
if (end_index < start_index) {
throw std::runtime_error("start index is less than end index");
}

auto mattype=beachmat::find_sexp_type(counts);
if (mattype==INTSXP) {
return sum_col_counts_internal<beachmat::integer_matrix,
beachmat::integer_output>(counts, summation_set, start_index, end_index);
beachmat::integer_output>(counts, sumset, start_index, end_index);

} else if (mattype==REALSXP) {
return sum_col_counts_internal<beachmat::numeric_matrix,
beachmat::numeric_output>(counts, summation_set, start_index, end_index);
beachmat::numeric_output>(counts, sumset, start_index, end_index);

} else {
throw std::runtime_error("unacceptable matrix type");
}
END_RCPP
}

0 comments on commit 16625ee

Please sign in to comment.