diff --git a/R/RcppExports.R b/R/RcppExports.R index b4d14975..26d50894 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -49,6 +49,10 @@ random_impl <- function(genome, length, n, seed = 0L) { .Call(`_valr_random_impl`, genome, length, n, seed) } +read_gtf_impl <- function(filename) { + .Call(`_valr_read_gtf_impl`, filename) +} + shuffle_impl <- function(df, incl, within = FALSE, max_tries = 1000L, seed = 0L) { .Call(`_valr_shuffle_impl`, df, incl, within, max_tries, seed) } diff --git a/R/read_bed.r b/R/read_bed.r index e9d27856..3bfcc749 100644 --- a/R/read_bed.r +++ b/R/read_bed.r @@ -194,38 +194,19 @@ read_bigwig <- function(path, ...) { #' Import and convert a GTF/GFF file into a valr compatible bed tbl format #' -#' @description -#' `r lifecycle::badge("deprecated")` -#' #' This function will output a tibble with the #' required chrom, start, and end columns, as well as other columns depending #' on content in GTF/GFF file. #' -#' @param path path to gtf or gff file -#' @param zero_based if TRUE, convert to zero based +#' Based on the GTF specification at #' -#' @examples +#' @param path path to gtf file #' -#' \dontrun{ +#' @examples #' gtf <- read_gtf(valr_example("hg19.gencode.gtf.gz")) #' head(gtf) -#' } #' #' @export -read_gtf <- function(path, zero_based = TRUE) { - lifecycle::deprecate_stop( - when = "0.8.3", - what = "read_gtf()", - details = c( - x = paste0( - "read_gtf() was removed because rtracklayer does not pass ", - "CRAN AddressSantizer checks of the UCSC C-library code vendored ", - "in rtracklayer." - ), - i = paste0( - "convert GTF to BED, and then `read_bed()`, ", - "or use `rtracklayer::import()` then `gr_to_bed()`." - ) - ) - ) +read_gtf <- function(path) { + as_tibble(read_gtf_impl(path)) } diff --git a/man/read_gtf.Rd b/man/read_gtf.Rd index 72786151..378af9eb 100644 --- a/man/read_gtf.Rd +++ b/man/read_gtf.Rd @@ -4,25 +4,21 @@ \alias{read_gtf} \title{Import and convert a GTF/GFF file into a valr compatible bed tbl format} \usage{ -read_gtf(path, zero_based = TRUE) +read_gtf(path, ...) } \arguments{ -\item{path}{path to gtf or gff file} - -\item{zero_based}{if TRUE, convert to zero based} +\item{path}{path to gtf file} } \description{ -\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} - This function will output a tibble with the required chrom, start, and end columns, as well as other columns depending on content in GTF/GFF file. } +\details{ +Based on the GTF specification at \url{https://useast.ensembl.org/info/website/upload/gff.html} +} \examples{ - -\dontrun{ gtf <- read_gtf(valr_example("hg19.gencode.gtf.gz")) head(gtf) -} } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 6122e492..0d43f690 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -180,6 +180,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// read_gtf_impl +DataFrame read_gtf_impl(std::string filename); +RcppExport SEXP _valr_read_gtf_impl(SEXP filenameSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type filename(filenameSEXP); + rcpp_result_gen = Rcpp::wrap(read_gtf_impl(filename)); + return rcpp_result_gen; +END_RCPP +} // shuffle_impl DataFrame shuffle_impl(DataFrame df, DataFrame incl, bool within, int max_tries, int seed); RcppExport SEXP _valr_shuffle_impl(SEXP dfSEXP, SEXP inclSEXP, SEXP withinSEXP, SEXP max_triesSEXP, SEXP seedSEXP) { diff --git a/src/init.c b/src/init.c index 11fd9038..1682ecf8 100644 --- a/src/init.c +++ b/src/init.c @@ -3,46 +3,48 @@ #include // for NULL #include -/* FIXME: - Check these declarations against the C/Fortran source code. -*/ +/* FIXME: + Check these declarations against the C/Fortran source code. + */ /* .Call calls */ -extern SEXP _valr_bed12toexons_impl(void *); -extern SEXP _valr_closest_impl(void *, void *, void *, void *, void *, void *); -extern SEXP _valr_complement_impl(void *, void *); -extern SEXP _valr_coverage_impl(void *, void *, void *, void *); -extern SEXP _valr_dist_impl(void *, void *, void *, void *, void *); -extern SEXP _valr_flank_impl(void *, void *, void *, void *, void *, void *, void *, void *); -extern SEXP _valr_gcoverage_impl(void *, void *); -extern SEXP _valr_intersect_impl(void *, void *, void *, void *, void *, void *, void *); -extern SEXP _valr_makewindows_impl(void *, void *, void *, void *, void *); -extern SEXP _valr_merge_impl(void *, void *, void *); -extern SEXP _valr_partition_impl(void *, void *); -extern SEXP _valr_random_impl(void *, void *, void *, void *); -extern SEXP _valr_shuffle_impl(void *, void *, void *, void *, void *); -extern SEXP _valr_subtract_impl(void *, void *, void *, void *); +extern SEXP _valr_bed12toexons_impl(SEXP); +extern SEXP _valr_closest_impl(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_complement_impl(SEXP, SEXP); +extern SEXP _valr_coverage_impl(SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_dist_impl(SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_flank_impl(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_gcoverage_impl(SEXP, SEXP); +extern SEXP _valr_intersect_impl(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_makewindows_impl(SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_merge_impl(SEXP, SEXP, SEXP); +extern SEXP _valr_partition_impl(SEXP, SEXP); +extern SEXP _valr_random_impl(SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_read_gtf_impl(SEXP); +extern SEXP _valr_shuffle_impl(SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_subtract_impl(SEXP, SEXP, SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { - {"_valr_bed12toexons_impl", (DL_FUNC) &_valr_bed12toexons_impl, 1}, - {"_valr_closest_impl", (DL_FUNC) &_valr_closest_impl, 6}, - {"_valr_complement_impl", (DL_FUNC) &_valr_complement_impl, 2}, - {"_valr_coverage_impl", (DL_FUNC) &_valr_coverage_impl, 4}, - {"_valr_dist_impl", (DL_FUNC) &_valr_dist_impl, 5}, - {"_valr_flank_impl", (DL_FUNC) &_valr_flank_impl, 8}, - {"_valr_gcoverage_impl", (DL_FUNC) &_valr_gcoverage_impl, 2}, - {"_valr_intersect_impl", (DL_FUNC) &_valr_intersect_impl, 7}, - {"_valr_makewindows_impl", (DL_FUNC) &_valr_makewindows_impl, 5}, - {"_valr_merge_impl", (DL_FUNC) &_valr_merge_impl, 3}, - {"_valr_partition_impl", (DL_FUNC) &_valr_partition_impl, 2}, - {"_valr_random_impl", (DL_FUNC) &_valr_random_impl, 4}, - {"_valr_shuffle_impl", (DL_FUNC) &_valr_shuffle_impl, 5}, - {"_valr_subtract_impl", (DL_FUNC) &_valr_subtract_impl, 4}, - {NULL, NULL, 0} + {"_valr_bed12toexons_impl", (DL_FUNC) &_valr_bed12toexons_impl, 1}, + {"_valr_closest_impl", (DL_FUNC) &_valr_closest_impl, 6}, + {"_valr_complement_impl", (DL_FUNC) &_valr_complement_impl, 2}, + {"_valr_coverage_impl", (DL_FUNC) &_valr_coverage_impl, 4}, + {"_valr_dist_impl", (DL_FUNC) &_valr_dist_impl, 5}, + {"_valr_flank_impl", (DL_FUNC) &_valr_flank_impl, 8}, + {"_valr_gcoverage_impl", (DL_FUNC) &_valr_gcoverage_impl, 2}, + {"_valr_intersect_impl", (DL_FUNC) &_valr_intersect_impl, 7}, + {"_valr_makewindows_impl", (DL_FUNC) &_valr_makewindows_impl, 5}, + {"_valr_merge_impl", (DL_FUNC) &_valr_merge_impl, 3}, + {"_valr_partition_impl", (DL_FUNC) &_valr_partition_impl, 2}, + {"_valr_random_impl", (DL_FUNC) &_valr_random_impl, 4}, + {"_valr_read_gtf_impl", (DL_FUNC) &_valr_read_gtf_impl, 1}, + {"_valr_shuffle_impl", (DL_FUNC) &_valr_shuffle_impl, 5}, + {"_valr_subtract_impl", (DL_FUNC) &_valr_subtract_impl, 4}, + {NULL, NULL, 0} }; void R_init_valr(DllInfo *dll) { - R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); - R_useDynamicSymbols(dll, FALSE); + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); } diff --git a/src/read_gtf.cpp b/src/read_gtf.cpp new file mode 100644 index 00000000..b3fe099c --- /dev/null +++ b/src/read_gtf.cpp @@ -0,0 +1,147 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace Rcpp; + +std::istream* open_file(std::string filename, std::ifstream& file_stream, gzFile& gz_file, bool& is_gzip) { + if (filename.size() > 3 && filename.substr(filename.size() - 3) == ".gz") { + is_gzip = true; + gz_file = gzopen(filename.c_str(), "r"); + if (!gz_file) stop("Unable to open gzip file: " + filename); + return nullptr; // Placeholder for gzip stream + } else { + is_gzip = false; + file_stream.open(filename); + if (!file_stream.is_open()) stop("Unable to open file: " + filename); + return &file_stream; + } +} + +std::string gz_getline(gzFile gz_file) { + char buffer[4096]; + std::string line; + while (gzgets(gz_file, buffer, sizeof(buffer))) { + line += buffer; + if (!line.empty() && line.back() == '\n') { + line.pop_back(); + break; + } + } + return line; +} + +// [[Rcpp::export]] +DataFrame read_gtf_impl(std::string filename) { + // Cols for the GTF file + std::vector seqname, source, feature; + std::vector start, end; + std::vector score; + std::vector strand, frame; + + // For parsing attrs + std::set all_keys; + std::vector> row_attrs; + + std::ifstream file_stream; + gzFile gz_file = nullptr; + bool is_gzip; + std::istream* input_stream = open_file(filename, file_stream, gz_file, is_gzip); + + std::string line; + while (true) { + if (is_gzip) { + line = gz_getline(gz_file); + if (line.empty() && gzeof(gz_file)) break; + } else { + if (!std::getline(*input_stream, line)) break; + } + + if (line.empty() || line[0] == '#') continue; // Skip comments + + std::istringstream linestream(line); + std::string seq, src, feat, attr; + int st, en; + double sc; + char str, frm; + + linestream >> seq >> src >> feat >> st >> en >> sc >> str >> frm; + std::getline(linestream, attr); + + // Trim leading spaces from attrs + attr.erase(0, attr.find_first_not_of(" \t")); + + // Populate basic cols + seqname.push_back(seq); + source.push_back(src); + feature.push_back(feat); + start.push_back(st); + end.push_back(en); + score.push_back(sc); + strand.push_back(str); + frame.push_back(frm); + + // Parse attrs into a map for this row + std::unordered_map attr_map; + std::istringstream attr_stream(attr); + std::string key_value_pair; + while (std::getline(attr_stream, key_value_pair, ';')) { + size_t eq_pos = key_value_pair.find(' '); + if (eq_pos == std::string::npos) continue; + + std::string key = key_value_pair.substr(0, eq_pos); + std::string value = key_value_pair.substr(eq_pos + 1); + + // Trim spaces and quotes + key.erase(0, key.find_first_not_of(" \t")); + key.erase(key.find_last_not_of(" \t") + 1); + + value.erase(0, value.find_first_not_of(" \t\"")); + value.erase(value.find_last_not_of(" \t\"") + 1); + + attr_map[key] = value; + all_keys.insert(key); + } + row_attrs.push_back(attr_map); + } + + if (is_gzip) gzclose(gz_file); + if (file_stream.is_open()) file_stream.close(); + + // Initialize attr cols + std::unordered_map> attr_cols; + for (const auto &key : all_keys) { + attr_cols[key] = std::vector(seqname.size(), std::string()); + } + + // Populate attr cols + for (size_t i = 0; i < row_attrs.size(); ++i) { + for (const auto &pair : row_attrs[i]) { + attr_cols[pair.first][i] = pair.second; + } + } + + // Create the DataFrame + List df = List::create( + Named("seqname") = seqname, + Named("source") = source, + Named("feature") = feature, + Named("start") = start, + Named("end") = end, + Named("score") = score, + Named("strand") = strand, + Named("frame") = frame + ); + + // Add attr cols as named entries + for (const auto &col : attr_cols) { + df[col.first] = col.second; + } + + return DataFrame(df); +} diff --git a/tests/testthat/test_read_bed.r b/tests/testthat/test_read_bed.r index 1fb30164..0011123a 100644 --- a/tests/testthat/test_read_bed.r +++ b/tests/testthat/test_read_bed.r @@ -51,6 +51,6 @@ test_that("read bigwig", { test_that("read gtf", { - expect_error(read_gtf(gtf_path)) - # expect_equal(ncol(x), 26) + x <- read_gtf(gtf_path) + expect_equal(ncol(x), 26) })