Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement read_gtf() #428

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ random_impl <- function(genome, length, n, seed = 0L) {
.Call(`_valr_random_impl`, genome, length, n, seed)
}

read_gtf_impl <- function(filename) {
.Call(`_valr_read_gtf_impl`, filename)
}

shuffle_impl <- function(df, incl, within = FALSE, max_tries = 1000L, seed = 0L) {
.Call(`_valr_shuffle_impl`, df, incl, within, max_tries, seed)
}
Expand Down
29 changes: 5 additions & 24 deletions R/read_bed.r
Original file line number Diff line number Diff line change
Expand Up @@ -194,38 +194,19 @@ read_bigwig <- function(path, ...) {

#' Import and convert a GTF/GFF file into a valr compatible bed tbl format
#'
#' @description
#' `r lifecycle::badge("deprecated")`
#'
#' This function will output a tibble with the
#' required chrom, start, and end columns, as well as other columns depending
#' on content in GTF/GFF file.
#'
#' @param path path to gtf or gff file
#' @param zero_based if TRUE, convert to zero based
#' Based on the GTF specification at <https://useast.ensembl.org/info/website/upload/gff.html>
#'
#' @examples
#' @param path path to gtf file
#'
#' \dontrun{
#' @examples
#' gtf <- read_gtf(valr_example("hg19.gencode.gtf.gz"))
#' head(gtf)
#' }
#'
#' @export
read_gtf <- function(path, zero_based = TRUE) {
lifecycle::deprecate_stop(
when = "0.8.3",
what = "read_gtf()",
details = c(
x = paste0(
"read_gtf() was removed because rtracklayer does not pass ",
"CRAN AddressSantizer checks of the UCSC C-library code vendored ",
"in rtracklayer."
),
i = paste0(
"convert GTF to BED, and then `read_bed()`, ",
"or use `rtracklayer::import()` then `gr_to_bed()`."
)
)
)
read_gtf <- function(path) {
as_tibble(read_gtf_impl(path))
}
14 changes: 5 additions & 9 deletions man/read_gtf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,17 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// read_gtf_impl
DataFrame read_gtf_impl(std::string filename);
RcppExport SEXP _valr_read_gtf_impl(SEXP filenameSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type filename(filenameSEXP);
rcpp_result_gen = Rcpp::wrap(read_gtf_impl(filename));
return rcpp_result_gen;
END_RCPP
}
// shuffle_impl
DataFrame shuffle_impl(DataFrame df, DataFrame incl, bool within, int max_tries, int seed);
RcppExport SEXP _valr_shuffle_impl(SEXP dfSEXP, SEXP inclSEXP, SEXP withinSEXP, SEXP max_triesSEXP, SEXP seedSEXP) {
Expand Down
70 changes: 36 additions & 34 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,46 +3,48 @@
#include <stdlib.h> // for NULL
#include <R_ext/Rdynload.h>

/* FIXME:
Check these declarations against the C/Fortran source code.
*/
/* FIXME:
Check these declarations against the C/Fortran source code.
*/

/* .Call calls */
extern SEXP _valr_bed12toexons_impl(void *);
extern SEXP _valr_closest_impl(void *, void *, void *, void *, void *, void *);
extern SEXP _valr_complement_impl(void *, void *);
extern SEXP _valr_coverage_impl(void *, void *, void *, void *);
extern SEXP _valr_dist_impl(void *, void *, void *, void *, void *);
extern SEXP _valr_flank_impl(void *, void *, void *, void *, void *, void *, void *, void *);
extern SEXP _valr_gcoverage_impl(void *, void *);
extern SEXP _valr_intersect_impl(void *, void *, void *, void *, void *, void *, void *);
extern SEXP _valr_makewindows_impl(void *, void *, void *, void *, void *);
extern SEXP _valr_merge_impl(void *, void *, void *);
extern SEXP _valr_partition_impl(void *, void *);
extern SEXP _valr_random_impl(void *, void *, void *, void *);
extern SEXP _valr_shuffle_impl(void *, void *, void *, void *, void *);
extern SEXP _valr_subtract_impl(void *, void *, void *, void *);
extern SEXP _valr_bed12toexons_impl(SEXP);
extern SEXP _valr_closest_impl(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _valr_complement_impl(SEXP, SEXP);
extern SEXP _valr_coverage_impl(SEXP, SEXP, SEXP, SEXP);
extern SEXP _valr_dist_impl(SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _valr_flank_impl(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _valr_gcoverage_impl(SEXP, SEXP);
extern SEXP _valr_intersect_impl(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _valr_makewindows_impl(SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _valr_merge_impl(SEXP, SEXP, SEXP);
extern SEXP _valr_partition_impl(SEXP, SEXP);
extern SEXP _valr_random_impl(SEXP, SEXP, SEXP, SEXP);
extern SEXP _valr_read_gtf_impl(SEXP);
extern SEXP _valr_shuffle_impl(SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _valr_subtract_impl(SEXP, SEXP, SEXP, SEXP);

static const R_CallMethodDef CallEntries[] = {
{"_valr_bed12toexons_impl", (DL_FUNC) &_valr_bed12toexons_impl, 1},
{"_valr_closest_impl", (DL_FUNC) &_valr_closest_impl, 6},
{"_valr_complement_impl", (DL_FUNC) &_valr_complement_impl, 2},
{"_valr_coverage_impl", (DL_FUNC) &_valr_coverage_impl, 4},
{"_valr_dist_impl", (DL_FUNC) &_valr_dist_impl, 5},
{"_valr_flank_impl", (DL_FUNC) &_valr_flank_impl, 8},
{"_valr_gcoverage_impl", (DL_FUNC) &_valr_gcoverage_impl, 2},
{"_valr_intersect_impl", (DL_FUNC) &_valr_intersect_impl, 7},
{"_valr_makewindows_impl", (DL_FUNC) &_valr_makewindows_impl, 5},
{"_valr_merge_impl", (DL_FUNC) &_valr_merge_impl, 3},
{"_valr_partition_impl", (DL_FUNC) &_valr_partition_impl, 2},
{"_valr_random_impl", (DL_FUNC) &_valr_random_impl, 4},
{"_valr_shuffle_impl", (DL_FUNC) &_valr_shuffle_impl, 5},
{"_valr_subtract_impl", (DL_FUNC) &_valr_subtract_impl, 4},
{NULL, NULL, 0}
{"_valr_bed12toexons_impl", (DL_FUNC) &_valr_bed12toexons_impl, 1},
{"_valr_closest_impl", (DL_FUNC) &_valr_closest_impl, 6},
{"_valr_complement_impl", (DL_FUNC) &_valr_complement_impl, 2},
{"_valr_coverage_impl", (DL_FUNC) &_valr_coverage_impl, 4},
{"_valr_dist_impl", (DL_FUNC) &_valr_dist_impl, 5},
{"_valr_flank_impl", (DL_FUNC) &_valr_flank_impl, 8},
{"_valr_gcoverage_impl", (DL_FUNC) &_valr_gcoverage_impl, 2},
{"_valr_intersect_impl", (DL_FUNC) &_valr_intersect_impl, 7},
{"_valr_makewindows_impl", (DL_FUNC) &_valr_makewindows_impl, 5},
{"_valr_merge_impl", (DL_FUNC) &_valr_merge_impl, 3},
{"_valr_partition_impl", (DL_FUNC) &_valr_partition_impl, 2},
{"_valr_random_impl", (DL_FUNC) &_valr_random_impl, 4},
{"_valr_read_gtf_impl", (DL_FUNC) &_valr_read_gtf_impl, 1},
{"_valr_shuffle_impl", (DL_FUNC) &_valr_shuffle_impl, 5},
{"_valr_subtract_impl", (DL_FUNC) &_valr_subtract_impl, 4},
{NULL, NULL, 0}
};

void R_init_valr(DllInfo *dll)
{
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}
147 changes: 147 additions & 0 deletions src/read_gtf.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#include <Rcpp.h>
#include <fstream>
#include <sstream>
#include <unordered_map>
#include <vector>
#include <string>
#include <set>
#include <zlib.h>

using namespace Rcpp;

std::istream* open_file(std::string filename, std::ifstream& file_stream, gzFile& gz_file, bool& is_gzip) {
if (filename.size() > 3 && filename.substr(filename.size() - 3) == ".gz") {
is_gzip = true;
gz_file = gzopen(filename.c_str(), "r");
if (!gz_file) stop("Unable to open gzip file: " + filename);
return nullptr; // Placeholder for gzip stream
} else {
is_gzip = false;
file_stream.open(filename);
if (!file_stream.is_open()) stop("Unable to open file: " + filename);
return &file_stream;
}
}

std::string gz_getline(gzFile gz_file) {
char buffer[4096];
std::string line;
while (gzgets(gz_file, buffer, sizeof(buffer))) {
line += buffer;
if (!line.empty() && line.back() == '\n') {
line.pop_back();
break;
}
}
return line;
}

// [[Rcpp::export]]
DataFrame read_gtf_impl(std::string filename) {
// Cols for the GTF file
std::vector<std::string> seqname, source, feature;
std::vector<int> start, end;
std::vector<double> score;
std::vector<char> strand, frame;

// For parsing attrs
std::set<std::string> all_keys;
std::vector<std::unordered_map<std::string, std::string>> row_attrs;

std::ifstream file_stream;
gzFile gz_file = nullptr;
bool is_gzip;
std::istream* input_stream = open_file(filename, file_stream, gz_file, is_gzip);

std::string line;
while (true) {
if (is_gzip) {
line = gz_getline(gz_file);
if (line.empty() && gzeof(gz_file)) break;
} else {
if (!std::getline(*input_stream, line)) break;
}

if (line.empty() || line[0] == '#') continue; // Skip comments

std::istringstream linestream(line);
std::string seq, src, feat, attr;
int st, en;
double sc;
char str, frm;

linestream >> seq >> src >> feat >> st >> en >> sc >> str >> frm;
std::getline(linestream, attr);

// Trim leading spaces from attrs
attr.erase(0, attr.find_first_not_of(" \t"));

// Populate basic cols
seqname.push_back(seq);
source.push_back(src);
feature.push_back(feat);
start.push_back(st);
end.push_back(en);
score.push_back(sc);
strand.push_back(str);
frame.push_back(frm);

// Parse attrs into a map for this row
std::unordered_map<std::string, std::string> attr_map;
std::istringstream attr_stream(attr);
std::string key_value_pair;
while (std::getline(attr_stream, key_value_pair, ';')) {
size_t eq_pos = key_value_pair.find(' ');
if (eq_pos == std::string::npos) continue;

std::string key = key_value_pair.substr(0, eq_pos);
std::string value = key_value_pair.substr(eq_pos + 1);

// Trim spaces and quotes
key.erase(0, key.find_first_not_of(" \t"));
key.erase(key.find_last_not_of(" \t") + 1);

value.erase(0, value.find_first_not_of(" \t\""));
value.erase(value.find_last_not_of(" \t\"") + 1);

attr_map[key] = value;
all_keys.insert(key);
}
row_attrs.push_back(attr_map);
}

if (is_gzip) gzclose(gz_file);
if (file_stream.is_open()) file_stream.close();

// Initialize attr cols
std::unordered_map<std::string, std::vector<std::string>> attr_cols;
for (const auto &key : all_keys) {
attr_cols[key] = std::vector<std::string>(seqname.size(), std::string());
}

// Populate attr cols
for (size_t i = 0; i < row_attrs.size(); ++i) {
for (const auto &pair : row_attrs[i]) {
attr_cols[pair.first][i] = pair.second;
}
}

// Create the DataFrame
List df = List::create(
Named("seqname") = seqname,
Named("source") = source,
Named("feature") = feature,
Named("start") = start,
Named("end") = end,
Named("score") = score,
Named("strand") = strand,
Named("frame") = frame
);

// Add attr cols as named entries
for (const auto &col : attr_cols) {
df[col.first] = col.second;
}

return DataFrame(df);
}
4 changes: 2 additions & 2 deletions tests/testthat/test_read_bed.r
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,6 @@ test_that("read bigwig", {


test_that("read gtf", {
expect_error(read_gtf(gtf_path))
# expect_equal(ncol(x), 26)
x <- read_gtf(gtf_path)
expect_equal(ncol(x), 26)
})
Loading