From dac8f29cf77c2d75ea5047a582344594cb5c549e Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Fri, 20 Dec 2024 21:02:26 +0000 Subject: [PATCH] Add bcf_sr_add_hreader() interface This is like bcf_sr_add_reader but the caller supplies an existing open htsFile instead of a filename. Fixes #1862 --- htslib/synced_bcf_reader.h | 22 ++++++++++++++- synced_bcf_reader.c | 58 ++++++++++++++++++++++++++++++++------ test/test-bcf-sr.c | 46 ++++++++++++++++++++++++++---- test/test.pl | 29 +++++++++++++++++++ 4 files changed, 139 insertions(+), 16 deletions(-) diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index fb42877c9..a81ac371d 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -1,7 +1,7 @@ /// @file htslib/synced_bcf_reader.h /// Stream through multiple VCF files. /* - Copyright (C) 2012-2017, 2019-2024 Genome Research Ltd. + Copyright (C) 2012-2017, 2019-2025 Genome Research Ltd. Author: Petr Danecek @@ -233,10 +233,30 @@ void bcf_sr_destroy_threads(bcf_srs_t *files); * * See also the bcf_srs_t data structure for parameters controlling * the reader's logic. + * Invokes bcf_sr_add_hreader with opened file */ HTSLIB_EXPORT int bcf_sr_add_reader(bcf_srs_t *readers, const char *fname); +/** + * bcf_sr_add_hreader() - open new reader using htsfile + * @readers: holder of the open readers + * @file_ptr: htsfile already opened + * @autoclose: close file along with reader or not, 1 - close, 0 - do not close + * @idxname: index file name for file in @file_ptr + * + * Returns 1 if the call succeeded, or 0 on error. + * + * See also the bcf_srs_t data structure for parameters controlling + * the reader's logic. + * If idxname is NULL, uses file_ptr->fn to find index file. + * With idxname as NULL, index file must be present along with the file with + * default name + */ +HTSLIB_EXPORT +int bcf_sr_add_hreader(bcf_srs_t *readers, htsFile *file_ptr, int autoclose, + const char *idxname); + HTSLIB_EXPORT void bcf_sr_remove_reader(bcf_srs_t *files, int i); diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index ac687ed03..fb177d279 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1,6 +1,6 @@ /* synced_bcf_reader.c -- stream through multiple VCF files. - Copyright (C) 2012-2023 Genome Research Ltd. + Copyright (C) 2012-2023, 2025 Genome Research Ltd. Author: Petr Danecek @@ -69,6 +69,7 @@ typedef struct { sr_sort_t sort; int regions_overlap, targets_overlap; + int *closefile; // close htsfile with sync reader close or not } aux_t; @@ -251,6 +252,9 @@ void bcf_sr_destroy_threads(bcf_srs_t *files) { int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) { char fmode[5]; + int ret = 0; + const char *idxname = NULL; + strcpy(fmode, "r"); vcf_open_mode(fmode+1, fname, NULL); htsFile* file_ptr = hts_open(fname, fmode); @@ -258,6 +262,22 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) files->errnum = open_failed; return 0; } + //get idx name and pass to add_hreader + idxname = strstr(fname, HTS_IDX_DELIM); + idxname += idxname ? sizeof(HTS_IDX_DELIM) - 1 : 0; + if (!(ret = bcf_sr_add_hreader(files, file_ptr, 1, idxname))) { + hts_close(file_ptr); //failed, close the file + } + return ret; +} + +int bcf_sr_add_hreader(bcf_srs_t *files, htsFile *file_ptr, int autoclose, const char *idxname) +{ + aux_t *auxdata = NULL; + if ( ! file_ptr ) { + files->errnum = open_failed; + return 0; + } files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1)); files->has_line[files->nreaders] = 0; @@ -274,7 +294,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) BGZF *bgzf = hts_get_bgzfp(reader->file); if ( bgzf && bgzf_check_EOF(bgzf) == 0 ) { files->errnum = no_eof; - hts_log_warning("No BGZF EOF marker; file '%s' may be truncated", fname); + hts_log_warning("No BGZF EOF marker; file '%s' may be truncated", file_ptr->fn); } if (files->p) bgzf_thread_pool(bgzf, files->p->pool, files->p->qsize); @@ -290,7 +310,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) return 0; } - reader->tbx_idx = tbx_index_load(fname); + reader->tbx_idx = tbx_index_load2(file_ptr->fn, idxname); if ( !reader->tbx_idx ) { files->errnum = idx_load_failed; @@ -309,7 +329,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) reader->header = bcf_hdr_read(reader->file); - reader->bcf_idx = bcf_index_load(fname); + reader->bcf_idx = bcf_index_load2(file_ptr->fn, idxname); if ( !reader->bcf_idx ) { files->errnum = idx_load_failed; @@ -362,7 +382,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) return 0; } - reader->fname = strdup(fname); + reader->fname = strdup(file_ptr->fn); if ( files->apply_filters ) reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids); @@ -413,6 +433,18 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) } } + if ((auxdata = BCF_SR_AUX(files))) { + //store closure status for htsfile + int *tmp = realloc(auxdata->closefile, sizeof(int) * files->nreaders); + if (!tmp) { + hts_log_error("Failed to allocate memory"); + return 0; + } + tmp[files->nreaders - 1] = autoclose; + auxdata->closefile = tmp; + } + + return 1; } @@ -426,13 +458,15 @@ bcf_srs_t *bcf_sr_init(void) return files; } -static void bcf_sr_destroy1(bcf_sr_t *reader) +static void bcf_sr_destroy1(bcf_sr_t *reader, int closefile) { free(reader->fname); if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); bcf_hdr_destroy(reader->header); - hts_close(reader->file); + if (closefile) { + hts_close(reader->file); + } if ( reader->itr ) tbx_itr_destroy(reader->itr); int j; for (j=0; jmbuffer; j++) @@ -445,8 +479,10 @@ static void bcf_sr_destroy1(bcf_sr_t *reader) void bcf_sr_destroy(bcf_srs_t *files) { int i; + int *autoclose = BCF_SR_AUX(files)->closefile; + for (i=0; inreaders; i++) - bcf_sr_destroy1(&files->readers[i]); + bcf_sr_destroy1(&files->readers[i], autoclose[i]); free(files->has_line); free(files->readers); for (i=0; in_smpl; i++) free(files->samples[i]); @@ -456,6 +492,7 @@ void bcf_sr_destroy(bcf_srs_t *files) if (files->tmps.m) free(files->tmps.s); if (files->n_threads) bcf_sr_destroy_threads(files); bcf_sr_sort_destroy(&BCF_SR_AUX(files)->sort); + free(autoclose); free(files->aux); free(files); } @@ -463,12 +500,15 @@ void bcf_sr_destroy(bcf_srs_t *files) void bcf_sr_remove_reader(bcf_srs_t *files, int i) { assert( !files->samples ); // not ready for this yet + int *autoclose = BCF_SR_AUX(files)->closefile; + bcf_sr_sort_remove_reader(files, &BCF_SR_AUX(files)->sort, i); - bcf_sr_destroy1(&files->readers[i]); + bcf_sr_destroy1(&files->readers[i], autoclose[i]); if ( i+1 < files->nreaders ) { memmove(&files->readers[i], &files->readers[i+1], (files->nreaders-i-1)*sizeof(bcf_sr_t)); memmove(&files->has_line[i], &files->has_line[i+1], (files->nreaders-i-1)*sizeof(int)); + memmove(&autoclose[i], &autoclose[i+1], (files->nreaders-i-1)*sizeof(int)); } files->nreaders--; } diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c index 0fb59905c..77c53465f 100644 --- a/test/test-bcf-sr.c +++ b/test/test-bcf-sr.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017, 2020, 2023 Genome Research Ltd. + Copyright (C) 2017, 2020, 2023, 2025 Genome Research Ltd. Author: Petr Danecek @@ -54,7 +54,7 @@ error(const char *format, ...) void HTS_NORETURN usage(int exit_code) { fprintf(stderr, "Usage: test-bcf-sr [OPTIONS] vcf-list.txt\n"); - fprintf(stderr, " test-bcf-sr [OPTIONS] -args file1.bcf [...]\n"); + fprintf(stderr, " test-bcf-sr [OPTIONS] --args file1.bcf [...]\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " --args pass filenames directly in argument list\n"); fprintf(stderr, " --no-index allow streaming\n"); @@ -63,6 +63,7 @@ void HTS_NORETURN usage(int exit_code) fprintf(stderr, " -p, --pair logic: snps,indels,both,snps+ref,indels+ref,both+ref,exact,some,all\n"); fprintf(stderr, " -r, --regions comma-separated list of regions\n"); fprintf(stderr, " -t, --targets comma-separated list of targets\n"); + fprintf(stderr, " -u, --usefptr use hfile pointer interface on reader addition\n"); fprintf(stderr, "\n"); exit(exit_code); } @@ -133,13 +134,15 @@ int main(int argc, char *argv[]) {"targets",required_argument,NULL,'t'}, {"no-index",no_argument,NULL,1000}, {"args",no_argument,NULL,1001}, + {"usefptr",no_argument,NULL,'u'}, {NULL,0,NULL,0} }; - int c, pair = 0, use_index = 1, use_fofn = 1; + int c, pair = 0, use_index = 1, use_fofn = 1, usefptr = 0; enum htsExactFormat out_fmt = text_format; // for original pos + alleles const char *out_fn = NULL, *regions = NULL, *targets = NULL; - while ((c = getopt_long(argc, argv, "o:O:p:r:t:h", loptions, NULL)) >= 0) + htsFile **htsfp = NULL; + while ((c = getopt_long(argc, argv, "o:O:p:r:t:hu", loptions, NULL)) >= 0) { switch (c) { @@ -179,6 +182,9 @@ int main(int argc, char *argv[]) case 1001: use_fofn = 0; break; + case 'u': + usefptr = 1; //use htsfile interface instead of fname i/f + break; case 'h': usage(EXIT_SUCCESS); default: usage(EXIT_FAILURE); @@ -218,8 +224,32 @@ int main(int argc, char *argv[]) error("Failed to set targets\n"); } - for (i=0; ierrnum)); + if (usefptr && !(htsfp = malloc(sizeof(htsFile*) * nvcf))) { + error("Failed to allocate memory\n"); + } + + for (i=0; ierrnum)); + } + } else { //use htsfile i/f + if (!(htsfp[i] = hts_open(vcfs[i], "r"))) { + error("Failed to open %s: %s\n", vcfs[i], + bcf_sr_strerror(sr->errnum)); + } + /*with name, index can be anywhere, named as anything + w/o name it has to be along with file with default naming*/ + + const char *idxname = strstr(vcfs[i], HTS_IDX_DELIM); + idxname += idxname ? sizeof(HTS_IDX_DELIM) - 1 : 0; + if ( !bcf_sr_add_hreader(sr, htsfp[i], 1, idxname) ) { + error("Failed to add reader %s: %s\n", vcfs[i], + bcf_sr_strerror(sr->errnum)); + } + } + } if (!sr->readers || sr->nreaders < 1) error("No readers set, even though one was added\n"); @@ -264,6 +294,10 @@ int main(int argc, char *argv[]) free(vcfs[i]); free(vcfs); } + if (usefptr) { + //files are closed along with sr destroy + free(htsfp); + } return 0; } diff --git a/test/test.pl b/test/test.pl index a286195c3..f1b2cf60b 100755 --- a/test/test.pl +++ b/test/test.pl @@ -56,6 +56,7 @@ run_test('test_bcf_sr_sort',$opts); run_test('test_bcf_sr_no_index',$opts); run_test('test_bcf_sr_range', $opts); +run_test('test_bcf_sr_hreader', $opts); run_test('test_command',$opts,cmd=>'test-bcf-translate -',out=>'test-bcf-translate.out'); run_test('test_convert_padded_header',$opts); run_test('test_rebgzip',$opts); @@ -1345,6 +1346,34 @@ sub test_bcf_sr_range { } } +sub test_bcf_sr_hreader { + #uses input file from test_bcf_sr_sort / test-bcf-sr.pl + #invokes bcf sync reader with hread method + my ($opts, %args) = @_; + my $test = "test_bcf_sr_hreader"; + my $fail = 0; + my $cmd = "$$opts{path}/test-bcf-sr -p all $$opts{tmp}/list.txt -o $$opts{tmp}/file.out"; + my $cmd_header = "$$opts{path}/test-bcf-sr -p all $$opts{tmp}/list.txt -o $$opts{tmp}/filenew.out -u"; + my $cmd_diff = "diff $$opts{tmp}/filenew.out $$opts{tmp}/file.out"; + + my ($ret, $out) = _cmd($cmd); + if ($ret != 0) { + failed($opts, $test, "Failed to create reference output\n"); + return; + } + ($ret, $out) = _cmd($cmd_header); + if ($ret != 0) { + failed($opts, $test, "Failed to create output\n"); + return; + } + ($ret, $out) = _cmd($cmd_diff); + if ($ret != 0) { + failed($opts, $test, "Output differs to reference output\n"); + return; + } + passed($opts, $test); +} + sub test_command { my ($opts, %args) = @_;