diff --git a/htslib/kstring.h b/htslib/kstring.h index ebb2f9363..dec807ac7 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2011 by Attractive Chaos - Copyright (C) 2013-2014, 2016, 2018-2020, 2022, 2024 Genome Research Ltd. + Copyright (C) 2013-2014, 2016, 2018-2020, 2022, 2024-2025 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -449,6 +449,57 @@ static inline int *ksplit(kstring_t *s, int delimiter, int *n) return offsets; } +/** + * kinsert_char - inserts a char to kstring + * @param c - char to insert + * @param pos - position at which to insert, starting from 0 + * @param s - pointer to output string + * Returns 0 on success and -1 on failure + * 0 for pos inserts at start and length of current string as pos appends at + * the end. + */ +static inline int kinsert_char(char c, size_t pos, kstring_t *s) +{ + if (!s || pos > s->l) { + return EOF; + } + if (ks_resize(s, s->l + 2) < 0) { + return EOF; + } + memmove(s->s + pos + 1, s->s + pos, s->l - pos); + s->s[pos] = c; + s->s[++s->l] = 0; + return 0; +} + +/** + * kinsert_str - inserts a null terminated string to kstring + * @param str - string to insert + * @param pos - position at which to insert, starting from 0 + * @param s - pointer to output string + * Returns 0 on success and -1 on failure + * 0 for pos inserts at start and length of current string as pos appends at + * the end. empty string makes no update. + */ +static inline int kinsert_str(const char *str, size_t pos, kstring_t *s) +{ + size_t len = 0; + if (!s || pos > s->l || !str) { + return EOF; + } + if (!(len = strlen(str))) { + return 0; + } + if (ks_resize(s, s->l + len + 1) < 0) { + return EOF; + } + memmove(s->s + pos + len, s->s + pos, s->l - pos); + memcpy(s->s + pos, str, len); + s->l += len; + s->s[s->l] = '\0'; + return 0; +} + #ifdef HTSLIB_SSIZE_T #undef HTSLIB_SSIZE_T #undef ssize_t diff --git a/htslib/vcf.h b/htslib/vcf.h index 9a36cab05..f705e0652 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -2,7 +2,7 @@ /// High-level VCF/BCF variant calling file operations. /* Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2020, 2022-2023 Genome Research Ltd. + Copyright (C) 2012-2020, 2022-2025 Genome Research Ltd. Author: Heng Li @@ -1501,31 +1501,25 @@ static inline int bcf_float_is_vector_end(float f) return u.i==bcf_float_vector_end ? 1 : 0; } + +/** + * bcf_format_gt_v2 - formats GT information on a string + * @param hdr - bcf header, to get version + * @param fmt - pointer to bcf format data + * @param isample - position of interested sample in data + * @param str - pointer to output string + * Returns 0 on success and -1 on failure + * This method is preferred over bcf_format_gt as this supports vcf4.4 and + * prefixed phasing. Explicit / prefixed phasing for 1st allele is used only + * when it is a must to correctly express phasing. + */ +HTSLIB_EXPORT +int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, + kstring_t *str) HTS_RESULT_USED; + static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) { - uint32_t e = 0; - #define BRANCH(type_t, convert, missing, vector_end) { \ - uint8_t *ptr = fmt->p + isample*fmt->size; \ - int i; \ - for (i=0; in; i++, ptr += sizeof(type_t)) \ - { \ - type_t val = convert(ptr); \ - if ( val == vector_end ) break; \ - if ( i ) e |= kputc("/|"[val&1], str) < 0; \ - if ( !(val>>1) ) e |= kputc('.', str) < 0; \ - else e |= kputw((val>>1) - 1, str) < 0; \ - } \ - if (i == 0) e |= kputc('.', str) < 0; \ - } - switch (fmt->type) { - case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break; - case BCF_BT_NULL: e |= kputc('.', str) < 0; break; - default: hts_log_error("Unexpected type %d", fmt->type); return -2; - } - #undef BRANCH - return e == 0 ? 0 : -1; + return bcf_format_gt_v2(NULL, fmt, isample, str); } static inline int bcf_enc_size(kstring_t *s, int size, int type) diff --git a/test/test.pl b/test/test.pl index a286195c3..4042840dc 100755 --- a/test/test.pl +++ b/test/test.pl @@ -53,6 +53,7 @@ run_test('test_bcf2vcf',$opts); run_test('test_vcf_sweep',$opts,out=>'test-vcf-sweep.out'); run_test('test_vcf_various',$opts); +run_test('test_vcf_44', $opts); run_test('test_bcf_sr_sort',$opts); run_test('test_bcf_sr_no_index',$opts); run_test('test_bcf_sr_range', $opts); @@ -1159,6 +1160,15 @@ sub test_vcf_various cmd => "$$opts{path}/test_view $$opts{path}/modhdr.vcf.gz chr22:1-2"); } +sub test_vcf_44 +{ + my ($opts, %args) = @_; + + # vcf4.4 with implicit and explicit phasing info combinations + test_cmd($opts, %args, out => "vcf44_1.expected", + cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf44_1.vcf"); +} + sub write_multiblock_bgzf { my ($name, $frags) = @_; diff --git a/test/test_kstring.c b/test/test_kstring.c index 8b6188b6e..cfa96fd8c 100644 --- a/test/test_kstring.c +++ b/test/test_kstring.c @@ -1,6 +1,6 @@ /* test_kstring.c -- kstring unit tests - Copyright (C) 2018, 2020, 2024 Genome Research Ltd. + Copyright (C) 2018, 2020, 2024-2025 Genome Research Ltd. Author: Rob Davies @@ -451,6 +451,134 @@ static int test_kgetline2(void) { return EXIT_SUCCESS; } +static int test_kinsertchar(void) { + kstring_t t = KS_INITIALIZE, res = KS_INITIALIZE; + int i = 0; + struct data { + int pos; + const char *val; + }; + + struct data tdata[] = { { -1, ""}, {0, "X0123"}, {1, "0X123"}, {2, "01X23"}, + {3, "012X3"}, {4, "0123X"}, {5, ""} }; + + for (i = -1; i < 6; ++i) { + kstring_t s = KS_INITIALIZE; + kputs("0123", &s); + if (kinsert_char('X', i, &s) < 0) { + if ( i < 0 || i > 4) { ks_free(&s); continue; } //expected failures + fprintf(stderr, "kinsert_char failed\n"); + ks_free(&s); + return -1; + } + if (s.s[s.l] != '\0') { + fprintf(stderr, "No NUL termination on string from kinsert_char\n"); + ks_free(&s); + return -1; + } + if (memcmp(s.s, tdata[i + 1].val, s.l + 1)) { + fprintf(stderr, "kinsert_char comparison failed\n"); + ks_free(&s); + return -1; + } + ks_free(&s); + } + //realloc checks + for (i = 0; i < 7; ++i) { + kputc('A' + i, &res); + if (kinsert_char('A' + i, t.l, &t) < 0) { + fprintf(stderr, "kinsert_char failed in realloc\n"); + ks_free(&res); ks_free(&t); + return -1; + } + if (t.s[t.l] != '\0') { + fprintf(stderr, "No NUL termination on string from kinsert_char in realloc\n"); + ks_free(&res); ks_free(&t); + return -1; + } + if (memcmp(t.s, res.s, res.l+1)) { + fprintf(stderr, "kinsert_char realloc comparison failed in realloc\n"); + ks_free(&res); ks_free(&t); + return -1; + } + } + ks_free(&t); + ks_free(&res); + return 0; +} + +static int test_kinsertstr(void) { + kstring_t t = KS_INITIALIZE, res = KS_INITIALIZE; + int i = 0; + struct data { + int pos; + const char *val; + }; + + struct data tdata[] = { { -1, ""}, {0, "XYZ0123"}, {1, "0XYZ123"}, + {2, "01XYZ23"}, {3, "012XYZ3"}, {4, "0123XYZ"}, {5, ""} }; + + for (i = -1; i < 6; ++i) { + kstring_t s = KS_INITIALIZE; + kputs("0123", &s); + if (kinsert_str("XYZ", i, &s) < 0) { + if ( i < 0 || i > 4) { ks_free(&s); continue; } //expected failures + fprintf(stderr, "kinsert_str failed\n"); + return -1; + } + if (s.s[s.l] != '\0') { + fprintf(stderr, "No NUL termination on string from kinsert_str\n"); + return -1; + } + if (memcmp(s.s, tdata[i + 1].val, s.l + 1)) { + fprintf(stderr, "kinsert_str comparison failed\n"); + return -1; + } + ks_free(&s); + } + //realloc checks + for (i = 0; i < 15; ++i) { + kstring_t val = KS_INITIALIZE; + ksprintf(&val, "%c", 'A' + i); + kputs(val.s, &res); + if (kinsert_str(val.s, t.l, &t) < 0) { + ks_free(&val); + fprintf(stderr, "kinsert_str failed in realloc\n"); + return -1; + } + if (t.s[t.l] != '\0') { + ks_free(&val); ks_free(&res); + fprintf(stderr, "No NUL termination on string from kinsert_str in realloc\n"); + return -1; + } + if (memcmp(t.s, res.s, res.l+1)) { + ks_free(&val); ks_free(&res); + fprintf(stderr, "kinsert_str realloc comparison failed in realloc\n"); + return -1; + } + ks_free(&val); + } + //empty strings + ks_free(&t); + if (kinsert_str("", 1, &t)) { //expected + if (kinsert_str("", 0, &t) || t.l != 0) { + fprintf(stderr, "kinsert_str empty insertion failed\n"); + return -1; + } + } else { + fprintf(stderr, "kinsert_str empty ins to invalid pos succeeded\n"); + return -1; + } + i = res.l; + if (kinsert_str("", 1, &res) || i != res.l) { + fprintf(stderr, "kinsert_str empty ins to valid pos failed\n"); + ks_free(&res); + return -1; + } + ks_free(&res); + return 0; +} + int main(int argc, char **argv) { int opt, res = EXIT_SUCCESS; int64_t start = 0; @@ -500,5 +628,11 @@ int main(int argc, char **argv) { if (!test || strcmp(test, "kgetline2") == 0) if (test_kgetline2() != 0) res = EXIT_FAILURE; + if (!test || strcmp(test, "kinsertchar") == 0) + if (test_kinsertchar() != 0) res = EXIT_FAILURE; + + if (!test || strcmp(test, "kinsertstr") == 0) + if (test_kinsertstr() != 0) res = EXIT_FAILURE; + return res; } diff --git a/test/vcf44_1.expected b/test/vcf44_1.expected new file mode 100644 index 000000000..b35b98412 --- /dev/null +++ b/test/vcf44_1.expected @@ -0,0 +1,35 @@ +##fileformat=VCFv4.4 +##FILTER= +##contig= +##reference=file://test +##FORMAT= +##failue="test file on explicit and implicit phasing markers in 4.4" +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 +1 61462 rs56992750 T A 100 PASS . GT 0|0|1 0/1 +1 61480 rs56992751 T A 100 PASS . GT 0 /0|1 +1 61481 rs56992752 T A 100 PASS . GT /0 |0/1 +1 61482 rs56992752 T A 100 PASS . GT /0 /1 +1 61483 rs56992752 T A 100 PASS . GT 0 1 +1 61484 rs56992752 T A 100 PASS . GT 0 /1 +1 61485 rs56992752 T A 100 PASS . GT 0 1 +1 61486 rs56992752 T A 100 PASS . GT 0 1 +1 61487 rs56992752 T A 100 PASS . GT 0 1 +1 61488 rs56992752 T A 100 PASS . GT 0 /1 +1 61489 rs56992752 T A 100 PASS . GT /0 1 +1 61490 rs56992752 T A 100 PASS . GT /0 1 +1 61491 rs56992752 T A 100 PASS . GT /0 /1 +1 61492 rs56992752 T A 100 PASS . GT /0|0 1/0 +1 61493 rs56992752 T A 100 PASS . GT 0|0 |1/0 +1 61494 rs56992752 T A 100 PASS . GT /0|0 1/0 +1 61495 rs56992752 T A 100 PASS . GT 0|0 |1/0 +1 61496 rs56992752 T A 100 PASS . GT . . +1 61497 rs56992752 T A 100 PASS . GT . |. +1 61498 rs56992752 T A 100 PASS . GT ./1 .|1 +1 61499 rs56992752 T A 100 PASS . GT ./1 .|1 +1 61500 rs56992752 T A 100 PASS . GT |./1 /.|1 +1 61501 rs56992752 T A 100 PASS . GT 1/. 1|. +1 61502 rs56992752 T A 100 PASS . GT 1/. /1|. +1 61503 rs56992752 T A 100 PASS . GT |1/. 1|. +1 61504 rs56992752 T A 100 PASS . GT ./. .|. +1 61505 rs56992752 T A 100 PASS . GT ./. .|. +1 61506 rs56992752 T A 100 PASS . GT |./. /.|. diff --git a/test/vcf44_1.vcf b/test/vcf44_1.vcf new file mode 100644 index 000000000..b4a1cbb34 --- /dev/null +++ b/test/vcf44_1.vcf @@ -0,0 +1,34 @@ +##fileformat=VCFv4.4 +##contig= +##reference=file://test +##FORMAT= +##failue="test file on explicit and implicit phasing markers in 4.4" +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 +1 61462 rs56992750 T A 100 PASS . GT 0|0|1 0/1 +1 61480 rs56992751 T A 100 PASS . GT 0 /0|1 +1 61481 rs56992752 T A 100 PASS . GT /0 |0/1 +1 61482 rs56992752 T A 100 PASS . GT /0 /1 +1 61483 rs56992752 T A 100 PASS . GT 0 1 +1 61484 rs56992752 T A 100 PASS . GT 0 /1 +1 61485 rs56992752 T A 100 PASS . GT 0 |1 +1 61486 rs56992752 T A 100 PASS . GT |0 1 +1 61487 rs56992752 T A 100 PASS . GT |0 |1 +1 61488 rs56992752 T A 100 PASS . GT |0 /1 +1 61489 rs56992752 T A 100 PASS . GT /0 1 +1 61490 rs56992752 T A 100 PASS . GT /0 |1 +1 61491 rs56992752 T A 100 PASS . GT /0 /1 +1 61492 rs56992752 T A 100 PASS . GT /0|0 /1/0 +1 61493 rs56992752 T A 100 PASS . GT |0|0 |1/0 +1 61494 rs56992752 T A 100 PASS . GT /0|0 1/0 +1 61495 rs56992752 T A 100 PASS . GT 0|0 |1/0 +1 61496 rs56992752 T A 100 PASS . GT . . +1 61497 rs56992752 T A 100 PASS . GT /. |. +1 61498 rs56992752 T A 100 PASS . GT ./1 .|1 +1 61499 rs56992752 T A 100 PASS . GT /./1 |.|1 +1 61500 rs56992752 T A 100 PASS . GT |./1 /.|1 +1 61501 rs56992752 T A 100 PASS . GT 1/. 1|. +1 61502 rs56992752 T A 100 PASS . GT /1/. /1|. +1 61503 rs56992752 T A 100 PASS . GT |1/. |1|. +1 61504 rs56992752 T A 100 PASS . GT ./. .|. +1 61505 rs56992752 T A 100 PASS . GT /./. |.|. +1 61506 rs56992752 T A 100 PASS . GT |./. /.|. \ No newline at end of file diff --git a/vcf.c b/vcf.c index f22f4450d..6f8d89dcb 100644 --- a/vcf.c +++ b/vcf.c @@ -1,7 +1,7 @@ /* vcf.c -- VCF/BCF API functions. Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2024 Genome Research Ltd. + Copyright (C) 2012-2025 Genome Research Ltd. Portions copyright (C) 2014 Intel Corporation. Author: Heng Li @@ -116,6 +116,7 @@ typedef struct vdict_t dict; // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields size_t *key_len;// length of h->id[BCF_DT_ID] strings + int version; //cached version } bcf_hdr_aux_t; @@ -124,6 +125,69 @@ static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr) return (bcf_hdr_aux_t *)hdr->dict[0]; } +//version macros +#define VCF_DEF 4002000 +#define VCF44 4004000 + +#define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 ) +#define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 ) + +/** + * bcf_get_version - get the version as int + * @param hdr - bcf header, to get version + * @param verstr- version string, which is already available + * Returns version on success and default version on failure + * version = major * 100 * 10000 + minor * 1000 + */ +static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr) +{ + const char *version = NULL, vcf[] = "VCFv"; + char *major = NULL, *minor = NULL; + int ver = -1; + long tmp = 0; + bcf_hdr_aux_t *aux = NULL; + + if (!hdr && !verstr) { //invalid input + goto fail; + } + + if (hdr) { + if ((aux = get_hdr_aux(hdr)) && aux->version != 0) { //use cached version + return aux->version; + } + //get from header + version = bcf_hdr_get_version(hdr); + } else { + //get from version string + version = verstr; + } + if (!(major = strstr(version, vcf))) { //bad format + goto fail; + } + major += sizeof(vcf) - 1; + if (!(minor = strchr(major, '.'))) { //bad format + goto fail; + } + tmp = strtol(major, NULL, 10); + if ((!tmp && errno == EINVAL) || + ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) { //failed + goto fail; + } + ver = tmp * 100 * 10000; + tmp = strtol(++minor, NULL, 10); + if ((!tmp && errno == EINVAL) || + ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) { //failed + goto fail; + } + ver += tmp * 1000; + return ver; + +fail: + hts_log_warning("Couldn't get VCF version, considering as %d.%d", + VCF_MAJOR_VER(VCF_DEF), VCF_MINOR_VER(VCF_DEF)); + return VCF_DEF; +} + static char *find_chrom_header_line(char *s) { char *nl; @@ -985,7 +1049,6 @@ static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec) int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) { - // currently only for bcf_hdr_set_version assert( hrec->type==BCF_HL_GEN ); int ret; khint_t k; @@ -1014,6 +1077,11 @@ int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) free(hrec->value); hrec->value = strdup(tmp->value); if ( !hrec->value ) return -1; + + if (!strcmp(hrec->key,"fileformat")) { + //update version + get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value); + } return 0; } @@ -1037,7 +1105,6 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) bcf_hrec_destroy(hrec); return 0; } - // Is one of the generic fields and already present? if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 ) { @@ -1052,6 +1119,9 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) free(str.s); return 0; } + if (!strcmp(hrec->key, "fileformat")) { + aux->version = bcf_get_version(NULL, hrec->value); + } } int i; @@ -1387,6 +1457,8 @@ int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1; hrec = bcf_hdr_parse_line(hdr, str.s, &len); free(str.s); + + get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value); } else { @@ -1420,6 +1492,7 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; } aux->key_len = NULL; aux->dict = *((vdict_t*)h->dict[0]); + aux->version = 0; free(h->dict[0]); h->dict[0] = aux; @@ -1428,6 +1501,7 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) bcf_hdr_append(h, "##fileformat=VCFv4.2"); // The filter PASS must appear first in the dictionary bcf_hdr_append(h, "##FILTER="); + aux->version = VCF_DEF; } return h; @@ -3061,8 +3135,10 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const char *t = q + 1; int m = 0; // m: sample id const int nsamples = bcf_hdr_nsamples(h); - const char *end = s->s + s->l; + + int ver = bcf_get_version(h, NULL); + while ( tis_gt) { // Genotypes. - // ([|/])+... where is [0-9]+ or ".". + //([/|])?)([|/])+... where is [0-9]+ or ".". int32_t is_phased = 0; uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m); uint32_t unreadable = 0; uint32_t max = 0; - int overflow = 0; + int overflow = 0, ploidy = 0, anyunphased = 0, \ + phasingprfx = 0, unknown1 = 0; + + /* with prefixed phasing, it is explicitly given for 1st one + with non-prefixed, set based on ploidy and phasing of other + alleles. */ + if (ver >= VCF44 && (*t == '|' || *t == '/')) { + // cache prefix and phasing status + is_phased = *t++ == '|'; + phasingprfx = 1; + } + for (l = 0;; ++t) { + ploidy++; if (*t == '.') { ++t, x[l++] = is_phased; + if (l==1) { //for 1st allele only + unknown1 = 1; + } } else { const char *tt = t; uint32_t val; @@ -3125,9 +3216,21 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, if (max < val) max = val; x[l++] = (val + 1) << 1 | is_phased; } + anyunphased |= (ploidy != 1) && !is_phased; is_phased = (*t == '|'); if (*t != '|' && *t != '/') break; } + if (ver >= VCF44 && !phasingprfx) { + /* no explicit phasing for 1st allele, set based on + other alleles and ploidy */ + if (ploidy == 1) { //implicitly phased + if (!unknown1) { + x[0] |= 1; + } + } else { //set by other unphased alleles + x[0] |= (anyunphased)? 0 : 1; + } + } // Possibly check max against v->n_allele instead? if (overflow || max > (INT32_MAX >> 1) - 1) { hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); @@ -4129,7 +4232,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) uint8_t *ptr = (uint8_t *)v->indiv.s; int gt_i = -1; bcf_fmt_t *fmt = v->d.fmt; - int first = 1; + int first = 1, ret = 0; int fmt_packed = !(v->unpacked & BCF_UN_FMT); if (fmt_packed) { @@ -4165,6 +4268,8 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) if (!id || !id->key) { hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1); errno = EINVAL; + if (fmt_packed) + free(fmt); return -1; } @@ -4187,7 +4292,13 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) if (!first) kputc_(':', s); first = 0; if (gt_i == i) { - bcf_format_gt(f,j,s); + if ((ret = bcf_format_gt_v2(h, f,j,s)) < 0) { + hts_log_error("Failed to format GT value for sample %d, returned %d", i, ret); + errno = EINVAL; + if (fmt_packed) + free(fmt); + return -1; + } break; } else if (f->n == 1) @@ -5950,3 +6061,77 @@ const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) { return buffer; } +/** + * bcf_format_gt_v2 - formats GT information on a string + * @param hdr - bcf header, to get version + * @param fmt - pointer to bcf format data + * @param isample - position of interested sample in data + * @param str - pointer to output string + * Returns 0 on success and -1 on failure + * This method is preferred over bcf_format_gt as this supports vcf4.4 and + * prefixed phasing. Explicit / prefixed phasing for 1st allele is used only + * when it is a must to correctly express phasing. + * correctly express phasing. + */ +int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str) +{ + uint32_t e = 0; + int ploidy = 1, anyunphased = 0; + int32_t val0 = 0; + size_t pos = str ? str->l : 0; + + #define BRANCH(type_t, convert, missing, vector_end) { \ + uint8_t *ptr = fmt->p + isample*fmt->size; \ + int i; \ + for (i=0; in; i++, ptr += sizeof(type_t)) \ + { \ + type_t val = convert(ptr); \ + if ( val == vector_end ) break; \ + if (!i) { val0 = val; } \ + if (i) { \ + e |= kputc("/|"[val & 1], str) < 0; \ + anyunphased |= !(val & 1); \ + } \ + if (!(val >> 1)) e |= kputc('.', str) < 0; \ + else e |= kputw((val >> 1) - 1, str) < 0; \ + } \ + if (i == 0) e |= kputc('.', str) < 0; \ + ploidy = i; \ + } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, + bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, + bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, + bcf_int32_vector_end); break; + case BCF_BT_NULL: e |= kputc('.', str) < 0; break; + default: hts_log_error("Unexpected type %d", fmt->type); return -2; + } + #undef BRANCH + + if (hdr && get_hdr_aux(hdr)->version >= VCF44) { + //output which supports prefixed phasing + + /* update 1st allele's phasing if required and append rest to it. + use prefixed phasing only when it is a must. i.e. without which the + inferred value will be incorrect */ + if (val0 & 1) { + /* 1st one is phased, if ploidy is > 1 and an unphased allele exists + need to specify explicitly */ + e |= (ploidy > 1 && anyunphased) ? + (kinsert_char('|', pos, str) < 0) : + (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p + (kinsert_char('|', pos, str) < 0) : + 0); + } else { + /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or + ploidy > 1 and no other unphased allele exist, need to specify + explicitly */ + e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ? + (kinsert_char('/', pos, str) < 0) : + 0; + } + } + return e == 0 ? 0 : -1; +}