From 624f554a2ad186f63f67e56fcdce15012489cbc0 Mon Sep 17 00:00:00 2001 From: Nuno Agostinho Date: Tue, 21 Nov 2023 11:52:03 +0000 Subject: [PATCH 1/7] Parse CNV:TR alleles from VCF --- modules/Bio/EnsEMBL/VEP/Parser/VCF.pm | 62 +++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm index 678df3d9e..3fcefe789 100644 --- a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm @@ -433,6 +433,68 @@ sub create_StructuralVariationFeatures { } $alt = $ref . "/$alt" unless $alt =~ /^\.|\.$/; } + ## parse tandem repeats based on VCF INFO fields + elsif ($so_term =~ /tandem/ ) { + # warn that CIRUC and CIRB INFO fields are ignored + $self->warning_msg("CIRUC and CIRB INFO fields are ignored when calculating alternative alleles in tandem repeats") + if $info->{CIRUC} or $info->{CIRB}; + + if ($info->{RUS}) { + # RN : Total number of repeat sequences in this allele + # RUS : Repeat unit sequence of the corresponding repeat sequence + # RUC : Repeat unit count of corresponding repeat sequence + # RB : Total number of bases in the corresponding repeat sequence + my @RN = $info->{RN} ? + split(/,/, $info->{RN}) : + # if RN is missing, assume 1 repeat for each CNV:TR allele + (1) x ( () = $type =~ /CNV:TR/g ); + my @RUS = split /,/, $info->{RUS}; + my @RUC = split /,/, ($info->{RUC} || ''); # undefined if no RUC + my @RB = split /,/, ($info->{RB} || ''); # undefined if no RB + + my $is_RUC_defined = scalar @RUC; + + # consider tandem repeats as non-SVs to properly calculate consequences + # based on alternative allele sequence + my $is_sv = 0; + + # build sequence of tandem repeat based on INFO fields + my @repeats; + for my $records (@RN) { + my $repeat; + next if $records eq '.'; # ignore missing values + for ( 1 .. $records ) { + my $seq = shift(@RUS); + $seq = 'N' if $seq eq '.'; # missing sequence + my $num = $is_RUC_defined ? shift(@RUC) : shift(@RB) / length($seq); + $repeat .= $seq x $num; + } + + # keep tandem repeats as SVs if larger than max_sv_size + $is_sv = 1 if length($repeat) > ($self->param('max_sv_size') || 5000); + + # prepend reference allele + push @repeats, $ref . $repeat; + } + $alt = $ref.'/'.join('/', @repeats); + + if (!$is_sv) { + # convert to VariationFeature + my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $start, + end => $start + length($ref) - 1, + allele_string => $alt, + strand => 1, + map_weight => 1, + adaptor => $self->get_adaptor('variation', 'VariationFeature'), + variation_name => @$ids ? $ids->[0] : undef, + chr => $chr, + _line => $record, + }); + return $self->post_process_vfs([$vf]); + } + } + } # check for imprecise breakpoints my ($min_start, $max_start, $min_end, $max_end) = ( From 65581774b59556415e954808e409562aa0a68661 Mon Sep 17 00:00:00 2001 From: Nuno Agostinho Date: Wed, 22 Nov 2023 10:18:28 +0000 Subject: [PATCH 2/7] Return as tandem_repeat --- modules/Bio/EnsEMBL/VEP/Parser.pm | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/Parser.pm b/modules/Bio/EnsEMBL/VEP/Parser.pm index 2d175372f..fdc308234 100755 --- a/modules/Bio/EnsEMBL/VEP/Parser.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser.pm @@ -674,17 +674,19 @@ sub get_SO_term { $subtype = $element if grep /^$element$/i, @mobile_elements; } $abbrev .= '_' . $subtype; - } elsif ($type =~ /DUP:TANDEM|CNV:TR/i) { - # including , + } elsif ($type =~ /DUP:TANDEM/i) { $abbrev = "TDUP"; - } elsif ($type =~ /CNV/i) { - # including , - $abbrev = "CNV"; + } elsif ($type =~ /CNV:TR/i) { + # including , + $abbrev = "TREP"; } elsif ($type =~ /CN=?[0-9]/i) { # ALT: "", ",," "" => SVTYPE: DEL, CNV, DUP $abbrev = "CNV"; $abbrev = "DEL" if $type =~ /^?$/; $abbrev = "DUP" if $type =~ /^?$/; + } elsif ($type =~ /CNV/i) { + # including , + $abbrev = "CNV"; } elsif ($type =~ /[\[\]]|^\.|\.$/) { $abbrev = "BND"; } elsif ($type =~ /^\<|\>$/) { @@ -710,6 +712,7 @@ sub get_SO_term { DEL_LINE1 => 'LINE1_deletion', DEL_SVA => 'SVA_deletion', + TREP => 'tandem_repeat', TDUP => 'tandem_duplication', DUP => 'duplication', CNV => 'copy_number_variation', From e62d7b8c2195fbf63cdd4e94e84a5db20e892fa0 Mon Sep 17 00:00:00 2001 From: Nuno Agostinho Date: Wed, 22 Nov 2023 10:19:14 +0000 Subject: [PATCH 3/7] Test if bigger than max_sv_size --- modules/Bio/EnsEMBL/VEP/Parser/VCF.pm | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm index 3fcefe789..13031a11d 100644 --- a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm @@ -439,7 +439,7 @@ sub create_StructuralVariationFeatures { $self->warning_msg("CIRUC and CIRB INFO fields are ignored when calculating alternative alleles in tandem repeats") if $info->{CIRUC} or $info->{CIRB}; - if ($info->{RUS}) { + if ($info->{RUS} && ($info->{RUC} || $info->{RB}) ) { # RN : Total number of repeat sequences in this allele # RUS : Repeat unit sequence of the corresponding repeat sequence # RUC : Repeat unit count of corresponding repeat sequence @@ -451,12 +451,10 @@ sub create_StructuralVariationFeatures { my @RUS = split /,/, $info->{RUS}; my @RUC = split /,/, ($info->{RUC} || ''); # undefined if no RUC my @RB = split /,/, ($info->{RB} || ''); # undefined if no RB - my $is_RUC_defined = scalar @RUC; - # consider tandem repeats as non-SVs to properly calculate consequences - # based on alternative allele sequence - my $is_sv = 0; + my $max_sv_size = $self->param('max_sv_size') || 5000; + my $is_oversized = 0; # build sequence of tandem repeat based on INFO fields my @repeats; @@ -467,19 +465,23 @@ sub create_StructuralVariationFeatures { my $seq = shift(@RUS); $seq = 'N' if $seq eq '.'; # missing sequence my $num = $is_RUC_defined ? shift(@RUC) : shift(@RB) / length($seq); + + # avoid calculating really large repeats + $is_oversized = length($seq) * $num > $max_sv_size; + last if $is_oversized; $repeat .= $seq x $num; } - # keep tandem repeats as SVs if larger than max_sv_size - $is_sv = 1 if length($repeat) > ($self->param('max_sv_size') || 5000); - + last if $is_oversized; # prepend reference allele push @repeats, $ref . $repeat; } - $alt = $ref.'/'.join('/', @repeats); + # avoid storing alternative allele for tandem repeats with large repeats + $alt = $ref.'/'.join('/', @repeats) unless $is_oversized; - if (!$is_sv) { - # convert to VariationFeature + if (!$is_oversized) { + # convert tandem repeats to VariationFeature in order to properly + # calculate consequences based on alternative allele sequence my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ start => $start, end => $start + length($ref) - 1, From 6fcd5e5e3306662a58a32a52469db39497ec56ce Mon Sep 17 00:00:00 2001 From: Nuno Agostinho Date: Wed, 22 Nov 2023 10:19:28 +0000 Subject: [PATCH 4/7] Add unit tests --- t/Parser_VCF.t | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/t/Parser_VCF.t b/t/Parser_VCF.t index 56e290d13..d669b8411 100755 --- a/t/Parser_VCF.t +++ b/t/Parser_VCF.t @@ -878,6 +878,99 @@ is_deeply($bnd5_vf, bless( { 'Bio::EnsEMBL::Variation::StructuralVariationFeature' ) , 'StructuralVariationFeature - BND with unsupported INFO/END field'); +## test tandem repeats + +sub parse_variant { + my $var = shift; + my $vf = Bio::EnsEMBL::VEP::Parser::VCF->new({ + config => Bio::EnsEMBL::VEP::Config->new({%$base_testing_cfg, gp => 1, warning_file => 'STDERR'}), + file => $test_cfg->create_input_file($var), + valid_chromosomes => [1,2] + })->next(); + + delete($vf->{adaptor}); + delete($vf->{_line}); + + return $vf; +} + +my $tandem = parse_variant([qw(2 68914092 tr0 T , . PASS .)]); +is_deeply($tandem, bless( { + 'chr' => '2', + 'strand' => '1', + 'variation_name' => 'tr0', + 'class_SO_term' => 'tandem_repeat', + 'allele_string' => '/', + 'start' => 68914093, + 'inner_start' => 68914093, + 'outer_start' => 68914093, + 'end' => 68914093, + 'inner_end' => 68914093, + 'outer_end' => 68914093, + 'seq_region_start' => 68914093, + 'seq_region_end' => 68914093 + }, + 'Bio::EnsEMBL::Variation::StructuralVariationFeature' ) , + 'StructuralVariationFeature - generic tandem repeat'); + +my $tandem_RUC = parse_variant([qw(2 68914092 tr0 T , . PASS RUS=CAT,GT,CA;RUC=2,5,4;RN=2,1)]); +is_deeply($tandem_RUC, bless( { + 'chr' => '2', + 'strand' => '1', + 'variation_name' => 'tr0', + 'allele_string' => 'T/T' . 'CAT' x 2 . 'GT' x 5 . '/T' . 'CA' x 4, + 'start' => 68914093, + 'end' => 68914093, + 'seq_region_start' => 68914093, + 'seq_region_end' => 68914093, + 'map_weight' => 1 + }, + 'Bio::EnsEMBL::Variation::VariationFeature' ) , + 'VariationFeature - tandem repeat using RUC'); + +my $tandem_RB = parse_variant([qw(2 68914092 tr0 T , . PASS RUS=CAT,GT,CA;RB=6,10,8;RN=2,1)]); +is_deeply($tandem_RUC, $tandem_RB, 'VariationFeature - tandem repeat using RB'); + +$tandem = parse_variant([qw(2 68914092 tr0 T , . PASS RUS=CAT,.,CA;RUC=2,5,4;RN=2,1)]); +is_deeply($tandem, bless( { + 'chr' => '2', + 'strand' => '1', + 'variation_name' => 'tr0', + 'allele_string' => 'T/T' . 'CAT' x 2 . 'N' x 5 . '/T' . 'CA' x 4, + 'start' => 68914093, + 'end' => 68914093, + 'seq_region_start' => 68914093, + 'seq_region_end' => 68914093, + 'map_weight' => 1 + }, + 'Bio::EnsEMBL::Variation::VariationFeature' ) , + 'VariationFeature - tandem repeat with missing sequence'); + +my $tandem_RN_0 = parse_variant([qw(2 68914092 tr0 T ,, . PASS RUS=CAT,GT,CA;RB=6,10,8)]); +my $tandem_RN_1 = parse_variant([qw(2 68914092 tr0 T ,, . PASS RUS=CAT,GT,CA;RB=6,10,8;RN=1,1,1)]); + +is_deeply($tandem_RN_0, $tandem_RN_1, + 'VariationFeature - tandem repeat omitting RN'); + +$tandem = parse_variant([qw(2 68914092 tr0 T , . PASS RUS=CAT,.,CA;RUC=5,10000001,4;RN=2,1)]); +is_deeply($tandem, bless( { + 'chr' => '2', + 'strand' => '1', + 'variation_name' => 'tr0', + 'class_SO_term' => 'tandem_repeat', + 'allele_string' => '/', + 'start' => 68914093, + 'end' => 68914093, + 'outer_start' => 68914093, + 'outer_end' => 68914093, + 'inner_start' => 68914093, + 'inner_end' => 68914093, + 'seq_region_start' => 68914093, + 'seq_region_end' => 68914093 + }, + 'Bio::EnsEMBL::Variation::StructuralVariationFeature' ) , + 'StructuralVariationFeature - tandem repeat is too large'); + ## OTHER TESTS ############## From 7a3598f5e24ffe4a287531c777faad54ab1f3aa1 Mon Sep 17 00:00:00 2001 From: Nuno Agostinho Date: Wed, 13 Dec 2023 10:15:16 +0000 Subject: [PATCH 5/7] Get proper reference sequence for tandem repeats --- modules/Bio/EnsEMBL/VEP/Parser/VCF.pm | 141 +++++++++++++++----------- 1 file changed, 82 insertions(+), 59 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm index 13031a11d..98d173672 100644 --- a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm @@ -376,6 +376,59 @@ sub create_VariationFeatures { } +sub _expand_tandem_repeat_allele_string { + my ($self, $type, $info, $ref) = @_; + + if ($info->{RUS} && ($info->{RUC} || $info->{RB}) ) { + # warn that CIRUC and CIRB INFO fields are ignored + $self->warning_msg("CIRUC and CIRB INFO fields are ignored when calculating alternative alleles in tandem repeats") + if $info->{CIRUC} or $info->{CIRB}; + + # RN : Total number of repeat sequences in this allele + # RUS : Repeat unit sequence of the corresponding repeat sequence + # RUC : Repeat unit count of corresponding repeat sequence + # RB : Total number of bases in the corresponding repeat sequence + my @RN = $info->{RN} ? + split(/,/, $info->{RN}) : + # if RN is missing, assume 1 repeat for each CNV:TR allele + (1) x ( () = $type =~ /CNV:TR/g ); + my @RUS = split /,/, $info->{RUS}; + my @RUC = split /,/, ($info->{RUC} || ''); # undefined if no RUC + my @RB = split /,/, ($info->{RB} || ''); # undefined if no RB + my $is_RUC_defined = scalar @RUC; + + my $max_sv_size = $self->param('max_sv_size') || 5000; + my $is_oversized = 0; + + # build sequence of tandem repeat based on INFO fields + my @repeats; + for my $records (@RN) { + my $repeat; + next if $records eq '.'; # ignore missing values + for ( 1 .. $records ) { + my $seq = shift(@RUS); + $seq = 'N' if $seq eq '.'; # missing sequence + my $num = $is_RUC_defined ? shift(@RUC) : shift(@RB) / length($seq); + + # avoid calculating really large repeats + $is_oversized = length($seq) * $num > $max_sv_size; + last if $is_oversized; + $repeat .= $seq x $num; + } + + last if $is_oversized; + # prepend reference allele + push @repeats, $repeat; + } + + # avoid storing alternative allele for tandem repeats with large repeats + return $ref.'/'.join('/', @repeats) unless $is_oversized; + } + return undef; +} + + + =head2 create_StructuralVariationFeatures Example : $vfs = $parser->create_StructuralVariationFeatures(); @@ -435,66 +488,36 @@ sub create_StructuralVariationFeatures { } ## parse tandem repeats based on VCF INFO fields elsif ($so_term =~ /tandem/ ) { - # warn that CIRUC and CIRB INFO fields are ignored - $self->warning_msg("CIRUC and CIRB INFO fields are ignored when calculating alternative alleles in tandem repeats") - if $info->{CIRUC} or $info->{CIRB}; - - if ($info->{RUS} && ($info->{RUC} || $info->{RB}) ) { - # RN : Total number of repeat sequences in this allele - # RUS : Repeat unit sequence of the corresponding repeat sequence - # RUC : Repeat unit count of corresponding repeat sequence - # RB : Total number of bases in the corresponding repeat sequence - my @RN = $info->{RN} ? - split(/,/, $info->{RN}) : - # if RN is missing, assume 1 repeat for each CNV:TR allele - (1) x ( () = $type =~ /CNV:TR/g ); - my @RUS = split /,/, $info->{RUS}; - my @RUC = split /,/, ($info->{RUC} || ''); # undefined if no RUC - my @RB = split /,/, ($info->{RB} || ''); # undefined if no RB - my $is_RUC_defined = scalar @RUC; - - my $max_sv_size = $self->param('max_sv_size') || 5000; - my $is_oversized = 0; - - # build sequence of tandem repeat based on INFO fields - my @repeats; - for my $records (@RN) { - my $repeat; - next if $records eq '.'; # ignore missing values - for ( 1 .. $records ) { - my $seq = shift(@RUS); - $seq = 'N' if $seq eq '.'; # missing sequence - my $num = $is_RUC_defined ? shift(@RUC) : shift(@RB) / length($seq); - - # avoid calculating really large repeats - $is_oversized = length($seq) * $num > $max_sv_size; - last if $is_oversized; - $repeat .= $seq x $num; - } + # get reference allele and allele_string from tandem repeats + my $allele_string; + my $slice_ref = $self->get_slice($chr); + if ($slice_ref) { + $slice_ref = $slice_ref->sub_Slice($start, $end, 1); + $ref = $slice_ref->seq if defined $slice_ref and defined $slice_ref->seq; + $allele_string = $self->_expand_tandem_repeat_allele_string($type, $info, $ref) if defined $ref; + } elsif (!defined($self->param('fasta')) && $self->param('offline')) { + $self->warning_msg( + "could not fetch sequence for tandem repeats; " . + "consequence calculation for tandem repeats is less precise when using --offline without --fasta\n"); + } else { + $self->warning_msg("could not fetch sequence for tandem repeat"); + } - last if $is_oversized; - # prepend reference allele - push @repeats, $ref . $repeat; - } - # avoid storing alternative allele for tandem repeats with large repeats - $alt = $ref.'/'.join('/', @repeats) unless $is_oversized; - - if (!$is_oversized) { - # convert tandem repeats to VariationFeature in order to properly - # calculate consequences based on alternative allele sequence - my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ - start => $start, - end => $start + length($ref) - 1, - allele_string => $alt, - strand => 1, - map_weight => 1, - adaptor => $self->get_adaptor('variation', 'VariationFeature'), - variation_name => @$ids ? $ids->[0] : undef, - chr => $chr, - _line => $record, - }); - return $self->post_process_vfs([$vf]); - } + if (defined $allele_string) { + # convert tandem repeats to VariationFeature in order to properly + # calculate consequences based on alternative allele sequence + my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $start, + end => $end, + allele_string => $allele_string, + strand => 1, + map_weight => 1, + adaptor => $self->get_adaptor('variation', 'VariationFeature'), + variation_name => @$ids ? $ids->[0] : undef, + chr => $chr, + _line => $record, + }); + return $self->post_process_vfs([$vf]); } } From d99dc66e053d53a85cdada36090a55c9a8d6ac5a Mon Sep 17 00:00:00 2001 From: Nuno Agostinho Date: Wed, 13 Dec 2023 11:25:16 +0000 Subject: [PATCH 6/7] Get reference length from SVLEN --- modules/Bio/EnsEMBL/VEP/Parser/VCF.pm | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm index 98d173672..f4ca5217f 100644 --- a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm @@ -78,6 +78,8 @@ use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::IO::Parser::VCF4; +use List::Util qw(max); +use List::MoreUtils qw(uniq); =head2 new @@ -428,7 +430,6 @@ sub _expand_tandem_repeat_allele_string { } - =head2 create_StructuralVariationFeatures Example : $vfs = $parser->create_StructuralVariationFeatures(); @@ -488,6 +489,17 @@ sub create_StructuralVariationFeatures { } ## parse tandem repeats based on VCF INFO fields elsif ($so_term =~ /tandem/ ) { + # validate SVLEN + if (defined $info->{SVLEN}) { + my @svlen = uniq(split(/,/, $info->{SVLEN})); + # warn if there are different references per alternative allele + $self->warning_msg( + "found tandem repeats with different references per alternative allele: " . + "SVLEN=" . $info->{SVLEN} . "; only using reference with largest size" + ) if scalar @svlen > 1; + $end = $start + max(@svlen) - 1; + } + # get reference allele and allele_string from tandem repeats my $allele_string; my $slice_ref = $self->get_slice($chr); From 69896352090c65a3c62935daa3bf7abbd83158c6 Mon Sep 17 00:00:00 2001 From: Nuno Agostinho Date: Wed, 13 Dec 2023 11:25:31 +0000 Subject: [PATCH 7/7] Improve unit tests --- t/Parser_VCF.t | 104 ++++++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 39 deletions(-) diff --git a/t/Parser_VCF.t b/t/Parser_VCF.t index d669b8411..0db8be28c 100755 --- a/t/Parser_VCF.t +++ b/t/Parser_VCF.t @@ -883,9 +883,12 @@ is_deeply($bnd5_vf, bless( { sub parse_variant { my $var = shift; my $vf = Bio::EnsEMBL::VEP::Parser::VCF->new({ - config => Bio::EnsEMBL::VEP::Config->new({%$base_testing_cfg, gp => 1, warning_file => 'STDERR'}), + config => Bio::EnsEMBL::VEP::Config->new({ + %$base_testing_cfg, gp => 1, warning_file => 'STDERR', + fasta => $test_cfg->{fasta} + }), file => $test_cfg->create_input_file($var), - valid_chromosomes => [1,2] + valid_chromosomes => [21] })->next(); delete($vf->{adaptor}); @@ -894,79 +897,102 @@ sub parse_variant { return $vf; } -my $tandem = parse_variant([qw(2 68914092 tr0 T , . PASS .)]); +my $tandem = parse_variant([qw(21 25587759 tr0 T , . PASS END=25587769)]); is_deeply($tandem, bless( { - 'chr' => '2', + 'chr' => '21', 'strand' => '1', 'variation_name' => 'tr0', 'class_SO_term' => 'tandem_repeat', 'allele_string' => '/', - 'start' => 68914093, - 'inner_start' => 68914093, - 'outer_start' => 68914093, - 'end' => 68914093, - 'inner_end' => 68914093, - 'outer_end' => 68914093, - 'seq_region_start' => 68914093, - 'seq_region_end' => 68914093 + 'start' => 25587760, + 'inner_start' => 25587760, + 'outer_start' => 25587760, + 'end' => 25587769, + 'inner_end' => 25587769, + 'outer_end' => 25587769, + 'seq_region_start' => 25587760, + 'seq_region_end' => 25587769 }, 'Bio::EnsEMBL::Variation::StructuralVariationFeature' ) , 'StructuralVariationFeature - generic tandem repeat'); -my $tandem_RUC = parse_variant([qw(2 68914092 tr0 T , . PASS RUS=CAT,GT,CA;RUC=2,5,4;RN=2,1)]); +my $tandem_RUC = parse_variant([qw(21 25587759 tr0 T , . PASS END=25587769;RUS=CAT,GT,CA;RUC=2,5,4;RN=2,1)]); is_deeply($tandem_RUC, bless( { - 'chr' => '2', + 'chr' => '21', 'strand' => '1', 'variation_name' => 'tr0', - 'allele_string' => 'T/T' . 'CAT' x 2 . 'GT' x 5 . '/T' . 'CA' x 4, - 'start' => 68914093, - 'end' => 68914093, - 'seq_region_start' => 68914093, - 'seq_region_end' => 68914093, + 'allele_string' => 'AGTAAATAGA/' . 'CAT' x 2 . 'GT' x 5 . '/' . 'CA' x 4, + 'start' => 25587760, + 'end' => 25587769, + 'seq_region_start' => 25587760, + 'seq_region_end' => 25587769, 'map_weight' => 1 }, 'Bio::EnsEMBL::Variation::VariationFeature' ) , 'VariationFeature - tandem repeat using RUC'); -my $tandem_RB = parse_variant([qw(2 68914092 tr0 T , . PASS RUS=CAT,GT,CA;RB=6,10,8;RN=2,1)]); +my $tandem_RB = parse_variant([qw(21 25587759 tr0 T , . PASS END=25587769;RUS=CAT,GT,CA;RB=6,10,8;RN=2,1)]); is_deeply($tandem_RUC, $tandem_RB, 'VariationFeature - tandem repeat using RB'); -$tandem = parse_variant([qw(2 68914092 tr0 T , . PASS RUS=CAT,.,CA;RUC=2,5,4;RN=2,1)]); +$tandem = parse_variant([qw(21 25587759 tr0 T , . PASS END=25587769;SVLEN=10,10;RUS=CAT,.,CA;RUC=2,5,4;RN=2,1)]); is_deeply($tandem, bless( { - 'chr' => '2', + 'chr' => '21', 'strand' => '1', 'variation_name' => 'tr0', - 'allele_string' => 'T/T' . 'CAT' x 2 . 'N' x 5 . '/T' . 'CA' x 4, - 'start' => 68914093, - 'end' => 68914093, - 'seq_region_start' => 68914093, - 'seq_region_end' => 68914093, + 'allele_string' => 'AGTAAATAGA/' . 'CAT' x 2 . 'N' x 5 . '/' . 'CA' x 4, + 'start' => 25587760, + 'end' => 25587769, + 'seq_region_start' => 25587760, + 'seq_region_end' => 25587769, 'map_weight' => 1 }, 'Bio::EnsEMBL::Variation::VariationFeature' ) , 'VariationFeature - tandem repeat with missing sequence'); -my $tandem_RN_0 = parse_variant([qw(2 68914092 tr0 T ,, . PASS RUS=CAT,GT,CA;RB=6,10,8)]); -my $tandem_RN_1 = parse_variant([qw(2 68914092 tr0 T ,, . PASS RUS=CAT,GT,CA;RB=6,10,8;RN=1,1,1)]); +my $tandem_svlen_1 = parse_variant([qw(21 25587759 tr0 T , . PASS SVLEN=10,10;RUS=CAT,.,CA;RUC=2,5,4;RN=2,1)]); +is_deeply($tandem, $tandem_svlen_1, + 'VariationFeature - tandem repeat with missing END but with SVLEN'); + +my $tandem_svlen_2 = parse_variant([qw(21 25587759 tr0 T , . PASS SVLEN=5,10;RUS=CAT,.,CA;RUC=2,5,4;RN=2,1)]); +is_deeply($tandem_svlen_1, $tandem_svlen_2, + 'VariationFeature - tandem repeat with missing END and multiple non-unique SVLEN'); + +$tandem = parse_variant([qw(21 25587759 tr0 T , . PASS RUS=CAT,.,CA;RUC=2,5,4;RN=2,1)]); +is_deeply($tandem, bless( { + 'chr' => '21', + 'strand' => '1', + 'variation_name' => 'tr0', + 'allele_string' => 'A/' . 'CAT' x 2 . 'N' x 5 . '/' . 'CA' x 4, + 'start' => 25587760, + 'end' => 25587760, + 'seq_region_start' => 25587760, + 'seq_region_end' => 25587760, + 'map_weight' => 1 + }, + 'Bio::EnsEMBL::Variation::VariationFeature' ) , + 'VariationFeature - tandem repeat with missing END and SVLEN'); + +my $tandem_RN_0 = parse_variant([qw(21 25587759 tr0 T ,, . PASS END=25587769;RUS=CAT,GT,CA;RB=6,10,8)]); +my $tandem_RN_1 = parse_variant([qw(21 25587759 tr0 T ,, . PASS END=25587769;RUS=CAT,GT,CA;RB=6,10,8;RN=1,1,1)]); is_deeply($tandem_RN_0, $tandem_RN_1, 'VariationFeature - tandem repeat omitting RN'); -$tandem = parse_variant([qw(2 68914092 tr0 T , . PASS RUS=CAT,.,CA;RUC=5,10000001,4;RN=2,1)]); +$tandem = parse_variant([qw(21 25587759 tr0 T , . PASS END=25587769;RUS=CAT,.,CA;RUC=5,10000001,4;RN=2,1)]); is_deeply($tandem, bless( { - 'chr' => '2', + 'chr' => '21', 'strand' => '1', 'variation_name' => 'tr0', 'class_SO_term' => 'tandem_repeat', 'allele_string' => '/', - 'start' => 68914093, - 'end' => 68914093, - 'outer_start' => 68914093, - 'outer_end' => 68914093, - 'inner_start' => 68914093, - 'inner_end' => 68914093, - 'seq_region_start' => 68914093, - 'seq_region_end' => 68914093 + 'start' => 25587760, + 'end' => 25587769, + 'outer_start' => 25587760, + 'outer_end' => 25587769, + 'inner_start' => 25587760, + 'inner_end' => 25587769, + 'seq_region_start' => 25587760, + 'seq_region_end' => 25587769 }, 'Bio::EnsEMBL::Variation::StructuralVariationFeature' ) , 'StructuralVariationFeature - tandem repeat is too large');