Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse CNV:TR alleles from VCF #1561

Merged
merged 7 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions modules/Bio/EnsEMBL/VEP/Parser.pm
Original file line number Diff line number Diff line change
Expand Up @@ -674,17 +674,19 @@ sub get_SO_term {
$subtype = $element if grep /^$element$/i, @mobile_elements;
}
$abbrev .= '_' . $subtype;
} elsif ($type =~ /DUP:TANDEM|CNV:TR/i) {
# including <CNV:TR>,<CNV:TR>
} elsif ($type =~ /DUP:TANDEM/i) {
$abbrev = "TDUP";
} elsif ($type =~ /CNV/i) {
# including <CNV>,<CNV>
$abbrev = "CNV";
} elsif ($type =~ /CNV:TR/i) {
# including <CNV:TR>,<CNV:TR>
$abbrev = "TREP";
} elsif ($type =~ /CN=?[0-9]/i) {
# ALT: "<CN0>", "<CN0>,<CN2>,<CN3>" "<CN2>" => SVTYPE: DEL, CNV, DUP
$abbrev = "CNV";
$abbrev = "DEL" if $type =~ /^<?CN=?0>?$/;
$abbrev = "DUP" if $type =~ /^<?CN=?2>?$/;
} elsif ($type =~ /CNV/i) {
# including <CNV>,<CNV>
$abbrev = "CNV";
} elsif ($type =~ /[\[\]]|^\.|\.$/) {
$abbrev = "BND";
} elsif ($type =~ /^\<|\>$/) {
Expand All @@ -710,6 +712,7 @@ sub get_SO_term {
DEL_LINE1 => 'LINE1_deletion',
DEL_SVA => 'SVA_deletion',

TREP => 'tandem_repeat',
TDUP => 'tandem_duplication',
DUP => 'duplication',
CNV => 'copy_number_variation',
Expand Down
99 changes: 99 additions & 0 deletions modules/Bio/EnsEMBL/VEP/Parser/VCF.pm
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ use Bio::EnsEMBL::Utils::Scalar qw(assert_ref);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::IO::Parser::VCF4;

use List::Util qw(max);
use List::MoreUtils qw(uniq);

=head2 new

Expand Down Expand Up @@ -376,6 +378,58 @@ sub create_VariationFeatures {
}


sub _expand_tandem_repeat_allele_string {
my ($self, $type, $info, $ref) = @_;

if ($info->{RUS} && ($info->{RUC} || $info->{RB}) ) {
# warn that CIRUC and CIRB INFO fields are ignored
$self->warning_msg("CIRUC and CIRB INFO fields are ignored when calculating alternative alleles in tandem repeats")
if $info->{CIRUC} or $info->{CIRB};

# RN : Total number of repeat sequences in this allele
# RUS : Repeat unit sequence of the corresponding repeat sequence
# RUC : Repeat unit count of corresponding repeat sequence
# RB : Total number of bases in the corresponding repeat sequence
my @RN = $info->{RN} ?
split(/,/, $info->{RN}) :
# if RN is missing, assume 1 repeat for each CNV:TR allele
(1) x ( () = $type =~ /CNV:TR/g );
my @RUS = split /,/, $info->{RUS};
my @RUC = split /,/, ($info->{RUC} || ''); # undefined if no RUC
my @RB = split /,/, ($info->{RB} || ''); # undefined if no RB
my $is_RUC_defined = scalar @RUC;

my $max_sv_size = $self->param('max_sv_size') || 5000;
my $is_oversized = 0;

# build sequence of tandem repeat based on INFO fields
my @repeats;
for my $records (@RN) {
my $repeat;
next if $records eq '.'; # ignore missing values
for ( 1 .. $records ) {
my $seq = shift(@RUS);
$seq = 'N' if $seq eq '.'; # missing sequence
my $num = $is_RUC_defined ? shift(@RUC) : shift(@RB) / length($seq);

# avoid calculating really large repeats
$is_oversized = length($seq) * $num > $max_sv_size;
last if $is_oversized;
$repeat .= $seq x $num;
}

last if $is_oversized;
# prepend reference allele
push @repeats, $repeat;
}

# avoid storing alternative allele for tandem repeats with large repeats
return $ref.'/'.join('/', @repeats) unless $is_oversized;
}
return undef;
}


=head2 create_StructuralVariationFeatures

Example : $vfs = $parser->create_StructuralVariationFeatures();
Expand Down Expand Up @@ -433,6 +487,51 @@ sub create_StructuralVariationFeatures {
}
$alt = $ref . "/$alt" unless $alt =~ /^\.|\.$/;
}
## parse tandem repeats based on VCF INFO fields
elsif ($so_term =~ /tandem/ ) {
# validate SVLEN
if (defined $info->{SVLEN}) {
my @svlen = uniq(split(/,/, $info->{SVLEN}));
# warn if there are different references per alternative allele
$self->warning_msg(
"found tandem repeats with different references per alternative allele: " .
"SVLEN=" . $info->{SVLEN} . "; only using reference with largest size"
) if scalar @svlen > 1;
$end = $start + max(@svlen) - 1;
nakib103 marked this conversation as resolved.
Show resolved Hide resolved
}

# get reference allele and allele_string from tandem repeats
my $allele_string;
my $slice_ref = $self->get_slice($chr);
if ($slice_ref) {
$slice_ref = $slice_ref->sub_Slice($start, $end, 1);
$ref = $slice_ref->seq if defined $slice_ref and defined $slice_ref->seq;
$allele_string = $self->_expand_tandem_repeat_allele_string($type, $info, $ref) if defined $ref;
} elsif (!defined($self->param('fasta')) && $self->param('offline')) {
$self->warning_msg(
"could not fetch sequence for tandem repeats; " .
"consequence calculation for tandem repeats is less precise when using --offline without --fasta\n");
} else {
$self->warning_msg("could not fetch sequence for tandem repeat");
}

if (defined $allele_string) {
# convert tandem repeats to VariationFeature in order to properly
# calculate consequences based on alternative allele sequence
my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({
start => $start,
end => $end,
allele_string => $allele_string,
strand => 1,
map_weight => 1,
adaptor => $self->get_adaptor('variation', 'VariationFeature'),
variation_name => @$ids ? $ids->[0] : undef,
chr => $chr,
_line => $record,
});
return $self->post_process_vfs([$vf]);
}
}

# check for imprecise breakpoints
my ($min_start, $max_start, $min_end, $max_end) = (
Expand Down
119 changes: 119 additions & 0 deletions t/Parser_VCF.t
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,125 @@ is_deeply($bnd5_vf, bless( {
'Bio::EnsEMBL::Variation::StructuralVariationFeature' ) ,
'StructuralVariationFeature - BND with unsupported INFO/END field');

## test tandem repeats

sub parse_variant {
my $var = shift;
my $vf = Bio::EnsEMBL::VEP::Parser::VCF->new({
config => Bio::EnsEMBL::VEP::Config->new({
%$base_testing_cfg, gp => 1, warning_file => 'STDERR',
fasta => $test_cfg->{fasta}
}),
file => $test_cfg->create_input_file($var),
valid_chromosomes => [21]
})->next();

delete($vf->{adaptor});
delete($vf->{_line});

return $vf;
}

my $tandem = parse_variant([qw(21 25587759 tr0 T <CNV:TR>,<CNV:TR> . PASS END=25587769)]);
is_deeply($tandem, bless( {
'chr' => '21',
'strand' => '1',
'variation_name' => 'tr0',
'class_SO_term' => 'tandem_repeat',
'allele_string' => '<CNV:TR>/<CNV:TR>',
'start' => 25587760,
'inner_start' => 25587760,
'outer_start' => 25587760,
'end' => 25587769,
'inner_end' => 25587769,
'outer_end' => 25587769,
'seq_region_start' => 25587760,
'seq_region_end' => 25587769
},
'Bio::EnsEMBL::Variation::StructuralVariationFeature' ) ,
'StructuralVariationFeature - generic tandem repeat');

my $tandem_RUC = parse_variant([qw(21 25587759 tr0 T <CNV:TR>,<CNV:TR> . PASS END=25587769;RUS=CAT,GT,CA;RUC=2,5,4;RN=2,1)]);
is_deeply($tandem_RUC, bless( {
'chr' => '21',
'strand' => '1',
'variation_name' => 'tr0',
'allele_string' => 'AGTAAATAGA/' . 'CAT' x 2 . 'GT' x 5 . '/' . 'CA' x 4,
'start' => 25587760,
'end' => 25587769,
'seq_region_start' => 25587760,
'seq_region_end' => 25587769,
'map_weight' => 1
},
'Bio::EnsEMBL::Variation::VariationFeature' ) ,
'VariationFeature - tandem repeat using RUC');

my $tandem_RB = parse_variant([qw(21 25587759 tr0 T <CNV:TR>,<CNV:TR> . PASS END=25587769;RUS=CAT,GT,CA;RB=6,10,8;RN=2,1)]);
is_deeply($tandem_RUC, $tandem_RB, 'VariationFeature - tandem repeat using RB');

$tandem = parse_variant([qw(21 25587759 tr0 T <CNV:TR>,<CNV:TR> . PASS END=25587769;SVLEN=10,10;RUS=CAT,.,CA;RUC=2,5,4;RN=2,1)]);
is_deeply($tandem, bless( {
'chr' => '21',
'strand' => '1',
'variation_name' => 'tr0',
'allele_string' => 'AGTAAATAGA/' . 'CAT' x 2 . 'N' x 5 . '/' . 'CA' x 4,
'start' => 25587760,
'end' => 25587769,
'seq_region_start' => 25587760,
'seq_region_end' => 25587769,
'map_weight' => 1
},
'Bio::EnsEMBL::Variation::VariationFeature' ) ,
'VariationFeature - tandem repeat with missing sequence');

my $tandem_svlen_1 = parse_variant([qw(21 25587759 tr0 T <CNV:TR>,<CNV:TR> . PASS SVLEN=10,10;RUS=CAT,.,CA;RUC=2,5,4;RN=2,1)]);
is_deeply($tandem, $tandem_svlen_1,
'VariationFeature - tandem repeat with missing END but with SVLEN');

my $tandem_svlen_2 = parse_variant([qw(21 25587759 tr0 T <CNV:TR>,<CNV:TR> . PASS SVLEN=5,10;RUS=CAT,.,CA;RUC=2,5,4;RN=2,1)]);
is_deeply($tandem_svlen_1, $tandem_svlen_2,
'VariationFeature - tandem repeat with missing END and multiple non-unique SVLEN');

$tandem = parse_variant([qw(21 25587759 tr0 T <CNV:TR>,<CNV:TR> . PASS RUS=CAT,.,CA;RUC=2,5,4;RN=2,1)]);
is_deeply($tandem, bless( {
'chr' => '21',
'strand' => '1',
'variation_name' => 'tr0',
'allele_string' => 'A/' . 'CAT' x 2 . 'N' x 5 . '/' . 'CA' x 4,
'start' => 25587760,
'end' => 25587760,
'seq_region_start' => 25587760,
'seq_region_end' => 25587760,
'map_weight' => 1
},
'Bio::EnsEMBL::Variation::VariationFeature' ) ,
'VariationFeature - tandem repeat with missing END and SVLEN');

my $tandem_RN_0 = parse_variant([qw(21 25587759 tr0 T <CNV:TR>,<CNV:TR>,<CNV:TR> . PASS END=25587769;RUS=CAT,GT,CA;RB=6,10,8)]);
my $tandem_RN_1 = parse_variant([qw(21 25587759 tr0 T <CNV:TR>,<CNV:TR>,<CNV:TR> . PASS END=25587769;RUS=CAT,GT,CA;RB=6,10,8;RN=1,1,1)]);

is_deeply($tandem_RN_0, $tandem_RN_1,
'VariationFeature - tandem repeat omitting RN');

$tandem = parse_variant([qw(21 25587759 tr0 T <CNV:TR>,<CNV:TR> . PASS END=25587769;RUS=CAT,.,CA;RUC=5,10000001,4;RN=2,1)]);
is_deeply($tandem, bless( {
'chr' => '21',
'strand' => '1',
'variation_name' => 'tr0',
'class_SO_term' => 'tandem_repeat',
'allele_string' => '<CNV:TR>/<CNV:TR>',
'start' => 25587760,
'end' => 25587769,
'outer_start' => 25587760,
'outer_end' => 25587769,
'inner_start' => 25587760,
'inner_end' => 25587769,
'seq_region_start' => 25587760,
'seq_region_end' => 25587769
},
'Bio::EnsEMBL::Variation::StructuralVariationFeature' ) ,
'StructuralVariationFeature - tandem repeat is too large');

## OTHER TESTS
##############

Expand Down