diff --git a/.gitignore b/.gitignore index afb2da3..ea10eab 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ /setup.log /perl/perltidy.LOG /perl/bin/cpanm +/perl/MYMETA.json +/perl/MYMETA.yml diff --git a/README.md b/README.md index 0ca5ba9..1d5f319 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,3 @@ -LICENCE -======= -Copyright (c) 2014 Genome Research Ltd. - -Author: Cancer Genome Project cgpit@sanger.ac.uk - -This file is part of cgpBattenberg. - -cgpBattenberg is free software: you can redistribute it and/or modify it under -the terms of the GNU Affero General Public License as published by the Free -Software Foundation; either version 3 of the License, or (at your option) any -later version. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more -details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . - -1. The usage of a range of years within a copyright statement contained within -this distribution should be interpreted as being equivalent to a list of years -including the first and last year specified and all consecutive years between -them. For example, a copyright statement that reads 'Copyright (c) 2005, 2007- -2009, 2011-2012' should be interpreted as being identical to a statement that -reads 'Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012' and a copyright -statement that reads "Copyright (c) 2005-2012' should be interpreted as being -identical to a statement that reads 'Copyright (c) 2005, 2006, 2007, 2008, -2009, 2010, 2011, 2012'." - - cgpBattenberg ============= @@ -37,10 +5,11 @@ An installation helper, perl wrapper and the R program Battenberg which detects ## Installation -Please install the following before attempting to run ``setup.sh `` +Please install the following before attempting to run ``setup.sh [X/lib/perl:Y/lib/perl]`` -1. [PCAP-core](https://github.com/ICGC-TCGA-PanCancer/PCAP-core/releases) -2. [alleleCount](https://github.com/cancerit/alleleCount/releases) +1. [PCAP-core v2.1.3+](https://github.com/ICGC-TCGA-PanCancer/PCAP-core/releases) +2. [alleleCount v3.0.1+](https://github.com/cancerit/alleleCount/releases) +3. [cgpVcf v2.0.1+](https://github.com/cancerit/cgpVcf/releases) All of the items listed here use the same install method. @@ -69,3 +38,35 @@ For the most up to date usage instructions for the wrapper code please see the c +---- + +LICENCE +======= +Copyright (c) 2014-2016 Genome Research Ltd. + +Author: Cancer Genome Project cgpit@sanger.ac.uk + +This file is part of cgpBattenberg. + +cgpBattenberg is free software: you can redistribute it and/or modify it under +the terms of the GNU Affero General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) any +later version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . + +1. The usage of a range of years within a copyright statement contained within +this distribution should be interpreted as being equivalent to a list of years +including the first and last year specified and all consecutive years between +them. For example, a copyright statement that reads 'Copyright (c) 2005, 2007- +2009, 2011-2012' should be interpreted as being identical to a statement that +reads 'Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012' and a copyright +statement that reads "Copyright (c) 2005-2012' should be interpreted as being +identical to a statement that reads 'Copyright (c) 2005, 2006, 2007, 2008, +2009, 2010, 2011, 2012'." \ No newline at end of file diff --git a/perl/MANIFEST b/perl/MANIFEST index b3bd1c7..cae50ef 100644 --- a/perl/MANIFEST +++ b/perl/MANIFEST @@ -26,5 +26,6 @@ share/battenberg/RunBAFLogR.R share/battenberg/RunGetHaplotypedBAFs.R share/battenberg/RunImpute.R share/battenberg/segmentBAFphased.R +share/gender/GRCh37d5_Y.loci t/1_pm_compile.t t/2_pl_compile.t diff --git a/perl/MYMETA.json b/perl/MYMETA.json deleted file mode 100644 index d42bfaa..0000000 --- a/perl/MYMETA.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "abstract" : "unknown", - "author" : [ - "unknown" - ], - "dynamic_config" : 0, - "generated_by" : "ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.142690", - "license" : [ - "agpl_3" - ], - "meta-spec" : { - "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", - "version" : "2" - }, - "name" : "cgpBattenberg", - "no_index" : { - "directory" : [ - "t", - "inc" - ] - }, - "prereqs" : { - "build" : { - "requires" : { - "ExtUtils::MakeMaker" : "0" - } - }, - "configure" : { - "requires" : { - "ExtUtils::MakeMaker" : "0" - } - }, - "runtime" : { - "requires" : { - "Const::Fast" : "0.014", - "Try::Tiny" : "0.22" - } - } - }, - "release_status" : "stable", - "version" : "v1.4.0" -} diff --git a/perl/MYMETA.yml b/perl/MYMETA.yml deleted file mode 100644 index cfb41c9..0000000 --- a/perl/MYMETA.yml +++ /dev/null @@ -1,23 +0,0 @@ ---- -abstract: unknown -author: - - unknown -build_requires: - ExtUtils::MakeMaker: '0' -configure_requires: - ExtUtils::MakeMaker: '0' -dynamic_config: 0 -generated_by: 'ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.142690' -license: open_source -meta-spec: - url: http://module-build.sourceforge.net/META-spec-v1.4.html - version: '1.4' -name: cgpBattenberg -no_index: - directory: - - t - - inc -requires: - Const::Fast: '0.014' - Try::Tiny: '0.22' -version: v1.4.0 diff --git a/perl/Makefile.PL b/perl/Makefile.PL index 42276ac..f43d1ac 100755 --- a/perl/Makefile.PL +++ b/perl/Makefile.PL @@ -37,6 +37,7 @@ WriteMakefile( PREREQ_PM => { 'Const::Fast' => 0.014, 'Try::Tiny' => 0.22, + 'Bio::DB::HTS' => 2.0, } ); diff --git a/perl/bin/battenberg.pl b/perl/bin/battenberg.pl index 270e933..527f308 100755 --- a/perl/bin/battenberg.pl +++ b/perl/bin/battenberg.pl @@ -1,7 +1,7 @@ #!/usr/bin/perl ##########LICENCE########## -# Copyright (c) 2014,2015 Genome Research Ltd. +# Copyright (c) 2014-2016 Genome Research Ltd. # # Author: Cancer Genome Project cgpit@sanger.ac.uk # @@ -45,6 +45,7 @@ segmentphased fitcn subclones finalise ); const my @VALID_PROTOCOLS => qw( WGS WXS RNA ); +const my @VALID_GENDERS => qw(XX XY L); const my $DEFAULT_ALLELE_COUNT_MBQ => 20; const my $DEFAULT_PLATFORM_GAMMA=>1; @@ -135,7 +136,6 @@ sub setup { 'p|process=s' => \$opts{'process'}, 'u|thousand-genomes-loc=s' => \$opts{'1kgenloc'}, 'r|reference=s' => \$opts{'reference'}, - 's|is-male' => \$opts{'is_male'}, 'e|impute-info=s' => \$opts{'impute_info'}, 'c|prob-loci=s' => \$opts{'prob_loci'}, 'g|logs=s' => \$opts{'lgs'}, @@ -157,11 +157,13 @@ sub setup { 'pr|protocol=s' => \$opts{'protocol'}, 'pl|platform=s' => \$opts{'platform'}, 'a|allele-counts=s' => \$opts{'allele-counts'}, + 'ge|gender=s' => \$opts{'gender'}, + 'gl|genderloci=s' => \$opts{'genderloci'}, 'j|jobs' => \$opts{'jobs'}, ) or pod2usage(2); - pod2usage(-message => Sanger::CGP::Battenberg::license, -verbose => 0) if(defined $opts{'h'}); - pod2usage(-message => Sanger::CGP::Battenberg::license, -verbose => 2) if(defined $opts{'m'}); + pod2usage(-verbose => 0, -exitval => 0) if(defined $opts{'h'}); + pod2usage(-verbose => 2, -exitval => 0) if(defined $opts{'m'}); # then check for no args: my $defined; @@ -205,6 +207,16 @@ sub setup { delete $opts{'process'} unless(defined $opts{'process'}); delete $opts{'index'} unless(defined $opts{'index'}); + if(defined $opts{'gender'}){ + pod2usage(-message => 'unknown gender value: '.$opts{'gender'}, -verbose => 1) unless(first {$_ eq $opts{'gender'}} @VALID_GENDERS); + if($opts{'gender'} eq 'L') { + die "ERROR: Gender of XY/XX must be supplied when 'allele-counts' defined\n" if(defined $opts{'allele-counts'}); + $opts{'gender'} = Sanger::CGP::Battenberg::Implement::determine_gender(\%opts); + } + } else { + pod2usage(-message => 'gender not set', -verbose => 1); + } + if(exists $opts{'protocol'} && defined $opts{'protocol'}) { my $bad_prot = 1; $bad_prot = 0 if(first { $_ eq $opts{'protocol'} } @VALID_PROTOCOLS); @@ -269,7 +281,7 @@ sub setup { make_path($logs) unless(-d $logs); $opts{'logs'} = $logs; - if(exists($opts{'is_male'}) && defined($opts{'is_male'})){ + if($opts{'gender'} eq 'XY'){ $opts{'is_male'} = 'TRUE'; }else{ $opts{'is_male'} = 'FALSE'; @@ -291,6 +303,7 @@ sub setup { $opts{'protocol'} = $DEFAULT_PROTOCOL if(!exists($opts{'protocol'}) || !defined($opts{'protocol'})); $opts{'platform'} = $DEFAULT_PLATFORM if(!exists($opts{'platform'}) || !defined($opts{'platform'})); + return \%opts; } @@ -312,7 +325,7 @@ =head1 SYNOPSIS - when '-a' defined sample name -normbam -nb Path to normal bam file - when '-a' defined sample name - -is-male -s Flag, if the sample is male + -gender -ge Gender, XX, XY or L (see -gl) -impute-info -e Location of the impute info file -thousand-genomes-loc -u Location of the directory containing 1k genomes data -ignore-contigs-file -ig File containing contigs to ignore @@ -336,6 +349,8 @@ =head1 SYNOPSIS -assembly -ra Reference assembly [] -protocol -pr Sequencing protocol [WGS] -platform -pl Sequencing platfrom [ILLUMINA] + -genderloci -gl List of gender loci, required when '-ge L' [share/gender/GRCh37d5_Y.loci] + - these are loci that will not present at all in a female sample Optional system related parameters: -threads -t Number of threads allowed on this machine (default 1) diff --git a/perl/bin/battenberg_CN_to_VCF.pl b/perl/bin/battenberg_CN_to_VCF.pl index a375999..e612d19 100755 --- a/perl/bin/battenberg_CN_to_VCF.pl +++ b/perl/bin/battenberg_CN_to_VCF.pl @@ -1,7 +1,7 @@ #!/usr/bin/perl ##########LICENCE########## -# Copyright (c) 2014 Genome Research Ltd. +# Copyright (c) 2014-2016 Genome Research Ltd. # # Author: David Jones # @@ -34,7 +34,7 @@ BEGIN use Getopt::Long; use Pod::Usage qw(pod2usage); -use Bio::DB::Sam; +use Bio::DB::HTS; use Try::Tiny; use PCAP::Cli; @@ -69,6 +69,7 @@ BEGIN my $record_converter = new Sanger::CGP::Vcf::VCFCNConverter( -contigs => [values %$contigs] ); + $record_converter->extended_cn(1); my ($input_loc,$output_loc,$IN_FH,$OUT_FH); try{ @@ -105,12 +106,19 @@ BEGIN #Iterate through input and create a record for each. - my $fai = Bio::DB::Sam::Fai->load($reference); + my $fai = Bio::DB::HTS::Fai->load($reference); while(<$IN_FH>){ my $line = $_; chomp($line); next if($line =~ m/^\s+chr/); - my ($seg_no,$chr,$start,$end,$blank1,$blank2,$blank3,$blank4,$mt_cn_tot,$mt_cn_min,undef) = split('\s+',$line); + my ($seg_no, $chr, $start, $end, $mt_cn_tot, $mt_cn_min, $mt_frac, $mt_cn_tot_sec, $mt_cn_min_sec, $mt_frac_sec) = (split('\s+',$line))[0,1,2,3,8,9,10,11,12,13]; + + my $extended = { 'mt_fcf' => $mt_frac eq 'NA' ? '.' : $mt_frac, + 'mt_tcs' => $mt_cn_tot_sec eq 'NA' ? '.' : $mt_cn_tot_sec, + 'mt_mcs' => $mt_cn_min_sec eq 'NA' ? '.' : $mt_cn_min_sec, + 'mt_fcs' => $mt_frac_sec eq 'NA' ? '.' : $mt_frac_sec, + }; + my $wt_cn_tot = 2; my $wt_cn_min = 1; $start--; # all symbolic ALTs require preceeding base padding @@ -125,7 +133,7 @@ BEGIN && defined($mt_cn_min)); my $start_allele = $fai->fetch("$chr:$start-$start"); - print $OUT_FH $record_converter->generate_record($chr,$start,$end,$start_allele,$wt_cn_tot,$wt_cn_min,$mt_cn_tot,$mt_cn_min); + print $OUT_FH $record_converter->generate_record($chr,$start,$end,$start_allele,$wt_cn_tot,$wt_cn_min,$mt_cn_tot,$mt_cn_min, $extended); } }catch{ @@ -172,7 +180,7 @@ sub parse_samples { $param_mod = 'w'; } if(defined $opts->{'sb'.$param_mod}) { - $sam = Bio::DB::Sam->new(-bam => $opts->{'sb'.$param_mod}, -fasta => $reference); + $sam = Bio::DB::HTS->new(-bam => $opts->{'sb'.$param_mod}, -fasta => $reference); $samp_ref = Sanger::CGP::Vcf::BamUtil->parse_samples($sam->header->text, $opts->{$param_mod.'ss'}, $opts->{$param_mod.'sq'}, @@ -249,7 +257,6 @@ sub setup{ PCAP::Cli::file_for_reading('r', $opts{'r'}); # required: direct input - pod2usage(-message => "\nERROR: rs|reference-species must be defined.\n", -verbose => 1, -output => \*STDERR) unless($opts{'rs'}); pod2usage(-message => "\nERROR: msq|sample-sequencing-protocol-mut must be defined.\n", -verbose => 1, -output => \*STDERR) if(exists $opts{'msq'} && ! defined $opts{'msq'}); pod2usage(-message => "\nERROR: wsq|sample-sequencing-protocol-norm must be defined.\n", -verbose => 1, -output => \*STDERR) if(exists $opts{'wsq'} && ! defined $opts{'wsq'}); diff --git a/perl/bin/download_generate_bberg_ref_files.pl b/perl/bin/download_generate_bberg_ref_files.pl index ea8e1c4..7e811a9 100755 --- a/perl/bin/download_generate_bberg_ref_files.pl +++ b/perl/bin/download_generate_bberg_ref_files.pl @@ -21,11 +21,8 @@ # along with this program. If not, see . ##########LICENCE########## -BEGIN { - use Cwd qw(abs_path); - use File::Basename; - push (@INC,dirname(abs_path($0)).'/../lib'); -}; +use FindBin; +use lib "$FindBin::Bin/../lib"; use strict; use warnings FATAL => 'all'; @@ -283,7 +280,7 @@ sub download_unpack_files{ my ($opts) = @_; my $impute_download = $opts->{'u'}.sprintf($IMPUTE_TGZ_PATTERN,$DOWNLOAD_VERSION); my $imputetgz = File::Spec->catfile($opts->{'tmp'}, sprintf($IMPUTE_TGZ_PATTERN,$DOWNLOAD_VERSION)); - download_file($impute_download,$imputetgz) unless(-e $imputetgz.'.dl_success'); + download_file($impute_download,$imputetgz,$opts->{'c'}) unless(-e $imputetgz.'.dl_success'); unpack_file($imputetgz,$opts->{'tmp'},'tgz'); return; } @@ -296,16 +293,20 @@ sub unpack_file{ } sub download_file{ - my ($url,$file) = @_; - my $dirname = dirname($file); - $File::Fetch::BLACKLIST = [qw(lwp)]; - $url =~ s/^https/http/; - my $ff = File::Fetch->new(uri => $url); - my $local = $ff->fetch(to => $dirname) or croak $ff->error; - # if the filename isn't what we expect move it - unless($local eq $file) { - move($local, $file) or croak $!; - } + my ($url,$file,$use_curl) = @_; + if ($use_curl) { + my $output = `curl --location $url > $file`; + } else { + my $dirname = dirname($file); + $File::Fetch::BLACKLIST = [qw(lwp)]; + $url =~ s/^https/http/; + my $ff = File::Fetch->new(uri => $url); + my $local = $ff->fetch(to => $dirname) or croak $ff->error; + # if the filename isn't what we expect move it + unless($local eq $file) { + move($local, $file) or croak $!; + } + } # touch a file to show the archive is complete and not partial to prevent re-dl (3.7GB) if this bit is successful # likely use case would be running out of disk space during unpacking intermediate space required is ~16GB my $success_file = $file.'.dl_success'; @@ -323,6 +324,7 @@ sub setup{ 'h|help' => \$opts{'h'}, 'm|man' => \$opts{'m'}, 'v|version' => \$opts{'v'}, + 'c|use-curl' => \$opts{'c'}, 'o|out-dir=s' => \$opts{'o'}, 'd|download-version=s' => \$opts{'d'}, 'u|url=s' => \$opts{'u'}, @@ -338,6 +340,9 @@ sub setup{ pod2usage(-verbose => 2) if(defined $opts{'m'}); pod2usage(-msg => "\nERROR: Invalid inputs. Must provide o|out-dir.\n", -verbose => 1, -output => \*STDERR) if(!exists $opts{'o'} || !defined $opts{'o'}); + # make outdir absolute + $opts{'o'} = File::Spec->rel2abs( $opts{'o'} ); + #Ensure download and other directories exist, if not create it. my $tmpdir = File::Spec->catdir($opts{'o'}, 'tmp'); make_path($tmpdir) unless(-d $tmpdir); @@ -375,6 +380,7 @@ =head1 SYNOPSIS Optional parameters: -url -u URL to download battenberg impute reference data from [see docs for default url] + -use-curl -c Use curl for the download -download-version -d Download version (part of download name) default: [v3] Other: diff --git a/perl/docs.tar.gz b/perl/docs.tar.gz index 170653b..2f0fb0e 100644 Binary files a/perl/docs.tar.gz and b/perl/docs.tar.gz differ diff --git a/perl/lib/Sanger/CGP/Battenberg.pm b/perl/lib/Sanger/CGP/Battenberg.pm index 42ed63a..580031c 100644 --- a/perl/lib/Sanger/CGP/Battenberg.pm +++ b/perl/lib/Sanger/CGP/Battenberg.pm @@ -1,5 +1,5 @@ ##########LICENCE########## -# Copyright (c) 2014 Genome Research Ltd. +# Copyright (c) 2014-2016 Genome Research Ltd. # # Author: Cancer Genome Project cgpit@sanger.ac.uk # @@ -25,35 +25,9 @@ use strict; use Const::Fast qw(const); use base 'Exporter'; -our $VERSION = '1.4.0'; +our $VERSION = '1.5.0'; our @EXPORT = qw($VERSION); -const my $LICENSE => -'##########LICENCE########## -# Copyright (c) 2014,2015 Genome Research Ltd. -# -# Author: Cancer Genome Project cgpit@sanger.ac.uk -# -# This file is part of cgpBattenberg. -# -# cgpBattenberg is free software: you can redistribute it and/or modify it under -# the terms of the GNU Affero General Public License as published by the Free -# Software Foundation; either version 3 of the License, or (at your option) any -# later version. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more -# details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -##########LICENCE##########'; - -sub license { - return sprintf $LICENSE, $VERSION; -} - 1; __END__ @@ -62,14 +36,3 @@ __END__ Sanger::CGP::Battenberg - Base class to house version and generic functions. -=head2 Methods - -=over 4 - -=item license - - my $brief_license = Sanger::CGP::Battenberg::licence; - -Output the brief license text for use in help messages. - -=back diff --git a/perl/lib/Sanger/CGP/Battenberg/Implement.pm b/perl/lib/Sanger/CGP/Battenberg/Implement.pm index da5dd26..c643063 100644 --- a/perl/lib/Sanger/CGP/Battenberg/Implement.pm +++ b/perl/lib/Sanger/CGP/Battenberg/Implement.pm @@ -114,6 +114,8 @@ const my $IMPUTE_EXE => q{impute2}; const my $IMPUTE_INPUT => q{%s_impute_input_chr%d_*K.*}; const my $IMPUTE_OUTPUT => q{%s_impute_output_chr%d.txt}; +const my $ALLELE_COUNT_PARA => ' -b %s -o %s -l %s '; + const my @BATTENBERG_RESULT_FILES => qw( %s_Tumor.png %s_Germline.png @@ -141,8 +143,8 @@ sub prepare { else { $options->{'tumbam'} = File::Spec->rel2abs($options->{'tumbam'}); $options->{'normbam'} = File::Spec->rel2abs($options->{'normbam'}); - $options->{'tumour_name'} = (PCAP::Bam::sample_name($options->{'tumbam'}))[0]; - $options->{'normal_name'} = (PCAP::Bam::sample_name($options->{'normbam'}))[0]; + $options->{'tumour_name'} = (PCAP::Bam::sample_name($options->{'tumbam'}, 1))[0]; + $options->{'normal_name'} = (PCAP::Bam::sample_name($options->{'normbam'}, 1))[0]; } $options->{'mod_path'} = get_mod_path(); $options->{'bat_path'} = File::Spec->catdir($options->{'mod_path'}, 'battenberg'); @@ -631,7 +633,9 @@ sub battenberg_finalise{ for(my $i=1; $i<=$options->{'job_count'}; $i++){ my $file = File::Spec->catfile($tmp,sprintf($IMPUTE_INPUT,$options->{'tumour_name'},$i)); next if($options->{'is_male'} && $file =~ m/chr(X|23)/ && ! -e $file); #Skip if male and X results aren't present, this is allowed... - unlink glob $file; + foreach my $file_to_del (glob $file){ + unlink($file_to_del) or print "Could not unlink $file_to_del"; + } } PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), @{['cleanup_impute',0]}); } @@ -673,11 +677,15 @@ sub battenberg_finalise{ #Tumour png my $tumour = File::Spec->catfile($tmp,sprintf($TUMOUR_PNG,$tumour_name,$tumour_dot)); my $tumour_copy = File::Spec->catfile($outdir,sprintf($TUMOUR_CORRECTED_PNG,$tumour_name)); - _copy_file($tumour,$tumour_copy); + if (-e $tumour) { + _copy_file($tumour,$tumour_copy); + } #Normal png my $normal = File::Spec->catfile($tmp,sprintf($NORMAL_PNG,$tumour_name,$tumour_dot)); my $normal_copy = File::Spec->catfile($outdir,sprintf($NORMAL_CORRECTED_PNG,$tumour_name)); - _copy_file($normal,$normal_copy); + if (-e $normal) { + _copy_file($normal,$normal_copy); + } tmp_to_outdir($tmp, $outdir, $tumour_name, $SUNRISE_PNG, $COPY_NO_PNG, $NON_ROUNDED_PNG, $ALT_COPY_NO_ROUNDED_PNG, @@ -695,6 +703,47 @@ sub battenberg_finalise{ return PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'),0); } +sub determine_gender { + my $options = shift; + my $gender_loci; + if(defined $options->{'genderloci'}) { + $gender_loci = $options->{'genderloci'}; + } + else { + my $mod_path = dirname(abs_path($0)).'/../share'; + $mod_path = module_dir('Sanger::CGP::Battenberg::Implement') unless(-e File::Spec->catdir($mod_path, 'gender')); + + my $gender_path = File::Spec->catdir($mod_path, 'gender'); + $gender_loci = File::Spec->catfile($gender_path,'GRCh37d5_Y.loci'); + } + + my $command = _which('alleleCounter'); + $command .= sprintf $ALLELE_COUNT_PARA, $options->{'normbam'}, File::Spec->catfile($options->{'tmp'}, 'normal_gender.tsv'), $gender_loci; + $command .= '-m '.$options->{'mbq'} if exists $options->{'mbq'}; + system($command); + my $norm_gender = _parse_gender_results(File::Spec->catfile($options->{'tmp'}, 'normal_gender.tsv')); + return $norm_gender; +} + +sub _parse_gender_results { + my $file = shift @_; + my $gender = 'XX'; + open my $fh, '<', $file; + while(my $line = <$fh>) { + next if($line =~ m/^#/); + chomp $line; + #CHR POS Count_A Count_C Count_G Count_T Good_depth + my ($chr, $pos, $a, $c, $g, $t, $depth) = split /\t/, $line; + # all we really care about is the depth + if($depth > 5) { + $gender = 'XY'; + last; # presence of ANY male loci in normal is sufficient, we shouldn't be using this to check for 'matchedness' + } + } + close $fh; + return $gender; +} + sub tmp_to_outdir { my ($tmp, $outdir, $tumour_name, @formats) = @_; for my $format(@formats) { @@ -829,8 +878,10 @@ sub _zip_and_tar_fileset{ $file = File::Spec->catfile($location,sprintf($filepattern,$sample_name,$i+1)); next if($options->{'is_male'} && $file =~ m/chr(X|23)/ && ! -e $file); #Skip if male and X results aren't present, this is allowed... } - PCAP::Cli::file_for_reading('file for tar.gz',$file); - push(@files,$file); + if (-e $file) { + PCAP::Cli::file_for_reading('file for tar.gz',$file); + push(@files,$file); + } } my $tarball = _targzFileSet($options,\@files,$tar,$dir); @@ -839,7 +890,7 @@ sub _zip_and_tar_fileset{ die("Error: Failed to copy file $tarball to $copied_loc.") if(!copy($tarball,$copied_loc)); push(@files,$tarball); foreach my $file_to_del (@files){ - unlink($file_to_del); + unlink($file_to_del) or print "Could not unlink $file_to_del"; } return; } @@ -927,6 +978,7 @@ sub read_contigs_from_file_with_ignore{ my $line = $_; chomp($line); my ($con,undef) = split(/\s+/,$line); + $con =~ s/chr//; # handle hg19, removing chr prefix my $match=0; foreach my $ign(@$ignore_contigs){ if("$ign" eq "$con"){ diff --git a/perl/share/battenberg/FitCopyNumber.R b/perl/share/battenberg/FitCopyNumber.R index 909f542..1d4545e 100755 --- a/perl/share/battenberg/FitCopyNumber.R +++ b/perl/share/battenberg/FitCopyNumber.R @@ -60,14 +60,6 @@ raw.logR.data = read.table(paste(start.file,"mutantLogR.tab",sep=""),sep="\t",he raw.BAF.data = raw.BAF.data[!is.na(raw.BAF.data[,3]),] raw.logR.data = raw.logR.data[!is.na(raw.logR.data[,3]),] -#chromosome names are sometimes 'chr1', etc. -if(length(grep("chr",raw.BAF.data[1,1]))>0){ - raw.BAF.data[,1] = gsub("chr","",raw.BAF.data[,1]) -} -if(length(grep("chr",raw.logR.data[1,1]))>0){ - raw.logR.data[,1] = gsub("chr","",raw.logR.data[,1]) -} - BAF.data = NULL logR.data = NULL segmented.logR.data = NULL @@ -80,6 +72,11 @@ for(chr in chr.names){ print(chr.segmented.BAF.data[1:3,]) indices = match(chr.segmented.BAF.data[,2],chr.BAF.data$Position ) + # Check if there are matches, if not skip the remainer of the loop + if (sum(is.na(indices))==length(indices) | length(indices)==0) { + next + } + #130313 chr.segmented.BAF.data = chr.segmented.BAF.data[!is.na(indices),] matched.segmented.BAF.data = rbind(matched.segmented.BAF.data, chr.segmented.BAF.data) @@ -89,10 +86,11 @@ for(chr in chr.names){ indices = match(chr.segmented.BAF.data[,2],chr.logR.data$Position) logR.data = rbind(logR.data, chr.logR.data[indices[!is.na(indices)],]) chr.segmented.logR.data = chr.logR.data[indices[!is.na(indices)],] + if(length(chr.segmented.BAF.data[,5])==0){next} segs = make_seg_lr(chr.segmented.BAF.data[,5]) cum.segs = c(0,cumsum(segs)) for(s in 1:length(segs)){ - chr.segmented.logR.data[(cum.segs[s]+1):cum.segs[s+1],3] = mean(chr.segmented.logR.data[(cum.segs[s]+1):cum.segs[s+1],3]) + chr.segmented.logR.data[(cum.segs[s]+1):cum.segs[s+1],3] = mean(chr.segmented.logR.data[(cum.segs[s]+1):cum.segs[s+1],3], na.rm=T) } segmented.logR.data = rbind(segmented.logR.data,chr.segmented.logR.data) #print(paste(nrow(matched.segmented.BAF.data),nrow(segmented.logR.data),nrow(logR.data),sep=",")) @@ -138,8 +136,4 @@ is.ref.better = out$is.ref.better rho_psi_output = data.frame(rho = c(ascat_optimum_pair$rho,ascat_optimum_pair_fraction_of_genome$rho,ascat_optimum_pair_ref_seg$rho),psi = c(ascat_optimum_pair$psi,ascat_optimum_pair_fraction_of_genome$psi,ascat_optimum_pair_ref_seg$psi), ploidy = c(ascat_optimum_pair$ploidy,ascat_optimum_pair_fraction_of_genome$ploidy,ascat_optimum_pair_ref_seg$ploidy), distance = c(NA,out$distance_without_ref,out$distance), is.best = c(NA,!is.ref.better,is.ref.better),row.names=c("ASCAT","FRAC_GENOME","REF_SEG")) write.table(rho_psi_output,paste(start.file,"rho_and_psi.txt",sep=""),quote=F,sep="\t") -# Create user friendly cellularity and ploidy output file -cellularity_ploidy_output = data.frame(cellularity = c(ascat_optimum_pair_fraction_of_genome$rho), ploidy = c(ascat_optimum_pair_fraction_of_genome$ploidy), psi = c(ascat_optimum_pair_fraction_of_genome$psi)) -write.table(cellularity_ploidy_output, paste(start.file,"cellularity_ploidy.txt",sep=""), quote=F, sep="\t", row.names=F) - q(save="no") diff --git a/perl/share/battenberg/PlotHaplotypedData.R b/perl/share/battenberg/PlotHaplotypedData.R index bb0c13b..af761b2 100755 --- a/perl/share/battenberg/PlotHaplotypedData.R +++ b/perl/share/battenberg/PlotHaplotypedData.R @@ -30,7 +30,8 @@ impute.info = read.table(imputeInfoFile,header=F,row.names=NULL,sep="\t",strings chr_names=unique(impute.info[,1]) chr_name = chr_names[chr] -imageFileName = paste(samplename,"_chr",chr_name,"_heterozygousData.png",sep="") +imageFileName = paste(samplename,"_",ifelse(grepl("chr", chr_name), "", "chr"), + chr_name,"_heterozygousData.png",sep="") mut_data<-read.table(mutFile,sep="\t",header=T) diff --git a/perl/share/battenberg/RunBAFLogR.R b/perl/share/battenberg/RunBAFLogR.R index 62cefc0..4373ea6 100755 --- a/perl/share/battenberg/RunBAFLogR.R +++ b/perl/share/battenberg/RunBAFLogR.R @@ -36,7 +36,9 @@ getBAFsAndLogRs(imputeInfoFile,inputFile,normalInputFile,paste(outputDir,"normal # This is a workaround such that BB doesnt have to be rerun across a lot of samples. The problem is that when ASCAT reads in a file it changes two things: If the samplename starts with a number it places an X in front of the name and underscores are replaced by dots. ascat.loadData was extended to take the real samplenames and place those in the header of the read in data files. However that changes the other underscores and dots thing as well and that changes filenames. We therefore place the dots in the samplenames such that ASCAT will produce the file with dots in it. A complete fix will be added at a later date. samplename = gsub("\\-", "\\.", samplename) -ascat.bc = ascat.loadData(paste(outputDir,"mutantLogR.tab",sep=""),paste(outputDir,"mutantBAF.tab",sep=""),paste(outputDir,"normalLogR.tab",sep=""),paste(outputDir,"normalBAF.tab",sep=""), samplenames=c(samplename)) +impute.info = read.table(imputeInfoFile,header=F,row.names=NULL,sep="\t",stringsAsFactors=F) +chr_names=unique(impute.info[,1]) +ascat.bc = ascat.loadData(paste(outputDir,"mutantLogR.tab",sep=""),paste(outputDir,"mutantBAF.tab",sep=""),paste(outputDir,"normalLogR.tab",sep=""),paste(outputDir,"normalBAF.tab",sep=""), samplenames=c(samplename), chrs=chr_names) ascat.plotRawData(ascat.bc, parentDir=outputDir) q(save="no") diff --git a/perl/share/battenberg/RunImpute.R b/perl/share/battenberg/RunImpute.R index ae82e5f..48667ea 100755 --- a/perl/share/battenberg/RunImpute.R +++ b/perl/share/battenberg/RunImpute.R @@ -41,8 +41,9 @@ for(r in 1:nrow(inpute.info)){ boundaries = c(boundaries,inpute.info[r,6]) } boundaries=boundaries + # Startpoint is boundaries[b]+1 to make sure windows do not overlap and therefore SNPs cannot occur more than once for(b in 1:(length(boundaries)-1)){ - cmd = paste(impute.executable," -m ",inpute.info[r,3]," -h ",inpute.info[r,4]," -l ",inpute.info[r,2]," -g ",inFileStart,chr,".txt -int ",boundaries[b]," ",boundaries[b+1]," -Ne 20000 -o ",outFileStart,chr,"_",boundaries[b]/1000,"K_",boundaries[b+1]/1000,"K.txt -phase",sep="") + cmd = paste(impute.executable," -m ",inpute.info[r,3]," -h ",inpute.info[r,4]," -l ",inpute.info[r,2]," -g ",inFileStart,chr,".txt -int ",boundaries[b]+1," ",boundaries[b+1]," -Ne 20000 -o ",outFileStart,chr,"_",boundaries[b]/1000,"K_",boundaries[b+1]/1000,"K.txt -phase",sep="") system(cmd, wait=T) } } diff --git a/perl/share/battenberg/ascat.R b/perl/share/battenberg/ascat.R index 718dc60..3ba1ea4 100755 --- a/perl/share/battenberg/ascat.R +++ b/perl/share/battenberg/ascat.R @@ -1715,6 +1715,8 @@ run_clonal_ASCAT = function(lrr, baf, lrrsegmented, bafsegmented, chromosomes, s library(RColorBrewer) s = get_segment_info(lrrsegmented,segBAF.table) + # Make sure no segment of length 1 remains - TODO: this should not occur and needs to be prevented upstream + s = s[s[,3] > 1,] dist_matrix_info <- create_distance_matrix_clonal( s, dist_choice, gamma_param, read_depth, siglevel_BAF, maxdist_BAF, siglevel_LogR, maxdist_LogR, uninformative_BAF_threshold, new_bounds)# kjd 10-2-2013 d = dist_matrix_info$distance_matrix # kjd 10-2-2013 diff --git a/perl/share/battenberg/callSubclones.R b/perl/share/battenberg/callSubclones.R index fe57237..ca54c6c 100755 --- a/perl/share/battenberg/callSubclones.R +++ b/perl/share/battenberg/callSubclones.R @@ -164,7 +164,7 @@ for (i in 1:length(BAFlevels)) { whichclosestlevel.test = which.min(abs(test.levels-l)) #270713 - problem caused by segments with constant BAF (usually 1 or 2) - if(sd(BAFke)==0){ + if(is.na(sd(BAFke)) || sd(BAFke)==0){ pval[i]=0 }else{ #pval[i] = t.test(BAFke,alternative="two.sided",mu=levels[whichclosestlevel])$p.value @@ -248,6 +248,21 @@ for (i in 1:length(BAFlevels)) { write.table(subcloneres,paste(start.file,"_subclones.txt",sep=""),quote=F,col.names=NA,row.names=T,sep="\t") +# Recalculate the ploidy based on the actual fit +subcloneres = as.data.frame(subcloneres, stringsAsFactors=F) +subcloneres[,2:ncol(subcloneres)] = sapply(2:ncol(subcloneres), function(x) { as.numeric(as.character(subcloneres[,x])) }) +seg_length = floor((subcloneres$endpos-subcloneres$startpos)/1000) +is_subclonal_maj = abs(subcloneres$nMaj1_A - subcloneres$nMaj2_A) > 0 +is_subclonal_min = abs(subcloneres$nMin1_A - subcloneres$nMin2_A) > 0 +is_subclonal_maj[is.na(is_subclonal_maj)] = F +is_subclonal_min[is.na(is_subclonal_min)] = F +segment_states_min = subcloneres$nMin1_A * ifelse(is_subclonal_min, subcloneres$frac1_A, 1) + ifelse(is_subclonal_min, subcloneres$nMin2_A, 0) * ifelse(is_subclonal_min, subcloneres$frac2_A, 0) +segment_states_maj = subcloneres$nMaj1_A * ifelse(is_subclonal_maj, subcloneres$frac1_A, 1) + ifelse(is_subclonal_maj, subcloneres$nMaj2_A, 0) * ifelse(is_subclonal_maj, subcloneres$frac2_A, 0) +ploidy = sum((segment_states_min+segment_states_maj) * seg_length) / sum(seg_length) + +# Create user friendly cellularity and ploidy output file +cellularity_ploidy_output = data.frame(cellularity = c(rho), ploidy = c(ploidy), psi = c(psit)) +write.table(cellularity_ploidy_output, paste(start.file, "_cellularity_ploidy.txt", sep=""), quote=F, sep="\t", row.names=F) #DCW 211012 sample.name = strsplit(start.file,"/") @@ -264,7 +279,8 @@ for (chr in chr_names) { LogRchr = LogRvals[LogRvals[,1]==chr,3] LogRposke = LogRvals[LogRvals[,1]==chr,2] - png(filename = paste(start.file,"_subclones_chr",chr,".png",sep=""), width = 2000, height = 2000, res = 200) + png(filename = paste(start.file,"_subclones_",ifelse(grepl("chr", chr), "", "chr"), + chr,".png",sep=""), width = 2000, height = 2000, res = 200) par(mar = c(2.5,2.5,2.5,0.25), cex = 0.4, cex.main=1.5, cex.axis = 1, cex.lab = 1, mfrow = c(2,1)) plot(c(min(pos)/1000000,max(pos)/1000000),c(-1,1),pch=".",type = "n", main = paste(sample.name,", chromosome ", chr, sep=""), xlab = "Position (Mb)", ylab = "LogR") diff --git a/perl/share/battenberg/segmentBAFphased.R b/perl/share/battenberg/segmentBAFphased.R index 063aee4..9293a2e 100755 --- a/perl/share/battenberg/segmentBAFphased.R +++ b/perl/share/battenberg/segmentBAFphased.R @@ -77,7 +77,8 @@ for (chr in unique(BAFraw[,1])) { BAFsegm = res$yhat } - png(filename = paste(sample,"_RAFseg_chr",chr,".png",sep=""), width = 2000, height = 1000, res = 200) + png(filename = paste(sample,"_RAFseg_",ifelse(grepl("chr", chr), "", "chr"), + chr,".png",sep=""), width = 2000, height = 1000, res = 200) par(mar = c(5,5,5,0.5), cex = 0.4, cex.main=3, cex.axis = 2, cex.lab = 2) plot(c(min(pos)/1000000,max(pos)/1000000),c(0,1),pch=".",type = "n", main = paste(sample,", chromosome ", chr, sep=""), xlab = "Position (Mb)", ylab = "BAF (phased)") @@ -94,7 +95,8 @@ for (chr in unique(BAFraw[,1])) { BAFphseg = res$yhat } - png(filename = paste(sample,"_chr",chr,".png",sep=""), width = 2000, height = 1000, res = 200) + png(filename = paste(sample,"_",ifelse(grepl("chr", chr), "", "chr"), + chr,".png",sep=""), width = 2000, height = 1000, res = 200) par(mar = c(5,5,5,0.5), cex = 0.4, cex.main=3, cex.axis = 2, cex.lab = 2) plot(c(min(pos)/1000000,max(pos)/1000000),c(0,1),pch=".",type = "n", main = paste(sample,", chromosome ", chr, sep=""), xlab = "Position (Mb)", ylab = "BAF (phased)") @@ -104,7 +106,7 @@ for (chr in unique(BAFraw[,1])) { dev.off() BAFphased = ifelse(BAFsegm>0.5,BAF,1-BAF) - BAFoutputchr = cbind(rep(chr,length(BAFphseg)),pos,BAF,BAFphased,BAFphseg) + BAFoutputchr = data.frame(Chromosome=rep(chr, length(BAFphseg)), Position=pos, BAF=BAF, BAFphased=BAFphased, BAFseg=BAFphseg) BAFoutput=rbind(BAFoutput,BAFoutputchr) } colnames(BAFoutput) = c("Chromosome","Position","BAF","BAFphased","BAFseg") diff --git a/perl/share/gender/GRCh37d5_Y.loci b/perl/share/gender/GRCh37d5_Y.loci new file mode 100644 index 0000000..a389a21 --- /dev/null +++ b/perl/share/gender/GRCh37d5_Y.loci @@ -0,0 +1,4 @@ +Y 4546684 +Y 2934912 +Y 4550107 +Y 4549638 diff --git a/setup.sh b/setup.sh index a5bb04b..afe7a5f 100755 --- a/setup.sh +++ b/setup.sh @@ -34,13 +34,20 @@ done_message () { fi } -if [ "$#" -ne "1" ] ; then - echo "Please provide an installation path such as /opt/ICGC" +if [[ ($# -ne 1 && $# -ne 2) ]] ; then + echo "Please provide an installation path such as /opt/pancan and optionally perl lib paths to allow, e.g." + echo " ./setup.sh /opt/myBundle" + echo "OR all elements versioned:" + echo " ./setup.sh /opt/cgpBattenberg-X.X.X /opt/PCAP-X.X.X/lib/perl:/opt/alleleCount-X.X.X/lib/perl:/opt/cgpVcf-X.X.X/lib/perl" exit 0 fi INST_PATH=$1 +if [[ $# -eq 2 ]] ; then + CGP_PERLLIBS=$2 +fi + # get current directory INIT_DIR=`pwd` @@ -51,11 +58,12 @@ INST_PATH=`pwd` cd $INIT_DIR # make sure that build is self contained -unset PERL5LIB -ARCHNAME=`perl -e 'use Config; print $Config{archname};'` PERLROOT=$INST_PATH/lib/perl5 -PERLARCH=$PERLROOT/$ARCHNAME -export PERL5LIB="$PERLROOT:$PERLARCH" +if [ -z ${CGP_PERLLIBS+x} ]; then + export PERL5LIB="$PERLROOT" +else + export PERL5LIB="$PERLROOT:$CGP_PERLLIBS" +fi #create a location to build dependencies SETUP_DIR=$INIT_DIR/install_tmp @@ -79,9 +87,17 @@ echo > $INIT_DIR/setup.log PCAP=`perl -le 'eval "require $ARGV[0]" and print $ARGV[0]->VERSION' PCAP` if [[ "x$PCAP" == "x" ]] ; then - echo "PREREQUISITE: Please install PCAP-core before proceeding:" + echo "PREREQUISITE: Please install PCAP-core (v1.12+) before proceeding:" echo " https://github.com/ICGC-TCGA-PanCancer/PCAP-core/releases" exit 1; +else + # need the leading 'v' on versions so comparison of those that don't have hotfix element behave correctly, i.e. (X.X, rather than X.X.X) + GOOD_VER=`perl -Mversion -e "version->parse(q{v$PCAP}) >= version->parse(q{v}.q{1.12}) ? print qq{1\n} : print qq{0\n};"` + if [[ $GOOD_VER -ne '1' ]]; then + echo "PREREQUISITE: Please install PCAP-core (v1.12+) before proceeding:" + echo " https://github.com/ICGC-TCGA-PanCancer/PCAP-core/releases" + exit 1; + fi fi AC=`perl -le 'eval "require $ARGV[0]" and print $ARGV[0]->VERSION' Sanger::CGP::AlleleCount` @@ -93,9 +109,17 @@ fi VCF=`perl -le 'eval "require $ARGV[0]" and print $ARGV[0]->VERSION' Sanger::CGP::Vcf` if [[ "x$VCF" == "x" ]] ; then - echo "PREREQUISITE: Please install cgpVcf before proceeding:" + echo "PREREQUISITE: Please install cgpVcf (v1.3.1+) before proceeding:" echo " https://github.com/cancerit/cgpVcf/releases" exit 1; +else + # need the leading 'v' on versions so comparison of those that don't have hotfix element behave correctly, i.e. (X.X, rather than X.X.X) + GOOD_VER=`perl -Mversion -e "version->parse(q{v$VCF}) >= version->parse(q{v}.q{1.3.1}) ? print qq{1\n} : print qq{0\n};"` + if [[ $GOOD_VER -ne '1' ]]; then + echo "PREREQUISITE: Please install cgpVcf (v1.3.1+) before proceeding:" + echo " https://github.com/cancerit/cgpVcf/releases" + exit 1; + fi fi ## grab cpanm: @@ -150,7 +174,6 @@ echo "Please add the following to beginning of path:" echo " $INST_PATH/bin" echo "Please add the following to beginning of PERL5LIB:" echo " $PERLROOT" -echo " $PERLARCH" echo exit 0