diff --git a/README.md b/README.md index d60c7c5..a478964 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# backmap.pl v0.3 +# backmap.pl v0.4 ## Description __Automatic read mapping and genome size estimation from coverage.__ @@ -31,14 +31,15 @@ Optional: ``` backmap.pl [-a {-p , | -u } | - -pb | -ont } | -b ] + -pb | -hifi | -ont } | -b ] Mandatory: -a STR Assembly were reads should mapped to in fasta format AND AT LEAST ONE OF -p STR Two files with paired Illumina reads comma sperated -u STR Fastq file with unpaired Illumina reads - -pb STR Fasta or fastq file with PacBio reads + -pb STR Fasta or fastq file with PacBio CLR reads + -hifi STR Fasta or fastq file with PacBio HiFi reads -ont STR Fasta or fastq file with Nanopore reads OR -b STR Bam file to calculate coverage from @@ -63,7 +64,8 @@ Options: [default] -ne Do not estimate genome size [off] -kt Keep temporary bam files [off] -bo STR Options passed to bwa [-a -c 10000] - -mo STR Options passed to minimap [PacBio: -H -x map-pb; ONT: -x map-ont] + -mo STR Options passed to minimap [CLR: -H -x map-pb; HiFi: minimap<=2.18 + -x asm20 minimap>2.18 -x map-hifi; ONT: -x map-ont] -qo STR Options passed to qualimap [none] Pass options with quotes e.g. -bo "" -v Print executed commands to STDERR [off] @@ -91,4 +93,4 @@ Ewels P, Magnusson M, Lundin S, Käller M (2016). MultiQC: summarize analysis re - bedtools: Quinlan AR, Hall IM (2010). BEDTools: a flexible suite of utilities for comparing genomic features. _Bioinformatics_, 26(6):841–842, - Rscript: -R Core Team (2019). R: A Language and Environment for Statistical Computing. +R Core Team (2021). R: A Language and Environment for Statistical Computing. diff --git a/backmap.pl b/backmap.pl index 72ccf1b..bb237f6 100755 --- a/backmap.pl +++ b/backmap.pl @@ -7,7 +7,7 @@ use Number::FormatEng qw(:all); use Parallel::Loops; -my $version = "0.3"; +my $version = "0.4"; sub print_help{ print STDOUT "\n"; @@ -18,14 +18,15 @@ sub print_help{ print STDOUT "\n"; print STDOUT "Usage:\n"; print STDOUT "\tbackmap.pl [-a {-p , | -u |\n"; - print STDOUT "\t -pb | -ont } | -b ]\n"; + print STDOUT "\t -pb | -hifi | -ont } | -b ]\n"; print STDOUT "\n"; print STDOUT "Mandatory:\n"; print STDOUT "\t-a STR\t\tAssembly were reads should mapped to in fasta format\n"; print STDOUT "\tAND AT LEAST ONE OF\n"; print STDOUT "\t-p STR\t\tTwo fastq files with paired Illumina reads comma sperated\n"; print STDOUT "\t-u STR\t\tFastq file with unpaired Illumina reads\n"; - print STDOUT "\t-pb STR\t\tFasta or fastq file with PacBio reads\n"; + print STDOUT "\t-pb STR\t\tFasta or fastq file with PacBio CLR reads\n"; + print STDOUT "\t-hifi STR\tFasta or fastq file with PacBio HiFi reads\n"; print STDOUT "\t-ont STR\tFasta or fastq file with Nanopore reads\n"; print STDOUT "\tOR\n"; print STDOUT "\t-b STR\t\tBam file to calculate coverage from\n"; @@ -47,7 +48,7 @@ sub print_help{ print STDOUT "\t-ne\t\tDo not estimate genome size [off]\n"; print STDOUT "\t-kt\t\tKeep temporary bam files [off]\n"; print STDOUT "\t-bo STR\t\tOptions passed to bwa [-a -c 10000]\n"; - print STDOUT "\t-mo STR\t\tOptions passed to minimap [PacBio: -H -x map-pb; ONT: -x map-ont]\n"; + print STDOUT "\t-mo STR\t\tOptions passed to minimap [CLR: -H -x map-pb; HiFi: minimap<=2.18\n\t\t\t-x asm20 minimap>2.18 -x map-hifi; ONT: -x map-ont]\n"; print STDOUT "\t-qo STR\t\tOptions passed to qualimap [none]\n"; print STDOUT "\tPass options with quotes e.g. -bo \"\"\n"; print STDOUT "\t-v\t\tPrint executed commands to STDERR [off]\n"; @@ -84,6 +85,7 @@ sub round_format_pref{ my @paired = (); my @unpaired = (); my @pb = (); +my @hifi = (); my @ont = (); my $threads = 1; my $prefix = ""; @@ -130,6 +132,9 @@ sub round_format_pref{ if ($ARGV[$i] eq "-pb"){ push(@pb,$ARGV[$i+1]); } + if ($ARGV[$i] eq "-hifi"){ + push(@hifi,$ARGV[$i+1]); + } if ($ARGV[$i] eq "-ont"){ push(@ont,$ARGV[$i+1]); } @@ -250,7 +255,7 @@ sub round_format_pref{ print STDERR "ERROR\tFile $assembly_path does not exist!\n"; $input_error = 1; } - if(scalar(@paired) == 0 and scalar(@unpaired) == 0 and scalar(@pb) == 0 and scalar(@ont) == 0){ + if(scalar(@paired) == 0 and scalar(@unpaired) == 0 and scalar(@pb) == 0 and scalar(@hifi) == 0 and scalar(@ont) == 0){ print STDERR "ERROR\tNo reads specified!\n"; $input_error = 1; } @@ -363,6 +368,24 @@ sub round_format_pref{ } } +my %hifi_filter; + +if($assembly_path ne ""){ + foreach(@hifi){ + if(not -f "$_"){ + print STDERR "INFO\tNo file $_ - skipping this file\n"; + } + else{ + if(exists($hifi_filter{abs_path($_)})){ + print STDERR "INFO\tFile " . abs_path($_) . " already specified\n"; + } + else{ + $hifi_filter{abs_path($_)} = 1; + } + } + } +} + my %ont_filter; if($assembly_path ne ""){ @@ -400,7 +423,7 @@ sub round_format_pref{ } if($assembly_path ne ""){ - if(scalar(keys(%paired_filter)) == 0 and scalar(keys(%unpaired_filter)) == 0 and scalar(keys(%pb_filter)) == 0 and scalar(keys(%ont_filter)) == 0){ + if(scalar(keys(%paired_filter)) == 0 and scalar(keys(%unpaired_filter)) == 0 and scalar(keys(%pb_filter)) == 0 and scalar(keys(%hifi_filter)) == 0 and scalar(keys(%ont_filter)) == 0){ print STDERR "ERROR\tNo existing read files specified!\n"; exit 1; } @@ -436,12 +459,16 @@ sub round_format_pref{ } my $minimap_version; +my $minimap_minor_version; if(not defined(can_run("minimap2"))){ $minimap_version = "not detected"; } else{ $minimap_version = `minimap2 --version`; chomp $minimap_version; + $minimap_minor_version = $minimap_version; + $minimap_minor_version =~ s/-.*//; + $minimap_minor_version =~ s/^.*\.//; } my $samtools_version = `samtools --version | head -1 | sed 's/^samtools //'`; @@ -592,6 +619,20 @@ sub round_format_pref{ push(@pb_bam,"$out_dir/$prefix.pb$pb_counter.bam"); } +my $hifi_counter = 0; +my @hifi_bam = (); +foreach(keys(%hifi_filter)){ + $hifi_counter++; + if($minimap_minor_version <= 18){ + $cmd = "minimap2 $minimap_opts-x asm20 -a -t $threads $assembly_path $_ 2> $out_dir/$prefix\_minimap_hifi$hifi_counter.err | samtools view -1 -b - > $out_dir/$prefix.hifi$hifi_counter.bam"; + } + else{ + $cmd = "minimap2 $minimap_opts-x map-hifi -a -t $threads $assembly_path $_ 2> $out_dir/$prefix\_minimap_hifi$hifi_counter.err | samtools view -1 -b - > $out_dir/$prefix.hifi$hifi_counter.bam"; + } + exe_cmd($cmd,$verbose,$dry); + push(@hifi_bam,"$out_dir/$prefix.hifi$hifi_counter.bam"); +} + my $ont_counter = 0; my @ont_bam = (); foreach(keys(%ont_filter)){ @@ -608,6 +649,7 @@ sub round_format_pref{ my $paired_bam_files = join(" ",@paired_bam); my $unpaired_bam_files = join(" ",@unpaired_bam); my $pb_bam_files = join(" ",@pb_bam); + my $hifi_bam_files = join(" ",@hifi_bam); my $ont_bam_files = join(" ",@ont_bam); if($ill_bam_count > 0){ @@ -639,6 +681,20 @@ sub round_format_pref{ push(@merged_bam_file, "$out_dir/$prefix.pb.bam"); } } + + if(scalar(@hifi_bam) > 0){ + if(scalar(@hifi_bam) == 1){ + my $single_bam = $hifi_bam[0]; + $cmd = "ln -fs $single_bam $out_dir/$prefix.hifi.bam"; + exe_cmd($cmd,$verbose,$dry); + push(@merged_bam_file, "$out_dir/$prefix.hifi.bam"); + } + else{ + $cmd = "samtools merge -@ $samtools_threads $out_dir/$prefix.hifi.bam $hifi_bam_files"; + exe_cmd($cmd,$verbose,$dry); + push(@merged_bam_file, "$out_dir/$prefix.hifi.bam"); + } + } if(scalar(@ont_bam) > 0){ if(scalar(@ont_bam) == 1){ @@ -676,7 +732,7 @@ sub round_format_pref{ } if($keep_tmp == 0){ - my $tmp_bams = join(" ",@paired_bam,@unpaired_bam,@pb_bam,@ont_bam,@merged_bam_file); + my $tmp_bams = join(" ",@paired_bam,@unpaired_bam,@pb_bam,@hifi_bam,@ont_bam,@merged_bam_file); $cmd = "rm $tmp_bams"; exe_cmd($cmd,$verbose,$dry); } @@ -720,7 +776,10 @@ sub round_format_pref{ my $tech = "Illumina"; if($_ =~ m/\.pb\.sort\.bam$/){ - $tech = "PacBio"; + $tech = "CLR"; + } + if($_ =~ m/\.hifi\.sort\.bam$/){ + $tech = "HiFi"; } if($_ =~ m/\.ont\.sort\.bam$/){ $tech = "Nanopore"; @@ -780,7 +839,7 @@ sub round_format_pref{ $rscript = "$out_dir/$prefix.plot.all.r"; } if($dry == 0){ - my @techs = ("Illumina","PacBio","Nanopore"); + my @techs = ("Illumina","CLR","HiFi","Nanopore"); open(RALL,'>',"$rscript") or die "ERROR\tCould not open file $rscript\n"; @@ -811,6 +870,10 @@ sub round_format_pref{ push(@col,"\"blue\""); } if($i == 2 and exists($cov_files{$techs[$i]})){ + print RALL "lines($techs[$i]\[,1],$techs[$i]\[,2],type=\"l\",col=\"darkgreen\")\n"; + push(@col,"\"darkgreen\""); + } + if($i == 3 and exists($cov_files{$techs[$i]})){ print RALL "lines($techs[$i]\[,1],$techs[$i]\[,2],type=\"l\",col=\"red\")\n"; push(@col,"\"red\""); } @@ -852,7 +915,10 @@ sub round_format_pref{ my $tech = "Illumina"; if($_ =~ m/\.pb\.sort\.bam$/){ - $tech = "PacBio"; + $tech = "CLR"; + } + if($_ =~ m/\.hifi\.sort\.bam$/){ + $tech = "HiFi"; } if($_ =~ m/\.ont\.sort\.bam$/){ $tech = "Nanopore"; @@ -891,7 +957,7 @@ sub round_format_pref{ print "Output\n"; print "======\n"; - my @techs = ("Illumina","PacBio","Nanopore"); + my @techs = ("Illumina","CLR","HiFi","Nanopore"); for (my $i = 0; $i < scalar(@techs); $i++){ if(exists($results{$techs[$i]})){ print $results{$techs[$i]};