diff --git a/README.md b/README.md
index a478964..b9684bc 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
-# backmap.pl v0.4
+# backmap.pl v0.5
## Description
__Automatic read mapping and genome size estimation from coverage.__
-Automatic mapping of paired, unpaired, PacBio and Nanopore reads to an assembly with `bwa mem` or `minimap2`, execution of `qualimap bamqc` and estimation of genome size from mapped nucleotides divided by mode of the coverage distribution (>0). This method was first pulished in Schell et al. (2017) and is slightly refined in this script.
+Automatic mapping of paired, unpaired, PacBio and Nanopore reads to an assembly with `bwa mem` or `minimap2`, execution of `qualimap bamqc` and estimation of genome size from mapped nucleotides divided by mode of the coverage distribution (>0). This method was first pulished in Schell et al. (2017). To show high accuracy and reliability of this method throughout the tree of life, Pfenninger et al. (2021) published a study comparing different estimators. Currently, the estimator Nbm/m (number of back-mapped bases divided by the modal value of the sequencing depth distribution) is implemented in this script only.
The tools `samtools`, `bwa` and/or `minimap2` need to be in your `$PATH`. The tools `qualimap`, `multiqc`, `bedtools` and `Rscript` are optional but needed to create the mapping quality report, coverage histogram as well as genome size estimation and to plot of the coverage distribution respectively.
## Dependencies
@@ -46,8 +46,9 @@ Mandatory:
Skips read mapping
Overrides -nh
Technologies will recognized correctly if filenames end with
- .pb(.sort).bam or .ont(.sort).bam for PacBio and Nanopore respectively.
- Otherwise they are assumed to be from Illumina.
+ .pb(.sort).bam, .hifi(.sort).bam or .ont(.sort).bam for PacBio CLR,
+ PacBio HiFi and Nanopore respectively. Otherwise they are assumed to
+ be from Illumina.
All mandatory options except of -a can be specified multiple times
@@ -55,7 +56,7 @@ Options: [default]
-o STR Output directory [.]
Will be created if not existing
-t INT Number of parallel executed processes [1]
- Affects bwa mem, samtools sort, qualimap bamqc
+ Affects bwa mem, samtools sort/index/view/stats, qualimap bamqc
-pre STR Prefix of output files if -a is used [filename of -a]
-sort Sort the bam file(s) (-b) [off]
-nq Do not run qualimap bamqc [off]
@@ -76,6 +77,8 @@ Options: [default]
```
## Citation
+Pfenninger M, Schönenbeck P & Schell T (2021). ModEst: Accurate estimation of genome size from next generation sequencing data. _Molecular ecology resources_, 00, 1–11.
+
Schell T, Feldmeyer B, Schmidt H, Greshake B, Tills O et al. (2017). An Annotated Draft Genome for _Radix auricularia_ (Gastropoda, Mollusca). _Genome Biology and Evolution_, 9(3):585–592,
__If you use this tool please cite the dependencies as well:__
diff --git a/backmap.pl b/backmap.pl
index bb237f6..8bdd050 100755
--- a/backmap.pl
+++ b/backmap.pl
@@ -7,7 +7,7 @@
use Number::FormatEng qw(:all);
use Parallel::Loops;
-my $version = "0.4";
+my $version = "0.5";
sub print_help{
print STDOUT "\n";
@@ -32,14 +32,15 @@ sub print_help{
print STDOUT "\t-b STR\t\tBam file to calculate coverage from\n";
print STDOUT "\t\t\tSkips read mapping\n";
print STDOUT "\t\t\tOverrides -nh\n";
- print STDOUT "\t\t\tTechnologies will recognized correctly if filenames end with\n\t\t\t.pb(.sort).bam or .ont(.sort).bam for PacBio and Nanopore respectively.\n\t\t\tOtherwise they are assumed to be from Illumina.\n";
+ print STDOUT "\t\t\tTechnologies will recognized correctly if filenames end with\n\t\t\t.pb(.sort).bam, .hifi(.sort).bam or .ont(.sort).bam for PacBio CLR,\n\t\t\tPacBio HiFi and Nanopore respectively. Otherwise they are assumed to\n\t\t\tbe from Illumina.\n";
+ print STDOUT "\n";
print STDOUT "\tAll mandatory options except of -a can be specified multiple times\n";
print STDOUT "\n";
print STDOUT "Options: [default]\n";
print STDOUT "\t-o STR\t\tOutput directory [.]\n";
print STDOUT "\t\t\tWill be created if not existing\n";
print STDOUT "\t-t INT\t\tNumber of parallel executed processes [1]\n";
- print STDOUT "\t\t\tAffects bwa mem, samtools sort, qualimap bamqc\n";
+ print STDOUT "\t\t\tAffects bwa mem, samtools sort/index/view/stats, qualimap bamqc\n";
print STDOUT "\t-pre STR\tPrefix of output files if -a is used [filename of -a]\n";
print STDOUT "\t-sort\t\tSort the bam file(s) (-b) [off]\n";
print STDOUT "\t-nq\t\tDo not run qualimap bamqc [off]\n";
@@ -210,7 +211,7 @@ sub round_format_pref{
$input_error = 1;
}
}
- if(scalar(@pb) > 0 or scalar(@ont) > 0){
+ if(scalar(@pb) > 0 or scalar(@hifi) > 0 or scalar(@ont) > 0){
if(not defined(can_run("minimap2"))){
print STDERR "ERROR\tminimap2 is not in your \$PATH\n";
$input_error = 1;
@@ -738,6 +739,11 @@ sub round_format_pref{
}
}
+foreach(@sorted_bams){
+ $cmd = "samtools index -@ $samtools_threads $_";
+ exe_cmd($cmd,$verbose,$dry);
+}
+
if($run_bamqc_switch == 1){
if(defined(can_run("qualimap"))){
foreach(@sorted_bams){
@@ -766,6 +772,13 @@ sub round_format_pref{
if($create_histo_switch == 1){
+ foreach(@sorted_bams){
+ my $filename = (split(/\//,$_))[-1];
+ my $cov_hist_file = "$out_dir/$filename.cov-hist";
+ $cmd = "samtools view -@ $samtools_threads -b -h -F 256 $_ | bedtools genomecov -ibam stdin -d | awk \'{print \$3}\' | sort -g | uniq -c | awk '{print \$2\"\\t\"\$1}' > $cov_hist_file";
+ exe_cmd($cmd,$verbose,$dry);
+ }
+
my $multiple_histos = Parallel::Loops->new($maxProcs);
$multiple_histos->share(\%cov_files);
$multiple_histos->share(\%peak_cov);
@@ -787,8 +800,6 @@ sub round_format_pref{
my $filename = (split(/\//,$_))[-1];
my $cov_hist_file = "$out_dir/$filename.cov-hist";
- $cmd = "samtools view -b -h -F 256 $_ | bedtools genomecov -ibam stdin -d | awk \'{print \$3}\' | sort -g | uniq -c | awk '{print \$2\"\\t\"\$1}' > $cov_hist_file";
- exe_cmd($cmd,$verbose,$dry);
$cov_files{$tech} = $cov_hist_file;
@@ -899,7 +910,12 @@ sub round_format_pref{
if($estimate_genome_size_switch == 1){
my %results;
-
+
+ foreach(@sorted_bams){
+ $cmd = "samtools stats -@ $samtools_threads $_ > $_.stats 2> $_.stats.err";
+ exe_cmd($cmd,$verbose,$dry);
+ }
+
my $multiple_genome_size = Parallel::Loops->new($maxProcs);
$multiple_genome_size->share(\%results);
@@ -908,8 +924,8 @@ sub round_format_pref{
print STDERR "CMD\tsort -rgk2 $_.cov-hist | awk \'\$1!=0{print \$1}\' | head -1\n";
}
- $cmd = "samtools stats $_ > $_.stats 2> $_.stats.err";
- exe_cmd($cmd,$verbose,$dry);
+# $cmd = "samtools stats $_ > $_.stats 2> $_.stats.err";
+# exe_cmd($cmd,$verbose,$dry);
if($dry == 0){