diff --git a/docker/Dockerfile b/docker/Dockerfile
index ce98bc7d0..52ae5e374 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -6,7 +6,7 @@ FROM ubuntu:16.04
LABEL base.image="biocontainers:latest"
LABEL version="1"
LABEL software="GRIDSS"
-LABEL software.version="2.0.1"
+LABEL software.version="2.2.0"
LABEL about.summary="Genomic Rearrangement IDentification Software Suite"
LABEL about.home="https://github.com/PapenfussLab/gridss"
LABEL about.tags="Genomics"
@@ -32,6 +32,6 @@ RUN chmod 777 /data
USER gridss
RUN mkdir /data/gridss
WORKDIR /data/gridss
-RUN wget https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar
+RUN wget https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar
-ENTRYPOINT ["java", "-ea", "-Xmx16g", "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true", "-Dgridss.gridss.output_to_temp_file=true", "-cp", "/data/gridss/gridss-2.0.1-gridss-jar-with-dependencies.jar", "gridss.CallVariants", "WORKER_THREADS=4"]
+ENTRYPOINT ["java", "-ea", "-Xmx16g", "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true", "-Dgridss.gridss.output_to_temp_file=true", "-cp", "/data/gridss/gridss-2.2.0-gridss-jar-with-dependencies.jar", "gridss.CallVariants", "WORKER_THREADS=4"]
diff --git a/example/example_regions_of_interest.bed b/example/example_regions_of_interest.bed
new file mode 100644
index 000000000..b6bb0623a
--- /dev/null
+++ b/example/example_regions_of_interest.bed
@@ -0,0 +1 @@
+chr12 1527326 1528326
\ No newline at end of file
diff --git a/example/gridss.sh b/example/gridss.sh
index 584f17a20..8ae6e69ff 100644
--- a/example/gridss.sh
+++ b/example/gridss.sh
@@ -7,7 +7,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed
REFERENCE=hg19.fa
OUTPUT=${INPUT/.bam/.sv.vcf}
ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
-GRIDSS_JAR=../target/gridss-2.1.1-gridss-jar-with-dependencies.jar
+GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar
if [[ ! -f "$INPUT" ]] ; then
echo "Missing $INPUT input file."
diff --git a/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh b/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh
index 38c8c96d8..1a8754bf7 100644
--- a/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh
+++ b/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh
@@ -41,7 +41,7 @@ REFERENCE=~/reference_genomes/human/hg19.fa
INPUT=pipelined.example.input.bam
OUTPUT=pipelined.example.sv.vcf
RAW_GRIDSS_ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
-GRIDSS_JAR=~/bin/gridss-2.1.1-jar-with-dependencies.jar
+GRIDSS_JAR=~/bin/gridss-2.2.0-jar-with-dependencies.jar
GRIDSS_JVM_ARGS="
-Dsamjdk.use_async_io_read_samtools=true
-Dsamjdk.use_async_io_write_samtools=true
diff --git a/example/gridss_separate.sh b/example/gridss_separate.sh
index 062e503cb..9eb640de1 100644
--- a/example/gridss_separate.sh
+++ b/example/gridss_separate.sh
@@ -7,7 +7,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed
REFERENCE=hg19.fa
OUTPUT=${INPUT/.bam/.sv.vcf}
ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
-GRIDSS_JAR=../target/gridss-2.1.1-gridss-jar-with-dependencies.jar
+GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar
WORKING_DIR=.
diff --git a/example/gridss_targeted.sh b/example/gridss_targeted.sh
index acfd0c19c..08c1bda44 100644
--- a/example/gridss_targeted.sh
+++ b/example/gridss_targeted.sh
@@ -2,25 +2,33 @@
#
# Runs GRIDSS on a set of targeted regions
#
-# Note that to properly call compound breakpoints, this script should be run
+#
+# WARNING: reference read counts will only be correct in targeted regions.
+# If a breakpoint goes from a targeted region to a region not in the BED of
+# interest then the untargeted breakend will be reported as homozygous variant
+# even when it is not (since the reference-supporting reads were not included).
+# This can be fixed by rerunning gridss.AnnotateReferenceReads against the
+# original input file directly, or as explained below.
+#
+# To properly call compound breakpoints, this script should be run
# twice: once for the intial GRIDSS calls, then again on the GRIDSS calls
# This enables accurate calling of complex events for which only some of the
-# breakpoints involved fall within the initially targeted region
+# breakpoints involved fall within the initially targeted region, as well
+# as fixing the above issue with reference read counts.
#
-BED_REGIONS_OF_INTEREST=targetted_regions.bed # If it's already a bed then you don't need the next two lines
-VCF_REGIONS_OF_INTEREST=targetted_regions.vcf # If your input is a VCF you'll need to convert it to a BED file
-Rscript gridss_targeted_vcf_to_region_bed.R --input $VCF_REGIONS_OF_INTEREST --ouput #BED_REGIONS_OF_INTEREST
+BED_REGIONS_OF_INTEREST=example_regions_of_interest.bed
+# If you have a VCF file, you can use the following script convert to a BED
+# file that correctly handle symbolic SVs
+#VCF_REGIONS_OF_INTEREST=targetted_regions.vcf # If your input is a VCF you'll need to convert it to a BED file
+#Rscript gridss_targeted_vcf_to_region_bed.R --input $VCF_REGIONS_OF_INTEREST --ouput $BED_REGIONS_OF_INTEREST
-REGIONS_OF_INTEREST=colo829.region.bed
-REFERENCE=/data/refgenomes/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta
-OUTPUT=colo829.lite.vcf
-ASSEMBLY=assembly.lite.bam
-GRIDSS_JAR=gridss-2.1.2-gridss-jar-with-dependencies.jar
-
-# This example uses a paired tumour/normal
-NORMAL=COLO829R_dedup.realigned.bam
-TUMOUR=COLO829T_dedup.realigned.bam
+REGIONS_OF_INTEREST=$BED_REGIONS_OF_INTEREST
+REFERENCE=hg19.fa
+INPUT=chr12.1527326.DEL1024.bam
+OUTPUT=${INPUT/.bam/.targeted.sv.vcf}
+ASSEMBLY=${INPUT/.bam/.targeted.assembly.bam}
+GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar
# Extract the reads flanking each breakpoint, not just overlapping.
# 2k is chosen since it it (should) be larger than the fragment size
@@ -29,18 +37,8 @@ TUMOUR=COLO829T_dedup.realigned.bam
# to that of the full file.
REGION_PADDING_SIZE=2000
-
-
-
-
-INPUT=../../temp/test.bam
-REFERENCE=~/hartwig/temp/Homo_sapiens.GRCh37.GATK.illumina.fa
-OUTPUT=${INPUT/.bam/.targeted.sv.vcf}
-ASSEMBLY=${OUTPUT/.sv.vcf/.targeted.gridss.assembly.bam}
-GRIDSS_JAR=../target/gridss-2.1.2-gridss-jar-with-dependencies.jar
-WORKING_DIR=../../temp/
-TMP_DIR=../../temp/
-
+# Number of reads for which to calculate metrics for
+STOP_METRICS_AFTER=10000000
JVM_ARGS="-ea \
-Dreference_fasta="$REFERENCE" \
-Dsamjdk.create_index=true \
@@ -50,36 +48,78 @@ JVM_ARGS="-ea \
-Dsamjdk.buffer_size=4194304 \
-Dgridss.gridss.output_to_temp_file=true \
-cp $GRIDSS_JAR "
-# Don't perform async read-ahead for gridss.IndexedExtractFullReads
-# as there's not point in performing read-ahead of reads we're going
-# ignore anyway
-JVM_ARGS_SINGLE_THREADED="-ea \
- -Dreference_fasta="$REFERENCE" \
- -Dsamjdk.create_index=true \
- -Dsamjdk.use_async_io_write_samtools=true \
- -Dsamjdk.use_async_io_write_tribble=true \
- -Dgridss.gridss.output_to_temp_file=true \
- -cp $GRIDSS_JAR "
-
-
-for F in $NORMAL $TUMOUR ; do
- java -Xmx8g $JVM_ARGS_SINGLE_THREADED gridss.IndexedExtractFullReads B=$REGIONS_OF_INTEREST REGION_PADDING_SIZE=$REGION_PADDING_SIZE I=$F O=$F.lite.bam 2>&1 | tee log.$F.extract.log &
- # If you have are extracting a large portion of genome then
+# for each input file
+for F in $INPUT ; do
+ TARGETED_BAM=$(dirname $INPUT)/$(basename $INPUT .bam).targeted.bam
+ GRIDSS_WORKING_DIR=./$(basename $TARGETED_BAM).gridss.working
+ TMP_DIR=./$(basename $TARGETED_BAM).gridss.working
+ echo "Calculate metrics for $F based on first $STOP_METRICS_AFTER reads"
+ # estimate metrics
+ mkdir -p $GRIDSS_WORKING_DIR
+ java -Xmx8g $JVM_ARGS gridss.analysis.CollectGridssMetrics \
+ TMP_DIR=$GRIDSS_WORKING_DIR \
+ ASSUME_SORTED=true \
+ I=$F \
+ O=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM) \
+ THRESHOLD_COVERAGE=25000 \
+ FILE_EXTENSION=null \
+ GRIDSS_PROGRAM=null \
+ GRIDSS_PROGRAM=CollectCigarMetrics \
+ GRIDSS_PROGRAM=CollectMapqMetrics \
+ GRIDSS_PROGRAM=CollectTagMetrics \
+ GRIDSS_PROGRAM=CollectIdsvMetrics \
+ GRIDSS_PROGRAM=ReportThresholdCoverage \
+ PROGRAM=null \
+ PROGRAM=CollectInsertSizeMetrics \
+ STOP_AFTER=$STOP_METRICS_AFTER \
+ | tee log.$F.collectgridssmetrics.log
+ java -Xmx8g $JVM_ARGS gridss.analysis.CollectStructuralVariantReadMetrics \
+ TMP_DIR=$TMP_DIR \
+ I=$F \
+ OUTPUT=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM).sv_metrics \
+ INSERT_SIZE_METRICS=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM).insert_size_metrics \
+ STOP_AFTER=$STOP_METRICS_AFTER \
+ | tee log.$F.collectsvmetrics.log
+ echo "Extracting all reads from fragments overlapping $REGIONS_OF_INTEREST from $F"
+
+ # IndexedExtractFullReads is much faster than ExtractFullReads when
+ # using a small region of interest. Unfortunately, if the regions of
+ # interest are all SVs, the insert size distribution will be biased
+ ## Don't perform async read-ahead for gridss.IndexedExtractFullReads
+ ## as there's not point in performing read-ahead of reads we're going
+ ## ignore anyway
+ JVM_ARGS_SINGLE_THREADED="-ea \
+ -Dreference_fasta="$REFERENCE" \
+ -Dsamjdk.create_index=true \
+ -Dsamjdk.use_async_io_write_samtools=true \
+ -Dgridss.gridss.output_to_temp_file=true \
+ -cp $GRIDSS_JAR "
+ java -Xmx8g $JVM_ARGS_SINGLE_THREADED \
+ gridss.IndexedExtractFullReads \
+ B=$REGIONS_OF_INTEREST \
+ REGION_PADDING_SIZE=$REGION_PADDING_SIZE \
+ I=$F \
+ O=$TARGETED_BAM 2>&1 | tee log.$F.extract.log &
+
+ # If you are extracting a large portion of genome then
# you should perform a full pass of the input file as it will
# be faster than doing millions of index seeks()
- #java -Xmx8g $JVM_ARGS gridss.ExtractFullReads REGION_PADDING_SIZE=$REGION_PADDING_SIZE B=$REGIONS_OF_INTEREST I=$F O=$F.lite.bam 2>&1 | tee log.$F.extract.log &
+ #java -Xmx8g $JVM_ARGS gridss.CollectGridssMetricsAndExtractFullReads \
+ # REGION_PADDING_SIZE=$REGION_PADDING_SIZE \
+ # REGION_BED=$REGIONS_OF_INTEREST \
+ # I=$F \
+ # O=$TARGETED_BAM 2>&1 | tee log.$F.extract.log &
done
wait
+
# Run GRIDSS on the extracted files
-# warning: if you haven't increased your OS file ulimit above 4096 then will probably crash.
-#
+# warning: if you haven't increased your OS file ulimit above 4096 then will probably crash. Refer to the GRIDSS README for more details
java -Xmx31g $JVM_ARGS \
-cp $GRIDSS_JAR gridss.CallVariants \
TMP_DIR=. \
WORKING_DIR=. \
REFERENCE_SEQUENCE=$REFERENCE \
- INPUT=$NORMAL.lite.bam \
- INPUT=$TUMOUR.lite.bam \
+ INPUT=$(dirname $INPUT)/$(basename $INPUT .bam).targeted.bam \
OUTPUT=$OUTPUT \
ASSEMBLY=$ASSEMBLY \
WORKER_THREADS=16 \
diff --git a/example/somatic.sh b/example/somatic.sh
index 080539116..c178563f3 100644
--- a/example/somatic.sh
+++ b/example/somatic.sh
@@ -8,7 +8,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed
REFERENCE=~/reference_genomes/human/hg19.fa
OUTPUT=somatic.sv.vcf
ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
-GRIDSS_JAR=~/bin/gridss-2.0.1-jar-with-dependencies.jar
+GRIDSS_JAR=~/bin/gridss-2.2.0-jar-with-dependencies.jar
if [[ ! -f "$NORMAL" ]] ; then
echo "Missing $NORMAL input file."
diff --git a/pom.xml b/pom.xml
index 3f344834b..e3fabf8db 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
au.edu.wehi
gridss
jar
- 2.1.2-gridss
+ 2.2.0-gridss
gridss
https://github.com/PapenfussLab/gridss
diff --git a/scripts/cohort_analysis/0_README.txt b/scripts/cohort_analysis/0_README.txt
index e20e4d979..dd1a11ed0 100644
--- a/scripts/cohort_analysis/0_README.txt
+++ b/scripts/cohort_analysis/0_README.txt
@@ -38,7 +38,7 @@ Instructions
> ./1_setup.sh
This will download the GRIDSS jar file and dependencies
-(https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar)
+(https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar)
and download and decompress the ENCODE blacklist file
(https://www.encodeproject.org/files/ENCFF001TDO/@@download/ENCFF001TDO.bed.gz).
@@ -54,7 +54,7 @@ of the following files:
BLACKLIST_FILENAME = "data/ENCODE_blacklist_hg19/ENCFF001TDO.bed"
REFERENCE_GENOME = "data/hg19.fa"
-GRIDSS_JARFILE = "./gridss-2.0.1-gridss-jar-with-dependencies.jar"
+GRIDSS_JARFILE = "./gridss-2.2.0-gridss-jar-with-dependencies.jar"
5. Open & edit the samples file sample.csv using a text editor. The file
diff --git a/scripts/cohort_analysis/1_setup.sh b/scripts/cohort_analysis/1_setup.sh
index 29d9d0814..d81d07fdc 100644
--- a/scripts/cohort_analysis/1_setup.sh
+++ b/scripts/cohort_analysis/1_setup.sh
@@ -3,7 +3,7 @@
# Setup script
# Download the GRIDSS jar file and dependencies from:
-wget https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar
+wget https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar
# Download and decompress the ENCODE blacklist file:
mkdir data
diff --git a/scripts/cohort_analysis/2_generate_analysis_scripts.py b/scripts/cohort_analysis/2_generate_analysis_scripts.py
index 53538bdc6..97a66b6b2 100644
--- a/scripts/cohort_analysis/2_generate_analysis_scripts.py
+++ b/scripts/cohort_analysis/2_generate_analysis_scripts.py
@@ -5,7 +5,7 @@
BLACKLIST_FILENAME = "data/ENCODE_blacklist_hg19/ENCFF001TDO.bed"
REFERENCE_GENOME = "data/hg19/hg19.fa"
-GRIDSS_JARFILE = "./gridss-2.0.1-gridss-jar-with-dependencies.jar"
+GRIDSS_JARFILE = "./gridss-2.2.0-gridss-jar-with-dependencies.jar"
# Read the script template
diff --git a/scripts/cohort_analysis/4_cleanup.py b/scripts/cohort_analysis/4_cleanup.py
index f323828b5..3d06382b6 100644
--- a/scripts/cohort_analysis/4_cleanup.py
+++ b/scripts/cohort_analysis/4_cleanup.py
@@ -4,7 +4,7 @@
rm scripts/*
rm output/*
-rm gridss-2.0.1-gridss-jar-with-dependencies.jar
+rm gridss-2.2.0-gridss-jar-with-dependencies.jar
rm data/ENCODE_blacklist_hg19/ENCFF001TDO.bed
# rm data/hg19/*
diff --git a/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java b/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java
index 6d2e8defd..67a0b6365 100644
--- a/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java
+++ b/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java
@@ -153,7 +153,7 @@ public double readPairFoldedCumulativeDistribution(int fragmentSize) {
}
double totalPairs = idsvMetrics.READ_PAIRS_BOTH_MAPPED;
double dpPairs = totalPairs - insertDistribution.getTotalMappedPairs() + pairsFromFragmentDistribution;
- return dpPairs / totalPairs;
+ return Math.min(1, dpPairs / totalPairs);
}
/**
diff --git a/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java b/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java
index 81234c7ce..61a769922 100644
--- a/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java
+++ b/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java
@@ -47,7 +47,7 @@ public double scoreReadPair(IdsvSamFileMetrics metrics, int fragmentSize, int ma
public double scoreUnmappedMate(IdsvSamFileMetrics metrics, int mapq) {
IdsvMetrics im = metrics.getIdsvMetrics();
// completely unmapped read pairs are excluded for consistency with sc and dp calculation
- double score = MathUtil.prToPhred((double)im.READ_PAIRS_ONE_MAPPED / (double)(im.MAPPED_READS));
+ double score = MathUtil.prToPhred((double)Math.max(1, im.READ_PAIRS_ONE_MAPPED) / (double)Math.max(1, Math.max(im.READ_PAIRS_ONE_MAPPED, (im.MAPPED_READS))));
score = Math.min(score, mapq);
return score;
}
diff --git a/src/main/java/au/edu/wehi/idsv/model/Models.java b/src/main/java/au/edu/wehi/idsv/model/Models.java
index 1f72e6d3a..7d4ebbcf2 100644
--- a/src/main/java/au/edu/wehi/idsv/model/Models.java
+++ b/src/main/java/au/edu/wehi/idsv/model/Models.java
@@ -36,17 +36,10 @@ public static BreakendSummary calculateBreakend(LinearGenomicCoordinate lgc, Lis
public BreakendSummary apply(DirectedEvidence input) {
return input.getBreakendSummary();
}
- }), Lists.transform(evidence, new Function() {
- @Override
- public Long apply(DirectedEvidence input) {
- return ScalingHelper.toScaledWeight(input.getBreakendQual());
- }
- }));
+ }), Lists.transform(evidence, (Function) input -> ScalingHelper.toScaledWeight(input.getBreakendQual())));
}
/**
* Calculates the most likely breakend interval for the given evidence
- * @param evidence
- * @return breakend interval with highest total evidence quality
*/
public static BreakendSummary calculateBreakend(LinearGenomicCoordinate lgc, List bs, List weights) {
if (bs == null || bs.size() == 0) throw new IllegalArgumentException("No evidence supplied");
diff --git a/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java b/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java
index 58a6394ce..ba49272b8 100644
--- a/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java
+++ b/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java
@@ -2,6 +2,8 @@
import au.edu.wehi.idsv.ReadPairConcordanceMethod;
import gridss.analysis.CollectGridssMetrics;
+import gridss.analysis.CollectIdsvMetrics;
+import gridss.analysis.CollectStructuralVariantReadMetrics;
import gridss.cmdline.CommandLineProgramHelper;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
@@ -25,26 +27,60 @@
programGroup = gridss.cmdline.programgroups.DataConversion.class
)
public class CollectGridssMetricsAndExtractFullReads extends CollectGridssMetrics {
+ // #region SV command line args
+ @Argument(doc="Minimum indel size", optional=true)
+ public int MIN_INDEL_SIZE = 1;
+ @Argument(doc="Minimum bases clipped", optional=true)
+ public int MIN_CLIP_LENGTH = 1;
+ @Argument(doc="Include hard and soft clipped reads in output", optional=true)
+ public boolean CLIPPED = true;
+ @Argument(doc="Include reads containing indels in output", optional=true)
+ public boolean INDELS = true;
+ @Argument(doc="Include split reads in output", optional=true)
+ public boolean SPLIT = true;
+ @Argument(doc="Include read pairs in which only one of the read is aligned to the reference.", optional=true)
+ public boolean SINGLE_MAPPED_PAIRED = true;
+ @Argument(doc="Include read pairs that align do not align in the expected orientation within the expected fragment size distribution.", optional=true)
+ public boolean DISCORDANT_READ_PAIRS = true;
+ @Argument(doc="Method of calculating read pair concordance. Valid values are SAM_FLAG, PERCENTAGE, and FIXED", optional=true)
+ public ReadPairConcordanceMethod READ_PAIR_CONCORDANCE_METHOD = ReadPairConcordanceMethod.SAM_FLAG;
+ @Argument(doc="Minimum concordant read pair fragment size if using the FIXED method of calculation", optional=true)
+ public int FIXED_READ_PAIR_CONCORDANCE_MIN_FRAGMENT_SIZE = 0;
+ @Argument(doc="Maximum concordant read pair fragment size if using the FIXED method of calculation", optional=true)
+ public int FIXED_READ_PAIR_CONCORDANCE_MAX_FRAGMENT_SIZE = 0;
+ @Argument(doc = "Percent (0.0-1.0) of read pairs considered concorant if using the PERCENTAGE method of calculation.", optional=true)
+ public Float READ_PAIR_CONCORDANT_PERCENT = 0.995f;
+ @Argument(doc="Picard tools insert size distribution metrics txt file. Required if using the PERCENTAGE read pair concordance calculation method.", optional=true)
+ public File INSERT_SIZE_METRICS = null;
+ @Argument(doc="Include unmapped reads", optional=true)
+ public boolean UNMAPPED_READS = true;
+ @Argument(doc="If true, also include reads marked as duplicates.")
+ public boolean INCLUDE_DUPLICATES = false;
+ @Argument(shortName="SVO", doc="Output file containing SV metrics", optional=true)
+ public File SV_METRICS_OUTPUT;
+ // #endregion
+ // #region extract command line args
@Argument(shortName="MO", doc="Output file containing SV metrics", optional=true)
- public File REGION_BED;
+ public File REGION_BED = new ExtractFullReads().REGION_BED;
@Argument(doc = "Extract reads whose mate maps to an export region. " +
"If the MC tag is not present, only the starting alignment position of the mate is considered. " +
"When determining whether the mate maps to an export region " +
"only the primary alignment of that mate is considered. Secondary " +
"and supplementary alignments are ignored.")
- public boolean EXTRACT_MATES = true;
- @Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region")
- public boolean EXTRACT_SPLITS = true;
+ public boolean EXTRACT_MATES = new ExtractFullReads().EXTRACT_MATES;
+ @Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region", optional=true)
+ public boolean EXTRACT_SPLITS = new ExtractFullReads().EXTRACT_SPLITS;
@Argument(doc = "Number of bases surrounding each export region to include in the index query. " +
"Setting this to a value slightly greater than the 99.99% fragment size distribution will reduce the number of random access" +
"IO requests made. " +
- "This parameter is not used if a linear scan is performed.")
- public int REGION_PADDING_SIZE = 2000;
- @Argument(doc = "File to write the output reads to.")
+ "This parameter is not used if a linear scan is performed.", optional=true)
+ public int REGION_PADDING_SIZE = new ExtractFullReads().REGION_PADDING_SIZE;
+ @Argument(doc = "File to write the output reads to.", optional=true)
public File READ_OUTPUT;
@Argument(doc="Number of worker threads to spawn. Defaults to number of cores available."
+ " Note that I/O threads are not included in this worker thread count so CPU usage can be higher than the number of worker thread.",
shortName="THREADS")
+ // #endregion
public int WORKER_THREADS = 1;
public static void main(final String[] args) {
new CollectGridssMetricsAndExtractFullReads().instanceMainWithExit(args);
@@ -89,10 +125,57 @@ public boolean supportsMetricAccumulationLevel() {
}
};
}
+ protected CollectStructuralVariantReadMetrics getSVMetrics() {
+ CollectStructuralVariantReadMetrics extract = new CollectStructuralVariantReadMetrics();
+ CommandLineProgramHelper.copyInputs(this, extract);
+ extract.MIN_INDEL_SIZE = MIN_INDEL_SIZE;
+ extract.MIN_CLIP_LENGTH = MIN_CLIP_LENGTH;
+ extract.CLIPPED = CLIPPED;
+ extract.INDELS = INDELS;
+ extract.SPLIT = SPLIT;
+ extract.SINGLE_MAPPED_PAIRED = SINGLE_MAPPED_PAIRED;
+ extract.DISCORDANT_READ_PAIRS = DISCORDANT_READ_PAIRS;
+ extract.READ_PAIR_CONCORDANCE_METHOD = READ_PAIR_CONCORDANCE_METHOD;
+ extract.FIXED_READ_PAIR_CONCORDANCE_MIN_FRAGMENT_SIZE = FIXED_READ_PAIR_CONCORDANCE_MIN_FRAGMENT_SIZE;
+ extract.FIXED_READ_PAIR_CONCORDANCE_MAX_FRAGMENT_SIZE = FIXED_READ_PAIR_CONCORDANCE_MAX_FRAGMENT_SIZE;
+ extract.READ_PAIR_CONCORDANT_PERCENT = READ_PAIR_CONCORDANT_PERCENT;
+ extract.INSERT_SIZE_METRICS = INSERT_SIZE_METRICS;
+ extract.UNMAPPED_READS = UNMAPPED_READS;
+ extract.INCLUDE_DUPLICATES = INCLUDE_DUPLICATES;
+ extract.OUTPUT = SV_METRICS_OUTPUT;
+ extract.INPUT = this.INPUT;
+ extract.ASSUME_SORTED = true;
+ return extract;
+ }
+ public ProgramInterface createSVMetrics() {
+ return new ProgramInterface() {
+ @Override
+ public SinglePassSamProgram makeInstance(final String outbase,
+ final String outext,
+ final File input,
+ final File reference,
+ final Set metricAccumulationLevel,
+ final File dbSnp,
+ final File intervals,
+ final File refflat,
+ final Set ignoreSequence) {
+ final CollectStructuralVariantReadMetrics program = getSVMetrics();
+ return program.asSinglePassSamProgram();
+ }
+ @Override
+ public boolean needsReferenceSequence() {
+ return false;
+ }
+ @Override
+ public boolean supportsMetricAccumulationLevel() {
+ return false;
+ }
+ };
+ }
@Override
public void setProgramsToRun(Collection programsToRun) {
- // Inject SV read extraction
programsToRun.add(createExtractFullReads());
+ programsToRun.add(createSVMetrics());
super.setProgramsToRun(programsToRun);
}
}
diff --git a/src/main/java/gridss/CollectGridssMetricsAndExtractSVReads.java b/src/main/java/gridss/CollectGridssMetricsAndExtractSVReads.java
index cca6a434f..00c924deb 100644
--- a/src/main/java/gridss/CollectGridssMetricsAndExtractSVReads.java
+++ b/src/main/java/gridss/CollectGridssMetricsAndExtractSVReads.java
@@ -61,10 +61,6 @@ public class CollectGridssMetricsAndExtractSVReads extends CollectGridssMetrics
public static void main(final String[] args) {
new CollectGridssMetricsAndExtractSVReads().instanceMainWithExit(args);
}
- @Override
- protected String[] customCommandLineValidation() {
- return super.customCommandLineValidation();
- }
protected ExtractSVReads getExtractSVReads() {
ExtractSVReads extract = new ExtractSVReads();
CommandLineProgramHelper.copyInputs(this, extract);
diff --git a/src/main/java/gridss/ExtractFullReads.java b/src/main/java/gridss/ExtractFullReads.java
index f5053eba6..cfbff64b3 100644
--- a/src/main/java/gridss/ExtractFullReads.java
+++ b/src/main/java/gridss/ExtractFullReads.java
@@ -33,7 +33,7 @@ public class ExtractFullReads extends SinglePassSamProgram {
@Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region")
public boolean EXTRACT_SPLITS = true;
@Argument(doc = "Number of bases surrounding each export region to include in the index query. ")
- public int REGION_PADDING_SIZE = 2000;
+ public int REGION_PADDING_SIZE = 0;
private File tmpOut;
private SAMFileWriter writer;
diff --git a/src/main/java/gridss/IndexedExtractFullReads.java b/src/main/java/gridss/IndexedExtractFullReads.java
index dbf104841..5b166bb9a 100644
--- a/src/main/java/gridss/IndexedExtractFullReads.java
+++ b/src/main/java/gridss/IndexedExtractFullReads.java
@@ -30,17 +30,17 @@ public class IndexedExtractFullReads extends CommandLineProgram {
@Argument(shortName= StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="Output file location.")
public File OUTPUT;
@Argument(shortName = "B", doc = "BED file containing regions to export")
- public File REGION_BED;
+ public File REGION_BED = new ExtractFullReads().REGION_BED;
@Argument(doc = "Extract reads whose mate maps to an export region. " +
"If the MC tag is not present, only the starting alignment position of the mate is considered. " +
"When determining whether the mate maps to an export region " +
"only the primary alignment of that mate is considered. Secondary " +
"and supplementary alignments are ignored.")
- public boolean EXTRACT_MATES = true;
+ public boolean EXTRACT_MATES = new ExtractFullReads().EXTRACT_MATES;
@Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region")
- public boolean EXTRACT_SPLITS = true;
+ public boolean EXTRACT_SPLITS = new ExtractFullReads().EXTRACT_SPLITS;
@Argument(doc = "Number of additional bases surrounding each export region to include in the index query. ")
- public int REGION_PADDING_SIZE = 0;
+ public int REGION_PADDING_SIZE = new ExtractFullReads().REGION_PADDING_SIZE;
@Argument(doc="Number of worker threads to spawn. Defaults to number of cores available."
+ " Note that I/O threads are not included in this worker thread count so CPU usage can be higher than the number of worker thread.",
shortName="THREADS")
diff --git a/src/main/java/gridss/analysis/CigarSizeDistribution.java b/src/main/java/gridss/analysis/CigarSizeDistribution.java
index 4c26a76e7..c9ff2b02f 100644
--- a/src/main/java/gridss/analysis/CigarSizeDistribution.java
+++ b/src/main/java/gridss/analysis/CigarSizeDistribution.java
@@ -33,7 +33,7 @@ private static double[] calcPhred(List sc) {
CigarDetailMetrics m = sc.get(i);
assert(m.LENGTH == i);
if (cumsum > 0) {
- score = -10 * Math.log10((double)cumsum / (double)total);
+ score = -10 * Math.log10((double)Math.max(1, cumsum) / (double)Math.max(1, total));
}
phred[i] = score;
cumsum -= m.COUNT;
diff --git a/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java b/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java
index e2db2c195..d949ea95d 100644
--- a/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java
+++ b/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java
@@ -18,6 +18,7 @@
*/
public abstract class ProcessStructuralVariantReadsCommandLineProgram extends ByReadNameSinglePassSamProgram {
private static final Log log = Log.getInstance(ProcessStructuralVariantReadsCommandLineProgram.class);
+ //#region SV read args
@Argument(doc="Minimum indel size", optional=true)
public int MIN_INDEL_SIZE = 1;
@Argument(doc="Minimum bases clipped", optional=true)
@@ -46,6 +47,8 @@ public abstract class ProcessStructuralVariantReadsCommandLineProgram extends By
public boolean UNMAPPED_READS = true;
@Argument(doc="If true, also include reads marked as duplicates.")
public boolean INCLUDE_DUPLICATES = false;
+ //#endregion SV read args
+
@Override
public String[] customCommandLineValidation() {
if (READ_PAIR_CONCORDANCE_METHOD == ReadPairConcordanceMethod.PERCENTAGE && INSERT_SIZE_METRICS == null) {