Working gridss_targeted.sh example

gridss_targeted.sh now calculates metrics based on first 10,000,000 reads Models have been updated to be more robust to approximate metrics Version bumped to 2.2.0
PapenfussLab · Feb 11, 2019 · d9f0f8b · d9f0f8b
1 parent 4ef7cdc
commit d9f0f8b
Show file tree

Hide file tree

Showing 21 changed files with 203 additions and 87 deletions.
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -6,7 +6,7 @@ FROM ubuntu:16.04
 LABEL base.image="biocontainers:latest"
 LABEL version="1"
 LABEL software="GRIDSS"
-LABEL software.version="2.0.1"
+LABEL software.version="2.2.0"
 LABEL about.summary="Genomic Rearrangement IDentification Software Suite"
 LABEL about.home="https://github.com/PapenfussLab/gridss"
 LABEL about.tags="Genomics"
@@ -32,6 +32,6 @@ RUN chmod 777 /data
 USER gridss
 RUN mkdir /data/gridss
 WORKDIR /data/gridss
-RUN wget https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar
+RUN wget https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar
 
-ENTRYPOINT ["java", "-ea", "-Xmx16g", "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true", "-Dgridss.gridss.output_to_temp_file=true", "-cp", "/data/gridss/gridss-2.0.1-gridss-jar-with-dependencies.jar", "gridss.CallVariants", "WORKER_THREADS=4"]
+ENTRYPOINT ["java", "-ea", "-Xmx16g", "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true", "-Dgridss.gridss.output_to_temp_file=true", "-cp", "/data/gridss/gridss-2.2.0-gridss-jar-with-dependencies.jar", "gridss.CallVariants", "WORKER_THREADS=4"]
diff --git a/example/example_regions_of_interest.bed b/example/example_regions_of_interest.bed
@@ -0,0 +1 @@
+chr12	1527326	1528326
diff --git a/example/gridss.sh b/example/gridss.sh
@@ -7,7 +7,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed
 REFERENCE=hg19.fa
 OUTPUT=${INPUT/.bam/.sv.vcf}
 ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
-GRIDSS_JAR=../target/gridss-2.1.1-gridss-jar-with-dependencies.jar
+GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar
 
 if [[ ! -f "$INPUT" ]] ; then
 	echo "Missing $INPUT input file."

diff --git a/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh b/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh
@@ -41,7 +41,7 @@ REFERENCE=~/reference_genomes/human/hg19.fa
 INPUT=pipelined.example.input.bam
 OUTPUT=pipelined.example.sv.vcf
 RAW_GRIDSS_ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
-GRIDSS_JAR=~/bin/gridss-2.1.1-jar-with-dependencies.jar
+GRIDSS_JAR=~/bin/gridss-2.2.0-jar-with-dependencies.jar
 GRIDSS_JVM_ARGS="
 	-Dsamjdk.use_async_io_read_samtools=true 
 	-Dsamjdk.use_async_io_write_samtools=true 

diff --git a/example/gridss_separate.sh b/example/gridss_separate.sh
@@ -7,7 +7,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed
 REFERENCE=hg19.fa
 OUTPUT=${INPUT/.bam/.sv.vcf}
 ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
-GRIDSS_JAR=../target/gridss-2.1.1-gridss-jar-with-dependencies.jar
+GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar
 WORKING_DIR=.
 
 

diff --git a/example/gridss_targeted.sh b/example/gridss_targeted.sh
@@ -2,25 +2,33 @@
 #
 # Runs GRIDSS on a set of targeted regions
 #
-# Note that to properly call compound breakpoints, this script should be run
+#
+# WARNING: reference read counts will only be correct in targeted regions.
+# If a breakpoint goes from a targeted region to a region not in the BED of
+# interest then the untargeted breakend will be reported as homozygous variant
+# even when it is not (since the reference-supporting reads were not included).
+# This can be fixed by rerunning gridss.AnnotateReferenceReads against the
+# original input file directly, or as explained below.
+#
+# To properly call compound breakpoints, this script should be run
 # twice: once for the intial GRIDSS calls, then again on the GRIDSS calls
 # This enables accurate calling of complex events for which only some of the
-# breakpoints involved fall within the initially targeted region
+# breakpoints involved fall within the initially targeted region, as well
+# as fixing the above issue with reference read counts.
 #
 
-BED_REGIONS_OF_INTEREST=targetted_regions.bed # If it's already a bed then you don't need the next two lines
-VCF_REGIONS_OF_INTEREST=targetted_regions.vcf # If your input is a VCF you'll need to convert it to a BED file
-Rscript gridss_targeted_vcf_to_region_bed.R --input $VCF_REGIONS_OF_INTEREST --ouput #BED_REGIONS_OF_INTEREST
+BED_REGIONS_OF_INTEREST=example_regions_of_interest.bed
+# If you have a VCF file, you can use the following script convert to a BED
+# file that correctly handle symbolic SVs
+#VCF_REGIONS_OF_INTEREST=targetted_regions.vcf # If your input is a VCF you'll need to convert it to a BED file
+#Rscript gridss_targeted_vcf_to_region_bed.R --input $VCF_REGIONS_OF_INTEREST --ouput $BED_REGIONS_OF_INTEREST
 
-REGIONS_OF_INTEREST=colo829.region.bed
-REFERENCE=/data/refgenomes/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta
-OUTPUT=colo829.lite.vcf
-ASSEMBLY=assembly.lite.bam
-GRIDSS_JAR=gridss-2.1.2-gridss-jar-with-dependencies.jar
-
-# This example uses a paired tumour/normal
-NORMAL=COLO829R_dedup.realigned.bam
-TUMOUR=COLO829T_dedup.realigned.bam
+REGIONS_OF_INTEREST=$BED_REGIONS_OF_INTEREST
+REFERENCE=hg19.fa
+INPUT=chr12.1527326.DEL1024.bam
+OUTPUT=${INPUT/.bam/.targeted.sv.vcf}
+ASSEMBLY=${INPUT/.bam/.targeted.assembly.bam}
+GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar
 
 # Extract the reads flanking each breakpoint, not just overlapping.
 # 2k is chosen since it it (should) be larger than the fragment size
@@ -29,18 +37,8 @@ TUMOUR=COLO829T_dedup.realigned.bam
 # to that of the full file.
 REGION_PADDING_SIZE=2000
 
-
-
-
-
-INPUT=../../temp/test.bam
-REFERENCE=~/hartwig/temp/Homo_sapiens.GRCh37.GATK.illumina.fa
-OUTPUT=${INPUT/.bam/.targeted.sv.vcf}
-ASSEMBLY=${OUTPUT/.sv.vcf/.targeted.gridss.assembly.bam}
-GRIDSS_JAR=../target/gridss-2.1.2-gridss-jar-with-dependencies.jar
-WORKING_DIR=../../temp/
-TMP_DIR=../../temp/
-
+# Number of reads for which to calculate metrics for
+STOP_METRICS_AFTER=10000000
 JVM_ARGS="-ea \
 	-Dreference_fasta="$REFERENCE" \
 	-Dsamjdk.create_index=true \
@@ -50,36 +48,78 @@ JVM_ARGS="-ea \
 	-Dsamjdk.buffer_size=4194304 \
 	-Dgridss.gridss.output_to_temp_file=true \
 	-cp $GRIDSS_JAR "
-# Don't perform async read-ahead for gridss.IndexedExtractFullReads
-# as there's not point in performing read-ahead of reads we're going
-# ignore anyway
-JVM_ARGS_SINGLE_THREADED="-ea \
-        -Dreference_fasta="$REFERENCE" \
-        -Dsamjdk.create_index=true \
-        -Dsamjdk.use_async_io_write_samtools=true \
-        -Dsamjdk.use_async_io_write_tribble=true \
-        -Dgridss.gridss.output_to_temp_file=true \
-        -cp $GRIDSS_JAR "
-
-
-for F in $NORMAL $TUMOUR ; do
-	java -Xmx8g $JVM_ARGS_SINGLE_THREADED gridss.IndexedExtractFullReads B=$REGIONS_OF_INTEREST REGION_PADDING_SIZE=$REGION_PADDING_SIZE I=$F O=$F.lite.bam 2>&1 | tee log.$F.extract.log &
-	# If you have are extracting a large portion of genome then
+# for each input file
+for F in $INPUT ; do
+	TARGETED_BAM=$(dirname $INPUT)/$(basename $INPUT .bam).targeted.bam
+	GRIDSS_WORKING_DIR=./$(basename $TARGETED_BAM).gridss.working
+	TMP_DIR=./$(basename $TARGETED_BAM).gridss.working
+	echo "Calculate metrics for $F based on first $STOP_METRICS_AFTER reads"
+	# estimate metrics
+	mkdir -p $GRIDSS_WORKING_DIR
+	java -Xmx8g $JVM_ARGS gridss.analysis.CollectGridssMetrics \
+			TMP_DIR=$GRIDSS_WORKING_DIR \
+			ASSUME_SORTED=true \
+			I=$F \
+			O=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM) \
+			THRESHOLD_COVERAGE=25000 \
+			FILE_EXTENSION=null \
+			GRIDSS_PROGRAM=null \
+			GRIDSS_PROGRAM=CollectCigarMetrics \
+			GRIDSS_PROGRAM=CollectMapqMetrics \
+			GRIDSS_PROGRAM=CollectTagMetrics \
+			GRIDSS_PROGRAM=CollectIdsvMetrics \
+			GRIDSS_PROGRAM=ReportThresholdCoverage \
+			PROGRAM=null \
+			PROGRAM=CollectInsertSizeMetrics \
+			STOP_AFTER=$STOP_METRICS_AFTER \
+			| tee log.$F.collectgridssmetrics.log 
+	java -Xmx8g $JVM_ARGS gridss.analysis.CollectStructuralVariantReadMetrics \
+			TMP_DIR=$TMP_DIR \
+			I=$F \
+			OUTPUT=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM).sv_metrics \
+			INSERT_SIZE_METRICS=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM).insert_size_metrics \
+			STOP_AFTER=$STOP_METRICS_AFTER \
+			| tee log.$F.collectsvmetrics.log 
+	echo "Extracting all reads from fragments overlapping $REGIONS_OF_INTEREST from $F"
+
+	# IndexedExtractFullReads is much faster than ExtractFullReads when
+	# using a small region of interest. Unfortunately, if the regions of
+	# interest are all SVs, the insert size distribution will be biased
+	## Don't perform async read-ahead for gridss.IndexedExtractFullReads
+	## as there's not point in performing read-ahead of reads we're going
+	## ignore anyway
+	JVM_ARGS_SINGLE_THREADED="-ea \
+			-Dreference_fasta="$REFERENCE" \
+			-Dsamjdk.create_index=true \
+			-Dsamjdk.use_async_io_write_samtools=true \
+			-Dgridss.gridss.output_to_temp_file=true \
+			-cp $GRIDSS_JAR "
+	java -Xmx8g $JVM_ARGS_SINGLE_THREADED \
+		gridss.IndexedExtractFullReads \
+		B=$REGIONS_OF_INTEREST \
+		REGION_PADDING_SIZE=$REGION_PADDING_SIZE \
+		I=$F \
+		O=$TARGETED_BAM 2>&1 | tee log.$F.extract.log &
+
+	# If you are extracting a large portion of genome then
 	# you should perform a full pass of the input file as it will
 	# be faster than doing millions of index seeks()
-	#java -Xmx8g $JVM_ARGS gridss.ExtractFullReads REGION_PADDING_SIZE=$REGION_PADDING_SIZE B=$REGIONS_OF_INTEREST I=$F O=$F.lite.bam 2>&1 | tee log.$F.extract.log &
+	#java -Xmx8g $JVM_ARGS gridss.CollectGridssMetricsAndExtractFullReads \
+	#	REGION_PADDING_SIZE=$REGION_PADDING_SIZE \
+	#	REGION_BED=$REGIONS_OF_INTEREST \
+	#	I=$F \
+	#	O=$TARGETED_BAM 2>&1 | tee log.$F.extract.log &
 done
 wait
+
 # Run GRIDSS on the extracted files
-# warning: if you haven't increased your OS file ulimit above 4096 then will probably crash.
-# 
+# warning: if you haven't increased your OS file ulimit above 4096 then will probably crash. Refer to the GRIDSS README for more details
 java -Xmx31g $JVM_ARGS \
 	-cp $GRIDSS_JAR gridss.CallVariants \
 	TMP_DIR=. \
 	WORKING_DIR=. \
 	REFERENCE_SEQUENCE=$REFERENCE \
-	INPUT=$NORMAL.lite.bam \
-	INPUT=$TUMOUR.lite.bam \
+	INPUT=$(dirname $INPUT)/$(basename $INPUT .bam).targeted.bam \
 	OUTPUT=$OUTPUT \
 	ASSEMBLY=$ASSEMBLY \
 	WORKER_THREADS=16 \

diff --git a/example/somatic.sh b/example/somatic.sh
@@ -8,7 +8,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed
 REFERENCE=~/reference_genomes/human/hg19.fa
 OUTPUT=somatic.sv.vcf
 ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
-GRIDSS_JAR=~/bin/gridss-2.0.1-jar-with-dependencies.jar
+GRIDSS_JAR=~/bin/gridss-2.2.0-jar-with-dependencies.jar
 
 if [[ ! -f "$NORMAL" ]] ; then
 	echo "Missing $NORMAL input file."

diff --git a/pom.xml b/pom.xml
@@ -4,7 +4,7 @@
 	<groupId>au.edu.wehi</groupId>
 	<artifactId>gridss</artifactId>
 	<packaging>jar</packaging>
-	<version>2.1.2-gridss</version>
+	<version>2.2.0-gridss</version>
 	<name>gridss</name>
 	<url>https://github.com/PapenfussLab/gridss</url>
 	<properties>

diff --git a/scripts/cohort_analysis/0_README.txt b/scripts/cohort_analysis/0_README.txt
@@ -38,7 +38,7 @@ Instructions
 > ./1_setup.sh
 
 This will download the GRIDSS jar file and dependencies
-(https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar)
+(https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar)
 and download and decompress the ENCODE blacklist file
 (https://www.encodeproject.org/files/ENCFF001TDO/@@download/ENCFF001TDO.bed.gz).
 
@@ -54,7 +54,7 @@ of the following files:
 
 BLACKLIST_FILENAME = "data/ENCODE_blacklist_hg19/ENCFF001TDO.bed"
 REFERENCE_GENOME = "data/hg19.fa"
-GRIDSS_JARFILE = "./gridss-2.0.1-gridss-jar-with-dependencies.jar"
+GRIDSS_JARFILE = "./gridss-2.2.0-gridss-jar-with-dependencies.jar"
 
 
 5. Open & edit the samples file sample.csv using a text editor. The file

diff --git a/scripts/cohort_analysis/1_setup.sh b/scripts/cohort_analysis/1_setup.sh
@@ -3,7 +3,7 @@
 # Setup script
 
 # Download the GRIDSS jar file and dependencies from:
-wget https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar
+wget https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar
 
 # Download and decompress the ENCODE blacklist file:
 mkdir data

diff --git a/scripts/cohort_analysis/2_generate_analysis_scripts.py b/scripts/cohort_analysis/2_generate_analysis_scripts.py
@@ -5,7 +5,7 @@
 
 BLACKLIST_FILENAME = "data/ENCODE_blacklist_hg19/ENCFF001TDO.bed"
 REFERENCE_GENOME = "data/hg19/hg19.fa"
-GRIDSS_JARFILE = "./gridss-2.0.1-gridss-jar-with-dependencies.jar"
+GRIDSS_JARFILE = "./gridss-2.2.0-gridss-jar-with-dependencies.jar"
 
 
 # Read the script template

diff --git a/scripts/cohort_analysis/4_cleanup.py b/scripts/cohort_analysis/4_cleanup.py
@@ -4,7 +4,7 @@
 
 rm scripts/*
 rm output/*
-rm gridss-2.0.1-gridss-jar-with-dependencies.jar
+rm gridss-2.2.0-gridss-jar-with-dependencies.jar
 rm data/ENCODE_blacklist_hg19/ENCFF001TDO.bed
 
 # rm data/hg19/*
diff --git a/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java b/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java
@@ -153,7 +153,7 @@ public double readPairFoldedCumulativeDistribution(int fragmentSize) {
 		}
 		double totalPairs = idsvMetrics.READ_PAIRS_BOTH_MAPPED;
 		double dpPairs = totalPairs - insertDistribution.getTotalMappedPairs() + pairsFromFragmentDistribution;
-		return dpPairs / totalPairs;
+		return Math.min(1, dpPairs / totalPairs);
 	}
 
 	/**

diff --git a/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java b/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java
@@ -47,7 +47,7 @@ public double scoreReadPair(IdsvSamFileMetrics metrics, int fragmentSize, int ma
 	public double scoreUnmappedMate(IdsvSamFileMetrics metrics, int mapq) {
 		IdsvMetrics im = metrics.getIdsvMetrics();
 		// completely unmapped read pairs are excluded for consistency with sc and dp calculation
-		double score = MathUtil.prToPhred((double)im.READ_PAIRS_ONE_MAPPED / (double)(im.MAPPED_READS));
+		double score = MathUtil.prToPhred((double)Math.max(1, im.READ_PAIRS_ONE_MAPPED) / (double)Math.max(1, Math.max(im.READ_PAIRS_ONE_MAPPED, (im.MAPPED_READS))));
 		score = Math.min(score, mapq);
 		return score;
 	}

diff --git a/src/main/java/au/edu/wehi/idsv/model/Models.java b/src/main/java/au/edu/wehi/idsv/model/Models.java
@@ -36,17 +36,10 @@ public static BreakendSummary calculateBreakend(LinearGenomicCoordinate lgc, Lis
 				public BreakendSummary apply(DirectedEvidence input) {
 					return input.getBreakendSummary();
 				}
-			}), Lists.transform(evidence, new Function<DirectedEvidence, Long>() {
-				@Override
-				public Long apply(DirectedEvidence input) {
-					return ScalingHelper.toScaledWeight(input.getBreakendQual());
-				}
-			}));
+			}), Lists.transform(evidence, (Function<DirectedEvidence, Long>) input -> ScalingHelper.toScaledWeight(input.getBreakendQual())));
 	}
 	/**
 	 * Calculates the most likely breakend interval for the given evidence 
-	 * @param evidence
-	 * @return breakend interval with highest total evidence quality
 	 */
 	public static BreakendSummary calculateBreakend(LinearGenomicCoordinate lgc, List<BreakendSummary> bs, List<Long> weights) {
 		if (bs == null || bs.size() == 0) throw new IllegalArgumentException("No evidence supplied");