Skip to content

Commit

Permalink
Working gridss_targeted.sh example
Browse files Browse the repository at this point in the history
gridss_targeted.sh now calculates metrics based on first 10,000,000 reads
Models have been updated to be more robust to approximate metrics
Version bumped to 2.2.0
  • Loading branch information
Daniel Cameron committed Feb 11, 2019
1 parent 4ef7cdc commit d9f0f8b
Show file tree
Hide file tree
Showing 21 changed files with 203 additions and 87 deletions.
6 changes: 3 additions & 3 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ FROM ubuntu:16.04
LABEL base.image="biocontainers:latest"
LABEL version="1"
LABEL software="GRIDSS"
LABEL software.version="2.0.1"
LABEL software.version="2.2.0"
LABEL about.summary="Genomic Rearrangement IDentification Software Suite"
LABEL about.home="https://github.com/PapenfussLab/gridss"
LABEL about.tags="Genomics"
Expand All @@ -32,6 +32,6 @@ RUN chmod 777 /data
USER gridss
RUN mkdir /data/gridss
WORKDIR /data/gridss
RUN wget https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar
RUN wget https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar

ENTRYPOINT ["java", "-ea", "-Xmx16g", "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true", "-Dgridss.gridss.output_to_temp_file=true", "-cp", "/data/gridss/gridss-2.0.1-gridss-jar-with-dependencies.jar", "gridss.CallVariants", "WORKER_THREADS=4"]
ENTRYPOINT ["java", "-ea", "-Xmx16g", "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true", "-Dgridss.gridss.output_to_temp_file=true", "-cp", "/data/gridss/gridss-2.2.0-gridss-jar-with-dependencies.jar", "gridss.CallVariants", "WORKER_THREADS=4"]
1 change: 1 addition & 0 deletions example/example_regions_of_interest.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chr12 1527326 1528326
2 changes: 1 addition & 1 deletion example/gridss.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed
REFERENCE=hg19.fa
OUTPUT=${INPUT/.bam/.sv.vcf}
ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
GRIDSS_JAR=../target/gridss-2.1.1-gridss-jar-with-dependencies.jar
GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar

if [[ ! -f "$INPUT" ]] ; then
echo "Missing $INPUT input file."
Expand Down
2 changes: 1 addition & 1 deletion example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ REFERENCE=~/reference_genomes/human/hg19.fa
INPUT=pipelined.example.input.bam
OUTPUT=pipelined.example.sv.vcf
RAW_GRIDSS_ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
GRIDSS_JAR=~/bin/gridss-2.1.1-jar-with-dependencies.jar
GRIDSS_JAR=~/bin/gridss-2.2.0-jar-with-dependencies.jar
GRIDSS_JVM_ARGS="
-Dsamjdk.use_async_io_read_samtools=true
-Dsamjdk.use_async_io_write_samtools=true
Expand Down
2 changes: 1 addition & 1 deletion example/gridss_separate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed
REFERENCE=hg19.fa
OUTPUT=${INPUT/.bam/.sv.vcf}
ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
GRIDSS_JAR=../target/gridss-2.1.1-gridss-jar-with-dependencies.jar
GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar
WORKING_DIR=.


Expand Down
132 changes: 86 additions & 46 deletions example/gridss_targeted.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,33 @@
#
# Runs GRIDSS on a set of targeted regions
#
# Note that to properly call compound breakpoints, this script should be run
#
# WARNING: reference read counts will only be correct in targeted regions.
# If a breakpoint goes from a targeted region to a region not in the BED of
# interest then the untargeted breakend will be reported as homozygous variant
# even when it is not (since the reference-supporting reads were not included).
# This can be fixed by rerunning gridss.AnnotateReferenceReads against the
# original input file directly, or as explained below.
#
# To properly call compound breakpoints, this script should be run
# twice: once for the intial GRIDSS calls, then again on the GRIDSS calls
# This enables accurate calling of complex events for which only some of the
# breakpoints involved fall within the initially targeted region
# breakpoints involved fall within the initially targeted region, as well
# as fixing the above issue with reference read counts.
#

BED_REGIONS_OF_INTEREST=targetted_regions.bed # If it's already a bed then you don't need the next two lines
VCF_REGIONS_OF_INTEREST=targetted_regions.vcf # If your input is a VCF you'll need to convert it to a BED file
Rscript gridss_targeted_vcf_to_region_bed.R --input $VCF_REGIONS_OF_INTEREST --ouput #BED_REGIONS_OF_INTEREST
BED_REGIONS_OF_INTEREST=example_regions_of_interest.bed
# If you have a VCF file, you can use the following script convert to a BED
# file that correctly handle symbolic SVs
#VCF_REGIONS_OF_INTEREST=targetted_regions.vcf # If your input is a VCF you'll need to convert it to a BED file
#Rscript gridss_targeted_vcf_to_region_bed.R --input $VCF_REGIONS_OF_INTEREST --ouput $BED_REGIONS_OF_INTEREST

REGIONS_OF_INTEREST=colo829.region.bed
REFERENCE=/data/refgenomes/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta
OUTPUT=colo829.lite.vcf
ASSEMBLY=assembly.lite.bam
GRIDSS_JAR=gridss-2.1.2-gridss-jar-with-dependencies.jar

# This example uses a paired tumour/normal
NORMAL=COLO829R_dedup.realigned.bam
TUMOUR=COLO829T_dedup.realigned.bam
REGIONS_OF_INTEREST=$BED_REGIONS_OF_INTEREST
REFERENCE=hg19.fa
INPUT=chr12.1527326.DEL1024.bam
OUTPUT=${INPUT/.bam/.targeted.sv.vcf}
ASSEMBLY=${INPUT/.bam/.targeted.assembly.bam}
GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar

# Extract the reads flanking each breakpoint, not just overlapping.
# 2k is chosen since it it (should) be larger than the fragment size
Expand All @@ -29,18 +37,8 @@ TUMOUR=COLO829T_dedup.realigned.bam
# to that of the full file.
REGION_PADDING_SIZE=2000





INPUT=../../temp/test.bam
REFERENCE=~/hartwig/temp/Homo_sapiens.GRCh37.GATK.illumina.fa
OUTPUT=${INPUT/.bam/.targeted.sv.vcf}
ASSEMBLY=${OUTPUT/.sv.vcf/.targeted.gridss.assembly.bam}
GRIDSS_JAR=../target/gridss-2.1.2-gridss-jar-with-dependencies.jar
WORKING_DIR=../../temp/
TMP_DIR=../../temp/

# Number of reads for which to calculate metrics for
STOP_METRICS_AFTER=10000000
JVM_ARGS="-ea \
-Dreference_fasta="$REFERENCE" \
-Dsamjdk.create_index=true \
Expand All @@ -50,36 +48,78 @@ JVM_ARGS="-ea \
-Dsamjdk.buffer_size=4194304 \
-Dgridss.gridss.output_to_temp_file=true \
-cp $GRIDSS_JAR "
# Don't perform async read-ahead for gridss.IndexedExtractFullReads
# as there's not point in performing read-ahead of reads we're going
# ignore anyway
JVM_ARGS_SINGLE_THREADED="-ea \
-Dreference_fasta="$REFERENCE" \
-Dsamjdk.create_index=true \
-Dsamjdk.use_async_io_write_samtools=true \
-Dsamjdk.use_async_io_write_tribble=true \
-Dgridss.gridss.output_to_temp_file=true \
-cp $GRIDSS_JAR "


for F in $NORMAL $TUMOUR ; do
java -Xmx8g $JVM_ARGS_SINGLE_THREADED gridss.IndexedExtractFullReads B=$REGIONS_OF_INTEREST REGION_PADDING_SIZE=$REGION_PADDING_SIZE I=$F O=$F.lite.bam 2>&1 | tee log.$F.extract.log &
# If you have are extracting a large portion of genome then
# for each input file
for F in $INPUT ; do
TARGETED_BAM=$(dirname $INPUT)/$(basename $INPUT .bam).targeted.bam
GRIDSS_WORKING_DIR=./$(basename $TARGETED_BAM).gridss.working
TMP_DIR=./$(basename $TARGETED_BAM).gridss.working
echo "Calculate metrics for $F based on first $STOP_METRICS_AFTER reads"
# estimate metrics
mkdir -p $GRIDSS_WORKING_DIR
java -Xmx8g $JVM_ARGS gridss.analysis.CollectGridssMetrics \
TMP_DIR=$GRIDSS_WORKING_DIR \
ASSUME_SORTED=true \
I=$F \
O=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM) \
THRESHOLD_COVERAGE=25000 \
FILE_EXTENSION=null \
GRIDSS_PROGRAM=null \
GRIDSS_PROGRAM=CollectCigarMetrics \
GRIDSS_PROGRAM=CollectMapqMetrics \
GRIDSS_PROGRAM=CollectTagMetrics \
GRIDSS_PROGRAM=CollectIdsvMetrics \
GRIDSS_PROGRAM=ReportThresholdCoverage \
PROGRAM=null \
PROGRAM=CollectInsertSizeMetrics \
STOP_AFTER=$STOP_METRICS_AFTER \
| tee log.$F.collectgridssmetrics.log
java -Xmx8g $JVM_ARGS gridss.analysis.CollectStructuralVariantReadMetrics \
TMP_DIR=$TMP_DIR \
I=$F \
OUTPUT=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM).sv_metrics \
INSERT_SIZE_METRICS=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM).insert_size_metrics \
STOP_AFTER=$STOP_METRICS_AFTER \
| tee log.$F.collectsvmetrics.log
echo "Extracting all reads from fragments overlapping $REGIONS_OF_INTEREST from $F"

# IndexedExtractFullReads is much faster than ExtractFullReads when
# using a small region of interest. Unfortunately, if the regions of
# interest are all SVs, the insert size distribution will be biased
## Don't perform async read-ahead for gridss.IndexedExtractFullReads
## as there's not point in performing read-ahead of reads we're going
## ignore anyway
JVM_ARGS_SINGLE_THREADED="-ea \
-Dreference_fasta="$REFERENCE" \
-Dsamjdk.create_index=true \
-Dsamjdk.use_async_io_write_samtools=true \
-Dgridss.gridss.output_to_temp_file=true \
-cp $GRIDSS_JAR "
java -Xmx8g $JVM_ARGS_SINGLE_THREADED \
gridss.IndexedExtractFullReads \
B=$REGIONS_OF_INTEREST \
REGION_PADDING_SIZE=$REGION_PADDING_SIZE \
I=$F \
O=$TARGETED_BAM 2>&1 | tee log.$F.extract.log &

# If you are extracting a large portion of genome then
# you should perform a full pass of the input file as it will
# be faster than doing millions of index seeks()
#java -Xmx8g $JVM_ARGS gridss.ExtractFullReads REGION_PADDING_SIZE=$REGION_PADDING_SIZE B=$REGIONS_OF_INTEREST I=$F O=$F.lite.bam 2>&1 | tee log.$F.extract.log &
#java -Xmx8g $JVM_ARGS gridss.CollectGridssMetricsAndExtractFullReads \
# REGION_PADDING_SIZE=$REGION_PADDING_SIZE \
# REGION_BED=$REGIONS_OF_INTEREST \
# I=$F \
# O=$TARGETED_BAM 2>&1 | tee log.$F.extract.log &
done
wait

# Run GRIDSS on the extracted files
# warning: if you haven't increased your OS file ulimit above 4096 then will probably crash.
#
# warning: if you haven't increased your OS file ulimit above 4096 then will probably crash. Refer to the GRIDSS README for more details
java -Xmx31g $JVM_ARGS \
-cp $GRIDSS_JAR gridss.CallVariants \
TMP_DIR=. \
WORKING_DIR=. \
REFERENCE_SEQUENCE=$REFERENCE \
INPUT=$NORMAL.lite.bam \
INPUT=$TUMOUR.lite.bam \
INPUT=$(dirname $INPUT)/$(basename $INPUT .bam).targeted.bam \
OUTPUT=$OUTPUT \
ASSEMBLY=$ASSEMBLY \
WORKER_THREADS=16 \
Expand Down
2 changes: 1 addition & 1 deletion example/somatic.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed
REFERENCE=~/reference_genomes/human/hg19.fa
OUTPUT=somatic.sv.vcf
ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
GRIDSS_JAR=~/bin/gridss-2.0.1-jar-with-dependencies.jar
GRIDSS_JAR=~/bin/gridss-2.2.0-jar-with-dependencies.jar

if [[ ! -f "$NORMAL" ]] ; then
echo "Missing $NORMAL input file."
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<groupId>au.edu.wehi</groupId>
<artifactId>gridss</artifactId>
<packaging>jar</packaging>
<version>2.1.2-gridss</version>
<version>2.2.0-gridss</version>
<name>gridss</name>
<url>https://github.com/PapenfussLab/gridss</url>
<properties>
Expand Down
4 changes: 2 additions & 2 deletions scripts/cohort_analysis/0_README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Instructions
> ./1_setup.sh

This will download the GRIDSS jar file and dependencies
(https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar)
(https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar)
and download and decompress the ENCODE blacklist file
(https://www.encodeproject.org/files/ENCFF001TDO/@@download/ENCFF001TDO.bed.gz).

Expand All @@ -54,7 +54,7 @@ of the following files:

BLACKLIST_FILENAME = "data/ENCODE_blacklist_hg19/ENCFF001TDO.bed"
REFERENCE_GENOME = "data/hg19.fa"
GRIDSS_JARFILE = "./gridss-2.0.1-gridss-jar-with-dependencies.jar"
GRIDSS_JARFILE = "./gridss-2.2.0-gridss-jar-with-dependencies.jar"


5. Open & edit the samples file sample.csv using a text editor. The file
Expand Down
2 changes: 1 addition & 1 deletion scripts/cohort_analysis/1_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Setup script

# Download the GRIDSS jar file and dependencies from:
wget https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar
wget https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar

# Download and decompress the ENCODE blacklist file:
mkdir data
Expand Down
2 changes: 1 addition & 1 deletion scripts/cohort_analysis/2_generate_analysis_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

BLACKLIST_FILENAME = "data/ENCODE_blacklist_hg19/ENCFF001TDO.bed"
REFERENCE_GENOME = "data/hg19/hg19.fa"
GRIDSS_JARFILE = "./gridss-2.0.1-gridss-jar-with-dependencies.jar"
GRIDSS_JARFILE = "./gridss-2.2.0-gridss-jar-with-dependencies.jar"


# Read the script template
Expand Down
2 changes: 1 addition & 1 deletion scripts/cohort_analysis/4_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

rm scripts/*
rm output/*
rm gridss-2.0.1-gridss-jar-with-dependencies.jar
rm gridss-2.2.0-gridss-jar-with-dependencies.jar
rm data/ENCODE_blacklist_hg19/ENCFF001TDO.bed

# rm data/hg19/*
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ public double readPairFoldedCumulativeDistribution(int fragmentSize) {
}
double totalPairs = idsvMetrics.READ_PAIRS_BOTH_MAPPED;
double dpPairs = totalPairs - insertDistribution.getTotalMappedPairs() + pairsFromFragmentDistribution;
return dpPairs / totalPairs;
return Math.min(1, dpPairs / totalPairs);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public double scoreReadPair(IdsvSamFileMetrics metrics, int fragmentSize, int ma
public double scoreUnmappedMate(IdsvSamFileMetrics metrics, int mapq) {
IdsvMetrics im = metrics.getIdsvMetrics();
// completely unmapped read pairs are excluded for consistency with sc and dp calculation
double score = MathUtil.prToPhred((double)im.READ_PAIRS_ONE_MAPPED / (double)(im.MAPPED_READS));
double score = MathUtil.prToPhred((double)Math.max(1, im.READ_PAIRS_ONE_MAPPED) / (double)Math.max(1, Math.max(im.READ_PAIRS_ONE_MAPPED, (im.MAPPED_READS))));
score = Math.min(score, mapq);
return score;
}
Expand Down
9 changes: 1 addition & 8 deletions src/main/java/au/edu/wehi/idsv/model/Models.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,10 @@ public static BreakendSummary calculateBreakend(LinearGenomicCoordinate lgc, Lis
public BreakendSummary apply(DirectedEvidence input) {
return input.getBreakendSummary();
}
}), Lists.transform(evidence, new Function<DirectedEvidence, Long>() {
@Override
public Long apply(DirectedEvidence input) {
return ScalingHelper.toScaledWeight(input.getBreakendQual());
}
}));
}), Lists.transform(evidence, (Function<DirectedEvidence, Long>) input -> ScalingHelper.toScaledWeight(input.getBreakendQual())));
}
/**
* Calculates the most likely breakend interval for the given evidence
* @param evidence
* @return breakend interval with highest total evidence quality
*/
public static BreakendSummary calculateBreakend(LinearGenomicCoordinate lgc, List<BreakendSummary> bs, List<Long> weights) {
if (bs == null || bs.size() == 0) throw new IllegalArgumentException("No evidence supplied");
Expand Down
Loading

0 comments on commit d9f0f8b

Please sign in to comment.