Add ONT option to run_deeptio and set --cpus 0 for post_processing.

PiperOrigin-RevId: 572358097
yinshiyi · Oct 10, 2023 · 0317638 · 0317638
1 parent 8e14a53
commit 0317638
Show file tree

Hide file tree

Showing 3 changed files with 362 additions and 57 deletions.
diff --git a/docs/deeptrio-ont-case-study.md b/docs/deeptrio-ont-case-study.md
@@ -0,0 +1,308 @@
+# Using DeepTrio for small variant calling from the trio sequenced with ONT R10.4
+
+In this case study, we describe applying [DeepTrio](deeptrio-details.md) to a
+real ONT R10.4 trio. Then we assess the quality of the DeepTrio variant calls
+with `hap.py`. In addition we evaluate a Mendelian violation rate for a merged
+VCF.
+
+To make it faster to run over this case study, we run only on chromosome 20.
+
+## Prepare environment
+
+### Tools
+
+[Docker](https://docs.docker.com/get-docker/) will be used to run DeepTrio and
+[hap.py](https://github.com/illumina/hap.py),
+
+### Download Reference
+
+We will be using GRCh38 for this case study.
+
+```bash
+mkdir -p reference
+
+FTPDIR=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids
+
+curl ${FTPDIR}/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | gunzip > reference/GRCh38_no_alt_analysis_set.fasta
+curl ${FTPDIR}/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai > reference/GRCh38_no_alt_analysis_set.fasta.fai
+```
+
+### Download Genome in a Bottle Benchmarks
+
+We will benchmark our variant calls against v4.2.1 of the Genome in a Bottle
+small variant benchmarks for HG002, HG003, and HG004 trio.
+
+```bash
+mkdir -p benchmark
+
+FTPDIR=ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio
+
+curl ${FTPDIR}/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed > benchmark/HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed
+curl ${FTPDIR}/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz > benchmark/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz
+curl ${FTPDIR}/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi > benchmark/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi
+
+curl ${FTPDIR}/HG003_NA24149_father/NISTv4.2.1/GRCh38/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed > benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed
+curl ${FTPDIR}/HG003_NA24149_father/NISTv4.2.1/GRCh38/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz > benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz
+curl ${FTPDIR}/HG003_NA24149_father/NISTv4.2.1/GRCh38/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi > benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi
+
+curl ${FTPDIR}/HG004_NA24143_mother/NISTv4.2.1/GRCh38/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed > benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed
+curl ${FTPDIR}/HG004_NA24143_mother/NISTv4.2.1/GRCh38/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz > benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz
+curl ${FTPDIR}/HG004_NA24143_mother/NISTv4.2.1/GRCh38/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi > benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi
+```
+
+### Download HG002, HG003, and HG004 BAM files
+
+We'll use HG002, HG003, HG004 ONT R10.4 reads. These reads have been
+aligned to the GRCh38_no_alt_analysis reference using minimap2.
+
+```bash
+mkdir -p input
+HTTPDIR=https://storage.googleapis.com/deepvariant/ont-case-study-testdata
+
+curl ${HTTPDIR}/HG002_R104_sup_merged.50x.chr20.bam > input/HG002_R104_sup_merged.50x.chr20.bam
+curl ${HTTPDIR}/HG002_R104_sup_merged.50x.chr20.bam.bai > input/HG002_R104_sup_merged.50x.chr20.bam.bai
+
+curl ${HTTPDIR}/HG003_R104_sup_merged.40x.chr20.bam > input/HG003_R104_sup_merged.40x.chr20.bam
+curl ${HTTPDIR}/HG003_R104_sup_merged.40x.chr20.bam.bai > input/HG003_R104_sup_merged.40x.chr20.bam.bai
+
+curl ${HTTPDIR}/HG004_R104_sup_merged.40x.chr20.bam > input/HG004_R104_sup_merged.40x.chr20.bam
+curl ${HTTPDIR}/HG004_R104_sup_merged.40x.chr20.bam.bai > input/HG004_R104_sup_merged.40x.chr20.bam.bai
+```
+
+## Running DeepTrio with one command
+
+DeepTrio pipeline consists of 4 steps: `make_examples`, `call_variants`,
+`postprocess_variants` and `GLnexus merge`. It is possible to run the first
+three steps with one command using the `run_deeptrio` script. GLnexus
+is run as a separate command.
+
+### Running on a CPU-only machine
+
+```bash
+mkdir -p output
+mkdir -p output/intermediate_results_dir
+
+BIN_VERSION="1.6.0"
+
+sudo apt -y update
+sudo apt-get -y install docker.io
+sudo docker pull google/deepvariant:deeptrio-"${BIN_VERSION}"
+
+time sudo docker run \
+  -v "${PWD}/input":"/input" \
+  -v "${PWD}/output":"/output" \
+  -v "${PWD}/reference":"/reference" \
+  google/deepvariant:deeptrio-"${BIN_VERSION}" \
+  /opt/deepvariant/bin/deeptrio/run_deeptrio \
+  --model_type ONT \
+  --ref /reference/GRCh38_no_alt_analysis_set.fasta \
+  --reads_child /input/HG002_R104_sup_merged.50x.chr20.bam \
+  --reads_parent1 /input/HG003_R104_sup_merged.40x.chr20.bam \
+  --reads_parent2 /input/HG004_R104_sup_merged.40x.chr20.bam \
+  --output_vcf_child /output/HG002.output.vcf.gz \
+  --output_vcf_parent1 /output/HG003.output.vcf.gz \
+  --output_vcf_parent2 /output/HG004.output.vcf.gz \
+  --sample_name_child 'HG002' \
+  --sample_name_parent1 'HG003' \
+  --sample_name_parent2 'HG004' \
+  --num_shards $(nproc) \
+  --intermediate_results_dir /output/intermediate_results_dir \
+  --output_gvcf_child /output/HG002.g.vcf.gz \
+  --output_gvcf_parent1 /output/HG003.g.vcf.gz \
+  --output_gvcf_parent2 /output/HG004.g.vcf.gz \
+  --regions chr20
+```
+
+By specifying `--model_type ONT`, you'll be using a model that is best suited
+for ONT R10.4 Whole Genome Sequencing data.
+
+NOTE: If you want to run each of the steps separately, add `--dry_run=true`
+to the command above to figure out what flags you need in each step. Based on
+the different model types, different flags are needed in the `make_examples`
+step.
+
+`--intermediate_results_dir` flag is optional. By specifying it, the
+intermediate outputs of `make_examples` and `call_variants` stages can be found
+in the directory. After the command, you can find these files in the directory:
+
+```
+call_variants_output_child.tfrecord.gz
+call_variants_output_parent1.tfrecord.gz
+call_variants_output_parent2.tfrecord.gz
+
+gvcf_child.tfrecord-?????-of-?????.gz
+gvcf_parent1.tfrecord-?????-of-?????.gz
+gvcf_parent2.tfrecord-?????-of-?????.gz
+
+make_examples_child.tfrecord-?????-of-?????.gz
+make_examples_parent1.tfrecord-?????-of-?????.gz
+make_examples_parent2.tfrecord-?????-of-?????.gz
+```
+
+For running on GPU machines, or using Singularity instead of Docker, see
+[Quick Start](deepvariant-quick-start.md).
+
+## Merge VCFs using GLnexus
+
+At this step we take all 3 VCFs generated in the previous step and merge them
+using GLnexus.
+
+```bash
+sudo docker pull quay.io/mlin/glnexus:v1.2.7
+
+# bcftools and bgzip are now included in our docker images.
+# You can also install them separately.
+sudo docker run \
+  -v "${PWD}/output":"/output" \
+  quay.io/mlin/glnexus:v1.2.7 \
+  /usr/local/bin/glnexus_cli \
+  --config DeepVariant_unfiltered \
+  /output/HG002.g.vcf.gz \
+  /output/HG003.g.vcf.gz \
+  /output/HG004.g.vcf.gz \
+  | sudo docker run -i google/deepvariant:deeptrio-"${BIN_VERSION}" \
+    bcftools view - \
+  | sudo docker run -i google/deepvariant:deeptrio-"${BIN_VERSION}" \
+    bgzip -c > output/HG002_trio_merged.vcf.gz
+```
+
+After completion of GLnexus command we should have a new merged VCF file in the
+output directory.
+
+```
+HG002_trio_merged.vcf.gz
+```
+
+## Benchmark on chr20
+
+### Calculate Mendelian Violation rate
+
+```bash
+sudo docker pull realtimegenomics/rtg-tools
+
+sudo docker run \
+  -v "${PWD}/input":"/input" \
+  -v "${PWD}/reference":"/reference" \
+  realtimegenomics/rtg-tools format \
+  -o /reference/GRCh38_no_alt_analysis_set.sdf "/reference/GRCh38_no_alt_analysis_set.fasta"
+
+FILE="reference/trio.ped"
+cat <<EOM >$FILE
+#PED format pedigree
+#
+#fam-id/ind-id/pat-id/mat-id: 0=unknown
+#sex: 1=male; 2=female; 0=unknown
+#phenotype: -9=missing, 0=missing; 1=unaffected; 2=affected
+#
+#fam-id ind-id pat-id mat-id sex phen
+1 HG002 HG003 HG004 1 0
+1 HG003 0 0 1 0
+1 HG004 0 0 2 0
+EOM
+
+sudo docker run \
+-v "${PWD}/input":"/input" \
+-v "${PWD}/reference":"/reference" \
+-v "${PWD}/output":"/output" \
+realtimegenomics/rtg-tools mendelian \
+-i "/output/HG002_trio_merged.vcf.gz" \
+-o "/output/HG002_trio_annotated.output.vcf.gz" \
+--pedigree=/reference/trio.ped \
+-t /reference/GRCh38_no_alt_analysis_set.sdf \
+| tee output/deepvariant.input_rtg_output.txt
+```
+
+As a result we should get the following output:
+
+```bash
+Checking: /output/HG002_trio_merged.vcf.gz
+Family: [HG003 + HG004] -> [HG002]
+164 non-pass records were skipped
+Concordance HG002: F:142343/143808 (98.98%)  M:144330/146059 (98.82%)  F+M:135630/138870 (97.67%)
+Sample HG002 has less than 99.0 concordance with both parents. Check for incorrect pedigree or sample mislabelling.
+0/168963 (0.00%) records did not conform to expected call ploidy
+156632/168963 (92.70%) records were variant in at least 1 family member and checked for Mendelian constraints
+16308/156632 (10.41%) records had indeterminate consistency status due to incomplete calls
+4099/156632 (2.62%) records contained a violation of Mendelian constraints
+```
+
+### Benchmark variant calls against 4.2.1 truth set with hap.py
+
+```bash
+mkdir -p happy
+
+sudo docker pull jmcdani20/hap.py:v0.3.12
+
+sudo docker run \
+  -v "${PWD}/benchmark":"/benchmark" \
+  -v "${PWD}/input":"/input" \
+  -v "${PWD}/output":"/output" \
+  -v "${PWD}/reference":"/reference" \
+  -v "${PWD}/happy:/happy" \
+  jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \
+  /benchmark/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \
+  /output/HG002.output.vcf.gz \
+  -f /benchmark/HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \
+  -r /reference/GRCh38_no_alt_analysis_set.fasta \
+  -o /happy/HG002.output \
+  --engine=vcfeval \
+  --pass-only \
+  -l chr20
+
+sudo docker run \
+  -v "${PWD}/benchmark":"/benchmark" \
+  -v "${PWD}/input":"/input" \
+  -v "${PWD}/output":"/output" \
+  -v "${PWD}/reference":"/reference" \
+  -v "${PWD}/happy:/happy" \
+  jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \
+  /benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \
+  /output/HG003.output.vcf.gz \
+  -f /benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \
+  -r /reference/GRCh38_no_alt_analysis_set.fasta \
+  -o /happy/HG003.output \
+  --engine=vcfeval \
+  --pass-only \
+  -l chr20
+
+sudo docker run \
+  -v "${PWD}/benchmark":"/benchmark" \
+  -v "${PWD}/input":"/input" \
+  -v "${PWD}/output":"/output" \
+  -v "${PWD}/reference":"/reference" \
+  -v "${PWD}/happy:/happy" \
+  jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \
+  /benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \
+  /output/HG004.output.vcf.gz \
+  -f /benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \
+  -r /reference/GRCh38_no_alt_analysis_set.fasta \
+  -o /happy/HG004.output \
+  --engine=vcfeval \
+  --pass-only \
+  -l chr20
+```
+
+```
+Benchmarking Summary for HG002:
+Benchmarking Summary:
+Type Filter  TRUTH.TOTAL  TRUTH.TP  TRUTH.FN  QUERY.TOTAL  QUERY.FP  QUERY.UNK  FP.gt  FP.al  METRIC.Recall  METRIC.Precision  METRIC.Frac_NA  METRIC.F1_Score  TRUTH.TOTAL.TiTv_ratio  QUERY.TOTAL.TiTv_ratio  TRUTH.TOTAL.het_hom_ratio  QUERY.TOTAL.het_hom_ratio
+INDEL    ALL        11256      9916      1340        20944      1295       9272    478    519       0.880952          0.889051        0.442704         0.884983                     NaN                     NaN                   1.561710                   2.346610
+INDEL   PASS        11256      9916      1340        20944      1295       9272    478    519       0.880952          0.889051        0.442704         0.884983                     NaN                     NaN                   1.561710                   2.346610
+  SNP    ALL        71333     71199       134       100632        84      29325     59     22       0.998121          0.998822        0.291408         0.998472                2.314904                1.795903                   1.715978                   1.935631
+  SNP   PASS        71333     71199       134       100632        84      29325     59     22       0.998121          0.998822        0.291408         0.998472                2.314904                1.795903                   1.715978                   1.935631
+
+Benchmarking Summary for HG003:
+Benchmarking Summary:
+Type Filter  TRUTH.TOTAL  TRUTH.TP  TRUTH.FN  QUERY.TOTAL  QUERY.FP  QUERY.UNK  FP.gt  FP.al  METRIC.Recall  METRIC.Precision  METRIC.Frac_NA  METRIC.F1_Score  TRUTH.TOTAL.TiTv_ratio  QUERY.TOTAL.TiTv_ratio  TRUTH.TOTAL.het_hom_ratio  QUERY.TOTAL.het_hom_ratio
+INDEL    ALL        10628      9044      1584        20580      1348       9720    571    506       0.850960          0.875875        0.472303         0.863238                     NaN                     NaN                   1.748961                   2.292998
+INDEL   PASS        10628      9044      1584        20580      1348       9720    571    506       0.850960          0.875875        0.472303         0.863238                     NaN                     NaN                   1.748961                   2.292998
+  SNP    ALL        70166     70043       123        99648       138      29480     53     32       0.998247          0.998033        0.295841         0.998140                2.296566                1.833087                   1.883951                   1.994315
+  SNP   PASS        70166     70043       123        99648       138      29480     53     32       0.998247          0.998033        0.295841         0.998140                2.296566                1.833087                   1.883951                   1.994315
+
+Benchmarking Summary for HG004:
+Type Filter  TRUTH.TOTAL  TRUTH.TP  TRUTH.FN  QUERY.TOTAL  QUERY.FP  QUERY.UNK  FP.gt  FP.al  METRIC.Recall  METRIC.Precision  METRIC.Frac_NA  METRIC.F1_Score  TRUTH.TOTAL.TiTv_ratio  QUERY.TOTAL.TiTv_ratio  TRUTH.TOTAL.het_hom_ratio  QUERY.TOTAL.het_hom_ratio
+INDEL    ALL        11000      9480      1520        20898      1271       9674    511    526       0.861818          0.886761        0.462915         0.874111                     NaN                     NaN                   1.792709                   2.374125
+INDEL   PASS        11000      9480      1520        20898      1271       9674    511    526       0.861818          0.886761        0.462915         0.874111                     NaN                     NaN                   1.792709                   2.374125
+  SNP    ALL        71659     71563        96       101494        74      29834     31     40       0.998660          0.998967        0.293948         0.998814                2.310073                1.846227                   1.878340                   1.838233
+  SNP   PASS        71659     71563        96       101494        74      29834     31     40       0.998660          0.998967        0.293948         0.998814                2.310073                1.846227                   1.878340                   1.838233
+```
diff --git a/scripts/run_deeptrio.py b/scripts/run_deeptrio.py
@@ -536,7 +536,7 @@ def _make_examples_command(
     special_args['vsc_min_fraction_indels'] = 0.12
     special_args['alt_aligned_pileup'] = 'diff_channels'
     special_args['add_hp_channel'] = True
-    special_args['min_mapping_quality'] = 1
+    special_args['min_mapping_quality'] = 5
     special_args['track_ref_reads'] = True
     special_args['pileup_image_height_child'] = (
         DEEP_TRIO_ONT_PILEUP_HEIGHT_CHILD
@@ -551,6 +551,7 @@ def _make_examples_command(
     special_args['parse_sam_aux_fields'] = True
     special_args['discard_non_dna_regions'] = True
     special_args['max_reads_for_dynamic_bases_per_region'] = 200
+    special_args['max_reads_per_partition'] = 500
     kwargs = _update_kwargs_with_warning(kwargs, special_args)
 
   if _MODEL_TYPE.value == 'WES':
@@ -643,6 +644,7 @@ def postprocess_variants_command(
   command.extend(['--ref', '"{}"'.format(ref)])
   command.extend(['--infile', '"{}"'.format(infile)])
   command.extend(['--outfile', '"{}"'.format(outfile)])
+  command.extend(['--cpus 0'])
   if nonvariant_site_tfrecord_path is not None:
     command.extend([
         '--nonvariant_site_tfrecord_path',
@@ -866,9 +868,11 @@ def create_all_commands(intermediate_results_dir):
   # helps to better distribute the work between shards.
 
   candidate_partition_modes = [None]
-  # In DeepTrio PacBio mode, we always enable use_candidate_partition because
-  # otherwise it can run out of memory.
-  if _USE_CANDIDATE_PARTITION.value or _MODEL_TYPE.value == 'PACBIO':
+  # In DeepTrio PacBio and ONT mode, we always enable use_candidate_partition
+  # because otherwise it can run out of memory.
+  use_candidate_partition = False
+  if _USE_CANDIDATE_PARTITION.value or _MODEL_TYPE.value in ['PACBIO', 'ONT']:
+    use_candidate_partition = True
     candidate_partition_modes = [
         CandidatePartitionCommand.SWEEP,
         CandidatePartitionCommand.CANDIDATE_PARTITION_INFERENCE,
@@ -891,7 +895,7 @@ def create_all_commands(intermediate_results_dir):
                 intermediate_results_dir, _NUM_SHARDS.value
             ),
             candidate_partition_mode=candidate_partition_mode
-            if _USE_CANDIDATE_PARTITION.value
+            if use_candidate_partition
             else None,
             # kwargs:
             gvcf=nonvariant_site_tfrecord_path,