From 03176384ea00546fe8fbb57598eb17cebb4307de Mon Sep 17 00:00:00 2001 From: shafin Date: Tue, 10 Oct 2023 13:56:44 -0700 Subject: [PATCH] Add ONT option to run_deeptio and set `--cpus 0` for post_processing. PiperOrigin-RevId: 572358097 --- docs/deeptrio-ont-case-study.md | 308 ++++++++++++++++++++++++++++++++ scripts/run_deeptrio.py | 14 +- scripts/run_deeptrio_test.py | 97 +++++----- 3 files changed, 362 insertions(+), 57 deletions(-) create mode 100644 docs/deeptrio-ont-case-study.md diff --git a/docs/deeptrio-ont-case-study.md b/docs/deeptrio-ont-case-study.md new file mode 100644 index 00000000..15c72f92 --- /dev/null +++ b/docs/deeptrio-ont-case-study.md @@ -0,0 +1,308 @@ +# Using DeepTrio for small variant calling from the trio sequenced with ONT R10.4 + +In this case study, we describe applying [DeepTrio](deeptrio-details.md) to a +real ONT R10.4 trio. Then we assess the quality of the DeepTrio variant calls +with `hap.py`. In addition we evaluate a Mendelian violation rate for a merged +VCF. + +To make it faster to run over this case study, we run only on chromosome 20. + +## Prepare environment + +### Tools + +[Docker](https://docs.docker.com/get-docker/) will be used to run DeepTrio and +[hap.py](https://github.com/illumina/hap.py), + +### Download Reference + +We will be using GRCh38 for this case study. + +```bash +mkdir -p reference + +FTPDIR=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids + +curl ${FTPDIR}/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | gunzip > reference/GRCh38_no_alt_analysis_set.fasta +curl ${FTPDIR}/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai > reference/GRCh38_no_alt_analysis_set.fasta.fai +``` + +### Download Genome in a Bottle Benchmarks + +We will benchmark our variant calls against v4.2.1 of the Genome in a Bottle +small variant benchmarks for HG002, HG003, and HG004 trio. + +```bash +mkdir -p benchmark + +FTPDIR=ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio + +curl ${FTPDIR}/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed > benchmark/HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed +curl ${FTPDIR}/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz > benchmark/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz +curl ${FTPDIR}/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi > benchmark/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi + +curl ${FTPDIR}/HG003_NA24149_father/NISTv4.2.1/GRCh38/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed > benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed +curl ${FTPDIR}/HG003_NA24149_father/NISTv4.2.1/GRCh38/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz > benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz +curl ${FTPDIR}/HG003_NA24149_father/NISTv4.2.1/GRCh38/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi > benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi + +curl ${FTPDIR}/HG004_NA24143_mother/NISTv4.2.1/GRCh38/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed > benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed +curl ${FTPDIR}/HG004_NA24143_mother/NISTv4.2.1/GRCh38/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz > benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz +curl ${FTPDIR}/HG004_NA24143_mother/NISTv4.2.1/GRCh38/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi > benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi +``` + +### Download HG002, HG003, and HG004 BAM files + +We'll use HG002, HG003, HG004 ONT R10.4 reads. These reads have been +aligned to the GRCh38_no_alt_analysis reference using minimap2. + +```bash +mkdir -p input +HTTPDIR=https://storage.googleapis.com/deepvariant/ont-case-study-testdata + +curl ${HTTPDIR}/HG002_R104_sup_merged.50x.chr20.bam > input/HG002_R104_sup_merged.50x.chr20.bam +curl ${HTTPDIR}/HG002_R104_sup_merged.50x.chr20.bam.bai > input/HG002_R104_sup_merged.50x.chr20.bam.bai + +curl ${HTTPDIR}/HG003_R104_sup_merged.40x.chr20.bam > input/HG003_R104_sup_merged.40x.chr20.bam +curl ${HTTPDIR}/HG003_R104_sup_merged.40x.chr20.bam.bai > input/HG003_R104_sup_merged.40x.chr20.bam.bai + +curl ${HTTPDIR}/HG004_R104_sup_merged.40x.chr20.bam > input/HG004_R104_sup_merged.40x.chr20.bam +curl ${HTTPDIR}/HG004_R104_sup_merged.40x.chr20.bam.bai > input/HG004_R104_sup_merged.40x.chr20.bam.bai +``` + +## Running DeepTrio with one command + +DeepTrio pipeline consists of 4 steps: `make_examples`, `call_variants`, +`postprocess_variants` and `GLnexus merge`. It is possible to run the first +three steps with one command using the `run_deeptrio` script. GLnexus +is run as a separate command. + +### Running on a CPU-only machine + +```bash +mkdir -p output +mkdir -p output/intermediate_results_dir + +BIN_VERSION="1.6.0" + +sudo apt -y update +sudo apt-get -y install docker.io +sudo docker pull google/deepvariant:deeptrio-"${BIN_VERSION}" + +time sudo docker run \ + -v "${PWD}/input":"/input" \ + -v "${PWD}/output":"/output" \ + -v "${PWD}/reference":"/reference" \ + google/deepvariant:deeptrio-"${BIN_VERSION}" \ + /opt/deepvariant/bin/deeptrio/run_deeptrio \ + --model_type ONT \ + --ref /reference/GRCh38_no_alt_analysis_set.fasta \ + --reads_child /input/HG002_R104_sup_merged.50x.chr20.bam \ + --reads_parent1 /input/HG003_R104_sup_merged.40x.chr20.bam \ + --reads_parent2 /input/HG004_R104_sup_merged.40x.chr20.bam \ + --output_vcf_child /output/HG002.output.vcf.gz \ + --output_vcf_parent1 /output/HG003.output.vcf.gz \ + --output_vcf_parent2 /output/HG004.output.vcf.gz \ + --sample_name_child 'HG002' \ + --sample_name_parent1 'HG003' \ + --sample_name_parent2 'HG004' \ + --num_shards $(nproc) \ + --intermediate_results_dir /output/intermediate_results_dir \ + --output_gvcf_child /output/HG002.g.vcf.gz \ + --output_gvcf_parent1 /output/HG003.g.vcf.gz \ + --output_gvcf_parent2 /output/HG004.g.vcf.gz \ + --regions chr20 +``` + +By specifying `--model_type ONT`, you'll be using a model that is best suited +for ONT R10.4 Whole Genome Sequencing data. + +NOTE: If you want to run each of the steps separately, add `--dry_run=true` +to the command above to figure out what flags you need in each step. Based on +the different model types, different flags are needed in the `make_examples` +step. + +`--intermediate_results_dir` flag is optional. By specifying it, the +intermediate outputs of `make_examples` and `call_variants` stages can be found +in the directory. After the command, you can find these files in the directory: + +``` +call_variants_output_child.tfrecord.gz +call_variants_output_parent1.tfrecord.gz +call_variants_output_parent2.tfrecord.gz + +gvcf_child.tfrecord-?????-of-?????.gz +gvcf_parent1.tfrecord-?????-of-?????.gz +gvcf_parent2.tfrecord-?????-of-?????.gz + +make_examples_child.tfrecord-?????-of-?????.gz +make_examples_parent1.tfrecord-?????-of-?????.gz +make_examples_parent2.tfrecord-?????-of-?????.gz +``` + +For running on GPU machines, or using Singularity instead of Docker, see +[Quick Start](deepvariant-quick-start.md). + +## Merge VCFs using GLnexus + +At this step we take all 3 VCFs generated in the previous step and merge them +using GLnexus. + +```bash +sudo docker pull quay.io/mlin/glnexus:v1.2.7 + +# bcftools and bgzip are now included in our docker images. +# You can also install them separately. +sudo docker run \ + -v "${PWD}/output":"/output" \ + quay.io/mlin/glnexus:v1.2.7 \ + /usr/local/bin/glnexus_cli \ + --config DeepVariant_unfiltered \ + /output/HG002.g.vcf.gz \ + /output/HG003.g.vcf.gz \ + /output/HG004.g.vcf.gz \ + | sudo docker run -i google/deepvariant:deeptrio-"${BIN_VERSION}" \ + bcftools view - \ + | sudo docker run -i google/deepvariant:deeptrio-"${BIN_VERSION}" \ + bgzip -c > output/HG002_trio_merged.vcf.gz +``` + +After completion of GLnexus command we should have a new merged VCF file in the +output directory. + +``` +HG002_trio_merged.vcf.gz +``` + +## Benchmark on chr20 + +### Calculate Mendelian Violation rate + +```bash +sudo docker pull realtimegenomics/rtg-tools + +sudo docker run \ + -v "${PWD}/input":"/input" \ + -v "${PWD}/reference":"/reference" \ + realtimegenomics/rtg-tools format \ + -o /reference/GRCh38_no_alt_analysis_set.sdf "/reference/GRCh38_no_alt_analysis_set.fasta" + +FILE="reference/trio.ped" +cat <$FILE +#PED format pedigree +# +#fam-id/ind-id/pat-id/mat-id: 0=unknown +#sex: 1=male; 2=female; 0=unknown +#phenotype: -9=missing, 0=missing; 1=unaffected; 2=affected +# +#fam-id ind-id pat-id mat-id sex phen +1 HG002 HG003 HG004 1 0 +1 HG003 0 0 1 0 +1 HG004 0 0 2 0 +EOM + +sudo docker run \ +-v "${PWD}/input":"/input" \ +-v "${PWD}/reference":"/reference" \ +-v "${PWD}/output":"/output" \ +realtimegenomics/rtg-tools mendelian \ +-i "/output/HG002_trio_merged.vcf.gz" \ +-o "/output/HG002_trio_annotated.output.vcf.gz" \ +--pedigree=/reference/trio.ped \ +-t /reference/GRCh38_no_alt_analysis_set.sdf \ +| tee output/deepvariant.input_rtg_output.txt +``` + +As a result we should get the following output: + +```bash +Checking: /output/HG002_trio_merged.vcf.gz +Family: [HG003 + HG004] -> [HG002] +164 non-pass records were skipped +Concordance HG002: F:142343/143808 (98.98%) M:144330/146059 (98.82%) F+M:135630/138870 (97.67%) +Sample HG002 has less than 99.0 concordance with both parents. Check for incorrect pedigree or sample mislabelling. +0/168963 (0.00%) records did not conform to expected call ploidy +156632/168963 (92.70%) records were variant in at least 1 family member and checked for Mendelian constraints +16308/156632 (10.41%) records had indeterminate consistency status due to incomplete calls +4099/156632 (2.62%) records contained a violation of Mendelian constraints +``` + +### Benchmark variant calls against 4.2.1 truth set with hap.py + +```bash +mkdir -p happy + +sudo docker pull jmcdani20/hap.py:v0.3.12 + +sudo docker run \ + -v "${PWD}/benchmark":"/benchmark" \ + -v "${PWD}/input":"/input" \ + -v "${PWD}/output":"/output" \ + -v "${PWD}/reference":"/reference" \ + -v "${PWD}/happy:/happy" \ + jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \ + /benchmark/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \ + /output/HG002.output.vcf.gz \ + -f /benchmark/HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \ + -r /reference/GRCh38_no_alt_analysis_set.fasta \ + -o /happy/HG002.output \ + --engine=vcfeval \ + --pass-only \ + -l chr20 + +sudo docker run \ + -v "${PWD}/benchmark":"/benchmark" \ + -v "${PWD}/input":"/input" \ + -v "${PWD}/output":"/output" \ + -v "${PWD}/reference":"/reference" \ + -v "${PWD}/happy:/happy" \ + jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \ + /benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \ + /output/HG003.output.vcf.gz \ + -f /benchmark/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \ + -r /reference/GRCh38_no_alt_analysis_set.fasta \ + -o /happy/HG003.output \ + --engine=vcfeval \ + --pass-only \ + -l chr20 + +sudo docker run \ + -v "${PWD}/benchmark":"/benchmark" \ + -v "${PWD}/input":"/input" \ + -v "${PWD}/output":"/output" \ + -v "${PWD}/reference":"/reference" \ + -v "${PWD}/happy:/happy" \ + jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \ + /benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz \ + /output/HG004.output.vcf.gz \ + -f /benchmark/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed \ + -r /reference/GRCh38_no_alt_analysis_set.fasta \ + -o /happy/HG004.output \ + --engine=vcfeval \ + --pass-only \ + -l chr20 +``` + +``` +Benchmarking Summary for HG002: +Benchmarking Summary: +Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK FP.gt FP.al METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score TRUTH.TOTAL.TiTv_ratio QUERY.TOTAL.TiTv_ratio TRUTH.TOTAL.het_hom_ratio QUERY.TOTAL.het_hom_ratio +INDEL ALL 11256 9916 1340 20944 1295 9272 478 519 0.880952 0.889051 0.442704 0.884983 NaN NaN 1.561710 2.346610 +INDEL PASS 11256 9916 1340 20944 1295 9272 478 519 0.880952 0.889051 0.442704 0.884983 NaN NaN 1.561710 2.346610 + SNP ALL 71333 71199 134 100632 84 29325 59 22 0.998121 0.998822 0.291408 0.998472 2.314904 1.795903 1.715978 1.935631 + SNP PASS 71333 71199 134 100632 84 29325 59 22 0.998121 0.998822 0.291408 0.998472 2.314904 1.795903 1.715978 1.935631 + +Benchmarking Summary for HG003: +Benchmarking Summary: +Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK FP.gt FP.al METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score TRUTH.TOTAL.TiTv_ratio QUERY.TOTAL.TiTv_ratio TRUTH.TOTAL.het_hom_ratio QUERY.TOTAL.het_hom_ratio +INDEL ALL 10628 9044 1584 20580 1348 9720 571 506 0.850960 0.875875 0.472303 0.863238 NaN NaN 1.748961 2.292998 +INDEL PASS 10628 9044 1584 20580 1348 9720 571 506 0.850960 0.875875 0.472303 0.863238 NaN NaN 1.748961 2.292998 + SNP ALL 70166 70043 123 99648 138 29480 53 32 0.998247 0.998033 0.295841 0.998140 2.296566 1.833087 1.883951 1.994315 + SNP PASS 70166 70043 123 99648 138 29480 53 32 0.998247 0.998033 0.295841 0.998140 2.296566 1.833087 1.883951 1.994315 + +Benchmarking Summary for HG004: +Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK FP.gt FP.al METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score TRUTH.TOTAL.TiTv_ratio QUERY.TOTAL.TiTv_ratio TRUTH.TOTAL.het_hom_ratio QUERY.TOTAL.het_hom_ratio +INDEL ALL 11000 9480 1520 20898 1271 9674 511 526 0.861818 0.886761 0.462915 0.874111 NaN NaN 1.792709 2.374125 +INDEL PASS 11000 9480 1520 20898 1271 9674 511 526 0.861818 0.886761 0.462915 0.874111 NaN NaN 1.792709 2.374125 + SNP ALL 71659 71563 96 101494 74 29834 31 40 0.998660 0.998967 0.293948 0.998814 2.310073 1.846227 1.878340 1.838233 + SNP PASS 71659 71563 96 101494 74 29834 31 40 0.998660 0.998967 0.293948 0.998814 2.310073 1.846227 1.878340 1.838233 +``` diff --git a/scripts/run_deeptrio.py b/scripts/run_deeptrio.py index ea48f02c..5ca75f6a 100644 --- a/scripts/run_deeptrio.py +++ b/scripts/run_deeptrio.py @@ -536,7 +536,7 @@ def _make_examples_command( special_args['vsc_min_fraction_indels'] = 0.12 special_args['alt_aligned_pileup'] = 'diff_channels' special_args['add_hp_channel'] = True - special_args['min_mapping_quality'] = 1 + special_args['min_mapping_quality'] = 5 special_args['track_ref_reads'] = True special_args['pileup_image_height_child'] = ( DEEP_TRIO_ONT_PILEUP_HEIGHT_CHILD @@ -551,6 +551,7 @@ def _make_examples_command( special_args['parse_sam_aux_fields'] = True special_args['discard_non_dna_regions'] = True special_args['max_reads_for_dynamic_bases_per_region'] = 200 + special_args['max_reads_per_partition'] = 500 kwargs = _update_kwargs_with_warning(kwargs, special_args) if _MODEL_TYPE.value == 'WES': @@ -643,6 +644,7 @@ def postprocess_variants_command( command.extend(['--ref', '"{}"'.format(ref)]) command.extend(['--infile', '"{}"'.format(infile)]) command.extend(['--outfile', '"{}"'.format(outfile)]) + command.extend(['--cpus 0']) if nonvariant_site_tfrecord_path is not None: command.extend([ '--nonvariant_site_tfrecord_path', @@ -866,9 +868,11 @@ def create_all_commands(intermediate_results_dir): # helps to better distribute the work between shards. candidate_partition_modes = [None] - # In DeepTrio PacBio mode, we always enable use_candidate_partition because - # otherwise it can run out of memory. - if _USE_CANDIDATE_PARTITION.value or _MODEL_TYPE.value == 'PACBIO': + # In DeepTrio PacBio and ONT mode, we always enable use_candidate_partition + # because otherwise it can run out of memory. + use_candidate_partition = False + if _USE_CANDIDATE_PARTITION.value or _MODEL_TYPE.value in ['PACBIO', 'ONT']: + use_candidate_partition = True candidate_partition_modes = [ CandidatePartitionCommand.SWEEP, CandidatePartitionCommand.CANDIDATE_PARTITION_INFERENCE, @@ -891,7 +895,7 @@ def create_all_commands(intermediate_results_dir): intermediate_results_dir, _NUM_SHARDS.value ), candidate_partition_mode=candidate_partition_mode - if _USE_CANDIDATE_PARTITION.value + if use_candidate_partition else None, # kwargs: gvcf=nonvariant_site_tfrecord_path, diff --git a/scripts/run_deeptrio_test.py b/scripts/run_deeptrio_test.py index 377e551f..7a5f5499 100644 --- a/scripts/run_deeptrio_test.py +++ b/scripts/run_deeptrio_test.py @@ -128,6 +128,7 @@ def test_call_variants_postprocess_variants_commands(self, model_type): '--infile ' '"/tmp/deeptrio_tmp_output/call_variants_output_child.tfrecord.gz" ' '--outfile "your_vcf_child" ' + '--cpus 0 ' '--nonvariant_site_tfrecord_path ' '"/tmp/deeptrio_tmp_output/gvcf_child.tfrecord@64.gz" ' '--gvcf_outfile "your_gvcf_child"' @@ -139,7 +140,8 @@ def test_call_variants_postprocess_variants_commands(self, model_type): 'time /opt/deepvariant/bin/postprocess_variants --ref "your_ref"' ' --infile' ' "/tmp/deeptrio_tmp_output/call_variants_output_parent1.tfrecord.gz"' - ' --outfile "your_vcf_parent1" --nonvariant_site_tfrecord_path' + ' --outfile "your_vcf_parent1" --cpus 0' + ' --nonvariant_site_tfrecord_path' ' "/tmp/deeptrio_tmp_output/gvcf_parent1.tfrecord@64.gz"' ' --gvcf_outfile "your_gvcf_parent1"' ), @@ -150,7 +152,8 @@ def test_call_variants_postprocess_variants_commands(self, model_type): 'time /opt/deepvariant/bin/postprocess_variants --ref "your_ref"' ' --infile' ' "/tmp/deeptrio_tmp_output/call_variants_output_parent2.tfrecord.gz"' - ' --outfile "your_vcf_parent2" --nonvariant_site_tfrecord_path' + ' --outfile "your_vcf_parent2" --cpus 0' + ' --nonvariant_site_tfrecord_path' ' "/tmp/deeptrio_tmp_output/gvcf_parent2.tfrecord@64.gz"' ' --gvcf_outfile "your_gvcf_parent2"' ), @@ -227,6 +230,7 @@ def test_duo_call_variants_postprocess_variants_commands( '--infile ' '"/tmp/deeptrio_tmp_output/call_variants_output_child.tfrecord.gz" ' '--outfile "your_vcf_child" ' + '--cpus 0 ' '--nonvariant_site_tfrecord_path ' '"/tmp/deeptrio_tmp_output/gvcf_child.tfrecord@64.gz" ' '--gvcf_outfile "your_gvcf_child"' @@ -238,7 +242,8 @@ def test_duo_call_variants_postprocess_variants_commands( 'time /opt/deepvariant/bin/postprocess_variants --ref "your_ref"' ' --infile' ' "/tmp/deeptrio_tmp_output/call_variants_output_parent1.tfrecord.gz"' - ' --outfile "your_vcf_parent1" --nonvariant_site_tfrecord_path' + ' --outfile "your_vcf_parent1" --cpus 0' + ' --nonvariant_site_tfrecord_path' ' "/tmp/deeptrio_tmp_output/gvcf_parent1.tfrecord@64.gz"' ' --gvcf_outfile "your_gvcf_parent1"' ), @@ -271,21 +276,8 @@ def test_duo_call_variants_postprocess_variants_commands( + '--pileup_image_height_parent "100" ', ), ( - 'PACBIO', - False, - '--add_hp_channel --alt_aligned_pileup "diff_channels" ' - + '--discard_non_dna_regions ' - + '--gvcf "/tmp/deeptrio_tmp_output/gvcf.tfrecord@64.gz" ' - + '--max_reads_for_dynamic_bases_per_region "200" ' - + '--min_mapping_quality "1" ' - + '--parse_sam_aux_fields --partition_size "25000" --phase_reads ' - + '--pileup_image_height_child "60" ' - + '--pileup_image_height_parent "40" --pileup_image_width "199" ' - + '--norealign_reads --sort_by_haplotypes ' - + '--track_ref_reads ' - + '--vsc_min_fraction_indels "0.12" ', - ), - ( + # For pacbio candidate paritionning is turned on by default. + # Currently, there is no way to disable it. 'PACBIO', True, '--add_hp_channel --alt_aligned_pileup "diff_channels" ' @@ -343,6 +335,8 @@ def test_make_examples_commands_with_types( FLAGS.output_gvcf_parent1 = 'your_gvcf_parent1' FLAGS.output_gvcf_parent2 = 'your_gvcf_parent2' FLAGS.num_shards = 64 + if model_type == 'PACBIO': + use_candidate_partition = True make_examples_command_index = 1 if use_candidate_partition else 0 commands, _, _ = self._create_all_commands_and_check_stdout() self.assertEqual( @@ -463,6 +457,8 @@ def test_make_examples_commands_with_candidate_partition( ( 'PACBIO', '--add_hp_channel --alt_aligned_pileup "diff_channels" ' + + '--candidate_positions ' + + '"/tmp/deeptrio_tmp_output/candidate_positions@64" ' + '--discard_non_dna_regions ' + '--gvcf "/tmp/deeptrio_tmp_output/gvcf.tfrecord@64.gz" ' + '--max_reads_for_dynamic_bases_per_region "200" ' @@ -491,8 +487,12 @@ def test_duo_make_examples_commands_with_types( FLAGS.output_gvcf_parent1 = 'your_gvcf_parent1' FLAGS.num_shards = 64 commands, _, _ = self._create_all_commands_and_check_stdout() + use_candidate_partition = False + if model_type == 'PACBIO': + use_candidate_partition = True + make_examples_command_index = 1 if use_candidate_partition else 0 self.assertEqual( - commands[0], + commands[make_examples_command_index], 'time seq 0 63 ' '| parallel -q --halt 2 --line-buffer ' '/opt/deepvariant/bin/deeptrio/make_examples ' @@ -511,44 +511,36 @@ def test_duo_make_examples_commands_with_types( ( None, ( - '--add_hp_channel ' - '--alt_aligned_pileup "diff_channels" ' - '--discard_non_dna_regions ' - '--gvcf "/tmp/deeptrio_tmp_output/gvcf.tfrecord@64.gz" ' - '--max_reads_for_dynamic_bases_per_region "200" ' - '--min_mapping_quality "1" ' - '--parse_sam_aux_fields ' - '--partition_size "25000" ' - '--phase_reads ' - '--pileup_image_height_child "60" ' - '--pileup_image_height_parent "40" ' - '--pileup_image_width "199" ' - '--norealign_reads ' - '--sort_by_haplotypes ' - '--track_ref_reads ' - '--vsc_min_fraction_indels "0.12" ' + '--add_hp_channel --alt_aligned_pileup "diff_channels"' + ' --candidate_positions' + ' "/tmp/deeptrio_tmp_output/candidate_positions@64"' + ' --discard_non_dna_regions --gvcf' + ' "/tmp/deeptrio_tmp_output/gvcf.tfrecord@64.gz"' + ' --max_reads_for_dynamic_bases_per_region "200"' + ' --min_mapping_quality "1" --parse_sam_aux_fields' + ' --partition_size "10000" --phase_reads' + ' --pileup_image_height_child "60" --pileup_image_height_parent' + ' "40" --pileup_image_width "199" --norealign_reads' + ' --sort_by_haplotypes --track_ref_reads' + ' --vsc_min_fraction_indels "0.12" ' ), None, ), ( 'alt_aligned_pileup="rows",vsc_min_fraction_indels=0.03', ( - '--add_hp_channel ' - '--alt_aligned_pileup "rows" ' - '--discard_non_dna_regions ' - '--gvcf "/tmp/deeptrio_tmp_output/gvcf.tfrecord@64.gz" ' - '--max_reads_for_dynamic_bases_per_region "200" ' - '--min_mapping_quality "1" ' - '--parse_sam_aux_fields ' - '--partition_size "25000" ' - '--phase_reads ' - '--pileup_image_height_child "60" ' - '--pileup_image_height_parent "40" ' - '--pileup_image_width "199" ' - '--norealign_reads ' - '--sort_by_haplotypes ' - '--track_ref_reads ' - '--vsc_min_fraction_indels "0.03" ' + '--add_hp_channel --alt_aligned_pileup "rows"' + ' --candidate_positions' + ' "/tmp/deeptrio_tmp_output/candidate_positions@64"' + ' --discard_non_dna_regions --gvcf' + ' "/tmp/deeptrio_tmp_output/gvcf.tfrecord@64.gz"' + ' --max_reads_for_dynamic_bases_per_region "200"' + ' --min_mapping_quality "1" --parse_sam_aux_fields' + ' --partition_size "10000" --phase_reads' + ' --pileup_image_height_child "60" --pileup_image_height_parent' + ' "40" --pileup_image_width "199" --norealign_reads' + ' --sort_by_haplotypes --track_ref_reads' + ' --vsc_min_fraction_indels "0.03" ' ), # Because PacBio uses candidate_sweep, make_examples got run twice. ( @@ -589,7 +581,7 @@ def test_pacbio_args_overwrite( self.assertEqual( commands[0], 'time seq 0 63 | parallel -q --halt 2 --line-buffer ' - '/opt/deepvariant/bin/deeptrio/make_examples --mode calling ' + '/opt/deepvariant/bin/deeptrio/make_examples --mode candidate_sweep ' '--ref "your_ref" --reads_parent1 "your_bam_parent1" ' '--reads_parent2 "your_bam_parent2" ' '--reads "your_bam_child" ' @@ -751,6 +743,7 @@ def test_postprocess_variants_extra_args( '--infile ' '"/tmp/deeptrio_tmp_output/call_variants_output_child.tfrecord.gz" ' '--outfile "your_vcf_child" ' + '--cpus 0 ' '--nonvariant_site_tfrecord_path ' '"/tmp/deeptrio_tmp_output/gvcf_child.tfrecord@64.gz" ' '--gvcf_outfile "your_gvcf_child" '