From 3692aeb1de8ee607322cf1057729c04dc98bf1e8 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 25 Jan 2022 11:22:44 -0800 Subject: [PATCH 1/2] Remove redundant include strains Removes older references from the list of strains to force include in analyses. We use the single Wuhan/Hu-1/2019 strains as the reference for alignment and time tree rooting, so the other two strains are not required. Importantly, these additional strains get used in proximity calculations with priority-based subsampling leading to the unexpected inclusion of contextual strains that look like these redundant root sequences. We try to avoid this problem in the proximity calculations by defining a list of strains to ignore [1], but this list only includes the strain used to root the time tree. Rather than updating this list to include more strains, we can just remove the strains from the include file. [1] https://github.com/nextstrain/ncov/blob/51ac8143761057c14de3841e46b82e08f7fef4b6/workflow/snakemake_rules/main_workflow.smk#L372 --- defaults/include.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/defaults/include.txt b/defaults/include.txt index 91f5039ad..da823688b 100644 --- a/defaults/include.txt +++ b/defaults/include.txt @@ -1,3 +1 @@ Wuhan/Hu-1/2019 -Wuhan-Hu-1/2019 -Wuhan/WH01/2019 From 9edf570cf5d4d69a8a9b231726b12353b383eab6 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 25 Jan 2022 11:33:40 -0800 Subject: [PATCH 2/2] Define list of strains to ignore for proximities Defines a list of strains to ignore, including the two reference sequences we use for GISAID and GenBank builds and passes this list to the `ignore_seqs` parameter of the proximity calculations instead of the current build's root sequence alone. --- defaults/include.txt | 1 + defaults/parameters.yaml | 4 ++++ workflow/snakemake_rules/main_workflow.smk | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/defaults/include.txt b/defaults/include.txt index da823688b..3d7a12350 100644 --- a/defaults/include.txt +++ b/defaults/include.txt @@ -1 +1,2 @@ Wuhan/Hu-1/2019 +Wuhan-Hu-1/2019 diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index 2369f9f50..188077039 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -89,8 +89,12 @@ filter: # will help reduce the number of genetically identical strains that get chosen, # and allows for more diversity represented on the tree. priorities: + ignore_sequences: + - Wuhan/Hu-1/2019 + - Wuhan-Hu-1/2019 crowding_penalty: 0.1 + # Alignment settings # Alignments are partitioned into smaller groups to speed up the overall alignment process. # The number of sequences per group determines the run time of a single alignment job. diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index ca546c63c..975de94c2 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -369,7 +369,7 @@ rule proximity_score: "benchmarks/proximity_score_{build_name}_{focus}.txt" params: chunk_size=10000, - ignore_seqs = config['refine']['root'] + ignore_seqs = config['priorities']['ignore_sequences'], resources: # Memory scales at ~0.15 MB * chunk_size (e.g., 0.15 MB * 10000 = 1.5GB). mem_mb=4000