From 3692aeb1de8ee607322cf1057729c04dc98bf1e8 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 25 Jan 2022 11:22:44 -0800
Subject: [PATCH 1/2] Remove redundant include strains

Removes older references from the list of strains to force include in
analyses. We use the single Wuhan/Hu-1/2019 strains as the reference for
alignment and time tree rooting, so the other two strains are not
required. Importantly, these additional strains get used in proximity
calculations with priority-based subsampling leading to the unexpected
inclusion of contextual strains that look like these redundant root
sequences. We try to avoid this problem in the proximity calculations by
defining a list of strains to ignore [1], but this list only includes
the strain used to root the time tree. Rather than updating this list to
include more strains, we can just remove the strains from the include
file.

[1] https://github.com/nextstrain/ncov/blob/51ac8143761057c14de3841e46b82e08f7fef4b6/workflow/snakemake_rules/main_workflow.smk#L372
---
 defaults/include.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/defaults/include.txt b/defaults/include.txt
index 91f5039ad..da823688b 100644
--- a/defaults/include.txt
+++ b/defaults/include.txt
@@ -1,3 +1 @@
 Wuhan/Hu-1/2019
-Wuhan-Hu-1/2019
-Wuhan/WH01/2019

From 9edf570cf5d4d69a8a9b231726b12353b383eab6 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 25 Jan 2022 11:33:40 -0800
Subject: [PATCH 2/2] Define list of strains to ignore for proximities

Defines a list of strains to ignore, including the two reference
sequences we use for GISAID and GenBank builds and passes this list to
the `ignore_seqs` parameter of the proximity calculations instead of the
current build's root sequence alone.
---
 defaults/include.txt                       | 1 +
 defaults/parameters.yaml                   | 4 ++++
 workflow/snakemake_rules/main_workflow.smk | 2 +-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/defaults/include.txt b/defaults/include.txt
index da823688b..3d7a12350 100644
--- a/defaults/include.txt
+++ b/defaults/include.txt
@@ -1 +1,2 @@
 Wuhan/Hu-1/2019
+Wuhan-Hu-1/2019
diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
index 2369f9f50..188077039 100644
--- a/defaults/parameters.yaml
+++ b/defaults/parameters.yaml
@@ -89,8 +89,12 @@ filter:
 # will help reduce the number of genetically identical strains that get chosen,
 # and allows for more diversity represented on the tree.
 priorities:
+  ignore_sequences:
+    - Wuhan/Hu-1/2019
+    - Wuhan-Hu-1/2019
   crowding_penalty: 0.1
 
+
 # Alignment settings
 # Alignments are partitioned into smaller groups to speed up the overall alignment process.
 # The number of sequences per group determines the run time of a single alignment job.
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index ca546c63c..975de94c2 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -369,7 +369,7 @@ rule proximity_score:
         "benchmarks/proximity_score_{build_name}_{focus}.txt"
     params:
         chunk_size=10000,
-        ignore_seqs = config['refine']['root']
+        ignore_seqs = config['priorities']['ignore_sequences'],
     resources:
         # Memory scales at ~0.15 MB * chunk_size (e.g., 0.15 MB * 10000 = 1.5GB).
         mem_mb=4000