From 88c13975914ab1d7c4d9c3019e2795e8161d7272 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 7 Jun 2024 10:19:52 -0700
Subject: [PATCH] =?UTF-8?q?fixup!=20=F0=9F=9A=A7=20Use=20population-based?=
 =?UTF-8?q?=20weighted=20sampling=20for=20Asia=20builds?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 nextstrain_profiles/nextstrain-gisaid/builds.yaml | 14 +++++++-------
 nextstrain_profiles/nextstrain-open/builds.yaml   | 14 +++++++-------
 workflow/snakemake_rules/main_workflow.smk        |  1 +
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
index b7b0e356b..d85147a5b 100644
--- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml
+++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -186,7 +186,7 @@ subsampling:
     # Early focal samples for Asia
     asia_early:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 700
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Asia'"
@@ -199,7 +199,7 @@ subsampling:
     # Recent focal samples for Asia
     asia_recent:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 2800
       min_date: "--min-date 1M"
       exclude: "--exclude-where 'region!=Asia'"
@@ -220,7 +220,7 @@ subsampling:
     # Early focal samples for Asia
     asia_early:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 700
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Asia'"
@@ -233,7 +233,7 @@ subsampling:
     # Recent focal samples for Asia
     asia_recent:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 2800
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Asia'"
@@ -254,7 +254,7 @@ subsampling:
     # Early focal samples for Asia
     asia_early:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 700
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Asia'"
@@ -267,7 +267,7 @@ subsampling:
     # Recent focal samples for Asia
     asia_recent:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 2800
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Asia'"
@@ -286,7 +286,7 @@ subsampling:
     # Focal samples for Asia
     asia:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 3500
       exclude: "--exclude-where 'region!=Asia'"
     # Contextual samples from the rest of the world
diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml
index 1819933c2..753b301a8 100644
--- a/nextstrain_profiles/nextstrain-open/builds.yaml
+++ b/nextstrain_profiles/nextstrain-open/builds.yaml
@@ -186,7 +186,7 @@ subsampling:
     # Early focal samples for Asia
     asia_early:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 700
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Asia'"
@@ -199,7 +199,7 @@ subsampling:
     # Recent focal samples for Asia
     asia_recent:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 2800
       min_date: "--min-date 1M"
       exclude: "--exclude-where 'region!=Asia'"
@@ -220,7 +220,7 @@ subsampling:
     # Early focal samples for Asia
     asia_early:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 700
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Asia'"
@@ -233,7 +233,7 @@ subsampling:
     # Recent focal samples for Asia
     asia_recent:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 2800
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Asia'"
@@ -254,7 +254,7 @@ subsampling:
     # Early focal samples for Asia
     asia_early:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 700
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Asia'"
@@ -267,7 +267,7 @@ subsampling:
     # Recent focal samples for Asia
     asia_recent:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 2800
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Asia'"
@@ -286,7 +286,7 @@ subsampling:
     # Focal samples for Asia
     asia:
       group_by: "country year month"
-      group_by_weights: "data/country_population_weights.tsv"
+      group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 3500
       exclude: "--exclude-where 'region!=Asia'"
     # Contextual samples from the rest of the world
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 7ac099561..24c977517 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -300,6 +300,7 @@ rule subsample:
         "benchmarks/subsample_{build_name}_{subsample}.txt"
     params:
         group_by = _get_specific_subsampling_setting("group_by", optional=True),
+        # FIXME: pull this from config.files.population_weights?
         group_by_weights = _get_specific_subsampling_setting("group_by_weights", optional=True),
         sequences_per_group = _get_specific_subsampling_setting("seq_per_group", optional=True),
         subsample_max_sequences = _get_specific_subsampling_setting("max_sequences", optional=True),