From db6684c8291274a3c1c0e48a05166f834b168df3 Mon Sep 17 00:00:00 2001
From: Trevor Bedford <trevor@bedford.io>
Date: Tue, 23 Jul 2024 17:19:13 -0700
Subject: [PATCH] Only include more recent context sequences

For when subsampling in the Nextstrain GISAID profile, rather than treating early contextual samples as origin of pandemic to beginning of focal window, eg for 6m analysis from 2020 to 6m ago, instead use a consistent 24m of additional context. So, for 6m, this is context of 30m ago to 6m and focal of 6m ago to present. Additionally, reduce the amount of contextual sequences included from a 4:1 ratio of focal to context to a 10:1 ratio of focal to context.
---
 .../nextstrain-gisaid/builds.yaml             | 200 +++++++++++-------
 1 file changed, 127 insertions(+), 73 deletions(-)

diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
index 4034e4730..d56530772 100644
--- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml
+++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -165,19 +165,21 @@ subsampling:
   # Custom subsampling logic for regions over 1m
   # Grouping by division for North America and Oceania
   # 4000 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   nextstrain_region_grouped_by_division_1m:
     # Early focal samples for region
     focal_early:
       group_by: "division year month"
-      max_sequences: 640
+      max_sequences: 256
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 160
+      max_sequences: 64
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -196,19 +198,21 @@ subsampling:
   # Custom subsampling logic for regions over 2m
   # Grouping by division for North America and Oceania
   # 4000 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   nextstrain_region_grouped_by_division_2m:
     # Early focal samples for region
     focal_early:
       group_by: "division year month"
-      max_sequences: 640
+      max_sequences: 256
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 160
+      max_sequences: 64
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -227,19 +231,21 @@ subsampling:
   # Custom subsampling logic for regions over 6m
   # Grouping by division for North America and Oceania
   # 4000 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   nextstrain_region_grouped_by_division_6m:
     # Early focal samples for region
     focal_early:
       group_by: "division year month"
-      max_sequences: 640
+      max_sequences: 256
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 160
+      max_sequences: 64
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -275,32 +281,36 @@ subsampling:
   # Grouping by division
   # Separating three buckets for China, India and elsewhere
   # 4375 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   # 3:2:2 proportions of Asia, China, India
   nextstrain_region_asia_grouped_by_division_1m:
     # Early focal samples for Asia
     asia_early:
       group_by: "division year month"
-      max_sequences: 300
+      max_sequences: 120
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     # Early focal samples for China
     china_early:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=China'"
     # Early focal samples for India
     india_early:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
@@ -313,13 +323,13 @@ subsampling:
     china_recent:
       group_by: "division week"
       max_sequences: 800
-      max_date: "--min-date 1M"
+      min_date: "--min-date 1M"
       exclude: "--exclude-where 'country!=China'"
     # Recent focal samples for India
     india_recent:
       group_by: "division week"
       max_sequences: 800
-      max_date: "--min-date 1M"
+      min_date: "--min-date 1M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_recent:
@@ -332,32 +342,36 @@ subsampling:
   # Grouping by division
   # Separating three buckets for China, India and elsewhere
   # 4375 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   # 3:2:2 proportions of Asia, China, India
   nextstrain_region_asia_grouped_by_division_2m:
     # Early focal samples for Asia
     asia_early:
       group_by: "division year month"
-      max_sequences: 300
+      max_sequences: 120
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     # Early focal samples for China
     china_early:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=China'"
     # Early focal samples for India
     india_early:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
@@ -370,13 +384,13 @@ subsampling:
     china_recent:
       group_by: "division week"
       max_sequences: 800
-      max_date: "--min-date 2M"
+      min_date: "--min-date 2M"
       exclude: "--exclude-where 'country!=China'"
     # Recent focal samples for India
     india_recent:
       group_by: "division week"
       max_sequences: 800
-      max_date: "--min-date 2M"
+      min_date: "--min-date 2M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_recent:
@@ -389,32 +403,36 @@ subsampling:
   # Grouping by division
   # Separating three buckets for China, India and elsewhere
   # 4375 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   # 3:2:2 proportions of Asia, China, India
   nextstrain_region_asia_grouped_by_division_6m:
     # Early focal samples for Asia
     asia_early:
       group_by: "division year month"
-      max_sequences: 300
+      max_sequences: 120
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     # Early focal samples for China
     china_early:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=China'"
     # Early focal samples for India
     india_early:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
@@ -427,13 +445,13 @@ subsampling:
     china_recent:
       group_by: "division year month"
       max_sequences: 800
-      max_date: "--min-date 6M"
+      min_date: "--min-date 6M"
       exclude: "--exclude-where 'country!=China'"
     # Recent focal samples for India
     india_recent:
       group_by: "division year month"
       max_sequences: 800
-      max_date: "--min-date 6M"
+      min_date: "--min-date 6M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_recent:
@@ -473,19 +491,21 @@ subsampling:
   # Custom subsampling logic for regions over 1m
   # Grouping by country for Africa, Asia, Europe and South America
   # 4000 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   nextstrain_region_grouped_by_country_1m:
     # Early focal samples for region
     focal_early:
       group_by: "country year month"
-      max_sequences: 640
+      max_sequences: 256
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 160
+      max_sequences: 64
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -504,19 +524,21 @@ subsampling:
   # Custom subsampling logic for regions over 2m
   # Grouping by country for Africa, Asia, Europe and South America
   # 4000 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   nextstrain_region_grouped_by_country_2m:
     # Early focal samples for region
     focal_early:
       group_by: "country year month"
-      max_sequences: 640
+      max_sequences: 256
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 160
+      max_sequences: 64
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -535,19 +557,21 @@ subsampling:
   # Custom subsampling logic for regions over 6m
   # Grouping by country for Africa, Asia, Europe and South America
   # 4000 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   nextstrain_region_grouped_by_country_6m:
     # Early focal samples for region
     focal_early:
       group_by: "country year month"
-      max_sequences: 640
+      max_sequences: 256
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 160
+      max_sequences: 64
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -580,48 +604,58 @@ subsampling:
       exclude: "--exclude-where 'region={region}'"
 
   # Custom subsampling logic for global region over 1m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
+  # ~4500 total (expect ~3400)
+  # 10:1 ratio of recent to early
+  # recent is present to 1m, n = 4120
+  # early is 1m to 25m, n = 412
+  # regions are proportional to population size
   nextstrain_global_1m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 60
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     china_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 125
+      max_sequences: 50
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Europe'"
     india_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 100
+      max_sequences: 40
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 90
+      max_sequences: 36
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 15
+      max_sequences: 6
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
@@ -666,48 +700,58 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 2m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
+  # ~4500 total (expect ~3400)
+  # 10:1 ratio of recent to early
+  # recent is present to 2m, n = 4120
+  # early is 2m to 26m, n = 412
+  # regions are proportional to population size
   nextstrain_global_2m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 60
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     china_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 125
+      max_sequences: 50
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Europe'"
     india_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 100
+      max_sequences: 40
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 90
+      max_sequences: 36
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 15
+      max_sequences: 6
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
@@ -752,48 +796,58 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 6m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
+  # ~4500 total (expect ~3400)
+  # 10:1 ratio of recent to early
+  # recent is present to 6m, n = 4120
+  # early is 6m to 30m, n = 412
+  # regions are proportional to population size
   nextstrain_global_6m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 60
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     china_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 125
+      max_sequences: 50
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Europe'"
     india_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 100
+      max_sequences: 40
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 90
+      max_sequences: 36
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 15
+      max_sequences: 6
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
@@ -839,7 +893,7 @@ subsampling:
 
   # Custom subsampling logic for global region over all-time
   # 4320 total (expect ~3200)
-  # all eight regions equal except Oceania at 20%
+  # regions are proportional to population size
   nextstrain_global_all_time:
     africa:
       group_by: "country year month"