From 88c13975914ab1d7c4d9c3019e2795e8161d7272 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 7 Jun 2024 10:19:52 -0700 Subject: [PATCH] =?UTF-8?q?fixup!=20=F0=9F=9A=A7=20Use=20population-based?= =?UTF-8?q?=20weighted=20sampling=20for=20Asia=20builds?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nextstrain_profiles/nextstrain-gisaid/builds.yaml | 14 +++++++------- nextstrain_profiles/nextstrain-open/builds.yaml | 14 +++++++------- workflow/snakemake_rules/main_workflow.smk | 1 + 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index b7b0e356b..d85147a5b 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -186,7 +186,7 @@ subsampling: # Early focal samples for Asia asia_early: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 700 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=Asia'" @@ -199,7 +199,7 @@ subsampling: # Recent focal samples for Asia asia_recent: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 2800 min_date: "--min-date 1M" exclude: "--exclude-where 'region!=Asia'" @@ -220,7 +220,7 @@ subsampling: # Early focal samples for Asia asia_early: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 700 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=Asia'" @@ -233,7 +233,7 @@ subsampling: # Recent focal samples for Asia asia_recent: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 2800 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=Asia'" @@ -254,7 +254,7 @@ subsampling: # Early focal samples for Asia asia_early: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 700 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Asia'" @@ -267,7 +267,7 @@ subsampling: # Recent focal samples for Asia asia_recent: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 2800 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Asia'" @@ -286,7 +286,7 @@ subsampling: # Focal samples for Asia asia: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 3500 exclude: "--exclude-where 'region!=Asia'" # Contextual samples from the rest of the world diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml index 1819933c2..753b301a8 100644 --- a/nextstrain_profiles/nextstrain-open/builds.yaml +++ b/nextstrain_profiles/nextstrain-open/builds.yaml @@ -186,7 +186,7 @@ subsampling: # Early focal samples for Asia asia_early: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 700 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=Asia'" @@ -199,7 +199,7 @@ subsampling: # Recent focal samples for Asia asia_recent: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 2800 min_date: "--min-date 1M" exclude: "--exclude-where 'region!=Asia'" @@ -220,7 +220,7 @@ subsampling: # Early focal samples for Asia asia_early: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 700 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=Asia'" @@ -233,7 +233,7 @@ subsampling: # Recent focal samples for Asia asia_recent: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 2800 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=Asia'" @@ -254,7 +254,7 @@ subsampling: # Early focal samples for Asia asia_early: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 700 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Asia'" @@ -267,7 +267,7 @@ subsampling: # Recent focal samples for Asia asia_recent: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 2800 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Asia'" @@ -286,7 +286,7 @@ subsampling: # Focal samples for Asia asia: group_by: "country year month" - group_by_weights: "data/country_population_weights.tsv" + group_by_weights: "defaults/population_weights.tsv" max_sequences: 3500 exclude: "--exclude-where 'region!=Asia'" # Contextual samples from the rest of the world diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 7ac099561..24c977517 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -300,6 +300,7 @@ rule subsample: "benchmarks/subsample_{build_name}_{subsample}.txt" params: group_by = _get_specific_subsampling_setting("group_by", optional=True), + # FIXME: pull this from config.files.population_weights? group_by_weights = _get_specific_subsampling_setting("group_by_weights", optional=True), sequences_per_group = _get_specific_subsampling_setting("seq_per_group", optional=True), subsample_max_sequences = _get_specific_subsampling_setting("max_sequences", optional=True),