From 831014c481d51eea7c04007b4140ec833f12c18c Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 7 Jun 2024 10:34:09 -0700 Subject: [PATCH 1/2] ingest: Add Snakemake rules for case counts The case counts scripts used csvtk before csvtk was officially added to nextstrain/docker-base so I worked around this by just running them directly in the GH Action workflow. Our push to use short-lived AWS credentials has finally pushed me to put this into a proper Snakemake workflow. --- ingest/Snakefile | 6 ++++++ ingest/rules/case_counts.smk | 31 +++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 ingest/rules/case_counts.smk diff --git a/ingest/Snakefile b/ingest/Snakefile index e5134ff..16e2cf1 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -22,4 +22,10 @@ rule upload_all_sequence_counts: ) +rule upload_all_case_counts: + input: + "results/upload_global_case_counts.done" + + include: "rules/sequence_counts.smk" +include: "rules/case_counts.smk" diff --git a/ingest/rules/case_counts.smk b/ingest/rules/case_counts.smk new file mode 100644 index 0000000..f0cc012 --- /dev/null +++ b/ingest/rules/case_counts.smk @@ -0,0 +1,31 @@ +""" +This part of the workflow summarizes SARS-CoV-2 case counts from public +exteranl data sources (e.g. Our World in Data) and uploads them to AWS S3 for +downstream use by the modeling workflow. +""" + + +rule fetch_global_case_counts: + output: + global_case_counts = "data/global_case_counts.tsv", + shell: + """ + ./bin/fetch-ncov-global-case-counts > {output.global_case_counts} + """ + + +rule upload_global_case_counts: + input: + global_case_counts = "data/global_case_counts.tsv", + output: + upload_flag = "results/upload_global_case_counts.done", + params: + s3_dst = config["s3_dst"], + cloudfront_domain = config["cloudfront_domain"], + shell: + """ + ./vendored/upload-to-s3 \ + {input.global_case_counts} \ + {params.s3_dst}/cases/global.tsv.gz \ + {params.cloudfront_domain:q} 2>&1 | tee {output.upload_flag} + """ From e9c5772b5a086e8c85689babeda3a269fe78b17b Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 7 Jun 2024 10:44:02 -0700 Subject: [PATCH 2/2] Refactor update-ncov-case-counts.yaml Refactored to use the shared `pathogen-repo-build` GH Action workflow so that it can use the short-lived AWS credentials that are automatically set up within the workflow. --- .../workflows/update-ncov-case-counts.yaml | 64 ++++++++----------- 1 file changed, 27 insertions(+), 37 deletions(-) diff --git a/.github/workflows/update-ncov-case-counts.yaml b/.github/workflows/update-ncov-case-counts.yaml index 5460458..0c559d6 100644 --- a/.github/workflows/update-ncov-case-counts.yaml +++ b/.github/workflows/update-ncov-case-counts.yaml @@ -13,46 +13,36 @@ on: required: false jobs: - case_counts: + set_s3_dst: runs-on: ubuntu-latest - env: - SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} - SLACK_CHANNELS: ${{ github.event.inputs.slack_channel || 'nextstrain-counts-updates' }} - defaults: - run: - # Login shell is required to include changes by conda init bash. - shell: bash -l -eo pipefail {0} steps: - - uses: actions/checkout@v4 - - uses: conda-incubator/setup-miniconda@v3 - with: - python-version: "3.9" - miniforge-variant: Mambaforge - channels: conda-forge,bioconda + - id: s3_dst + run: | + S3_DST=s3://nextstrain-data/files/workflows/forecasts-ncov - - name: setup - run: mamba install "csvtk>=0.23.0" + if [[ "$TRIAL_NAME" ]]; then + S3_DST+=/trial/"$TRIAL_NAME" + fi - - name: download case counts - run: | - ./ingest/bin/fetch-ncov-global-case-counts > global_case_counts.tsv + echo "s3_dst=$S3_DST" >> "$GITHUB_OUTPUT" + env: + TRIAL_NAME: ${{ inputs.trial_name }} + outputs: + s3_dst: ${{ steps.s3_dst.outputs.s3_dst }} - - name: upload to S3 + case_counts: + needs: [set_s3_dst] + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + runtime: docker run: | - S3_DST=s3://nextstrain-data/files/workflows/forecasts-ncov/cases - CLOUDFRONT_DOMAIN="data.nextstrain.org" - - if [[ "$TRIAL_NAME" ]]; then - S3_DST+=/trial/"$TRIAL_NAME" - fi - - ./ingest/vendored/upload-to-s3 global_case_counts.tsv "$S3_DST"/global.tsv.gz $CLOUDFRONT_DOMAIN - env: - AWS_DEFAULT_REGION: ${{ vars.AWS_DEFAULT_REGION }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - TRIAL_NAME: ${{ github.event.inputs.trial_name }} - - - name: notify_pipeline_failed - if: ${{ failure() }} - run: ./ingest/vendored/notify-on-job-fail "Case counts ingest" "nextstrain/forecasts-ncov" + nextstrain build \ + ingest \ + upload_all_case_counts \ + --config s3_dst="$S3_DST" + env: | + SLACK_CHANNELS: ${{ inputs.slack_channel || vars.SLACK_CHANNELS }} + S3_DST: ${{ needs.set_s3_dst.outputs.s3_dst }}