Skip to content

Commit

Permalink
Merge pull request #485 from nextstrain/update-accession-links
Browse files Browse the repository at this point in the history
Update links for GISAID and GenBank accessions
  • Loading branch information
joverlee521 authored Dec 20, 2024
2 parents 4d06220 + fcb407c commit e359b7e
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 5 deletions.
8 changes: 8 additions & 0 deletions bin/fetch-accession-links
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
set -euo pipefail

curl "https://hgwdev.gi.ucsc.edu/~angie/epiToPublicAndDate.latest" \
--fail --silent --show-error \
--header 'User-Agent: https://github.com/nextstrain/ncov-ingest ([email protected])' \
| csvtk -t add-header --names gisaid_epi_isl,genbank_accession,strain,date \
| csvtk -t cut --fields genbank_accession,gisaid_epi_isl
3 changes: 1 addition & 2 deletions bin/transform-genbank
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ if __name__ == '__main__':
| ParseGeographicColumnsGenbank( base / 'source-data/us-state-codes.tsv' )
| AbbreviateAuthors()
| ApplyUserGeoLocationSubstitutionRules(geoRules)
| MergeUserAnnotatedMetadata(annotations, idKey = 'genbank_accession' )
| MergeUserAnnotatedMetadata(accessions, idKey = 'genbank_accession_rev' )
| MergeUserAnnotatedMetadata(annotations, idKey = 'genbank_accession' )
| FillDefaultLocationData()
| patchUKData(args.cog_uk_accessions, args.cog_uk_metadata)
| GenbankProblematicFilter( args.problem_data,
Expand Down Expand Up @@ -301,4 +301,3 @@ if __name__ == '__main__':
strain_name = updated_strain_names_by_line_no[entry[LINE_NUMBER_KEY]]
print( '>' , strain_name , sep='' , file= fasta_OUT)
print( entry['sequence'] , file= fasta_OUT)

2 changes: 1 addition & 1 deletion bin/transform-gisaid
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ if __name__ == '__main__':

pipeline = (pipeline
| ApplyUserGeoLocationSubstitutionRules(geoRules)
| MergeUserAnnotatedMetadata(annotations)
| MergeUserAnnotatedMetadata(accessions)
| MergeUserAnnotatedMetadata(annotations)
| FillDefaultLocationData()
)

Expand Down
36 changes: 34 additions & 2 deletions workflow/snakemake_rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,34 @@ Produces different output files for GISAID vs GenBank:
"""


rule fetch_accession_links:
"""
Fetch the accession links between GISAID and GenBank
"""
output:
accessions=temp("data/accessions.tsv"),
retries: 5
shell:
"""
./bin/fetch-accession-links > {output.accessions:q}
"""


rule concat_accession_links:
input:
source_data="source-data/accessions.tsv.gz",
accessions="data/accessions.tsv",
output:
all_accessions="data/all_accessions.tsv.gz"
shell:
r"""
gunzip -kcfq {input.source_data:q} \
| csvtk concat -t - {input.accessions:q} \
| csvtk uniq -t -f genbank_accession,gisaid_epi_isl \
| gzip -c > {output.all_accessions:q}
"""


rule transform_rki_data:
input:
ndjson="data/rki.ndjson",
Expand Down Expand Up @@ -60,7 +88,8 @@ rule transform_genbank_data:
biosample = "data/genbank/biosample.tsv",
ndjson = "data/genbank.ndjson",
cog_uk_accessions = "data/cog_uk_accessions.tsv",
cog_uk_metadata = "data/cog_uk_metadata.csv.gz"
cog_uk_metadata = "data/cog_uk_metadata.csv.gz",
accessions = "data/all_accessions.tsv.gz",
output:
fasta = "data/genbank_sequences.fasta",
metadata = "data/genbank_metadata_transformed.tsv",
Expand All @@ -75,6 +104,7 @@ rule transform_genbank_data:
--duplicate-biosample {output.duplicate_biosample} \
--cog-uk-accessions {input.cog_uk_accessions} \
--cog-uk-metadata {input.cog_uk_metadata} \
--accessions {input.accessions} \
--output-metadata {output.metadata} \
--output-fasta {output.fasta} > {output.flagged_annotations}
"""
Expand Down Expand Up @@ -105,7 +135,8 @@ rule merge_open_data:

rule transform_gisaid_data:
input:
ndjson = "data/gisaid.ndjson"
ndjson = "data/gisaid.ndjson",
accessions = "data/all_accessions.tsv.gz",
output:
fasta = "data/gisaid/sequences.fasta",
metadata = "data/gisaid/metadata_transformed.tsv",
Expand All @@ -116,6 +147,7 @@ rule transform_gisaid_data:
shell:
"""
./bin/transform-gisaid {input.ndjson} \
--accessions {input.accessions} \
--output-metadata {output.metadata} \
--output-fasta {output.fasta} \
--output-additional-info {output.additional_info} \
Expand Down

0 comments on commit e359b7e

Please sign in to comment.