Skip to content

Commit

Permalink
Merge pull request #3292 from AlexsLemonade/davidsmejia/3283-sra-meta…
Browse files Browse the repository at this point in the history
…data-download

3283 sra metadata download
  • Loading branch information
davidsmejia authored May 19, 2023
2 parents d424091 + f597413 commit 529080f
Show file tree
Hide file tree
Showing 11 changed files with 3,060 additions and 8,253 deletions.
32 changes: 28 additions & 4 deletions .github/workflows/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,11 @@ jobs:
- uses: actions/checkout@v3

- name: Free up space
run: ./.github/scripts/cleanup_instance.sh
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false

- name: Login to Packages Container registry
uses: docker/login-action@v2
Expand Down Expand Up @@ -85,6 +89,13 @@ jobs:
steps:
- uses: actions/checkout@v3

- name: Free up space
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false

- name: Login to Packages Container registry
uses: docker/login-action@v2
with:
Expand Down Expand Up @@ -128,7 +139,11 @@ jobs:
- uses: actions/checkout@v3

- name: Free up space
run: ./.github/scripts/cleanup_instance.sh
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false

- name: Login to Packages Container registry
uses: docker/login-action@v2
Expand Down Expand Up @@ -229,7 +244,11 @@ jobs:
- uses: actions/checkout@v3

- name: Free up space
run: ./.github/scripts/cleanup_instance.sh
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false

- name: Login to Packages Container registry
uses: docker/login-action@v2
Expand Down Expand Up @@ -364,8 +383,13 @@ jobs:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
steps:
- uses: actions/checkout@v3

- name: Free up space
run: ./.github/scripts/cleanup_instance.sh
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false

- name: Login to Packages Container registry
uses: docker/login-action@v2
Expand Down
36 changes: 4 additions & 32 deletions foreman/data_refinery_foreman/surveyor/sra.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import random
import re
import xml.etree.ElementTree as ET
from typing import Dict, List
Expand All @@ -19,7 +18,6 @@
SurveyJob,
)
from data_refinery_common.rna_seq import _build_ena_file_url
from data_refinery_common.utils import get_fasp_sra_download
from data_refinery_foreman.surveyor import harmony, utils
from data_refinery_foreman.surveyor.external_source import ExternalSourceSurveyor

Expand All @@ -32,14 +30,7 @@
ENA_FILE_REPORT_URL_TEMPLATE = (
"https://www.ebi.ac.uk/ena/portal/api/filereport?accession={accession}&result=read_run"
)
NCBI_DOWNLOAD_URL_TEMPLATE = (
"[email protected]:/sra/sra-instant/reads/ByRun/sra/"
"{first_three}/{first_six}/{accession}/{accession}.sra"
)
NCBI_PRIVATE_DOWNLOAD_URL_TEMPLATE = (
"[email protected]:/sra/sra-instant/reads/ByRun/sra/"
"{first_three}/{first_six}/{accession}/{accession}.sra"
)
NCBI_DOWNLOAD_URL_TEMPLATE = "https://sra-pub-run-odp.s3.amazonaws.com/sra/{accession}/{accession}"


class UnsupportedDataTypeError(Exception):
Expand All @@ -66,7 +57,6 @@ def source_type(self):

@staticmethod
def gather_submission_metadata(metadata: Dict) -> None:

formatted_metadata_URL = ENA_METADATA_URL_TEMPLATE.format(metadata["submission_accession"])
response = utils.requests_retry_session().get(formatted_metadata_URL)
submission_xml = ET.fromstring(response.text)[0]
Expand Down Expand Up @@ -190,7 +180,8 @@ def gather_file_report(run_accession: str) -> List[Dict]:
This endpoint returns a weird format, so some custom parsing is required:
run_accession fastq_ftp fastq_bytes fastq_md5 submitted_ftp submitted_bytes submitted_md5 sra_ftp sra_bytes sra_md5
SRR7353755 ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_2.fastq.gz 25176;2856704;3140575 7ef1ba010dcb679217112efa380798b2;6bc5651b7103306d4d65018180ab8d0d;3856c14164612d9879d576a046a9879f"""
SRR7353755 ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_2.fastq.gz 25176;2856704;3140575 7ef1ba010dcb679217112efa380798b2;6bc5651b7103306d4d65018180ab8d0d;3856c14164612d9879d576a046a9879f
"""
response = utils.requests_retry_session().get(
ENA_FILE_REPORT_URL_TEMPLATE.format(accession=run_accession)
)
Expand Down Expand Up @@ -338,25 +329,7 @@ def gather_all_metadata(run_accession):
@staticmethod
def _build_ncbi_file_url(run_accession: str):
"""Build the path to the hypothetical .sra file we want"""
accession = run_accession
first_three = accession[:3]
first_six = accession[:6]

# Prefer the FASP-specific endpoints if possible..
download_url = get_fasp_sra_download(run_accession)

if not download_url:
# ..else, load balancing via coin flip.
if random.choice([True, False]):
download_url = NCBI_DOWNLOAD_URL_TEMPLATE.format(
first_three=first_three, first_six=first_six, accession=accession
)
else:
download_url = NCBI_PRIVATE_DOWNLOAD_URL_TEMPLATE.format(
first_three=first_three, first_six=first_six, accession=accession
)

return download_url
return NCBI_DOWNLOAD_URL_TEMPLATE.format(accession=run_accession)

@staticmethod
def _apply_harmonized_metadata_to_sample(sample: Sample, metadata: dict):
Expand Down Expand Up @@ -629,7 +602,6 @@ def discover_experiment_and_samples(self):
accessions_to_run = []
for child in study_links:
if child[0][0].text == "ENA-RUN":

all_runs = child[0][1].text

# Ranges can be disjoint, separated by commas
Expand Down
20 changes: 8 additions & 12 deletions foreman/tests/surveyor/test_sra.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,12 @@ def test_metadata_is_gathered_correctly(self):
self.assertEqual(metadata["run_ena_last_update"], "2017-08-11")
self.assertEqual(metadata["run_ena_spot_count"], "32568360")
self.assertEqual(metadata["sample_accession"], "DRS001521")
self.assertEqual(metadata["sample_center_name"], "BioSample")
self.assertEqual(metadata["sample_ena_base_count"], "3256836000")
self.assertEqual(metadata["sample_ena_first_public"], "2013-07-20")
self.assertEqual(metadata["sample_ena_last_update"], "2015-08-24")
self.assertEqual(metadata["sample_ena_spot_count"], "32568360")
self.assertEqual(
metadata["sample_center_name"],
"Group for Morphological Evolution, Center for Developmental Biology, Kobe Institute, RIKEN",
)
self.assertEqual(metadata["sample_ena_first_public"], "2013-02-27")
self.assertEqual(metadata["sample_ena_last_update"], "2014-11-12")
self.assertEqual(
metadata["sample_sample_comment"],
("mRNAseq of chicken at stage HH16 (biological " "replicate 1)"),
Expand All @@ -255,13 +256,8 @@ def test_metadata_is_gathered_correctly(self):
self.assertEqual(metadata["submission_title"], "Submitted by RIKEN_CDB on 19-JUL-2013")

ncbi_url = SraSurveyor._build_ncbi_file_url(metadata["run_accession"])
self.assertTrue(
ncbi_url
in [
"[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra",
"[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra",
"[email protected]:data/sracloud/traces/dra0/DRR/000002/DRR002116",
]
self.assertEqual(
ncbi_url, "https://sra-pub-run-odp.s3.amazonaws.com/sra/DRR002116/DRR002116"
)

def test_sra_metadata_is_harmonized(self):
Expand Down
Loading

0 comments on commit 529080f

Please sign in to comment.