-
-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3292 from AlexsLemonade/davidsmejia/3283-sra-meta…
…data-download 3283 sra metadata download
- Loading branch information
Showing
11 changed files
with
3,060 additions
and
8,253 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,3 @@ | ||
import random | ||
import re | ||
import xml.etree.ElementTree as ET | ||
from typing import Dict, List | ||
|
@@ -19,7 +18,6 @@ | |
SurveyJob, | ||
) | ||
from data_refinery_common.rna_seq import _build_ena_file_url | ||
from data_refinery_common.utils import get_fasp_sra_download | ||
from data_refinery_foreman.surveyor import harmony, utils | ||
from data_refinery_foreman.surveyor.external_source import ExternalSourceSurveyor | ||
|
||
|
@@ -32,14 +30,7 @@ | |
ENA_FILE_REPORT_URL_TEMPLATE = ( | ||
"https://www.ebi.ac.uk/ena/portal/api/filereport?accession={accession}&result=read_run" | ||
) | ||
NCBI_DOWNLOAD_URL_TEMPLATE = ( | ||
"[email protected]:/sra/sra-instant/reads/ByRun/sra/" | ||
"{first_three}/{first_six}/{accession}/{accession}.sra" | ||
) | ||
NCBI_PRIVATE_DOWNLOAD_URL_TEMPLATE = ( | ||
"[email protected]:/sra/sra-instant/reads/ByRun/sra/" | ||
"{first_three}/{first_six}/{accession}/{accession}.sra" | ||
) | ||
NCBI_DOWNLOAD_URL_TEMPLATE = "https://sra-pub-run-odp.s3.amazonaws.com/sra/{accession}/{accession}" | ||
|
||
|
||
class UnsupportedDataTypeError(Exception): | ||
|
@@ -66,7 +57,6 @@ def source_type(self): | |
|
||
@staticmethod | ||
def gather_submission_metadata(metadata: Dict) -> None: | ||
|
||
formatted_metadata_URL = ENA_METADATA_URL_TEMPLATE.format(metadata["submission_accession"]) | ||
response = utils.requests_retry_session().get(formatted_metadata_URL) | ||
submission_xml = ET.fromstring(response.text)[0] | ||
|
@@ -190,7 +180,8 @@ def gather_file_report(run_accession: str) -> List[Dict]: | |
This endpoint returns a weird format, so some custom parsing is required: | ||
run_accession fastq_ftp fastq_bytes fastq_md5 submitted_ftp submitted_bytes submitted_md5 sra_ftp sra_bytes sra_md5 | ||
SRR7353755 ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_2.fastq.gz 25176;2856704;3140575 7ef1ba010dcb679217112efa380798b2;6bc5651b7103306d4d65018180ab8d0d;3856c14164612d9879d576a046a9879f""" | ||
SRR7353755 ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_2.fastq.gz 25176;2856704;3140575 7ef1ba010dcb679217112efa380798b2;6bc5651b7103306d4d65018180ab8d0d;3856c14164612d9879d576a046a9879f | ||
""" | ||
response = utils.requests_retry_session().get( | ||
ENA_FILE_REPORT_URL_TEMPLATE.format(accession=run_accession) | ||
) | ||
|
@@ -338,25 +329,7 @@ def gather_all_metadata(run_accession): | |
@staticmethod | ||
def _build_ncbi_file_url(run_accession: str): | ||
"""Build the path to the hypothetical .sra file we want""" | ||
accession = run_accession | ||
first_three = accession[:3] | ||
first_six = accession[:6] | ||
|
||
# Prefer the FASP-specific endpoints if possible.. | ||
download_url = get_fasp_sra_download(run_accession) | ||
|
||
if not download_url: | ||
# ..else, load balancing via coin flip. | ||
if random.choice([True, False]): | ||
download_url = NCBI_DOWNLOAD_URL_TEMPLATE.format( | ||
first_three=first_three, first_six=first_six, accession=accession | ||
) | ||
else: | ||
download_url = NCBI_PRIVATE_DOWNLOAD_URL_TEMPLATE.format( | ||
first_three=first_three, first_six=first_six, accession=accession | ||
) | ||
|
||
return download_url | ||
return NCBI_DOWNLOAD_URL_TEMPLATE.format(accession=run_accession) | ||
|
||
@staticmethod | ||
def _apply_harmonized_metadata_to_sample(sample: Sample, metadata: dict): | ||
|
@@ -629,7 +602,6 @@ def discover_experiment_and_samples(self): | |
accessions_to_run = [] | ||
for child in study_links: | ||
if child[0][0].text == "ENA-RUN": | ||
|
||
all_runs = child[0][1].text | ||
|
||
# Ranges can be disjoint, separated by commas | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -226,11 +226,12 @@ def test_metadata_is_gathered_correctly(self): | |
self.assertEqual(metadata["run_ena_last_update"], "2017-08-11") | ||
self.assertEqual(metadata["run_ena_spot_count"], "32568360") | ||
self.assertEqual(metadata["sample_accession"], "DRS001521") | ||
self.assertEqual(metadata["sample_center_name"], "BioSample") | ||
self.assertEqual(metadata["sample_ena_base_count"], "3256836000") | ||
self.assertEqual(metadata["sample_ena_first_public"], "2013-07-20") | ||
self.assertEqual(metadata["sample_ena_last_update"], "2015-08-24") | ||
self.assertEqual(metadata["sample_ena_spot_count"], "32568360") | ||
self.assertEqual( | ||
metadata["sample_center_name"], | ||
"Group for Morphological Evolution, Center for Developmental Biology, Kobe Institute, RIKEN", | ||
) | ||
self.assertEqual(metadata["sample_ena_first_public"], "2013-02-27") | ||
self.assertEqual(metadata["sample_ena_last_update"], "2014-11-12") | ||
self.assertEqual( | ||
metadata["sample_sample_comment"], | ||
("mRNAseq of chicken at stage HH16 (biological " "replicate 1)"), | ||
|
@@ -255,13 +256,8 @@ def test_metadata_is_gathered_correctly(self): | |
self.assertEqual(metadata["submission_title"], "Submitted by RIKEN_CDB on 19-JUL-2013") | ||
|
||
ncbi_url = SraSurveyor._build_ncbi_file_url(metadata["run_accession"]) | ||
self.assertTrue( | ||
ncbi_url | ||
in [ | ||
"[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra", | ||
"[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra", | ||
"[email protected]:data/sracloud/traces/dra0/DRR/000002/DRR002116", | ||
] | ||
self.assertEqual( | ||
ncbi_url, "https://sra-pub-run-odp.s3.amazonaws.com/sra/DRR002116/DRR002116" | ||
) | ||
|
||
def test_sra_metadata_is_harmonized(self): | ||
|
Oops, something went wrong.