Skip to content

Commit

Permalink
Fix failing SRA requests in the geo-import process
Browse files Browse the repository at this point in the history
  • Loading branch information
AnzeLovse committed Oct 7, 2024
1 parent 4f57d8d commit 9d7d151
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 55 deletions.
24 changes: 13 additions & 11 deletions docs/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Added
- Allow filtering ``Sample`` by ``Variant``
- Allow filtering variant annotations by variant id
- Add filters to ``Variant`` and ``VariantCall`` objects
- Add ``known_fusions`` file input to ``arriba`` process and
- Add ``known_fusions`` file input to ``arriba`` process and
``gene-fusion-calling-arriba`` workflow

Changed
Expand All @@ -26,15 +26,17 @@ Changed
- Change clinical diagnosis and annotation fields type to text
- When filtering variants do not return duplicated objects
- Optimize resource usage in processes ``bbduk-single``, ``bbduk-paired``,
``upload-fastq-single``, ``upload-fastq-paired``,
``upload-fastq-single``, ``upload-fastq-paired``,
``files-to-fastq-single`` and ``files-to-fastq-paired``
- Update the ``pca`` process with the functionality from the
- Update the ``pca`` process with the functionality from the
deprecated ``pca-beta`` process

Fixed
-----
- Change max char length of REF and ALT fields
for variant model to 150
- Fix failing requests in the ``geo-import`` process by using eutils for
fetching data from SRA


===================
Expand Down Expand Up @@ -75,9 +77,9 @@ Added

Changed
-------
- **BACKWARD INCOMPATIBLE:** Remove obsolete processes and releated scripts:
- **BACKWARD INCOMPATIBLE:** Remove obsolete processes and releated scripts:
``cuffmerge``, ``chipseq-peakscore``, ``chipseq-genescore``, ``etc-bcm``,
``mergeetc``, ``upload-etc``, ``upload-bam-secondary``,
``mergeetc``, ``upload-etc``, ``upload-bam-secondary``,
``upload-bam-scseq-indexed``, ``create-geneset-venn``,
``upload-proteomics-sample``, ``upload-proteomics-sample-set``,
``upload-header-sam``, ``upload-multiplexed-single``,
Expand All @@ -102,7 +104,7 @@ Changed
- **BACKWARD INCOMPATIBLE:** Implement peak calling step and
add QC reporting to the Cut & Run workflow
- Bump requirements versions
- Report additional QC information in the variant table output
- Report additional QC information in the variant table output
of RNA-seq variant calling pipeline

Fixed
Expand All @@ -111,7 +113,7 @@ Fixed
- Fix the ``mutations-table`` process so that only a single variant instance
is reported for each variant in ``variants`` application. The process now
also correcly handles the ``depth`` field reporting.
- Fix that the number of used threads is correctly coerced to integer in
- Fix that the number of used threads is correctly coerced to integer in
``xengsort-index``
- Fixed data object naming in ``pca-beta`` process

Expand All @@ -127,7 +129,7 @@ Added
Changed
-------
- **BACKWARD INCOMPATIBLE:** Require Resolwe 40.x
- **BACKWARD INCOMPATIBLE:** Use the updated xengsort version
- **BACKWARD INCOMPATIBLE:** Use the updated xengsort version
in processes ``xengsort-index`` and ``xengsort-classify``
- **BACKWARD INCOMPATIBLE:** Remove support for ``Python 3.10``
- Extend the ``mutations-table`` process with the support for writing
Expand All @@ -136,7 +138,7 @@ Changed

Fixed
-----
- Fix handling of multiple instances of STAR aligner input in the
- Fix handling of multiple instances of STAR aligner input in the
``multiqc`` process


Expand Down Expand Up @@ -177,7 +179,7 @@ Changed
- Remove ``rnaseqc-qc`` from RNA-seq workflows
- Remove ``cut_and_run.yml``
- Rename ``workflow-cutnrun-beta`` to ``workflow-cutnrun``
- Remove ``upload-sc-10x``, ``cellranger-count`` and ``cellranger-mkref``
- Remove ``upload-sc-10x``, ``cellranger-count`` and ``cellranger-mkref``
processes


Expand Down Expand Up @@ -225,7 +227,7 @@ Changed
and ``workflow-bbduk-star`` workflows
- Unify the use of ``resolwebio/common:4.1.1`` Docker
image version across processes
- Unify the use of ``resolwebio/base:ubuntu-22.04-14112023`` Docker
- Unify the use of ``resolwebio/base:ubuntu-22.04-14112023`` Docker
image across processes
- Add normalized count matrix output to ``differentialexpression-deseq2`` process

Expand Down
131 changes: 89 additions & 42 deletions resolwe_bio/processes/workflows/geo_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

import re
import time
from io import StringIO
from pathlib import Path

import GEOparse
import pandas as pd
import requests
from requests.exceptions import RequestException
from requests.exceptions import HTTPError

from resolwe.process import (
BooleanField,
Expand All @@ -21,6 +22,66 @@
from resolwe.process.models import Data


def _handle_sra_http_error(httperr, default_wait_time=0.5):
"""Handle HTTP errors from SRA eutils."""
# Too many requests.
if httperr.response.status_code == 429:
try:
wait_time = int(httperr.headers["Retry-After"])
except (ValueError, TypeError):
wait_time = default_wait_time
else:
wait_time = default_wait_time

time.sleep(wait_time)


def sra_esearch(term, n_retries, warning):
"""Search SRA for SRX IDs."""
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "sra",
"term": term,
"usehistory": "y",
"retmode": "json",
}

for _ in range(n_retries):
try:
response = requests.get(url=url, params=params)
response.raise_for_status()
return response.json()

except HTTPError as httperr:
_handle_sra_http_error(httperr=httperr)
warning(f"Retrying search request for {term} experiment.")


def sra_efetch(webenv, query_key, term, n_retries, warning):
"""Fetch SRA run info."""
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
"db": "sra",
"retmode": "xml",
"rettype": "runinfo",
"query_key": query_key,
"WebEnv": webenv,
}

for _ in range(n_retries):
try:
response = requests.get(url=url, params=params)
response.raise_for_status()
return pd.read_xml(StringIO(response.text))

except HTTPError as httperr:
error = str(httperr)
_handle_sra_http_error(httperr=httperr)
warning(f"Retrying fetch request for {term} experiment.")

warning(f"Retry limit reached. {str(error)}")


def parse_sample(gse, db_accession, gse_name):
"""Parse sample information from GEO."""
sample = {"Database accession": db_accession}
Expand Down Expand Up @@ -91,7 +152,7 @@ class GeoImport(Process):
},
}
data_name = "{{ gse_accession }}"
version = "2.9.0"
version = "2.9.1"
process_type = "data:geo"
category = "Import"
scheduling_class = SchedulingClass.BATCH
Expand Down Expand Up @@ -189,47 +250,33 @@ def upload_rna_gse(self, inputs, gse):
if sample_found:
for srx_id in sample_found:
sample_info[srx_id] = name
info_file = f"{gse.name}.csv"
retry_count = 0
while retry_count < inputs.advanced.sra_retry_limit:
try:
run_info = requests.get(
url="https://eutils.ncbi.nlm.nih.gov/Traces/sra/sra.cgi",
params={
"save": "efetch",
"db": "sra",
"rettype": "runinfo",
"term": srx_id,
},
)

if run_info.text.isspace():
self.error(
f"Got an empty response from SRA for SRX ID {srx_id} belonging to {gse.name}."
)

run_info.raise_for_status()

with open(info_file, "wb") as handle:
handle.write(run_info.content)

break

except RequestException:
retry_count += 1
if retry_count == inputs.advanced.sra_retry_limit:
self.error(
f"Failed to fetch SRA runs for project {srx_id} belonging to {gse.name} after {retry_count} tries."
)
else:
time.sleep(0.5)
self.warning(
f"Retrying request for SRX ID {srx_id} belonging to {gse.name}."
)

run_info = pd.read_csv(
info_file, usecols=["Run", "SampleName", "LibraryLayout"]

search_result = sra_esearch(
term=srx_id,
n_retries=inputs.advanced.sra_retry_limit,
warning=self.warning,
)

if search_result is None:
self.error(
f"Failed to find {srx_id} experiment belonging to "
f"{gse.name} after {inputs.advanced.sra_retry_limit} tries."
)

run_info = sra_efetch(
webenv=search_result["esearchresult"]["webenv"],
query_key=search_result["esearchresult"]["querykey"],
term=srx_id,
n_retries=inputs.advanced.sra_retry_limit,
warning=self.warning,
)

if run_info is None:
self.error(
f"Failed to fetch SRA runs for {srx_id} belonging to "
f"{gse.name} after {inputs.advanced.sra_retry_limit} tries."
)

run_info = run_info.set_index("Run", drop=False)

process_inputs["sra_accession"] = run_info.index.values.tolist()
Expand Down
2 changes: 0 additions & 2 deletions resolwe_bio/tests/workflows/test_geo_import.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from pathlib import Path
from unittest import skip

from django.test import LiveServerTestCase

Expand All @@ -9,7 +8,6 @@
from resolwe_bio.utils.test import BioProcessTestCase


@skip("Temporarily skipping test due to web resource being unavailable")
class GeoImportTestCase(BioProcessTestCase, LiveServerTestCase):
@with_resolwe_host
@tag_process("geo-import")
Expand Down

0 comments on commit 9d7d151

Please sign in to comment.