From 9d7d15159be5f972724ef19010e202f9fe8f6171 Mon Sep 17 00:00:00 2001 From: AnzeLovse Date: Mon, 7 Oct 2024 16:23:34 +0200 Subject: [PATCH] Fix failing SRA requests in the geo-import process --- docs/CHANGELOG.rst | 24 ++-- resolwe_bio/processes/workflows/geo_import.py | 131 ++++++++++++------ .../tests/workflows/test_geo_import.py | 2 - 3 files changed, 102 insertions(+), 55 deletions(-) diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index a1cb834d8..8c73a5917 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -16,7 +16,7 @@ Added - Allow filtering ``Sample`` by ``Variant`` - Allow filtering variant annotations by variant id - Add filters to ``Variant`` and ``VariantCall`` objects -- Add ``known_fusions`` file input to ``arriba`` process and +- Add ``known_fusions`` file input to ``arriba`` process and ``gene-fusion-calling-arriba`` workflow Changed @@ -26,15 +26,17 @@ Changed - Change clinical diagnosis and annotation fields type to text - When filtering variants do not return duplicated objects - Optimize resource usage in processes ``bbduk-single``, ``bbduk-paired``, - ``upload-fastq-single``, ``upload-fastq-paired``, + ``upload-fastq-single``, ``upload-fastq-paired``, ``files-to-fastq-single`` and ``files-to-fastq-paired`` -- Update the ``pca`` process with the functionality from the +- Update the ``pca`` process with the functionality from the deprecated ``pca-beta`` process Fixed ----- - Change max char length of REF and ALT fields for variant model to 150 +- Fix failing requests in the ``geo-import`` process by using eutils for + fetching data from SRA =================== @@ -75,9 +77,9 @@ Added Changed ------- -- **BACKWARD INCOMPATIBLE:** Remove obsolete processes and releated scripts: +- **BACKWARD INCOMPATIBLE:** Remove obsolete processes and releated scripts: ``cuffmerge``, ``chipseq-peakscore``, ``chipseq-genescore``, ``etc-bcm``, - ``mergeetc``, ``upload-etc``, ``upload-bam-secondary``, + ``mergeetc``, ``upload-etc``, ``upload-bam-secondary``, ``upload-bam-scseq-indexed``, ``create-geneset-venn``, ``upload-proteomics-sample``, ``upload-proteomics-sample-set``, ``upload-header-sam``, ``upload-multiplexed-single``, @@ -102,7 +104,7 @@ Changed - **BACKWARD INCOMPATIBLE:** Implement peak calling step and add QC reporting to the Cut & Run workflow - Bump requirements versions -- Report additional QC information in the variant table output +- Report additional QC information in the variant table output of RNA-seq variant calling pipeline Fixed @@ -111,7 +113,7 @@ Fixed - Fix the ``mutations-table`` process so that only a single variant instance is reported for each variant in ``variants`` application. The process now also correcly handles the ``depth`` field reporting. -- Fix that the number of used threads is correctly coerced to integer in +- Fix that the number of used threads is correctly coerced to integer in ``xengsort-index`` - Fixed data object naming in ``pca-beta`` process @@ -127,7 +129,7 @@ Added Changed ------- - **BACKWARD INCOMPATIBLE:** Require Resolwe 40.x -- **BACKWARD INCOMPATIBLE:** Use the updated xengsort version +- **BACKWARD INCOMPATIBLE:** Use the updated xengsort version in processes ``xengsort-index`` and ``xengsort-classify`` - **BACKWARD INCOMPATIBLE:** Remove support for ``Python 3.10`` - Extend the ``mutations-table`` process with the support for writing @@ -136,7 +138,7 @@ Changed Fixed ----- -- Fix handling of multiple instances of STAR aligner input in the +- Fix handling of multiple instances of STAR aligner input in the ``multiqc`` process @@ -177,7 +179,7 @@ Changed - Remove ``rnaseqc-qc`` from RNA-seq workflows - Remove ``cut_and_run.yml`` - Rename ``workflow-cutnrun-beta`` to ``workflow-cutnrun`` -- Remove ``upload-sc-10x``, ``cellranger-count`` and ``cellranger-mkref`` +- Remove ``upload-sc-10x``, ``cellranger-count`` and ``cellranger-mkref`` processes @@ -225,7 +227,7 @@ Changed and ``workflow-bbduk-star`` workflows - Unify the use of ``resolwebio/common:4.1.1`` Docker image version across processes -- Unify the use of ``resolwebio/base:ubuntu-22.04-14112023`` Docker +- Unify the use of ``resolwebio/base:ubuntu-22.04-14112023`` Docker image across processes - Add normalized count matrix output to ``differentialexpression-deseq2`` process diff --git a/resolwe_bio/processes/workflows/geo_import.py b/resolwe_bio/processes/workflows/geo_import.py index 41735aabb..9f1155f3d 100644 --- a/resolwe_bio/processes/workflows/geo_import.py +++ b/resolwe_bio/processes/workflows/geo_import.py @@ -2,12 +2,13 @@ import re import time +from io import StringIO from pathlib import Path import GEOparse import pandas as pd import requests -from requests.exceptions import RequestException +from requests.exceptions import HTTPError from resolwe.process import ( BooleanField, @@ -21,6 +22,66 @@ from resolwe.process.models import Data +def _handle_sra_http_error(httperr, default_wait_time=0.5): + """Handle HTTP errors from SRA eutils.""" + # Too many requests. + if httperr.response.status_code == 429: + try: + wait_time = int(httperr.headers["Retry-After"]) + except (ValueError, TypeError): + wait_time = default_wait_time + else: + wait_time = default_wait_time + + time.sleep(wait_time) + + +def sra_esearch(term, n_retries, warning): + """Search SRA for SRX IDs.""" + url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" + params = { + "db": "sra", + "term": term, + "usehistory": "y", + "retmode": "json", + } + + for _ in range(n_retries): + try: + response = requests.get(url=url, params=params) + response.raise_for_status() + return response.json() + + except HTTPError as httperr: + _handle_sra_http_error(httperr=httperr) + warning(f"Retrying search request for {term} experiment.") + + +def sra_efetch(webenv, query_key, term, n_retries, warning): + """Fetch SRA run info.""" + url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + params = { + "db": "sra", + "retmode": "xml", + "rettype": "runinfo", + "query_key": query_key, + "WebEnv": webenv, + } + + for _ in range(n_retries): + try: + response = requests.get(url=url, params=params) + response.raise_for_status() + return pd.read_xml(StringIO(response.text)) + + except HTTPError as httperr: + error = str(httperr) + _handle_sra_http_error(httperr=httperr) + warning(f"Retrying fetch request for {term} experiment.") + + warning(f"Retry limit reached. {str(error)}") + + def parse_sample(gse, db_accession, gse_name): """Parse sample information from GEO.""" sample = {"Database accession": db_accession} @@ -91,7 +152,7 @@ class GeoImport(Process): }, } data_name = "{{ gse_accession }}" - version = "2.9.0" + version = "2.9.1" process_type = "data:geo" category = "Import" scheduling_class = SchedulingClass.BATCH @@ -189,47 +250,33 @@ def upload_rna_gse(self, inputs, gse): if sample_found: for srx_id in sample_found: sample_info[srx_id] = name - info_file = f"{gse.name}.csv" - retry_count = 0 - while retry_count < inputs.advanced.sra_retry_limit: - try: - run_info = requests.get( - url="https://eutils.ncbi.nlm.nih.gov/Traces/sra/sra.cgi", - params={ - "save": "efetch", - "db": "sra", - "rettype": "runinfo", - "term": srx_id, - }, - ) - - if run_info.text.isspace(): - self.error( - f"Got an empty response from SRA for SRX ID {srx_id} belonging to {gse.name}." - ) - - run_info.raise_for_status() - - with open(info_file, "wb") as handle: - handle.write(run_info.content) - - break - - except RequestException: - retry_count += 1 - if retry_count == inputs.advanced.sra_retry_limit: - self.error( - f"Failed to fetch SRA runs for project {srx_id} belonging to {gse.name} after {retry_count} tries." - ) - else: - time.sleep(0.5) - self.warning( - f"Retrying request for SRX ID {srx_id} belonging to {gse.name}." - ) - - run_info = pd.read_csv( - info_file, usecols=["Run", "SampleName", "LibraryLayout"] + + search_result = sra_esearch( + term=srx_id, + n_retries=inputs.advanced.sra_retry_limit, + warning=self.warning, + ) + + if search_result is None: + self.error( + f"Failed to find {srx_id} experiment belonging to " + f"{gse.name} after {inputs.advanced.sra_retry_limit} tries." + ) + + run_info = sra_efetch( + webenv=search_result["esearchresult"]["webenv"], + query_key=search_result["esearchresult"]["querykey"], + term=srx_id, + n_retries=inputs.advanced.sra_retry_limit, + warning=self.warning, ) + + if run_info is None: + self.error( + f"Failed to fetch SRA runs for {srx_id} belonging to " + f"{gse.name} after {inputs.advanced.sra_retry_limit} tries." + ) + run_info = run_info.set_index("Run", drop=False) process_inputs["sra_accession"] = run_info.index.values.tolist() diff --git a/resolwe_bio/tests/workflows/test_geo_import.py b/resolwe_bio/tests/workflows/test_geo_import.py index 161f476ed..61dff6a31 100644 --- a/resolwe_bio/tests/workflows/test_geo_import.py +++ b/resolwe_bio/tests/workflows/test_geo_import.py @@ -1,5 +1,4 @@ from pathlib import Path -from unittest import skip from django.test import LiveServerTestCase @@ -9,7 +8,6 @@ from resolwe_bio.utils.test import BioProcessTestCase -@skip("Temporarily skipping test due to web resource being unavailable") class GeoImportTestCase(BioProcessTestCase, LiveServerTestCase): @with_resolwe_host @tag_process("geo-import")