From 9d7d15159be5f972724ef19010e202f9fe8f6171 Mon Sep 17 00:00:00 2001
From: AnzeLovse <AnzeLovse@users.noreply.github.com>
Date: Mon, 7 Oct 2024 16:23:34 +0200
Subject: [PATCH] Fix failing SRA requests in the geo-import process

---
 docs/CHANGELOG.rst                            |  24 ++--
 resolwe_bio/processes/workflows/geo_import.py | 131 ++++++++++++------
 .../tests/workflows/test_geo_import.py        |   2 -
 3 files changed, 102 insertions(+), 55 deletions(-)

diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst
index a1cb834d8..8c73a5917 100644
--- a/docs/CHANGELOG.rst
+++ b/docs/CHANGELOG.rst
@@ -16,7 +16,7 @@ Added
 - Allow filtering ``Sample`` by ``Variant``
 - Allow filtering variant annotations by variant id
 - Add filters to ``Variant`` and ``VariantCall`` objects
-- Add ``known_fusions`` file input to ``arriba`` process and 
+- Add ``known_fusions`` file input to ``arriba`` process and
   ``gene-fusion-calling-arriba`` workflow
 
 Changed
@@ -26,15 +26,17 @@ Changed
 - Change clinical diagnosis and annotation fields type to text
 - When filtering variants do not return duplicated objects
 - Optimize resource usage in processes ``bbduk-single``, ``bbduk-paired``,
-  ``upload-fastq-single``, ``upload-fastq-paired``, 
+  ``upload-fastq-single``, ``upload-fastq-paired``,
   ``files-to-fastq-single`` and ``files-to-fastq-paired``
-- Update the ``pca`` process with the functionality from the 
+- Update the ``pca`` process with the functionality from the
   deprecated ``pca-beta`` process
 
 Fixed
 -----
 - Change max char length of REF and ALT fields
   for variant model to 150
+- Fix failing requests in the ``geo-import`` process by using eutils for
+  fetching data from SRA
 
 
 ===================
@@ -75,9 +77,9 @@ Added
 
 Changed
 -------
-- **BACKWARD INCOMPATIBLE:** Remove obsolete processes and releated scripts: 
+- **BACKWARD INCOMPATIBLE:** Remove obsolete processes and releated scripts:
   ``cuffmerge``, ``chipseq-peakscore``, ``chipseq-genescore``, ``etc-bcm``,
-  ``mergeetc``, ``upload-etc``, ``upload-bam-secondary``, 
+  ``mergeetc``, ``upload-etc``, ``upload-bam-secondary``,
   ``upload-bam-scseq-indexed``, ``create-geneset-venn``,
   ``upload-proteomics-sample``, ``upload-proteomics-sample-set``,
   ``upload-header-sam``, ``upload-multiplexed-single``,
@@ -102,7 +104,7 @@ Changed
 - **BACKWARD INCOMPATIBLE:** Implement peak calling step and
   add QC reporting to the Cut & Run workflow
 - Bump requirements versions
-- Report additional QC information in the variant table output 
+- Report additional QC information in the variant table output
   of RNA-seq variant calling pipeline
 
 Fixed
@@ -111,7 +113,7 @@ Fixed
 - Fix the ``mutations-table`` process so that only a single variant instance
   is reported for each variant in ``variants`` application. The process now
   also correcly handles the ``depth`` field reporting.
-- Fix that the number of used threads is correctly coerced to integer in 
+- Fix that the number of used threads is correctly coerced to integer in
   ``xengsort-index``
 - Fixed data object naming in ``pca-beta`` process
 
@@ -127,7 +129,7 @@ Added
 Changed
 -------
 - **BACKWARD INCOMPATIBLE:** Require Resolwe 40.x
-- **BACKWARD INCOMPATIBLE:** Use the updated xengsort version 
+- **BACKWARD INCOMPATIBLE:** Use the updated xengsort version
   in processes ``xengsort-index`` and ``xengsort-classify``
 - **BACKWARD INCOMPATIBLE:** Remove support for ``Python 3.10``
 - Extend the ``mutations-table`` process with the support for writing
@@ -136,7 +138,7 @@ Changed
 
 Fixed
 -----
-- Fix handling of multiple instances of STAR aligner input in the 
+- Fix handling of multiple instances of STAR aligner input in the
   ``multiqc`` process
 
 
@@ -177,7 +179,7 @@ Changed
 - Remove ``rnaseqc-qc`` from RNA-seq workflows
 - Remove ``cut_and_run.yml``
 - Rename ``workflow-cutnrun-beta`` to ``workflow-cutnrun``
-- Remove ``upload-sc-10x``, ``cellranger-count`` and ``cellranger-mkref`` 
+- Remove ``upload-sc-10x``, ``cellranger-count`` and ``cellranger-mkref``
   processes
 
 
@@ -225,7 +227,7 @@ Changed
   and ``workflow-bbduk-star`` workflows
 - Unify the use of ``resolwebio/common:4.1.1`` Docker
   image version across processes
-- Unify the use of ``resolwebio/base:ubuntu-22.04-14112023`` Docker 
+- Unify the use of ``resolwebio/base:ubuntu-22.04-14112023`` Docker
   image across processes
 - Add normalized count matrix output to ``differentialexpression-deseq2`` process
 
diff --git a/resolwe_bio/processes/workflows/geo_import.py b/resolwe_bio/processes/workflows/geo_import.py
index 41735aabb..9f1155f3d 100644
--- a/resolwe_bio/processes/workflows/geo_import.py
+++ b/resolwe_bio/processes/workflows/geo_import.py
@@ -2,12 +2,13 @@
 
 import re
 import time
+from io import StringIO
 from pathlib import Path
 
 import GEOparse
 import pandas as pd
 import requests
-from requests.exceptions import RequestException
+from requests.exceptions import HTTPError
 
 from resolwe.process import (
     BooleanField,
@@ -21,6 +22,66 @@
 from resolwe.process.models import Data
 
 
+def _handle_sra_http_error(httperr, default_wait_time=0.5):
+    """Handle HTTP errors from SRA eutils."""
+    # Too many requests.
+    if httperr.response.status_code == 429:
+        try:
+            wait_time = int(httperr.headers["Retry-After"])
+        except (ValueError, TypeError):
+            wait_time = default_wait_time
+    else:
+        wait_time = default_wait_time
+
+    time.sleep(wait_time)
+
+
+def sra_esearch(term, n_retries, warning):
+    """Search SRA for SRX IDs."""
+    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+    params = {
+        "db": "sra",
+        "term": term,
+        "usehistory": "y",
+        "retmode": "json",
+    }
+
+    for _ in range(n_retries):
+        try:
+            response = requests.get(url=url, params=params)
+            response.raise_for_status()
+            return response.json()
+
+        except HTTPError as httperr:
+            _handle_sra_http_error(httperr=httperr)
+            warning(f"Retrying search request for {term} experiment.")
+
+
+def sra_efetch(webenv, query_key, term, n_retries, warning):
+    """Fetch SRA run info."""
+    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+    params = {
+        "db": "sra",
+        "retmode": "xml",
+        "rettype": "runinfo",
+        "query_key": query_key,
+        "WebEnv": webenv,
+    }
+
+    for _ in range(n_retries):
+        try:
+            response = requests.get(url=url, params=params)
+            response.raise_for_status()
+            return pd.read_xml(StringIO(response.text))
+
+        except HTTPError as httperr:
+            error = str(httperr)
+            _handle_sra_http_error(httperr=httperr)
+            warning(f"Retrying fetch request for {term} experiment.")
+
+        warning(f"Retry limit reached. {str(error)}")
+
+
 def parse_sample(gse, db_accession, gse_name):
     """Parse sample information from GEO."""
     sample = {"Database accession": db_accession}
@@ -91,7 +152,7 @@ class GeoImport(Process):
         },
     }
     data_name = "{{ gse_accession }}"
-    version = "2.9.0"
+    version = "2.9.1"
     process_type = "data:geo"
     category = "Import"
     scheduling_class = SchedulingClass.BATCH
@@ -189,47 +250,33 @@ def upload_rna_gse(self, inputs, gse):
             if sample_found:
                 for srx_id in sample_found:
                     sample_info[srx_id] = name
-                    info_file = f"{gse.name}.csv"
-                    retry_count = 0
-                    while retry_count < inputs.advanced.sra_retry_limit:
-                        try:
-                            run_info = requests.get(
-                                url="https://eutils.ncbi.nlm.nih.gov/Traces/sra/sra.cgi",
-                                params={
-                                    "save": "efetch",
-                                    "db": "sra",
-                                    "rettype": "runinfo",
-                                    "term": srx_id,
-                                },
-                            )
-
-                            if run_info.text.isspace():
-                                self.error(
-                                    f"Got an empty response from SRA for SRX ID {srx_id} belonging to {gse.name}."
-                                )
-
-                            run_info.raise_for_status()
-
-                            with open(info_file, "wb") as handle:
-                                handle.write(run_info.content)
-
-                            break
-
-                        except RequestException:
-                            retry_count += 1
-                            if retry_count == inputs.advanced.sra_retry_limit:
-                                self.error(
-                                    f"Failed to fetch SRA runs for project {srx_id} belonging to {gse.name} after {retry_count} tries."
-                                )
-                            else:
-                                time.sleep(0.5)
-                                self.warning(
-                                    f"Retrying request for SRX ID {srx_id} belonging to {gse.name}."
-                                )
-
-                    run_info = pd.read_csv(
-                        info_file, usecols=["Run", "SampleName", "LibraryLayout"]
+
+                    search_result = sra_esearch(
+                        term=srx_id,
+                        n_retries=inputs.advanced.sra_retry_limit,
+                        warning=self.warning,
+                    )
+
+                    if search_result is None:
+                        self.error(
+                            f"Failed to find {srx_id} experiment belonging to "
+                            f"{gse.name} after {inputs.advanced.sra_retry_limit} tries."
+                        )
+
+                    run_info = sra_efetch(
+                        webenv=search_result["esearchresult"]["webenv"],
+                        query_key=search_result["esearchresult"]["querykey"],
+                        term=srx_id,
+                        n_retries=inputs.advanced.sra_retry_limit,
+                        warning=self.warning,
                     )
+
+                    if run_info is None:
+                        self.error(
+                            f"Failed to fetch SRA runs for {srx_id} belonging to "
+                            f"{gse.name} after {inputs.advanced.sra_retry_limit} tries."
+                        )
+
                     run_info = run_info.set_index("Run", drop=False)
 
                     process_inputs["sra_accession"] = run_info.index.values.tolist()
diff --git a/resolwe_bio/tests/workflows/test_geo_import.py b/resolwe_bio/tests/workflows/test_geo_import.py
index 161f476ed..61dff6a31 100644
--- a/resolwe_bio/tests/workflows/test_geo_import.py
+++ b/resolwe_bio/tests/workflows/test_geo_import.py
@@ -1,5 +1,4 @@
 from pathlib import Path
-from unittest import skip
 
 from django.test import LiveServerTestCase
 
@@ -9,7 +8,6 @@
 from resolwe_bio.utils.test import BioProcessTestCase
 
 
-@skip("Temporarily skipping test due to web resource being unavailable")
 class GeoImportTestCase(BioProcessTestCase, LiveServerTestCase):
     @with_resolwe_host
     @tag_process("geo-import")