From 42131478ab4a0be660f21ffb66d9e71080b8de32 Mon Sep 17 00:00:00 2001
From: David Mejia <davidsmejia@gmail.com>
Date: Wed, 27 Sep 2023 11:45:36 -0400
Subject: [PATCH] fetch bacteria fungi by assembly assign species_taxonomy_id

---
 .../surveyor/transcriptome_index.py           | 104 +++++++++++-------
 1 file changed, 63 insertions(+), 41 deletions(-)

diff --git a/foreman/data_refinery_foreman/surveyor/transcriptome_index.py b/foreman/data_refinery_foreman/surveyor/transcriptome_index.py
index 977bf31f0..26c66e75b 100644
--- a/foreman/data_refinery_foreman/surveyor/transcriptome_index.py
+++ b/foreman/data_refinery_foreman/surveyor/transcriptome_index.py
@@ -348,6 +348,63 @@ def survey(self, source_type=None) -> bool:
 
         return True
 
+    @staticmethod
+    def discover_division_species(
+        ensembl_division: str = "Ensembl", organism_name: str = None, strain_mapping: dict = None
+    ):
+        # The main division has a different base URL for its REST API.
+        if ensembl_division == "Ensembl":
+            r = utils.requests_retry_session().get(MAIN_DIVISION_URL_TEMPLATE)
+            all_species = r.json()["species"]
+        else:
+            formatted_division_url = DIVISION_URL_TEMPLATE.format(division=ensembl_division)
+            r = utils.requests_retry_session().get(formatted_division_url)
+            all_species = r.json()
+
+        # if no mapping find the specific species
+        if not strain_mapping and organism_name:
+            matches = [s for s in all_species if s["name"] == organism_name]
+            return matches[:1]
+
+        # This will exist for fungi or bacteria
+        if strain_mapping:
+            organism_strain_name = f"{organism_name}_{strain_mapping['strain'].lower()}"
+            assembly_matches = list(
+                filter(lambda s: s["assembly_name"] == strain_mapping["assembly"], all_species)
+            )
+
+            if len(assembly_matches) != 1:
+                # Currently we are unsure if there is always a 1:1 relationship
+                # between organism strain and assembly. So we can check if there
+                # is one in the entire division. Otherwise we throw an error to
+                # determine which species to use.
+                logger.error(
+                    "There were multiple matches for {} with assembly {}.",
+                    organism_name,
+                    strain_mapping["assembly"],
+                    matches=len(assembly_matches),
+                    organism_name=organism_strain_name,
+                    ensembl_division=ensembl_division,
+                    assembly=strain_mapping["assembly"],
+                )
+                return []
+
+            # Fungi and Bacteria have a strain identifier in their
+            # names. This is different than everything else,
+            # so we're going to handle this special case by
+            # just overwriting this. This is okay because we
+            # just have to discover one species for the
+            # organism, and then our strain mapping will make
+            # sure we use the correct strain and assembly.
+            # The Organism will be created from the species_taxonomy_id.
+            assembly_matches[0]["name"] = organism_strain_name
+            assembly_matches[0]["species_taxonomy_id"] = strain_mapping["species_taxonomy_id"]
+            # return the bacterium or fungus
+            return assembly_matches
+
+        # return the entire division
+        return all_species
+
     def discover_species(self):
         ensembl_division = SurveyJobKeyValue.objects.get(
             survey_job_id=self.survey_job.id, key__exact="ensembl_division"
@@ -368,6 +425,7 @@ def discover_species(self):
             organism_name = None
 
         strain_mapping = None
+
         if ensembl_division in ["EnsemblFungi", "EnsemblBacteria"]:
             if organism_name is None:
                 logger.error(
@@ -389,51 +447,15 @@ def discover_species(self):
                     )
                     return []
 
-        # The main division has a different base URL for its REST API.
-        if ensembl_division == "Ensembl":
-            r = utils.requests_retry_session().get(MAIN_DIVISION_URL_TEMPLATE)
-
-            # Yes I'm aware that specieses isn't a word. However I need to
-            # distinguish between a singlular species and multiple species.
-            specieses = r.json()["species"]
-        else:
-            formatted_division_url = DIVISION_URL_TEMPLATE.format(division=ensembl_division)
-            r = utils.requests_retry_session().get(formatted_division_url)
-            specieses = r.json()
-
-        all_new_species = []
-        if organism_name:
-            if strain_mapping:
-                organism_name = organism_name + "_" + strain_mapping["strain"].lower()
-
-            for species in specieses:
-                if (
-                    ensembl_division in ["EnsemblFungi", "EnsemblBacteria"]
-                    and organism_name in species["name"]
-                ):
-                    # Fungi and Bacteria have a strain identifier in their
-                    # names. This is different than everything else,
-                    # so we're going to handle this special case by
-                    # just overwriting this. This is okay because we
-                    # just have to discover one species for the
-                    # organism, and then our strain mapping will make
-                    # sure we use the correct strain and assembly.
-                    species["name"] = organism_name
-
-                    all_new_species.append(self._generate_files(species))
-                    break
-                elif "name" in species and organism_name == species["name"]:
-                    all_new_species.append(self._generate_files(species))
-                    break
-        else:
-            for species in specieses:
-                all_new_species.append(self._generate_files(species))
+        discovered_species = TranscriptomeIndexSurveyor.discover_division_species(
+            ensembl_division, organism_name, strain_mapping
+        )
 
-        if len(all_new_species) == 0:
+        if len(discovered_species) == 0:
             logger.error(
                 "Unable to find any species!",
                 ensembl_division=ensembl_division,
                 organism_name=organism_name,
             )
 
-        return all_new_species
+        return list(map(self._generate_files, discovered_species))