From 42131478ab4a0be660f21ffb66d9e71080b8de32 Mon Sep 17 00:00:00 2001 From: David Mejia Date: Wed, 27 Sep 2023 11:45:36 -0400 Subject: [PATCH] fetch bacteria fungi by assembly assign species_taxonomy_id --- .../surveyor/transcriptome_index.py | 104 +++++++++++------- 1 file changed, 63 insertions(+), 41 deletions(-) diff --git a/foreman/data_refinery_foreman/surveyor/transcriptome_index.py b/foreman/data_refinery_foreman/surveyor/transcriptome_index.py index 977bf31f0..26c66e75b 100644 --- a/foreman/data_refinery_foreman/surveyor/transcriptome_index.py +++ b/foreman/data_refinery_foreman/surveyor/transcriptome_index.py @@ -348,6 +348,63 @@ def survey(self, source_type=None) -> bool: return True + @staticmethod + def discover_division_species( + ensembl_division: str = "Ensembl", organism_name: str = None, strain_mapping: dict = None + ): + # The main division has a different base URL for its REST API. + if ensembl_division == "Ensembl": + r = utils.requests_retry_session().get(MAIN_DIVISION_URL_TEMPLATE) + all_species = r.json()["species"] + else: + formatted_division_url = DIVISION_URL_TEMPLATE.format(division=ensembl_division) + r = utils.requests_retry_session().get(formatted_division_url) + all_species = r.json() + + # if no mapping find the specific species + if not strain_mapping and organism_name: + matches = [s for s in all_species if s["name"] == organism_name] + return matches[:1] + + # This will exist for fungi or bacteria + if strain_mapping: + organism_strain_name = f"{organism_name}_{strain_mapping['strain'].lower()}" + assembly_matches = list( + filter(lambda s: s["assembly_name"] == strain_mapping["assembly"], all_species) + ) + + if len(assembly_matches) != 1: + # Currently we are unsure if there is always a 1:1 relationship + # between organism strain and assembly. So we can check if there + # is one in the entire division. Otherwise we throw an error to + # determine which species to use. + logger.error( + "There were multiple matches for {} with assembly {}.", + organism_name, + strain_mapping["assembly"], + matches=len(assembly_matches), + organism_name=organism_strain_name, + ensembl_division=ensembl_division, + assembly=strain_mapping["assembly"], + ) + return [] + + # Fungi and Bacteria have a strain identifier in their + # names. This is different than everything else, + # so we're going to handle this special case by + # just overwriting this. This is okay because we + # just have to discover one species for the + # organism, and then our strain mapping will make + # sure we use the correct strain and assembly. + # The Organism will be created from the species_taxonomy_id. + assembly_matches[0]["name"] = organism_strain_name + assembly_matches[0]["species_taxonomy_id"] = strain_mapping["species_taxonomy_id"] + # return the bacterium or fungus + return assembly_matches + + # return the entire division + return all_species + def discover_species(self): ensembl_division = SurveyJobKeyValue.objects.get( survey_job_id=self.survey_job.id, key__exact="ensembl_division" @@ -368,6 +425,7 @@ def discover_species(self): organism_name = None strain_mapping = None + if ensembl_division in ["EnsemblFungi", "EnsemblBacteria"]: if organism_name is None: logger.error( @@ -389,51 +447,15 @@ def discover_species(self): ) return [] - # The main division has a different base URL for its REST API. - if ensembl_division == "Ensembl": - r = utils.requests_retry_session().get(MAIN_DIVISION_URL_TEMPLATE) - - # Yes I'm aware that specieses isn't a word. However I need to - # distinguish between a singlular species and multiple species. - specieses = r.json()["species"] - else: - formatted_division_url = DIVISION_URL_TEMPLATE.format(division=ensembl_division) - r = utils.requests_retry_session().get(formatted_division_url) - specieses = r.json() - - all_new_species = [] - if organism_name: - if strain_mapping: - organism_name = organism_name + "_" + strain_mapping["strain"].lower() - - for species in specieses: - if ( - ensembl_division in ["EnsemblFungi", "EnsemblBacteria"] - and organism_name in species["name"] - ): - # Fungi and Bacteria have a strain identifier in their - # names. This is different than everything else, - # so we're going to handle this special case by - # just overwriting this. This is okay because we - # just have to discover one species for the - # organism, and then our strain mapping will make - # sure we use the correct strain and assembly. - species["name"] = organism_name - - all_new_species.append(self._generate_files(species)) - break - elif "name" in species and organism_name == species["name"]: - all_new_species.append(self._generate_files(species)) - break - else: - for species in specieses: - all_new_species.append(self._generate_files(species)) + discovered_species = TranscriptomeIndexSurveyor.discover_division_species( + ensembl_division, organism_name, strain_mapping + ) - if len(all_new_species) == 0: + if len(discovered_species) == 0: logger.error( "Unable to find any species!", ensembl_division=ensembl_division, organism_name=organism_name, ) - return all_new_species + return list(map(self._generate_files, discovered_species))