Skip to content

Commit

Permalink
fetch bacteria fungi by assembly assign species_taxonomy_id
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsmejia committed Sep 27, 2023
1 parent d476f8e commit 4213147
Showing 1 changed file with 63 additions and 41 deletions.
104 changes: 63 additions & 41 deletions foreman/data_refinery_foreman/surveyor/transcriptome_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,63 @@ def survey(self, source_type=None) -> bool:

return True

@staticmethod
def discover_division_species(
ensembl_division: str = "Ensembl", organism_name: str = None, strain_mapping: dict = None
):
# The main division has a different base URL for its REST API.
if ensembl_division == "Ensembl":
r = utils.requests_retry_session().get(MAIN_DIVISION_URL_TEMPLATE)
all_species = r.json()["species"]
else:
formatted_division_url = DIVISION_URL_TEMPLATE.format(division=ensembl_division)
r = utils.requests_retry_session().get(formatted_division_url)
all_species = r.json()

# if no mapping find the specific species
if not strain_mapping and organism_name:
matches = [s for s in all_species if s["name"] == organism_name]
return matches[:1]

# This will exist for fungi or bacteria
if strain_mapping:
organism_strain_name = f"{organism_name}_{strain_mapping['strain'].lower()}"
assembly_matches = list(
filter(lambda s: s["assembly_name"] == strain_mapping["assembly"], all_species)
)

if len(assembly_matches) != 1:
# Currently we are unsure if there is always a 1:1 relationship
# between organism strain and assembly. So we can check if there
# is one in the entire division. Otherwise we throw an error to
# determine which species to use.
logger.error(
"There were multiple matches for {} with assembly {}.",
organism_name,
strain_mapping["assembly"],
matches=len(assembly_matches),
organism_name=organism_strain_name,
ensembl_division=ensembl_division,
assembly=strain_mapping["assembly"],
)
return []

# Fungi and Bacteria have a strain identifier in their
# names. This is different than everything else,
# so we're going to handle this special case by
# just overwriting this. This is okay because we
# just have to discover one species for the
# organism, and then our strain mapping will make
# sure we use the correct strain and assembly.
# The Organism will be created from the species_taxonomy_id.
assembly_matches[0]["name"] = organism_strain_name
assembly_matches[0]["species_taxonomy_id"] = strain_mapping["species_taxonomy_id"]
# return the bacterium or fungus
return assembly_matches

# return the entire division
return all_species

def discover_species(self):
ensembl_division = SurveyJobKeyValue.objects.get(
survey_job_id=self.survey_job.id, key__exact="ensembl_division"
Expand All @@ -368,6 +425,7 @@ def discover_species(self):
organism_name = None

strain_mapping = None

if ensembl_division in ["EnsemblFungi", "EnsemblBacteria"]:
if organism_name is None:
logger.error(
Expand All @@ -389,51 +447,15 @@ def discover_species(self):
)
return []

# The main division has a different base URL for its REST API.
if ensembl_division == "Ensembl":
r = utils.requests_retry_session().get(MAIN_DIVISION_URL_TEMPLATE)

# Yes I'm aware that specieses isn't a word. However I need to
# distinguish between a singlular species and multiple species.
specieses = r.json()["species"]
else:
formatted_division_url = DIVISION_URL_TEMPLATE.format(division=ensembl_division)
r = utils.requests_retry_session().get(formatted_division_url)
specieses = r.json()

all_new_species = []
if organism_name:
if strain_mapping:
organism_name = organism_name + "_" + strain_mapping["strain"].lower()

for species in specieses:
if (
ensembl_division in ["EnsemblFungi", "EnsemblBacteria"]
and organism_name in species["name"]
):
# Fungi and Bacteria have a strain identifier in their
# names. This is different than everything else,
# so we're going to handle this special case by
# just overwriting this. This is okay because we
# just have to discover one species for the
# organism, and then our strain mapping will make
# sure we use the correct strain and assembly.
species["name"] = organism_name

all_new_species.append(self._generate_files(species))
break
elif "name" in species and organism_name == species["name"]:
all_new_species.append(self._generate_files(species))
break
else:
for species in specieses:
all_new_species.append(self._generate_files(species))
discovered_species = TranscriptomeIndexSurveyor.discover_division_species(
ensembl_division, organism_name, strain_mapping
)

if len(all_new_species) == 0:
if len(discovered_species) == 0:
logger.error(
"Unable to find any species!",
ensembl_division=ensembl_division,
organism_name=organism_name,
)

return all_new_species
return list(map(self._generate_files, discovered_species))

0 comments on commit 4213147

Please sign in to comment.