-
-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'feature/automatic-accessions' into ark/3111-update-fore…
…man-ami
- Loading branch information
Showing
65 changed files
with
1,821 additions
and
1,184 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
38 changes: 38 additions & 0 deletions
38
common/data_refinery_common/migrations/0071_gatheredaccession.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# Generated by Django 3.2.7 on 2022-09-13 18:14 | ||
|
||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
("data_refinery_common", "0070_auto_20211208_2118"), | ||
] | ||
|
||
operations = [ | ||
migrations.CreateModel( | ||
name="GatheredAccession", | ||
fields=[ | ||
( | ||
"id", | ||
models.AutoField( | ||
auto_created=True, | ||
primary_key=True, | ||
serialize=False, | ||
verbose_name="ID", | ||
), | ||
), | ||
("accession_code", models.TextField(unique=True)), | ||
("created_at", models.DateTimeField(auto_now_add=True)), | ||
("last_modified_at", models.DateTimeField(auto_now=True)), | ||
("organism", models.TextField()), | ||
("published_date", models.DateTimeField()), | ||
("sample_count", models.PositiveIntegerField(default=0)), | ||
("source", models.TextField()), | ||
("technology", models.TextField()), | ||
], | ||
options={ | ||
"db_table": "gathered_accessions", | ||
}, | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from datetime import datetime | ||
|
||
from django.db import models | ||
from django.utils import timezone | ||
|
||
|
||
class GatheredAccession(models.Model): | ||
"""Gathered accession model.""" | ||
|
||
class Meta: | ||
db_table = "gathered_accessions" | ||
|
||
accession_code = models.TextField(unique=True) | ||
created_at = models.DateTimeField(auto_now_add=True) | ||
last_modified_at = models.DateTimeField(auto_now=True) | ||
organism = models.TextField() | ||
published_date = models.DateTimeField() | ||
sample_count = models.PositiveIntegerField(default=0) | ||
source = models.TextField() | ||
technology = models.TextField() | ||
|
||
def __eq__(self, other: object) -> bool: | ||
"""Returns True if two objects are equal. Otherwise returns False.""" | ||
return isinstance(other, GatheredAccession) and self.accession_code == other.accession_code | ||
|
||
def __hash__(self) -> int: | ||
"""Returns accession object unique hash value.""" | ||
return hash(self.accession_code) | ||
|
||
def __str__(self) -> str: | ||
"""Returns accession default string representation.""" | ||
return ", ".join( | ||
( | ||
self.accession_code, | ||
self.technology, | ||
self.source, | ||
str(self.published_date.date()), | ||
) | ||
) | ||
|
||
@staticmethod | ||
def create_from_external_entry(data, source, technology, organism=None): | ||
"""Creates accession object from MicroArray ArrayExpress entry.""" | ||
accession = GatheredAccession() | ||
|
||
accession.accession_code = ( | ||
data.get("accession") or data.get("gse") or data.get("secondary_study_accession") | ||
) | ||
|
||
organism = data.get("organism") or data.get("scientific_name") or organism | ||
if organism: | ||
accession.organism = organism.lower() | ||
|
||
published_date = ( | ||
data.get("first_public") or data.get("release_date") or data.get("submission_date") | ||
) | ||
accession.published_date = timezone.make_aware( | ||
datetime.strptime(published_date, "%Y-%m-%d") | ||
) | ||
|
||
accession.source = source | ||
accession.technology = technology | ||
|
||
return accession |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
FROM ubuntu:18.04 | ||
|
||
# Fail in case of an error at any stage in the pipe. | ||
SHELL ["/bin/bash", "-o", "pipefail", "-c"] | ||
|
||
WORKDIR /home/user | ||
|
||
# Prevent tzdata from prompting us for a timezone and hanging the build. | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 | ||
# For whatever reason this worked and 'en_US.UTF-8' did not. | ||
ENV LANG=C.UTF-8 | ||
|
||
COPY workers/CRAN.gpg . | ||
RUN apt-get update && \ | ||
apt-get install --no-install-recommends -y software-properties-common && \ | ||
add-apt-repository ppa:apt-fast/stable && \ | ||
add-apt-repository ppa:deadsnakes/ppa && \ | ||
add-apt-repository ppa:savoury1/llvm-defaults-10 && \ | ||
apt-get update -qq && \ | ||
apt-get install --no-install-recommends -y \ | ||
apt-fast \ | ||
apt-transport-https \ | ||
gpg-agent && \ | ||
apt-key add CRAN.gpg && \ | ||
echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ | ||
>> /etc/apt/sources.list.d/added_repos.list && \ | ||
apt-fast update -qq && apt-fast install -y \ | ||
build-essential \ | ||
cmake \ | ||
curl \ | ||
cython3 \ | ||
ed \ | ||
git \ | ||
libcairo-dev \ | ||
libcurl4-gnutls-dev \ | ||
libedit-dev \ | ||
libgit2-dev \ | ||
libpq-dev \ | ||
libssl-dev \ | ||
libxml2-dev \ | ||
llvm-10-dev \ | ||
lsb-release \ | ||
mercurial \ | ||
pkg-config \ | ||
python3-pip \ | ||
python3.8 \ | ||
python3.8-dev \ | ||
r-base-core \ | ||
wget && \ | ||
apt-get clean && \ | ||
rm CRAN.gpg && \ | ||
rm -rf /var/lib/apt/lists/* && \ | ||
ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config && \ | ||
update-alternatives --install \ | ||
/usr/bin/python3 python3 /usr/bin/python3.8 1 && \ | ||
groupadd user && \ | ||
useradd --create-home --home-dir /home/user/ -g user user && \ | ||
chown -R user /home/user/ | ||
|
||
# Pre-install dev tools. | ||
ENV R_LIBS=/usr/local/lib/R/site-library | ||
COPY common/R/install_devtools.R . | ||
RUN Rscript install_devtools.R |
Empty file.
Empty file.
137 changes: 137 additions & 0 deletions
137
foreman/data_refinery_foreman/gatherer/agents/ae_agent.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
""" | ||
MicroArray ArrayExpress accession gathering automation. | ||
Data source: https://www.ebi.ac.uk/biostudies/help | ||
""" | ||
|
||
from typing import List, Set | ||
|
||
import requests | ||
from retrying import retry | ||
|
||
from data_refinery_common.logging import get_and_configure_logger | ||
from data_refinery_common.models.gathered_accession import GatheredAccession | ||
from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase | ||
|
||
logger = get_and_configure_logger(__name__) | ||
|
||
|
||
class AEAgent(AccessionAgentBase): | ||
""" | ||
MicroArray ArrayExpress accession gathering agent. The data is fetched from | ||
the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and | ||
https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more | ||
information about the API endpoints. | ||
""" | ||
|
||
DATA_CHUNK_SIZE = 100 | ||
DATA_URL = "https://www.ebi.ac.uk/biostudies/api/v1/search" | ||
SOURCE = "ebi-biostudies" | ||
SOURCE_NAME = "microarray-ae" | ||
TECHNOLOGY = "microarray" | ||
|
||
def build_query(self) -> dict: | ||
"""Returns a query dict for getting array/organism specific accessions.""" | ||
query_dict = { | ||
"directsub": "true", | ||
"page": 1, | ||
"pageSize": self.DATA_CHUNK_SIZE, | ||
"release_date": f"[{self.since} TO {self.until}]", | ||
"type": "study", | ||
} | ||
|
||
if self.ids: | ||
# TODO(ark): figure out better way of array filtering. | ||
# Also make sure it's equivalent to the array filtering in this query | ||
# https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208 | ||
query_dict.update({"content": ", ".join(self.ids)}) | ||
elif self.keyword: | ||
query_dict.update({"content": self.keyword}) | ||
elif self.organism: | ||
query_dict.update({"organism": f'"{self.organism}"'}) | ||
|
||
return query_dict | ||
|
||
def collect_data(self) -> Set[str]: | ||
"""Gets new accessions from EBI Biostudies API.""" | ||
accessions = set() | ||
|
||
if self.ids: | ||
message = ( | ||
"Getting MicroArray ArrayExpress entries by " | ||
f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] " | ||
"range." | ||
) | ||
elif self.keyword: | ||
message = ( | ||
"Getting MicroArray ArrayExpress entries by " | ||
f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.' | ||
) | ||
elif self.organism: | ||
message = ( | ||
"Getting MicroArray ArrayExpress entries by " | ||
f'"{self.organism}" organism for [{self.since} - {self.until}] range.' | ||
) | ||
else: | ||
return accessions | ||
|
||
logger.debug(message) | ||
accessions.update(self.fetch_data()) | ||
|
||
return accessions | ||
|
||
def fetch_data(self) -> Set[str]: | ||
"""Retrieves accessions from API search endpoint.""" | ||
|
||
@retry(**self.retry_params) | ||
def get_response(url, **kwargs): | ||
"""Gets response from an API endpoint.""" | ||
return requests.get(url, **kwargs) | ||
|
||
accessions = set() | ||
|
||
is_done = False | ||
params = self.build_query() | ||
while not is_done: | ||
range_start = (params["page"] - 1) * params["pageSize"] + 1 | ||
range_end = (params["page"] - 1) * params["pageSize"] + self.DATA_CHUNK_SIZE | ||
logger.debug(f"Processing entries {range_start} - {range_end}") | ||
|
||
response = get_response(self.DATA_URL, params=params) | ||
entries = response.json().get("hits", ()) | ||
if entries: | ||
entries = ( | ||
GatheredAccession.create_from_external_entry( | ||
entry, self.SOURCE, self.TECHNOLOGY, organism=self.organism | ||
) | ||
for entry in entries | ||
) | ||
params["page"] += 1 | ||
else: | ||
is_done = True | ||
|
||
if self.previous_accessions: | ||
entries = ( | ||
entry | ||
for entry in entries | ||
if entry.accession_code not in self.previous_accessions | ||
) | ||
accessions.update(entries) | ||
|
||
# Quit after getting a sufficient amount of accessions. | ||
if self.count and len(accessions) >= self.count: | ||
is_done = True | ||
|
||
return accessions | ||
|
||
def get_ids(self) -> List[str]: | ||
"""Returns a combined list of passed ArrayExpress IDs.""" | ||
ids = set() | ||
|
||
if self.options["ae_id"]: | ||
ids.update(self.options["ae_id"]) | ||
|
||
if self.options["ae_ids_file"]: | ||
with open(self.options["ae_ids_file"]) as ae_ids_file: | ||
ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines())) | ||
|
||
return sorted(ids) |
Oops, something went wrong.