Skip to content

Commit

Permalink
Merge branch 'feature/automatic-accessions' into ark/3111-update-fore…
Browse files Browse the repository at this point in the history
…man-ami
  • Loading branch information
arkid15r committed Feb 3, 2023
2 parents 8b79bae + 7621cde commit ffd289d
Show file tree
Hide file tree
Showing 65 changed files with 1,821 additions and 1,184 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ repos:
- id: isort

- repo: https://github.com/psf/black
rev: 19.10b0
rev: 22.3.0
hooks:
- id: black
args: [--line-length=100]
Expand Down
6 changes: 4 additions & 2 deletions ami/instance.tf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ resource "aws_instance" "ubuntu-ami-template-instance" {
# Our instance-user-data.sh script is built by Terraform at
# apply-time so that it can put additional files onto the
# instance. For more information see the definition of this resource.
user_data = templatefile("ubuntu-instance-user-data.tpl.sh", {
user_data = templatefile("ubuntu-instance-user-data.tpl.sh",
{
docker_apt_key = data.local_file.docker_apt_key.content
}
)
Expand All @@ -49,7 +50,8 @@ resource "aws_instance" "ecs-ami-template-instance" {
# Our instance-user-data.sh script is built by Terraform at
# apply-time so that it can put additional files onto the
# instance. For more information see the definition of this resource.
user_data = templatefile("ecs-instance-user-data.tpl.sh", {
user_data = templatefile("ecs-instance-user-data.tpl.sh",
{
docker_apt_key = data.local_file.docker_apt_key.content
}
)
Expand Down
12 changes: 6 additions & 6 deletions common/install_devtools.R → common/R/install_devtools.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
# Cranlock was used to find the versions of dependencies to install

# Treat warnings as errors, set CRAN mirror, and set parallelization:
options(warn=2)
options(repos=structure(c(CRAN="https://cloud.r-project.org/")))
options(Ncpus=parallel::detectCores())
options(warn = 2)
options(repos = structure(c(CRAN = "https://cloud.r-project.org/")))
options(Ncpus = parallel::detectCores())


install_package_version <- function(package_name, version) {
Expand All @@ -31,18 +31,18 @@ install_package_version <- function(package_name, version) {
package_url <- paste0("https://cloud.r-project.org/src/contrib/", package_tarball)

# Give CRAN a full minute to timeout since it's not always the most reliable.
curl_result <- system(paste0("curl --head --connect-timeout 60 ", package_url), intern=TRUE)
curl_result <- system(paste0("curl --head --connect-timeout 60 ", package_url), intern = TRUE)
if (grepl("404", curl_result[1])) {
package_url <- paste0("https://cloud.r-project.org/src/contrib/Archive/", package_name, "/", package_tarball)

# Make sure the package actually exists in the archive!
curl_result <- system(paste0("curl --head --connect-timeout 120 ", package_url), intern=TRUE)
curl_result <- system(paste0("curl --head --connect-timeout 120 ", package_url), intern = TRUE)
if (grepl("404", curl_result[1])) {
stop(paste("Package", package_name, "version", version, "does not exist!"))
}
}

install.packages(package_url)
install.packages(package_url, Ncpus = 32)
}

# Generated using cranlock
Expand Down
38 changes: 38 additions & 0 deletions common/data_refinery_common/migrations/0071_gatheredaccession.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Generated by Django 3.2.7 on 2022-09-13 18:14

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("data_refinery_common", "0070_auto_20211208_2118"),
]

operations = [
migrations.CreateModel(
name="GatheredAccession",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("accession_code", models.TextField(unique=True)),
("created_at", models.DateTimeField(auto_now_add=True)),
("last_modified_at", models.DateTimeField(auto_now=True)),
("organism", models.TextField()),
("published_date", models.DateTimeField()),
("sample_count", models.PositiveIntegerField(default=0)),
("source", models.TextField()),
("technology", models.TextField()),
],
options={
"db_table": "gathered_accessions",
},
),
]
1 change: 1 addition & 0 deletions common/data_refinery_common/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from data_refinery_common.models.dataset_annotation import DatasetAnnotation # noqa
from data_refinery_common.models.experiment import Experiment # noqa
from data_refinery_common.models.experiment_annotation import ExperimentAnnotation # noqa
from data_refinery_common.models.gathered_accession import GatheredAccession # noqa
from data_refinery_common.models.jobs.downloader_job import DownloaderJob # noqa
from data_refinery_common.models.jobs.processor_job import ProcessorJob # noqa
from data_refinery_common.models.jobs.survey_job import SurveyJob # noqa
Expand Down
64 changes: 64 additions & 0 deletions common/data_refinery_common/models/gathered_accession.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from datetime import datetime

from django.db import models
from django.utils import timezone


class GatheredAccession(models.Model):
"""Gathered accession model."""

class Meta:
db_table = "gathered_accessions"

accession_code = models.TextField(unique=True)
created_at = models.DateTimeField(auto_now_add=True)
last_modified_at = models.DateTimeField(auto_now=True)
organism = models.TextField()
published_date = models.DateTimeField()
sample_count = models.PositiveIntegerField(default=0)
source = models.TextField()
technology = models.TextField()

def __eq__(self, other: object) -> bool:
"""Returns True if two objects are equal. Otherwise returns False."""
return isinstance(other, GatheredAccession) and self.accession_code == other.accession_code

def __hash__(self) -> int:
"""Returns accession object unique hash value."""
return hash(self.accession_code)

def __str__(self) -> str:
"""Returns accession default string representation."""
return ", ".join(
(
self.accession_code,
self.technology,
self.source,
str(self.published_date.date()),
)
)

@staticmethod
def create_from_external_entry(data, source, technology, organism=None):
"""Creates accession object from MicroArray ArrayExpress entry."""
accession = GatheredAccession()

accession.accession_code = (
data.get("accession") or data.get("gse") or data.get("secondary_study_accession")
)

organism = data.get("organism") or data.get("scientific_name") or organism
if organism:
accession.organism = organism.lower()

published_date = (
data.get("first_public") or data.get("release_date") or data.get("submission_date")
)
accession.published_date = timezone.make_aware(
datetime.strptime(published_date, "%Y-%m-%d")
)

accession.source = source
accession.technology = technology

return accession
65 changes: 65 additions & 0 deletions common/dockerfiles/Dockerfile.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
FROM ubuntu:18.04

# Fail in case of an error at any stage in the pipe.
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

WORKDIR /home/user

# Prevent tzdata from prompting us for a timezone and hanging the build.
ENV DEBIAN_FRONTEND=noninteractive

# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
# For whatever reason this worked and 'en_US.UTF-8' did not.
ENV LANG=C.UTF-8

COPY workers/CRAN.gpg .
RUN apt-get update && \
apt-get install --no-install-recommends -y software-properties-common && \
add-apt-repository ppa:apt-fast/stable && \
add-apt-repository ppa:deadsnakes/ppa && \
add-apt-repository ppa:savoury1/llvm-defaults-10 && \
apt-get update -qq && \
apt-get install --no-install-recommends -y \
apt-fast \
apt-transport-https \
gpg-agent && \
apt-key add CRAN.gpg && \
echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
>> /etc/apt/sources.list.d/added_repos.list && \
apt-fast update -qq && apt-fast install -y \
build-essential \
cmake \
curl \
cython3 \
ed \
git \
libcairo-dev \
libcurl4-gnutls-dev \
libedit-dev \
libgit2-dev \
libpq-dev \
libssl-dev \
libxml2-dev \
llvm-10-dev \
lsb-release \
mercurial \
pkg-config \
python3-pip \
python3.8 \
python3.8-dev \
r-base-core \
wget && \
apt-get clean && \
rm CRAN.gpg && \
rm -rf /var/lib/apt/lists/* && \
ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config && \
update-alternatives --install \
/usr/bin/python3 python3 /usr/bin/python3.8 1 && \
groupadd user && \
useradd --create-home --home-dir /home/user/ -g user user && \
chown -R user /home/user/

# Pre-install dev tools.
ENV R_LIBS=/usr/local/lib/R/site-library
COPY common/R/install_devtools.R .
RUN Rscript install_devtools.R
Empty file.
Empty file.
137 changes: 137 additions & 0 deletions foreman/data_refinery_foreman/gatherer/agents/ae_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""
MicroArray ArrayExpress accession gathering automation.
Data source: https://www.ebi.ac.uk/biostudies/help
"""

from typing import List, Set

import requests
from retrying import retry

from data_refinery_common.logging import get_and_configure_logger
from data_refinery_common.models.gathered_accession import GatheredAccession
from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase

logger = get_and_configure_logger(__name__)


class AEAgent(AccessionAgentBase):
"""
MicroArray ArrayExpress accession gathering agent. The data is fetched from
the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and
https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more
information about the API endpoints.
"""

DATA_CHUNK_SIZE = 100
DATA_URL = "https://www.ebi.ac.uk/biostudies/api/v1/search"
SOURCE = "ebi-biostudies"
SOURCE_NAME = "microarray-ae"
TECHNOLOGY = "microarray"

def build_query(self) -> dict:
"""Returns a query dict for getting array/organism specific accessions."""
query_dict = {
"directsub": "true",
"page": 1,
"pageSize": self.DATA_CHUNK_SIZE,
"release_date": f"[{self.since} TO {self.until}]",
"type": "study",
}

if self.ids:
# TODO(ark): figure out better way of array filtering.
# Also make sure it's equivalent to the array filtering in this query
# https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208
query_dict.update({"content": ", ".join(self.ids)})
elif self.keyword:
query_dict.update({"content": self.keyword})
elif self.organism:
query_dict.update({"organism": f'"{self.organism}"'})

return query_dict

def collect_data(self) -> Set[str]:
"""Gets new accessions from EBI Biostudies API."""
accessions = set()

if self.ids:
message = (
"Getting MicroArray ArrayExpress entries by "
f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] "
"range."
)
elif self.keyword:
message = (
"Getting MicroArray ArrayExpress entries by "
f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.'
)
elif self.organism:
message = (
"Getting MicroArray ArrayExpress entries by "
f'"{self.organism}" organism for [{self.since} - {self.until}] range.'
)
else:
return accessions

logger.debug(message)
accessions.update(self.fetch_data())

return accessions

def fetch_data(self) -> Set[str]:
"""Retrieves accessions from API search endpoint."""

@retry(**self.retry_params)
def get_response(url, **kwargs):
"""Gets response from an API endpoint."""
return requests.get(url, **kwargs)

accessions = set()

is_done = False
params = self.build_query()
while not is_done:
range_start = (params["page"] - 1) * params["pageSize"] + 1
range_end = (params["page"] - 1) * params["pageSize"] + self.DATA_CHUNK_SIZE
logger.debug(f"Processing entries {range_start} - {range_end}")

response = get_response(self.DATA_URL, params=params)
entries = response.json().get("hits", ())
if entries:
entries = (
GatheredAccession.create_from_external_entry(
entry, self.SOURCE, self.TECHNOLOGY, organism=self.organism
)
for entry in entries
)
params["page"] += 1
else:
is_done = True

if self.previous_accessions:
entries = (
entry
for entry in entries
if entry.accession_code not in self.previous_accessions
)
accessions.update(entries)

# Quit after getting a sufficient amount of accessions.
if self.count and len(accessions) >= self.count:
is_done = True

return accessions

def get_ids(self) -> List[str]:
"""Returns a combined list of passed ArrayExpress IDs."""
ids = set()

if self.options["ae_id"]:
ids.update(self.options["ae_id"])

if self.options["ae_ids_file"]:
with open(self.options["ae_ids_file"]) as ae_ids_file:
ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines()))

return sorted(ids)
Loading

0 comments on commit ffd289d

Please sign in to comment.