Merge branch 'feature/automatic-accessions' into ark/3111-update-fore…

…man-ami
AlexsLemonade · Feb 3, 2023 · ffd289d · ffd289d
2 parents 8b79bae + 7621cde
commit ffd289d
Show file tree

Hide file tree

Showing 65 changed files with 1,821 additions and 1,184 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
       - id: isort
 
   - repo: https://github.com/psf/black
-    rev: 19.10b0
+    rev: 22.3.0
     hooks:
       - id: black
         args: [--line-length=100]

diff --git a/ami/instance.tf b/ami/instance.tf
@@ -25,7 +25,8 @@ resource "aws_instance" "ubuntu-ami-template-instance" {
   # Our instance-user-data.sh script is built by Terraform at
   # apply-time so that it can put additional files onto the
   # instance. For more information see the definition of this resource.
-  user_data = templatefile("ubuntu-instance-user-data.tpl.sh", {
+  user_data = templatefile("ubuntu-instance-user-data.tpl.sh",
+    {
       docker_apt_key = data.local_file.docker_apt_key.content
     }
   )
@@ -49,7 +50,8 @@ resource "aws_instance" "ecs-ami-template-instance" {
   # Our instance-user-data.sh script is built by Terraform at
   # apply-time so that it can put additional files onto the
   # instance. For more information see the definition of this resource.
-  user_data = templatefile("ecs-instance-user-data.tpl.sh", {
+  user_data = templatefile("ecs-instance-user-data.tpl.sh",
+    {
       docker_apt_key = data.local_file.docker_apt_key.content
     }
   )

diff --git a/common/install_devtools.R → common/R/install_devtools.R b/common/install_devtools.R → common/R/install_devtools.R
@@ -16,9 +16,9 @@
 # Cranlock was used to find the versions of dependencies to install
 
 # Treat warnings as errors, set CRAN mirror, and set parallelization:
-options(warn=2)
-options(repos=structure(c(CRAN="https://cloud.r-project.org/")))
-options(Ncpus=parallel::detectCores())
+options(warn = 2)
+options(repos = structure(c(CRAN = "https://cloud.r-project.org/")))
+options(Ncpus = parallel::detectCores())
 
 
 install_package_version <- function(package_name, version) {
@@ -31,18 +31,18 @@ install_package_version <- function(package_name, version) {
   package_url <- paste0("https://cloud.r-project.org/src/contrib/", package_tarball)
 
   # Give CRAN a full minute to timeout since it's not always the most reliable.
-  curl_result <- system(paste0("curl --head --connect-timeout 60 ", package_url), intern=TRUE)
+  curl_result <- system(paste0("curl --head --connect-timeout 60 ", package_url), intern = TRUE)
   if (grepl("404", curl_result[1])) {
     package_url <- paste0("https://cloud.r-project.org/src/contrib/Archive/", package_name, "/", package_tarball)
 
     # Make sure the package actually exists in the archive!
-    curl_result <- system(paste0("curl --head --connect-timeout 120 ", package_url), intern=TRUE)
+    curl_result <- system(paste0("curl --head --connect-timeout 120 ", package_url), intern = TRUE)
     if (grepl("404", curl_result[1])) {
       stop(paste("Package", package_name, "version", version, "does not exist!"))
     }
   }
 
-  install.packages(package_url)
+  install.packages(package_url, Ncpus = 32)
 }
 
 # Generated using cranlock

diff --git a/common/data_refinery_common/migrations/0071_gatheredaccession.py b/common/data_refinery_common/migrations/0071_gatheredaccession.py
@@ -0,0 +1,38 @@
+# Generated by Django 3.2.7 on 2022-09-13 18:14
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("data_refinery_common", "0070_auto_20211208_2118"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="GatheredAccession",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("accession_code", models.TextField(unique=True)),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("last_modified_at", models.DateTimeField(auto_now=True)),
+                ("organism", models.TextField()),
+                ("published_date", models.DateTimeField()),
+                ("sample_count", models.PositiveIntegerField(default=0)),
+                ("source", models.TextField()),
+                ("technology", models.TextField()),
+            ],
+            options={
+                "db_table": "gathered_accessions",
+            },
+        ),
+    ]
diff --git a/common/data_refinery_common/models/__init__.py b/common/data_refinery_common/models/__init__.py
@@ -45,6 +45,7 @@
 from data_refinery_common.models.dataset_annotation import DatasetAnnotation  # noqa
 from data_refinery_common.models.experiment import Experiment  # noqa
 from data_refinery_common.models.experiment_annotation import ExperimentAnnotation  # noqa
+from data_refinery_common.models.gathered_accession import GatheredAccession  # noqa
 from data_refinery_common.models.jobs.downloader_job import DownloaderJob  # noqa
 from data_refinery_common.models.jobs.processor_job import ProcessorJob  # noqa
 from data_refinery_common.models.jobs.survey_job import SurveyJob  # noqa

diff --git a/common/data_refinery_common/models/gathered_accession.py b/common/data_refinery_common/models/gathered_accession.py
@@ -0,0 +1,64 @@
+from datetime import datetime
+
+from django.db import models
+from django.utils import timezone
+
+
+class GatheredAccession(models.Model):
+    """Gathered accession model."""
+
+    class Meta:
+        db_table = "gathered_accessions"
+
+    accession_code = models.TextField(unique=True)
+    created_at = models.DateTimeField(auto_now_add=True)
+    last_modified_at = models.DateTimeField(auto_now=True)
+    organism = models.TextField()
+    published_date = models.DateTimeField()
+    sample_count = models.PositiveIntegerField(default=0)
+    source = models.TextField()
+    technology = models.TextField()
+
+    def __eq__(self, other: object) -> bool:
+        """Returns True if two objects are equal. Otherwise returns False."""
+        return isinstance(other, GatheredAccession) and self.accession_code == other.accession_code
+
+    def __hash__(self) -> int:
+        """Returns accession object unique hash value."""
+        return hash(self.accession_code)
+
+    def __str__(self) -> str:
+        """Returns accession default string representation."""
+        return ", ".join(
+            (
+                self.accession_code,
+                self.technology,
+                self.source,
+                str(self.published_date.date()),
+            )
+        )
+
+    @staticmethod
+    def create_from_external_entry(data, source, technology, organism=None):
+        """Creates accession object from MicroArray ArrayExpress entry."""
+        accession = GatheredAccession()
+
+        accession.accession_code = (
+            data.get("accession") or data.get("gse") or data.get("secondary_study_accession")
+        )
+
+        organism = data.get("organism") or data.get("scientific_name") or organism
+        if organism:
+            accession.organism = organism.lower()
+
+        published_date = (
+            data.get("first_public") or data.get("release_date") or data.get("submission_date")
+        )
+        accession.published_date = timezone.make_aware(
+            datetime.strptime(published_date, "%Y-%m-%d")
+        )
+
+        accession.source = source
+        accession.technology = technology
+
+        return accession
diff --git a/common/dockerfiles/Dockerfile.base b/common/dockerfiles/Dockerfile.base
@@ -0,0 +1,65 @@
+FROM ubuntu:18.04
+
+# Fail in case of an error at any stage in the pipe.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+WORKDIR /home/user
+
+# Prevent tzdata from prompting us for a timezone and hanging the build.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
+# For whatever reason this worked and 'en_US.UTF-8' did not.
+ENV LANG=C.UTF-8
+
+COPY workers/CRAN.gpg .
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y software-properties-common && \
+    add-apt-repository ppa:apt-fast/stable && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    add-apt-repository ppa:savoury1/llvm-defaults-10 && \
+    apt-get update -qq && \
+    apt-get install --no-install-recommends -y \
+        apt-fast \
+        apt-transport-https \
+        gpg-agent && \
+    apt-key add CRAN.gpg && \
+    echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
+        >> /etc/apt/sources.list.d/added_repos.list && \
+    apt-fast update -qq && apt-fast install -y \
+        build-essential \
+        cmake \
+        curl \
+        cython3 \
+        ed \
+        git \
+        libcairo-dev \
+        libcurl4-gnutls-dev \
+        libedit-dev \
+        libgit2-dev \
+        libpq-dev \
+        libssl-dev \
+        libxml2-dev \
+        llvm-10-dev \
+        lsb-release \
+        mercurial \
+        pkg-config \
+        python3-pip \
+        python3.8 \
+        python3.8-dev \
+        r-base-core \
+        wget && \
+    apt-get clean && \
+    rm CRAN.gpg && \
+    rm -rf /var/lib/apt/lists/* && \
+    ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config && \
+    update-alternatives --install \
+        /usr/bin/python3 python3 /usr/bin/python3.8 1 && \
+    groupadd user && \
+    useradd --create-home --home-dir /home/user/ -g user user && \
+    chown -R user /home/user/
+
+# Pre-install dev tools.
+ENV R_LIBS=/usr/local/lib/R/site-library
+COPY common/R/install_devtools.R .
+RUN Rscript install_devtools.R
diff --git a/foreman/data_refinery_foreman/gatherer/__init__.py b/foreman/data_refinery_foreman/gatherer/__init__.py
diff --git a/foreman/data_refinery_foreman/gatherer/agents/__init__.py b/foreman/data_refinery_foreman/gatherer/agents/__init__.py
diff --git a/foreman/data_refinery_foreman/gatherer/agents/ae_agent.py b/foreman/data_refinery_foreman/gatherer/agents/ae_agent.py
@@ -0,0 +1,137 @@
+"""
+MicroArray ArrayExpress accession gathering automation.
+Data source: https://www.ebi.ac.uk/biostudies/help
+"""
+
+from typing import List, Set
+
+import requests
+from retrying import retry
+
+from data_refinery_common.logging import get_and_configure_logger
+from data_refinery_common.models.gathered_accession import GatheredAccession
+from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase
+
+logger = get_and_configure_logger(__name__)
+
+
+class AEAgent(AccessionAgentBase):
+    """
+    MicroArray ArrayExpress accession gathering agent. The data is fetched from
+    the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and
+    https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more
+    information about the API endpoints.
+    """
+
+    DATA_CHUNK_SIZE = 100
+    DATA_URL = "https://www.ebi.ac.uk/biostudies/api/v1/search"
+    SOURCE = "ebi-biostudies"
+    SOURCE_NAME = "microarray-ae"
+    TECHNOLOGY = "microarray"
+
+    def build_query(self) -> dict:
+        """Returns a query dict for getting array/organism specific accessions."""
+        query_dict = {
+            "directsub": "true",
+            "page": 1,
+            "pageSize": self.DATA_CHUNK_SIZE,
+            "release_date": f"[{self.since} TO {self.until}]",
+            "type": "study",
+        }
+
+        if self.ids:
+            # TODO(ark): figure out better way of array filtering.
+            # Also make sure it's equivalent to the array filtering in this query
+            # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208
+            query_dict.update({"content": ", ".join(self.ids)})
+        elif self.keyword:
+            query_dict.update({"content": self.keyword})
+        elif self.organism:
+            query_dict.update({"organism": f'"{self.organism}"'})
+
+        return query_dict
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from EBI Biostudies API."""
+        accessions = set()
+
+        if self.ids:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] "
+                "range."
+            )
+        elif self.keyword:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.'
+            )
+        elif self.organism:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f'"{self.organism}" organism for [{self.since} - {self.until}] range.'
+            )
+        else:
+            return accessions
+
+        logger.debug(message)
+        accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self) -> Set[str]:
+        """Retrieves accessions from API search endpoint."""
+
+        @retry(**self.retry_params)
+        def get_response(url, **kwargs):
+            """Gets response from an API endpoint."""
+            return requests.get(url, **kwargs)
+
+        accessions = set()
+
+        is_done = False
+        params = self.build_query()
+        while not is_done:
+            range_start = (params["page"] - 1) * params["pageSize"] + 1
+            range_end = (params["page"] - 1) * params["pageSize"] + self.DATA_CHUNK_SIZE
+            logger.debug(f"Processing entries {range_start} - {range_end}")
+
+            response = get_response(self.DATA_URL, params=params)
+            entries = response.json().get("hits", ())
+            if entries:
+                entries = (
+                    GatheredAccession.create_from_external_entry(
+                        entry, self.SOURCE, self.TECHNOLOGY, organism=self.organism
+                    )
+                    for entry in entries
+                )
+                params["page"] += 1
+            else:
+                is_done = True
+
+            if self.previous_accessions:
+                entries = (
+                    entry
+                    for entry in entries
+                    if entry.accession_code not in self.previous_accessions
+                )
+            accessions.update(entries)
+
+            # Quit after getting a sufficient amount of accessions.
+            if self.count and len(accessions) >= self.count:
+                is_done = True
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed ArrayExpress IDs."""
+        ids = set()
+
+        if self.options["ae_id"]:
+            ids.update(self.options["ae_id"])
+
+        if self.options["ae_ids_file"]:
+            with open(self.options["ae_ids_file"]) as ae_ids_file:
+                ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines()))
+
+        return sorted(ids)