From 6b606f2013982e78cee31a1bec8f2d6672cbf629 Mon Sep 17 00:00:00 2001
From: ark <ark@cho.red>
Date: Thu, 5 May 2022 20:30:59 -0700
Subject: [PATCH 01/24] Fix test_common job.

---
 workers/install_affy_only.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workers/install_affy_only.R b/workers/install_affy_only.R
index 354e0e592..b488a0287 100644
--- a/workers/install_affy_only.R
+++ b/workers/install_affy_only.R
@@ -8,6 +8,7 @@ options(Ncpus=parallel::detectCores())
 devtools::install_version('dplyr', version='1.0.0')
 devtools::install_version('tidyr', version='1.1.0')
 devtools::install_version('ff', version='2.2-14')
+devtools::install_version('locfit', version='1.5-9.4')
 
 # Helper function that installs a list of packages using the input URLs
 install_with_url <- function(urls) {

From b87cbd21675c51dbaa8a956a3c87a1444ccade14 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Thu, 1 Sep 2022 11:03:18 -0700
Subject: [PATCH 02/24] Add `Accession` model. Update pre-commit config.

---
 .pre-commit-config.yaml                       |  2 +-
 .../migrations/0071_auto_20220901_1653.py     | 44 +++++++++++++++++++
 .../data_refinery_common/models/__init__.py   |  1 +
 .../data_refinery_common/models/accession.py  | 22 ++++++++++
 4 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 common/data_refinery_common/migrations/0071_auto_20220901_1653.py
 create mode 100644 common/data_refinery_common/models/accession.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d024704da..b651ce24a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
       - id: isort
 
   - repo: https://github.com/psf/black
-    rev: 19.10b0
+    rev: 22.3.0
     hooks:
       - id: black
         args: [--line-length=100]
diff --git a/common/data_refinery_common/migrations/0071_auto_20220901_1653.py b/common/data_refinery_common/migrations/0071_auto_20220901_1653.py
new file mode 100644
index 000000000..c7d3b0b63
--- /dev/null
+++ b/common/data_refinery_common/migrations/0071_auto_20220901_1653.py
@@ -0,0 +1,44 @@
+# Generated by Django 3.2.7 on 2022-09-01 16:53
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("data_refinery_common", "0070_auto_20211208_2118"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="Accession",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("code", models.TextField()),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("last_modified_at", models.DateTimeField(auto_now=True)),
+                ("organism", models.TextField()),
+                ("published_date", models.DateTimeField()),
+                ("sample_count", models.PositiveIntegerField(default=0)),
+                ("source", models.TextField()),
+                ("technology", models.TextField()),
+            ],
+            options={
+                "db_table": "accessions",
+            },
+        ),
+        migrations.AddConstraint(
+            model_name="accession",
+            constraint=models.UniqueConstraint(
+                fields=("code", "source", "technology"), name="unique_accession"
+            ),
+        ),
+    ]
diff --git a/common/data_refinery_common/models/__init__.py b/common/data_refinery_common/models/__init__.py
index 39abe7ee3..8e9564153 100644
--- a/common/data_refinery_common/models/__init__.py
+++ b/common/data_refinery_common/models/__init__.py
@@ -1,3 +1,4 @@
+from data_refinery_common.models.accession import Accession  # noqa
 from data_refinery_common.models.api_token import APIToken  # noqa
 from data_refinery_common.models.associations.compendium_result_organism_association import (  # noqa
     CompendiumResultOrganismAssociation,
diff --git a/common/data_refinery_common/models/accession.py b/common/data_refinery_common/models/accession.py
new file mode 100644
index 000000000..dc93cfd88
--- /dev/null
+++ b/common/data_refinery_common/models/accession.py
@@ -0,0 +1,22 @@
+from django.db import models
+
+
+class Accession(models.Model):
+    """Accession model."""
+
+    class Meta:
+        constraints = (
+            models.UniqueConstraint(
+                fields=("code", "source", "technology"), name="unique_accession"
+            ),
+        )
+        db_table = "accessions"
+
+    code = models.TextField()
+    created_at = models.DateTimeField(auto_now_add=True)
+    last_modified_at = models.DateTimeField(auto_now=True)
+    organism = models.TextField()
+    published_date = models.DateTimeField()
+    sample_count = models.PositiveIntegerField(default=0)
+    source = models.TextField()
+    technology = models.TextField()

From f1b1c06ee75d91a97fa40c2276c3d7b937431a72 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Thu, 8 Sep 2022 18:12:39 -0700
Subject: [PATCH 03/24] Port Python script to Django command.   - Introduce
 AccessionBacklogEntry model.   - Clean up command flags.   - Get previous
 accessions from the DB.   -

---
 .../migrations/0071_accessionbacklogentry.py  |  38 +
 .../data_refinery_common/models/accession.py  |  82 +-
 .../gatherer/__init__.py                      |   0
 .../gatherer/management/__init__.py           |   0
 .../gatherer/management/commands/__init__.py  |   0
 .../management/commands/gather_accessions.py  | 731 ++++++++++++++++++
 foreman/data_refinery_foreman/settings.py     |  17 +-
 foreman/dockerfiles/Dockerfile.foreman        |   2 +
 8 files changed, 856 insertions(+), 14 deletions(-)
 create mode 100644 common/data_refinery_common/migrations/0071_accessionbacklogentry.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/__init__.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/management/__init__.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/management/commands/__init__.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py

diff --git a/common/data_refinery_common/migrations/0071_accessionbacklogentry.py b/common/data_refinery_common/migrations/0071_accessionbacklogentry.py
new file mode 100644
index 000000000..86c04daed
--- /dev/null
+++ b/common/data_refinery_common/migrations/0071_accessionbacklogentry.py
@@ -0,0 +1,38 @@
+# Generated by Django 3.2.7 on 2022-09-07 19:31
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("data_refinery_common", "0070_auto_20211208_2118"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="AccessionBacklogEntry",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("code", models.TextField(unique=True)),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("last_modified_at", models.DateTimeField(auto_now=True)),
+                ("organism", models.TextField()),
+                ("published_date", models.DateTimeField()),
+                ("sample_count", models.PositiveIntegerField(default=0)),
+                ("source", models.TextField()),
+                ("technology", models.TextField()),
+            ],
+            options={
+                "db_table": "accession_backlog",
+            },
+        ),
+    ]
diff --git a/common/data_refinery_common/models/accession.py b/common/data_refinery_common/models/accession.py
index dc93cfd88..6ac62da9f 100644
--- a/common/data_refinery_common/models/accession.py
+++ b/common/data_refinery_common/models/accession.py
@@ -1,18 +1,16 @@
+from datetime import datetime
+
 from django.db import models
+from django.utils import timezone
 
 
-class Accession(models.Model):
-    """Accession model."""
+class AccessionBacklogEntry(models.Model):
+    """Accession backlog entry model."""
 
     class Meta:
-        constraints = (
-            models.UniqueConstraint(
-                fields=("code", "source", "technology"), name="unique_accession"
-            ),
-        )
-        db_table = "accessions"
-
-    code = models.TextField()
+        db_table = "accession_backlog"
+
+    code = models.TextField(unique=True)
     created_at = models.DateTimeField(auto_now_add=True)
     last_modified_at = models.DateTimeField(auto_now=True)
     organism = models.TextField()
@@ -20,3 +18,67 @@ class Meta:
     sample_count = models.PositiveIntegerField(default=0)
     source = models.TextField()
     technology = models.TextField()
+
+    def __eq__(self, other: object) -> bool:
+        """Returns True if two objects are equal. Otherwise returns False."""
+        return isinstance(other, AccessionBacklogEntry) and self.code == other.code
+
+    def __hash__(self) -> int:
+        """Returns accession object unique hash value."""
+        return hash(self.code)
+
+    def __str__(self) -> str:
+        """Returns accession default string representation."""
+        return ", ".join((self.code, self.technology, self.source, str(self.published_date.date())))
+
+    @staticmethod
+    def create_from_ma_ae_entry(entry):
+        """Creates accession object from MicroArray ArrayExpress entry."""
+        accession = AccessionBacklogEntry()
+        accession.code = entry["accession"]
+        accession.source = "ebi_biostudies"
+        accession.technology = "microarray"
+
+        if "organism" in entry:
+            accession.organism = entry["organism"]
+        if "release_date" in entry:
+            accession.published_date = timezone.make_aware(
+                datetime.strptime(entry["release_date"], "%Y-%m-%d")
+            )
+
+        return accession
+
+    @staticmethod
+    def create_from_ma_geo_entry(entry):
+        """Creates accession object from MicroArray GEO meta DB entry."""
+        accession = AccessionBacklogEntry()
+        accession.code = entry["gse"]
+        accession.source = "geo_meta_db"
+        accession.technology = "microarray"
+
+        if "organism" in entry:
+            accession.organism = entry["organism"].lower()
+        if "submission_date" in entry:
+
+            accession.published_date = timezone.make_aware(
+                datetime.strptime(entry["submission_date"], "%Y-%m-%d")
+            )
+
+        return accession
+
+    @staticmethod
+    def create_from_rnaseq_entry(entry):
+        """Creates accession object from RNA-Seq entry."""
+        accession = AccessionBacklogEntry()
+        accession.code = entry["secondary_study_accession"]
+        accession.source = "ebi_ena_portal"
+        accession.technology = "rna-seq"
+
+        if "scientific_name" in entry:
+            accession.organism = entry["scientific_name"].lower()
+        if "first_public" in entry:
+            accession.published_date = timezone.make_aware(
+                datetime.strptime(entry["first_public"], "%Y-%m-%d")
+            )
+
+        return accession
diff --git a/foreman/data_refinery_foreman/gatherer/__init__.py b/foreman/data_refinery_foreman/gatherer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/foreman/data_refinery_foreman/gatherer/management/__init__.py b/foreman/data_refinery_foreman/gatherer/management/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/__init__.py b/foreman/data_refinery_foreman/gatherer/management/commands/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
new file mode 100644
index 000000000..c4808a191
--- /dev/null
+++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
@@ -0,0 +1,731 @@
+"""MicroArray (ArrayExpress, GEO) and RNA-Seq accession gathering automation.
+Data sources:
+  - https://www.ebi.ac.uk/biostudies/help (MicroArray ArrayExpress).
+  - local SQLite meta DB from https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html
+    (MicroArray GEO).
+  - https://www.ebi.ac.uk/ena/portal/api/ (RNA-Seq).
+"""
+
+import argparse
+import logging
+import os
+import re
+import sqlite3
+from datetime import datetime
+from http.client import RemoteDisconnected
+from json.decoder import JSONDecodeError
+from typing import List, Set
+from urllib.parse import quote
+
+from django.core.management.base import BaseCommand
+
+import requests
+from requests.exceptions import ConnectionError, ConnectTimeout
+from retrying import retry
+from urllib3.exceptions import ProtocolError
+
+from data_refinery_common.logging import get_and_configure_logger
+from data_refinery_common.models.accession import AccessionBacklogEntry
+from data_refinery_common.models.experiment import Experiment
+
+log = get_and_configure_logger(__name__)
+
+
+class Command(BaseCommand):
+    """Creates agents and runs actual accession gathering."""
+
+    RE_ACCESSION = re.compile(r"(\D+)(\d+)")
+    RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}")
+
+    # TODO(ark): remove after upgrade to python3.8 where parser argument
+    # "extend" action is directly available.
+    # https://docs.python.org/3.8/library/argparse.html#action
+    class ExtendAction(argparse.Action):
+        def __call__(self, parser, namespace, values, option_string=None):
+            items = getattr(namespace, self.dest) or []
+            items.extend(values)
+            setattr(namespace, self.dest, items)
+
+    def add_arguments(self, parser) -> None:
+        parser.register("action", "extend", Command.ExtendAction)
+
+        parser.add_argument(
+            "--ae-id",
+            action="extend",
+            nargs="+",
+            type=str,
+            help="ArrayExpress ID(s) to use for filtering.",
+        )
+        parser.add_argument(
+            "--ae-ids-file",
+            type=str,
+            help="Path to a file containing ArrayExpress ID(s) to use for filtering.",
+        )
+        parser.add_argument("-c", "--count", type=int, help="Number of accessions to collect.")
+        parser.add_argument(
+            "-d",
+            "--dry-run",
+            action="store_true",
+            default=False,
+            help="Do not write the result to the database.",
+        )
+        parser.add_argument(
+            "-e",
+            "--exclude-previous",
+            action="store_true",
+            default=True,
+            help="Exclude previously gathered or surveyed accessions.",
+        )
+        parser.add_argument(
+            "-ne",
+            "--no-exclude-previous",
+            action="store_false",
+            default=False,
+            dest="exclude_previous",
+            help="Do not exclude previously gathered or surveyed accessions.",
+        )
+        parser.add_argument(
+            "--gpl-id",
+            action="extend",
+            nargs="+",
+            type=str,
+            help="GEO platform ID(s) to use for filtering.",
+        )
+        parser.add_argument(
+            "--gpl-ids-file",
+            type=str,
+            help="Path to a file containing GEO platform ID(s) to use for filtering.",
+        )
+        parser.add_argument(
+            "-k",
+            "--keyword",
+            type=str,
+            help="Keyword to use for filtering.",
+        )
+        parser.add_argument(
+            "-m",
+            "--microarray",
+            action="store_true",
+            default=False,
+            help="Collect MicroArray accessions.",
+        )
+        parser.add_argument(
+            "-o", "--organism", type=str, help="Organism name to use for filtering."
+        )
+        parser.add_argument(
+            "-r",
+            "--rna-seq",
+            action="store_true",
+            default=False,
+            help="Collect RNA-Seq accessions.",
+        )
+        parser.add_argument(
+            "-s",
+            "--since",
+            type=str,
+            required=True,
+            help="Collect accessions made public on or after this date.",
+        )
+        parser.add_argument(
+            "--taxon-id",
+            action="extend",
+            nargs="+",
+            type=int,
+            help="Taxon ID(s) to use for filtering.",
+        )
+        parser.add_argument(
+            "--taxon-ids-file",
+            type=str,
+            help="Path to a file containing taxon ID(s) to use for filtering.",
+        )
+        parser.add_argument(
+            "-u",
+            "--until",
+            type=str,
+            help="Collect accessions made public before or on this date.",
+        )
+        parser.add_argument(
+            "-lv",
+            "--log-verbose",
+            action="store_true",
+            default=False,
+            help="Enable verbose log output.",
+        )
+
+    def set_verbosity_level(self, options) -> None:
+        """Configures log verbosity level."""
+        if options["log_verbose"]:
+            log.addHandler(logging.StreamHandler())
+            log.setLevel(logging.DEBUG)
+        else:
+            log.setLevel(logging.ERROR)
+
+    def validate_args(self, options) -> None:
+        """Validates arguments."""
+        if not options["microarray"] and not options["rna_seq"]:
+            exit("Either --microarray or --rna-seq must be specified.")
+
+        errors = list()
+        since = options["since"]
+        until = options["until"]
+        if not self.RE_DATE.match(since):
+            errors.append('The -s, --since value must match "YYYY-MM-DD" format.')
+        if until and not self.RE_DATE.match(until):
+            errors.append('The -u, --until value must match "YYYY-MM-DD" format.')
+        if since and until and since > until:
+            errors.append("The -s, --since date must be earlier than -u, --until date.")
+
+        keyword = options["keyword"]
+        organism = options["organism"]
+        if options["microarray"]:
+            ae_id = options["ae_id"] or options["ae_ids_file"]
+            gpl_id = options["gpl_id"] or options["gpl_ids_file"]
+            ids = ae_id or gpl_id
+            invalid_options_message = (
+                "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or "
+                "ArrayExpress ID(s) [--ae-id, --ae-ids-file] / GEO platform ID(s) "
+                "[--gpl-id, --gpl-ids-file] must be specified."
+            )
+        elif options["rna_seq"]:
+            taxon_id = options["taxon_id"] or options["taxon_ids_file"]
+            ids = taxon_id
+            invalid_options_message = (
+                "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] "
+                "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified."
+            )
+
+        if len([option for option in (ids, keyword, organism) if option]) != 1:
+            errors.append(invalid_options_message)
+
+        if errors:
+            exit("\n".join(errors))
+
+    def handle(self, *args, **options):
+        """Runs the accession gathering process."""
+        self.validate_args(options)
+        self.set_verbosity_level(options)
+
+        agents = list()
+        if options["rna_seq"]:
+            agents.append(RNASeqAccessionAgent(options))
+        elif options["microarray"]:
+            if (
+                options["ae_id"]
+                or options["ae_ids_file"]
+                or options["keyword"]
+                or options["organism"]
+            ):
+                agents.append(MicroArrayExpressAccessionAgent(options))
+            if (
+                options["gpl_id"]
+                or options["gpl_ids_file"]
+                or options["keyword"]
+                or options["organism"]
+            ):
+                agents.append(MicroArrayGEOAccessionAgent(options))
+
+        entries = set()
+        for agent in agents:
+            entries.update(agent.collect_data())
+
+        entries = sorted(  # Sort the resulting list.
+            (entry for entry in entries if self.RE_ACCESSION.match(entry.code)),
+            key=lambda entry: (
+                self.RE_ACCESSION.match(entry.code).group(1),
+                int(self.RE_ACCESSION.match(entry.code).group(2)),
+            ),
+        )
+        # Limit the number of output entries.
+        entries = entries[: options["count"]] if options["count"] else entries
+
+        if options["dry_run"]:
+            if entries:
+                output = "\n".join((str(entry) for entry in entries))
+            else:
+                output = "No accessions found."
+            print(output)
+        else:
+            AccessionBacklogEntry.objects.bulk_create(entries)
+
+
+class AccessionAgentBase:
+    "Accession agent base class."
+
+    previous_accessions = set()
+    retry_params = {
+        "retry_on_exception": lambda e: isinstance(
+            e, (ConnectionError, ConnectTimeout, ProtocolError, RemoteDisconnected)
+        ),
+        "stop_max_attempt_number": 5,
+        "wait_exponential_multiplier": 1000,  # Seconds.
+        "wait_exponential_max": 16000,  # Seconds.
+    }
+
+    def __init__(self, options) -> None:
+        """Populates args and values for major variables."""
+        self.options = options
+        self.count = options["count"]
+        self.keyword = options["keyword"]
+        self.organism = options["organism"]
+        self.since = options["since"]
+        self.until = options["until"] or datetime.now().strftime("%Y-%m-%d")
+
+        self.populate_previous_accessions()
+
+    def build_query(self):
+        """Returns query/query dict depending on the accession data source."""
+        raise NotImplementedError
+
+    def collect_data(self):
+        """Generates resulting entry collection."""
+        raise NotImplementedError
+
+    def fetch_data(self):
+        """Fetches data from an external or local data source."""
+        raise NotImplementedError
+
+    def get_ids(self):
+        """Gets IDs for query filtering depending on the accession technology."""
+        raise NotImplementedError
+
+    def populate_previous_accessions(self) -> None:
+        """Populates previous accession set from a provided excluded ids file."""
+        if not self.options["exclude_previous"] or self.previous_accessions:
+            return
+
+        # Gathered accessions.
+        self.previous_accessions.update(
+            (entry["code"] for entry in AccessionBacklogEntry.objects.values("code"))
+        )
+
+        # Surveyed accessions.
+        experiments = Experiment.objects.values("accession_code", "alternate_accession_code")
+        self.previous_accessions.update(
+            (experiment["accession_code"] for experiment in experiments)
+        )
+        self.previous_accessions.update(
+            (experiment["alternate_accession_code"] for experiment in experiments)
+        )
+
+
+class MicroArrayExpressAccessionAgent(AccessionAgentBase):
+    """
+    MicroArray ArrayExpress accession gathering agent. The data is fetched from
+    the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and
+    https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more
+    information about the API endpoints.
+    """
+
+    def __init__(self, options) -> None:
+        super().__init__(options)
+
+        self.data_chunk_size = 100
+        self.data_url = "https://www.ebi.ac.uk/biostudies/api/v1/search"
+        self.ids = self.get_ids()
+
+    def build_query(self) -> dict:
+        """Returns a query dict for getting array/organism specific accessions."""
+        query_dict = {
+            "directsub": "true",
+            "page": 1,
+            "pageSize": self.data_chunk_size,
+            "release_date": f"[{self.since} TO {self.until}]",
+            "type": "study",
+        }
+
+        if self.ids:
+            # TODO(ark): figure out better way of array filtering.
+            # Also make sure it's equivalent to the array filtering in this query
+            # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208
+            query_dict.update({"content": ", ".join(self.ids)})
+        elif self.keyword:
+            query_dict.update({"content": self.keyword})
+        elif self.organism:
+            query_dict.update({"organism": f'"{self.organism}"'})
+
+        return query_dict
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from EBI Biostudies API."""
+        accessions = set()
+
+        if self.ids:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] "
+                "range."
+            )
+        elif self.keyword:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.'
+            )
+        elif self.organism:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f'"{self.organism}" organism for [{self.since} - {self.until}] range.'
+            )
+        else:
+            return accessions
+
+        log.debug(message)
+        accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self) -> Set[str]:
+        """Retrieves accessions from API search endpoint."""
+
+        @retry(**self.retry_params)
+        def get_response(url, **kwargs):
+            """Gets response from an API endpoint."""
+            return requests.get(url, **kwargs)
+
+        accessions = set()
+
+        is_done = False
+        params = self.build_query()
+        while not is_done:
+            range_start = (params["page"] - 1) * params["pageSize"] + 1
+            range_end = (params["page"] - 1) * params["pageSize"] + self.data_chunk_size
+            log.debug(f"Processing entries {range_start} - {range_end}")
+
+            response = get_response(self.data_url, params=params)
+            entries = response.json().get("hits")
+            if entries:
+                entries = (
+                    AccessionBacklogEntry.create_from_ma_ae_entry(entry) for entry in entries
+                )
+                params["page"] += 1
+            else:
+                is_done = True
+
+            if self.previous_accessions:
+                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+            accessions.update(entries)
+
+            # Quit after getting a sufficient amount of accessions.
+            if self.count and len(accessions) >= self.count:
+                is_done = True
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed ArrayExpress IDs."""
+        ids = set()
+
+        if self.options["ae_id"]:
+            ids.update(self.options["ae_id"])
+
+        if self.options["ae_ids_file"]:
+            with open(self.options["ae_ids_file"]) as ae_ids_file:
+                ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines()))
+
+        return sorted(ids)
+
+
+class MicroArrayGEOAccessionAgent(AccessionAgentBase):
+    """
+    MicroArray GEO accession gathering agent. The data is fetched from a local
+    SQLite GEO meta database.
+    """
+
+    def __init__(self, options) -> None:
+        super().__init__(options)
+
+        self.db_path = "data/microarray/GEOmetadb.sqlite"
+        self.ids = self.get_ids()
+
+    def build_query(self) -> str:
+        """Returns a query for getting GEO accessions from the local SQLite meta DB."""
+        tables = [
+            f"SELECT *",
+            "FROM gse_gpl",
+            "JOIN gpl ON gse_gpl.gpl=gpl.gpl",
+            "JOIN gse ON gse.gse=gse_gpl.gse",
+            "GROUP BY gse_gpl.gse",
+        ]
+
+        conditions = [
+            f"HAVING gse.submission_date >= '{self.since}'",
+            f"gse.submission_date <= '{self.until}'",
+        ]
+
+        if self.ids:
+            gpl_ids = (f"'{gpl_id}'" for gpl_id in self.ids)
+            conditions.append(f"gse_gpl.gpl IN ({', '.join(gpl_ids)})")
+        elif self.organism:
+            conditions.append(f"lower(organism)='{self.organism.lower()}'")
+
+        return f"{' '.join(tables)} {' AND '.join(conditions)}"
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from GEO database."""
+        accessions = set()
+
+        if self.ids:
+            message = (
+                "Getting MicroArray GEO entries by GEO platform ID(s): "
+                f"{', '.join(self.ids)} for [{self.since} - {self.until}] range."
+            )
+        elif self.keyword:
+            message = (
+                f'Getting MicroArray GEO entries by "{self.keyword}" keyword '
+                f"for [{self.since} - {self.until}] range."
+            )
+        elif self.organism:
+            message = (
+                f'Getting MicroArray GEO entries by "{self.organism}" organism '
+                f"for [{self.since} - {self.until}] range."
+            )
+        else:
+            return accessions
+
+        log.debug(message)
+        accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self) -> Set[str]:
+        """Retrieves accessions from the GEO meta DB."""
+
+        def match_keyword(row):
+            """
+            Returns True if `row` matches `self.keyword` based regex.
+            Otherwise returns False.
+            """
+            return re_keyword.match(" ".join((str(c) for c in row if c)))
+
+        accessions = set()
+
+        if not os.path.exists(self.db_path):
+            log.error("GEO meta database doesn't exist.")
+            return accessions
+
+        connection = sqlite3.connect(self.db_path)
+        connection.row_factory = sqlite3.Row
+        connection.text_factory = lambda b: b.decode(errors="ignore")
+        entries = connection.execute(self.build_query()).fetchall()
+        connection.close()
+
+        if self.keyword:
+            re_keyword = re.compile(f".*{self.keyword}.*", re.IGNORECASE)  # Keyword regex.
+            entries = filter(match_keyword, entries)
+
+        entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries)
+        entries = set((AccessionBacklogEntry.create_from_ma_geo_entry(entry) for entry in entries))
+
+        if self.previous_accessions:
+            entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+        accessions.update(entries)
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed GEO platform IDs."""
+        ids = set()
+
+        if self.options["gpl_id"]:
+            ids.update(self.options["gpl_id"])
+
+        if self.options["gpl_ids_file"]:
+            with open(self.options["gpl_ids_file"]) as gpl_ids_file:
+                ids.update((gpl_id.strip() for gpl_id in gpl_ids_file.readlines()))
+
+        return sorted(ids)
+
+
+class RNASeqAccessionAgent(AccessionAgentBase):
+    """
+    RNA-Seq accession gathering agent. The data is fetched from
+    The European Nucleotide Archive (ENA) Portal.
+    See https://www.ebi.ac.uk/ena/portal/api/ for more information about the API
+    endpoints.
+    """
+
+    def __init__(self, options) -> None:
+        super().__init__(options)
+
+        self.data_chunk_size = 10000
+        self.data_url = "https://www.ebi.ac.uk/ena/portal/api/search"
+        self.ids = self.get_ids()
+
+    def build_query(self, taxon_id: str = None) -> str:
+        """
+        Returns a query to use for getting specific taxon ID accessions.
+        Some special characters must remain unquoted.
+        """
+
+        AND = " AND "
+        OR = " OR "
+        instrument_models = (
+            "HiSeq X Five",
+            "HiSeq X Ten",
+            "Illumina Genome Analyzer II",
+            "Illumina Genome Analyzer IIx",
+            "Illumina Genome Analyzer",
+            "Illumina HiScanSQ",
+            "Illumina HiSeq 1000",
+            "Illumina HiSeq 1500",
+            "Illumina HiSeq 2000",
+            "Illumina HiSeq 2500",
+            "Illumina HiSeq 3000",
+            "Illumina HiSeq 4000",
+            "Illumina MiSeq",
+            "Illumina NovaSeq 6000",
+            "Ion Torrent Proton",
+            "Ion Torrent S5 XL",
+            "Ion Torrent S5",
+            "NextSeq 500",
+            "NextSeq 550",
+        )
+
+        instrument_models = OR.join((f'instrument_model="{im}"' for im in instrument_models))
+        conditions = [
+            # Relevant date fields: collection_date, collection_date_submitted,
+            # first_public, last_updated.
+            f"first_public >= {self.since}",
+            f"first_public <= {self.until}",
+            f"({instrument_models})",
+            'library_source="TRANSCRIPTOMIC"',
+            'library_strategy="RNA-Seq"',
+        ]
+
+        if taxon_id:
+            conditions.append(f"tax_eq({taxon_id})")
+        elif self.keyword:
+            search_fields = (
+                "assembly_software",
+                "bio_material",
+                "center_name",
+                "collected_by",
+                "experiment_title",
+                "host_body_site",
+                "instrument_model",
+                "instrument_platform",
+                "library_name",
+                "project_name",
+                "sample_title",
+                "sequencing_method",
+                "study_title",
+            )
+            search_fields = OR.join(
+                (f'{sf}="*{self.keyword}*"' for sf in search_fields)
+            )  # Keyword regex.
+            conditions.append(f"({search_fields})")
+        elif self.organism:
+            # `host`: Natural (as opposed to laboratory) host to the organism from which sample
+            #         was obtained.
+            # `host_scientific_name`: Scientific name of the natural (as opposed to laboratory)
+            #                         host to the organism from which sample was obtained.
+            # `scientific_name` Scientific name of the organism from which the sample was derived.
+            # Neither `host_scientific_name` nor `scientific_name` available for search.
+            # https://www.ebi.ac.uk/ena/portal/api/searchFields?dataPortal=ena&format=json&result=read_study
+            conditions.append(f'host="{self.organism}"')
+
+        return quote(AND.join(conditions), safe='*()-="<>/ ')  # Must remain unquoted.
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from EBI ENA API."""
+        accessions = set()
+
+        if self.ids:
+            log.debug(
+                f"Getting RNA-Seq entries by taxon ID(s): "
+                f"{', '.join((str(idx) for idx in self.ids))} for [{self.since} - {self.until}] range."
+            )
+            total = len(self.ids)
+            for idx, taxon_id in enumerate(self.ids):
+                if self.count and len(accessions) >= self.count:
+                    break
+
+                if total > 1:
+                    log.debug(f"Getting entries for taxon ID {taxon_id}, {idx + 1} of {total}.")
+                accessions.update(self.fetch_data(taxon_id=taxon_id))
+        elif self.keyword:
+            log.debug(
+                f'Getting RNA-Seq entries by "{self.keyword}" keyword '
+                f"for [{self.since} - {self.until}] range."
+            )
+            accessions.update(self.fetch_data())
+        elif self.organism:
+            log.debug(
+                f'Getting entries by "{self.organism}" organism '
+                f"for [{self.since} - {self.until}] range."
+            )
+            accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self, taxon_id=None) -> Set[str]:
+        """
+        Retrieves accessions from API search endpoint.
+        The API allows to set limit to 0 (get all in one request) but we do
+        it in a paginated fashion with `self.data_chunk_size` as a page size.
+        """
+
+        @retry(**self.retry_params)
+        def get_response(url, **kwargs):
+            """Gets response from an API endpoint."""
+            return requests.post(url, **kwargs)
+
+        accessions = set()
+
+        fields = [
+            "first_public",
+            "scientific_name",
+            "secondary_study_accession",
+        ]  # For DRP/ERP/SRP-prefixed accessions.
+        data = {
+            "dataPortal": "ena",
+            # TODO(ark): add excludeAccessions/excludeAccessionType support.
+            "fields": ",".join(fields),  # Use "all" to get all fields.
+            "format": "json",
+            "limit": self.data_chunk_size,
+            "offset": 0,
+            "query": self.build_query(taxon_id=taxon_id),
+            "result": "read_study",
+            "sortFields": fields,
+        }
+
+        is_done = False
+        while not is_done:
+            log.debug(
+                f"Processing entries {data['offset'] + 1} - {data['offset'] + self.data_chunk_size}"
+            )
+            entries = ()
+            try:
+                response = get_response(self.data_url, data=data)
+                entries = response.json()
+                # TODO(ark): add `organism` when -o, --organism flag is used.
+                entries = (
+                    AccessionBacklogEntry.create_from_rnaseq_entry(entry) for entry in entries
+                )
+            except JSONDecodeError:
+                is_done = True
+            except TypeError:
+                log.error(f"Couldn't get data from {self.data_url}. Response: {entries}")
+            data["offset"] += self.data_chunk_size
+
+            if self.previous_accessions:
+                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+            accessions.update(entries)
+
+            # Quit after getting a sufficient amount of accessions.
+            if self.count and len(accessions) >= self.count:
+                is_done = True
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed taxon IDs."""
+        ids = set()
+
+        if self.options["taxon_id"]:
+            ids.update(self.options["taxon_id"])
+
+        if self.options["taxon_ids_file"]:
+            with open(self.options["taxon_ids_file"]) as taxon_id_file:
+                ids.update((taxon_id.strip() for taxon_id in taxon_id_file.readlines()))
+
+        return sorted(ids)
diff --git a/foreman/data_refinery_foreman/settings.py b/foreman/data_refinery_foreman/settings.py
index 7a489facc..5fea76d71 100644
--- a/foreman/data_refinery_foreman/settings.py
+++ b/foreman/data_refinery_foreman/settings.py
@@ -47,6 +47,7 @@
     "data_refinery_common",
     "data_refinery_foreman.surveyor",
     "data_refinery_foreman.foreman",
+    "data_refinery_foreman.gatherer",
     "raven.contrib.django.raven_compat",
     "computedfields",
 ]
@@ -108,10 +109,18 @@
 # https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators
 
 AUTH_PASSWORD_VALIDATORS = [
-    {"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",},
-    {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",},
-    {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",},
-    {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",},
+    {
+        "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
+    },
+    {
+        "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
+    },
+    {
+        "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
+    },
+    {
+        "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
+    },
 ]
 
 
diff --git a/foreman/dockerfiles/Dockerfile.foreman b/foreman/dockerfiles/Dockerfile.foreman
index 8c09c6888..929ef2476 100644
--- a/foreman/dockerfiles/Dockerfile.foreman
+++ b/foreman/dockerfiles/Dockerfile.foreman
@@ -8,6 +8,8 @@ RUN apt-get -y install apt-fast
 
 RUN apt-fast update -qq && \
     apt-fast install -y \
+    gcc \
+    libpq-dev \
     python3 \
     python3-pip
 

From 9e520a0e0834368830fa73e409cd23c29de01587 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Tue, 13 Sep 2022 17:12:41 -0700
Subject: [PATCH 04/24] Address review comments.

---
 .../migrations/0071_auto_20220901_1653.py     |  44 --
 ...klogentry.py => 0071_gatheredaccession.py} |   6 +-
 .../data_refinery_common/models/__init__.py   |   2 +-
 .../{accession.py => gathered_accession.py}   |  20 +-
 .../gatherer/agents/__init__.py               |   0
 .../gatherer/agents/base.py                   |  79 +++
 .../gatherer/agents/microarray_ae.py          | 126 ++++
 .../gatherer/agents/microarray_geo.py         | 123 ++++
 .../gatherer/agents/rna_seq.py                | 204 ++++++
 .../management/commands/gather_accessions.py  | 643 +++---------------
 10 files changed, 626 insertions(+), 621 deletions(-)
 delete mode 100644 common/data_refinery_common/migrations/0071_auto_20220901_1653.py
 rename common/data_refinery_common/migrations/{0071_accessionbacklogentry.py => 0071_gatheredaccession.py} (88%)
 rename common/data_refinery_common/models/{accession.py => gathered_accession.py} (84%)
 create mode 100644 foreman/data_refinery_foreman/gatherer/agents/__init__.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/agents/base.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/agents/rna_seq.py

diff --git a/common/data_refinery_common/migrations/0071_auto_20220901_1653.py b/common/data_refinery_common/migrations/0071_auto_20220901_1653.py
deleted file mode 100644
index c7d3b0b63..000000000
--- a/common/data_refinery_common/migrations/0071_auto_20220901_1653.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Generated by Django 3.2.7 on 2022-09-01 16:53
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ("data_refinery_common", "0070_auto_20211208_2118"),
-    ]
-
-    operations = [
-        migrations.CreateModel(
-            name="Accession",
-            fields=[
-                (
-                    "id",
-                    models.AutoField(
-                        auto_created=True,
-                        primary_key=True,
-                        serialize=False,
-                        verbose_name="ID",
-                    ),
-                ),
-                ("code", models.TextField()),
-                ("created_at", models.DateTimeField(auto_now_add=True)),
-                ("last_modified_at", models.DateTimeField(auto_now=True)),
-                ("organism", models.TextField()),
-                ("published_date", models.DateTimeField()),
-                ("sample_count", models.PositiveIntegerField(default=0)),
-                ("source", models.TextField()),
-                ("technology", models.TextField()),
-            ],
-            options={
-                "db_table": "accessions",
-            },
-        ),
-        migrations.AddConstraint(
-            model_name="accession",
-            constraint=models.UniqueConstraint(
-                fields=("code", "source", "technology"), name="unique_accession"
-            ),
-        ),
-    ]
diff --git a/common/data_refinery_common/migrations/0071_accessionbacklogentry.py b/common/data_refinery_common/migrations/0071_gatheredaccession.py
similarity index 88%
rename from common/data_refinery_common/migrations/0071_accessionbacklogentry.py
rename to common/data_refinery_common/migrations/0071_gatheredaccession.py
index 86c04daed..a1740d96e 100644
--- a/common/data_refinery_common/migrations/0071_accessionbacklogentry.py
+++ b/common/data_refinery_common/migrations/0071_gatheredaccession.py
@@ -1,4 +1,4 @@
-# Generated by Django 3.2.7 on 2022-09-07 19:31
+# Generated by Django 3.2.7 on 2022-09-13 18:14
 
 from django.db import migrations, models
 
@@ -11,7 +11,7 @@ class Migration(migrations.Migration):
 
     operations = [
         migrations.CreateModel(
-            name="AccessionBacklogEntry",
+            name="GatheredAccession",
             fields=[
                 (
                     "id",
@@ -32,7 +32,7 @@ class Migration(migrations.Migration):
                 ("technology", models.TextField()),
             ],
             options={
-                "db_table": "accession_backlog",
+                "db_table": "gathered_accessions",
             },
         ),
     ]
diff --git a/common/data_refinery_common/models/__init__.py b/common/data_refinery_common/models/__init__.py
index 8e9564153..2b544765d 100644
--- a/common/data_refinery_common/models/__init__.py
+++ b/common/data_refinery_common/models/__init__.py
@@ -1,4 +1,3 @@
-from data_refinery_common.models.accession import Accession  # noqa
 from data_refinery_common.models.api_token import APIToken  # noqa
 from data_refinery_common.models.associations.compendium_result_organism_association import (  # noqa
     CompendiumResultOrganismAssociation,
@@ -46,6 +45,7 @@
 from data_refinery_common.models.dataset_annotation import DatasetAnnotation  # noqa
 from data_refinery_common.models.experiment import Experiment  # noqa
 from data_refinery_common.models.experiment_annotation import ExperimentAnnotation  # noqa
+from data_refinery_common.models.gathered_accession import GatheredAccession  # noqa
 from data_refinery_common.models.jobs.downloader_job import DownloaderJob  # noqa
 from data_refinery_common.models.jobs.processor_job import ProcessorJob  # noqa
 from data_refinery_common.models.jobs.survey_job import SurveyJob  # noqa
diff --git a/common/data_refinery_common/models/accession.py b/common/data_refinery_common/models/gathered_accession.py
similarity index 84%
rename from common/data_refinery_common/models/accession.py
rename to common/data_refinery_common/models/gathered_accession.py
index 6ac62da9f..04b084533 100644
--- a/common/data_refinery_common/models/accession.py
+++ b/common/data_refinery_common/models/gathered_accession.py
@@ -4,11 +4,11 @@
 from django.utils import timezone
 
 
-class AccessionBacklogEntry(models.Model):
-    """Accession backlog entry model."""
+class GatheredAccession(models.Model):
+    """Gathered accession model."""
 
     class Meta:
-        db_table = "accession_backlog"
+        db_table = "gathered_accessions"
 
     code = models.TextField(unique=True)
     created_at = models.DateTimeField(auto_now_add=True)
@@ -21,7 +21,7 @@ class Meta:
 
     def __eq__(self, other: object) -> bool:
         """Returns True if two objects are equal. Otherwise returns False."""
-        return isinstance(other, AccessionBacklogEntry) and self.code == other.code
+        return isinstance(other, GatheredAccession) and self.code == other.code
 
     def __hash__(self) -> int:
         """Returns accession object unique hash value."""
@@ -32,15 +32,15 @@ def __str__(self) -> str:
         return ", ".join((self.code, self.technology, self.source, str(self.published_date.date())))
 
     @staticmethod
-    def create_from_ma_ae_entry(entry):
+    def create_from_ma_ae_entry(entry, organism=None):
         """Creates accession object from MicroArray ArrayExpress entry."""
-        accession = AccessionBacklogEntry()
+        accession = GatheredAccession()
         accession.code = entry["accession"]
         accession.source = "ebi_biostudies"
         accession.technology = "microarray"
 
-        if "organism" in entry:
-            accession.organism = entry["organism"]
+        if organism:
+            accession.organism = organism
         if "release_date" in entry:
             accession.published_date = timezone.make_aware(
                 datetime.strptime(entry["release_date"], "%Y-%m-%d")
@@ -51,7 +51,7 @@ def create_from_ma_ae_entry(entry):
     @staticmethod
     def create_from_ma_geo_entry(entry):
         """Creates accession object from MicroArray GEO meta DB entry."""
-        accession = AccessionBacklogEntry()
+        accession = GatheredAccession()
         accession.code = entry["gse"]
         accession.source = "geo_meta_db"
         accession.technology = "microarray"
@@ -69,7 +69,7 @@ def create_from_ma_geo_entry(entry):
     @staticmethod
     def create_from_rnaseq_entry(entry):
         """Creates accession object from RNA-Seq entry."""
-        accession = AccessionBacklogEntry()
+        accession = GatheredAccession()
         accession.code = entry["secondary_study_accession"]
         accession.source = "ebi_ena_portal"
         accession.technology = "rna-seq"
diff --git a/foreman/data_refinery_foreman/gatherer/agents/__init__.py b/foreman/data_refinery_foreman/gatherer/agents/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/foreman/data_refinery_foreman/gatherer/agents/base.py b/foreman/data_refinery_foreman/gatherer/agents/base.py
new file mode 100644
index 000000000..3754a4068
--- /dev/null
+++ b/foreman/data_refinery_foreman/gatherer/agents/base.py
@@ -0,0 +1,79 @@
+"""Abstract base class for accession gathering automation agents."""
+
+from abc import ABC, abstractmethod
+from datetime import datetime
+from http.client import RemoteDisconnected
+
+from requests.exceptions import ConnectionError, ConnectTimeout
+from urllib3.exceptions import ProtocolError
+
+from data_refinery_common.logging import get_and_configure_logger
+from data_refinery_common.models.experiment import Experiment
+from data_refinery_common.models.gathered_accession import GatheredAccession
+
+logger = get_and_configure_logger(__name__)
+
+
+class AccessionAgentBase(ABC):
+    "Accession agent abstract base class."
+
+    previous_accessions = set()
+    retry_params = {
+        "retry_on_exception": lambda e: isinstance(
+            e, (ConnectionError, ConnectTimeout, ProtocolError, RemoteDisconnected)
+        ),
+        "stop_max_attempt_number": 5,
+        "wait_exponential_multiplier": 1000,  # Seconds.
+        "wait_exponential_max": 16000,  # Seconds.
+    }
+
+    def __init__(self, options) -> None:
+        """Populates args and values for major variables."""
+        self.options = options
+        self.count = options["count"]
+        self.keyword = options["keyword"]
+        self.organism = options["organism"]
+        self.since = options["since"]
+        self.until = options["until"] or datetime.now().strftime("%Y-%m-%d")
+
+        self.ids = self.get_ids()
+        self.populate_previous_accessions()
+
+    @abstractmethod
+    def build_query(self):
+        """Returns query/query dict depending on the accession data source."""
+        pass
+
+    @abstractmethod
+    def collect_data(self):
+        """Generates resulting entry collection."""
+        pass
+
+    @abstractmethod
+    def fetch_data(self):
+        """Fetches data from an external or local data source."""
+        pass
+
+    @abstractmethod
+    def get_ids(self):
+        """Gets IDs for query filtering depending on the accession technology."""
+        pass
+
+    def populate_previous_accessions(self) -> None:
+        """Populates previous accession set from a provided excluded ids file."""
+        if not self.options["exclude_previous"] or self.previous_accessions:
+            return
+
+        # Gathered accessions.
+        self.previous_accessions.update(
+            (entry["code"] for entry in GatheredAccession.objects.values("code"))
+        )
+
+        # Surveyed accessions.
+        experiments = Experiment.objects.values("accession_code", "alternate_accession_code")
+        self.previous_accessions.update(
+            (experiment["accession_code"] for experiment in experiments)
+        )
+        self.previous_accessions.update(
+            (experiment["alternate_accession_code"] for experiment in experiments)
+        )
diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
new file mode 100644
index 000000000..b5314302b
--- /dev/null
+++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
@@ -0,0 +1,126 @@
+"""MicroArray ArrayExpress accession gathering automation.
+Data source: https://www.ebi.ac.uk/biostudies/help"""
+
+from typing import List, Set
+
+import requests
+from retrying import retry
+
+from data_refinery_common.logging import get_and_configure_logger
+from data_refinery_common.models.gathered_accession import GatheredAccession
+from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase
+
+logger = get_and_configure_logger(__name__)
+
+
+class MicroArrayExpressAccessionAgent(AccessionAgentBase):
+    """
+    MicroArray ArrayExpress accession gathering agent. The data is fetched from
+    the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and
+    https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more
+    information about the API endpoints.
+    """
+
+    DATA_CHUNK_SIZE = 100
+    DATA_URL = "https://www.ebi.ac.uk/biostudies/api/v1/search"
+
+    def build_query(self) -> dict:
+        """Returns a query dict for getting array/organism specific accessions."""
+        query_dict = {
+            "directsub": "true",
+            "page": 1,
+            "pageSize": self.DATA_CHUNK_SIZE,
+            "release_date": f"[{self.since} TO {self.until}]",
+            "type": "study",
+        }
+
+        if self.ids:
+            # TODO(ark): figure out better way of array filtering.
+            # Also make sure it's equivalent to the array filtering in this query
+            # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208
+            query_dict.update({"content": ", ".join(self.ids)})
+        elif self.keyword:
+            query_dict.update({"content": self.keyword})
+        elif self.organism:
+            query_dict.update({"organism": f'"{self.organism}"'})
+
+        return query_dict
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from EBI Biostudies API."""
+        accessions = set()
+
+        if self.ids:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] "
+                "range."
+            )
+        elif self.keyword:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.'
+            )
+        elif self.organism:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f'"{self.organism}" organism for [{self.since} - {self.until}] range.'
+            )
+        else:
+            return accessions
+
+        logger.debug(message)
+        accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self) -> Set[str]:
+        """Retrieves accessions from API search endpoint."""
+
+        @retry(**self.retry_params)
+        def get_response(url, **kwargs):
+            """Gets response from an API endpoint."""
+            return requests.get(url, **kwargs)
+
+        accessions = set()
+
+        is_done = False
+        params = self.build_query()
+        while not is_done:
+            range_start = (params["page"] - 1) * params["pageSize"] + 1
+            range_end = (params["page"] - 1) * params["pageSize"] + self.DATA_CHUNK_SIZE
+            logger.debug(f"Processing entries {range_start} - {range_end}")
+
+            response = get_response(self.DATA_URL, params=params)
+            entries = response.json().get("hits")
+            if entries:
+                entries = (
+                    GatheredAccession.create_from_ma_ae_entry(entry, organism=self.organism)
+                    for entry in entries
+                )
+                params["page"] += 1
+            else:
+                is_done = True
+
+            if self.previous_accessions:
+                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+            accessions.update(entries)
+
+            # Quit after getting a sufficient amount of accessions.
+            if self.count and len(accessions) >= self.count:
+                is_done = True
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed ArrayExpress IDs."""
+        ids = set()
+
+        if self.options["ae_id"]:
+            ids.update(self.options["ae_id"])
+
+        if self.options["ae_ids_file"]:
+            with open(self.options["ae_ids_file"]) as ae_ids_file:
+                ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines()))
+
+        return sorted(ids)
diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
new file mode 100644
index 000000000..975c715b3
--- /dev/null
+++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
@@ -0,0 +1,123 @@
+"""MicroArray GEO accession gathering automation.
+Data source: local SQLite meta DB from https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html"""
+
+import os
+import re
+import sqlite3
+from typing import List, Set
+
+from data_refinery_common.logging import get_and_configure_logger
+from data_refinery_common.models.gathered_accession import GatheredAccession
+from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase
+
+logger = get_and_configure_logger(__name__)
+
+
+class MicroArrayGEOAccessionAgent(AccessionAgentBase):
+    """
+    MicroArray GEO accession gathering agent. The data is fetched from a local
+    SQLite GEO meta database.
+    """
+
+    # TODO(ark): move the DB file from Docker image to S3.
+    # Implement syncing procedure.
+    # Update URL once the original file is available again.
+    DB_PATH = "data/microarray/GEOmetadb.sqlite"
+
+    def build_query(self) -> str:
+        """Returns a query for getting GEO accessions from the local SQLite meta DB."""
+        tables = [
+            "SELECT *",
+            "FROM gse_gpl",
+            "JOIN gpl ON gse_gpl.gpl=gpl.gpl",
+            "JOIN gse ON gse.gse=gse_gpl.gse",
+            "GROUP BY gse_gpl.gse",
+        ]
+
+        conditions = [
+            f"HAVING gse.submission_date >= '{self.since}'",
+            f"gse.submission_date <= '{self.until}'",
+        ]
+
+        if self.ids:
+            gpl_ids = (f"'{gpl_id}'" for gpl_id in self.ids)
+            conditions.append(f"gse_gpl.gpl IN ({', '.join(gpl_ids)})")
+        elif self.organism:
+            conditions.append(f"lower(organism)='{self.organism.lower()}'")
+
+        return f"{' '.join(tables)} {' AND '.join(conditions)}"
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from GEO database."""
+        accessions = set()
+
+        if self.ids:
+            message = (
+                "Getting MicroArray GEO entries by GEO platform ID(s): "
+                f"{', '.join(self.ids)} for [{self.since} - {self.until}] range."
+            )
+        elif self.keyword:
+            message = (
+                f'Getting MicroArray GEO entries by "{self.keyword}" keyword '
+                f"for [{self.since} - {self.until}] range."
+            )
+        elif self.organism:
+            message = (
+                f'Getting MicroArray GEO entries by "{self.organism}" organism '
+                f"for [{self.since} - {self.until}] range."
+            )
+        else:
+            return accessions
+
+        logger.debug(message)
+        accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self) -> Set[str]:
+        """Retrieves accessions from the GEO meta DB."""
+
+        def match_keyword(row):
+            """
+            Returns True if `row` matches `self.keyword` based regex.
+            Otherwise returns False.
+            """
+            return re_keyword.match(" ".join((str(c) for c in row if c)))
+
+        accessions = set()
+
+        if not os.path.exists(self.DB_PATH):
+            logger.error("GEO meta database doesn't exist.")
+            return accessions
+
+        connection = sqlite3.connect(self.DB_PATH)
+        connection.row_factory = sqlite3.Row
+        connection.text_factory = lambda b: b.decode(errors="ignore")
+        entries = connection.execute(self.build_query()).fetchall()
+        connection.close()
+
+        if self.keyword:
+            re_keyword = re.compile(f".*{self.keyword}.*", re.IGNORECASE)  # Keyword regex.
+            entries = filter(match_keyword, entries)
+
+        entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries)
+        entries = set((GatheredAccession.create_from_ma_geo_entry(entry) for entry in entries))
+
+        if self.previous_accessions:
+            entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+        accessions.update(entries)
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed GEO platform IDs."""
+        ids = set()
+
+        if self.options["gpl_id"]:
+            ids.update(self.options["gpl_id"])
+
+        if self.options["gpl_ids_file"]:
+            with open(self.options["gpl_ids_file"]) as gpl_ids_file:
+                ids.update((gpl_id.strip() for gpl_id in gpl_ids_file.readlines()))
+
+        return sorted(ids)
diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
new file mode 100644
index 000000000..f9497f3ba
--- /dev/null
+++ b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
@@ -0,0 +1,204 @@
+"""RNA-Seq accession gathering automation.
+Data source: https://www.ebi.ac.uk/ena/portal/api/"""
+
+from json.decoder import JSONDecodeError
+from typing import List, Set
+from urllib.parse import quote
+
+import requests
+from retrying import retry
+
+from data_refinery_common.logging import get_and_configure_logger
+from data_refinery_common.models.gathered_accession import GatheredAccession
+from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase
+
+logger = get_and_configure_logger(__name__)
+
+
+class RNASeqAccessionAgent(AccessionAgentBase):
+    """
+    RNA-Seq accession gathering agent. The data is fetched from
+    The European Nucleotide Archive (ENA) Portal.
+    See https://www.ebi.ac.uk/ena/portal/api/ for more information about the API
+    endpoints.
+    """
+
+    DATA_CHUNK_SIZE = 10000
+    DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
+
+    def build_query(self, taxon_id: str = None) -> str:
+        """
+        Returns a query to use for getting specific taxon ID accessions.
+        Some special characters must remain unquoted.
+        """
+
+        AND = " AND "
+        OR = " OR "
+        instrument_models = (
+            "HiSeq X Five",
+            "HiSeq X Ten",
+            "Illumina Genome Analyzer II",
+            "Illumina Genome Analyzer IIx",
+            "Illumina Genome Analyzer",
+            "Illumina HiScanSQ",
+            "Illumina HiSeq 1000",
+            "Illumina HiSeq 1500",
+            "Illumina HiSeq 2000",
+            "Illumina HiSeq 2500",
+            "Illumina HiSeq 3000",
+            "Illumina HiSeq 4000",
+            "Illumina MiSeq",
+            "Illumina NovaSeq 6000",
+            "Ion Torrent Proton",
+            "Ion Torrent S5 XL",
+            "Ion Torrent S5",
+            "NextSeq 500",
+            "NextSeq 550",
+        )
+
+        instrument_models = OR.join((f'instrument_model="{im}"' for im in instrument_models))
+        conditions = [
+            # Relevant date fields: collection_date, collection_date_submitted,
+            # first_public, last_updated.
+            f"first_public >= {self.since}",
+            f"first_public <= {self.until}",
+            f"({instrument_models})",
+            'library_source="TRANSCRIPTOMIC"',
+            'library_strategy="RNA-Seq"',
+        ]
+
+        if taxon_id:
+            conditions.append(f"tax_eq({taxon_id})")
+        elif self.keyword:
+            search_fields = (
+                "assembly_software",
+                "bio_material",
+                "center_name",
+                "collected_by",
+                "experiment_title",
+                "host_body_site",
+                "instrument_model",
+                "instrument_platform",
+                "library_name",
+                "project_name",
+                "sample_title",
+                "sequencing_method",
+                "study_title",
+            )
+            search_fields = OR.join(
+                (f'{sf}="*{self.keyword}*"' for sf in search_fields)
+            )  # Keyword regex.
+            conditions.append(f"({search_fields})")
+        elif self.organism:
+            # `host`: Natural (as opposed to laboratory) host to the organism from which sample
+            #         was obtained.
+            # `host_scientific_name`: Scientific name of the natural (as opposed to laboratory)
+            #                         host to the organism from which sample was obtained.
+            # `scientific_name` Scientific name of the organism from which the sample was derived.
+            # Neither `host_scientific_name` nor `scientific_name` available for search.
+            # https://www.ebi.ac.uk/ena/portal/api/searchFields?dataPortal=ena&format=json&result=read_study
+            conditions.append(f'host="{self.organism}"')
+
+        return quote(AND.join(conditions), safe='*()-="<>/ ')  # Must remain unquoted.
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from EBI ENA API."""
+        accessions = set()
+
+        if self.ids:
+            logger.debug(
+                f"Getting RNA-Seq entries by taxon ID(s): "
+                f"{', '.join((str(i) for i in self.ids))} for [{self.since} - {self.until}] range."
+            )
+            total = len(self.ids)
+            for idx, taxon_id in enumerate(self.ids):
+                if self.count and len(accessions) >= self.count:
+                    break
+
+                if total > 1:
+                    logger.debug(f"Getting entries for taxon ID {taxon_id}, {idx + 1} of {total}.")
+                accessions.update(self.fetch_data(taxon_id=taxon_id))
+        elif self.keyword:
+            logger.debug(
+                f'Getting RNA-Seq entries by "{self.keyword}" keyword '
+                f"for [{self.since} - {self.until}] range."
+            )
+            accessions.update(self.fetch_data())
+        elif self.organism:
+            logger.debug(
+                f'Getting entries by "{self.organism}" organism '
+                f"for [{self.since} - {self.until}] range."
+            )
+            accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self, taxon_id=None) -> Set[str]:
+        """
+        Retrieves accessions from API search endpoint.
+        The API allows to set limit to 0 (get all in one request) but we do
+        it in a paginated fashion with `self.DATA_CHUNK_SIZE` as a page size.
+        """
+
+        @retry(**self.retry_params)
+        def get_response(url, **kwargs):
+            """Gets response from an API endpoint."""
+            return requests.post(url, **kwargs)
+
+        accessions = set()
+
+        fields = [
+            "first_public",
+            "scientific_name",
+            "secondary_study_accession",
+        ]  # For DRP/ERP/SRP-prefixed accessions.
+        data = {
+            "dataPortal": "ena",
+            # TODO(ark): add excludeAccessions/excludeAccessionType support.
+            "fields": ",".join(fields),  # Use "all" to get all fields.
+            "format": "json",
+            "limit": self.DATA_CHUNK_SIZE,
+            "offset": 0,
+            "query": self.build_query(taxon_id=taxon_id),
+            "result": "read_study",
+            "sortFields": fields,
+        }
+
+        is_done = False
+        while not is_done:
+            logger.debug(
+                f"Processing entries {data['offset'] + 1} - {data['offset'] + self.DATA_CHUNK_SIZE}"
+            )
+            entries = ()
+            try:
+                response = get_response(self.DATA_URL, data=data)
+                entries = response.json()
+                entries = (GatheredAccession.create_from_rnaseq_entry(entry) for entry in entries)
+            except JSONDecodeError:
+                is_done = True
+            except TypeError:
+                logger.error(f"Couldn't get data from {self.data_url}. Response: {entries}")
+            data["offset"] += self.DATA_CHUNK_SIZE
+
+            if self.previous_accessions:
+                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+            accessions.update(entries)
+
+            # Quit after getting a sufficient amount of accessions.
+            if self.count and len(accessions) >= self.count:
+                is_done = True
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed taxon IDs."""
+        ids = set()
+
+        if self.options["taxon_id"]:
+            ids.update(self.options["taxon_id"])
+
+        if self.options["taxon_ids_file"]:
+            with open(self.options["taxon_ids_file"]) as taxon_id_file:
+                ids.update((taxon_id.strip() for taxon_id in taxon_id_file.readlines()))
+
+        return sorted(ids)
diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
index c4808a191..445245d3a 100644
--- a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
+++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
@@ -8,32 +8,27 @@
 
 import argparse
 import logging
-import os
 import re
-import sqlite3
-from datetime import datetime
-from http.client import RemoteDisconnected
-from json.decoder import JSONDecodeError
-from typing import List, Set
-from urllib.parse import quote
 
 from django.core.management.base import BaseCommand
 
-import requests
-from requests.exceptions import ConnectionError, ConnectTimeout
-from retrying import retry
-from urllib3.exceptions import ProtocolError
-
 from data_refinery_common.logging import get_and_configure_logger
-from data_refinery_common.models.accession import AccessionBacklogEntry
-from data_refinery_common.models.experiment import Experiment
+from data_refinery_common.models.gathered_accession import GatheredAccession
+from data_refinery_foreman.gatherer.agents.microarray_ae import MicroArrayExpressAccessionAgent
+from data_refinery_foreman.gatherer.agents.microarray_geo import MicroArrayGEOAccessionAgent
+from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAccessionAgent
 
-log = get_and_configure_logger(__name__)
+logger = get_and_configure_logger(__name__)
 
 
 class Command(BaseCommand):
     """Creates agents and runs actual accession gathering."""
 
+    DATA_SOURCE_MA_AE = "microarray-ae"
+    DATA_SOURCE_MA_GEO = "microarray-geo"
+    DATA_SOURCE_RNA_SEQ = "rna-seq"
+    DATA_SOURCES = (DATA_SOURCE_MA_AE, DATA_SOURCE_MA_GEO, DATA_SOURCE_RNA_SEQ)
+
     RE_ACCESSION = re.compile(r"(\D+)(\d+)")
     RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}")
 
@@ -76,14 +71,6 @@ def add_arguments(self, parser) -> None:
             default=True,
             help="Exclude previously gathered or surveyed accessions.",
         )
-        parser.add_argument(
-            "-ne",
-            "--no-exclude-previous",
-            action="store_false",
-            default=False,
-            dest="exclude_previous",
-            help="Do not exclude previously gathered or surveyed accessions.",
-        )
         parser.add_argument(
             "--gpl-id",
             action="extend",
@@ -103,21 +90,22 @@ def add_arguments(self, parser) -> None:
             help="Keyword to use for filtering.",
         )
         parser.add_argument(
-            "-m",
-            "--microarray",
+            "-lv",
+            "--log-verbose",
             action="store_true",
             default=False,
-            help="Collect MicroArray accessions.",
+            help="Enable verbose log output.",
         )
         parser.add_argument(
-            "-o", "--organism", type=str, help="Organism name to use for filtering."
+            "-ne",
+            "--no-exclude-previous",
+            action="store_false",
+            default=False,
+            dest="exclude_previous",
+            help="Do not exclude previously gathered or surveyed accessions.",
         )
         parser.add_argument(
-            "-r",
-            "--rna-seq",
-            action="store_true",
-            default=False,
-            help="Collect RNA-Seq accessions.",
+            "-o", "--organism", type=str, help="Organism name to use for filtering."
         )
         parser.add_argument(
             "-s",
@@ -126,6 +114,14 @@ def add_arguments(self, parser) -> None:
             required=True,
             help="Collect accessions made public on or after this date.",
         )
+        parser.add_argument(
+            "-src",
+            "--source",
+            type=str,
+            action="extend",
+            nargs="+",
+            help="Gather accessions from selected sources.",
+        )
         parser.add_argument(
             "--taxon-id",
             action="extend",
@@ -144,28 +140,19 @@ def add_arguments(self, parser) -> None:
             type=str,
             help="Collect accessions made public before or on this date.",
         )
-        parser.add_argument(
-            "-lv",
-            "--log-verbose",
-            action="store_true",
-            default=False,
-            help="Enable verbose log output.",
-        )
 
     def set_verbosity_level(self, options) -> None:
         """Configures log verbosity level."""
         if options["log_verbose"]:
-            log.addHandler(logging.StreamHandler())
-            log.setLevel(logging.DEBUG)
+            logger.addHandler(logging.StreamHandler())
+            logger.setLevel(logging.DEBUG)
         else:
-            log.setLevel(logging.ERROR)
+            logger.setLevel(logging.ERROR)
 
     def validate_args(self, options) -> None:
         """Validates arguments."""
-        if not options["microarray"] and not options["rna_seq"]:
-            exit("Either --microarray or --rna-seq must be specified.")
-
         errors = list()
+
         since = options["since"]
         until = options["until"]
         if not self.RE_DATE.match(since):
@@ -177,52 +164,65 @@ def validate_args(self, options) -> None:
 
         keyword = options["keyword"]
         organism = options["organism"]
-        if options["microarray"]:
-            ae_id = options["ae_id"] or options["ae_ids_file"]
-            gpl_id = options["gpl_id"] or options["gpl_ids_file"]
-            ids = ae_id or gpl_id
-            invalid_options_message = (
-                "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or "
-                "ArrayExpress ID(s) [--ae-id, --ae-ids-file] / GEO platform ID(s) "
-                "[--gpl-id, --gpl-ids-file] must be specified."
-            )
-        elif options["rna_seq"]:
-            taxon_id = options["taxon_id"] or options["taxon_ids_file"]
-            ids = taxon_id
-            invalid_options_message = (
-                "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] "
-                "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified."
+        sources = options["source"] or self.DATA_SOURCES
+
+        for source in sources:
+            if source in self.DATA_SOURCES:
+                continue
+            errors.append(
+                f"Unknown source: {source}. Supported sources: {', '.join(self.DATA_SOURCES)}"
             )
 
-        if len([option for option in (ids, keyword, organism) if option]) != 1:
-            errors.append(invalid_options_message)
+        if self.DATA_SOURCE_MA_AE in sources:
+            ids = options["ae_id"] or options["ae_ids_file"]
+            if not (ids or keyword or organism):
+                errors.append(
+                    (
+                        "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or "
+                        "ArrayExpress ID(s) [--ae-id, --ae-ids-file] must be specified for "
+                        f"'{self.DATA_SOURCE_MA_AE}' source."
+                    )
+                )
+        if self.DATA_SOURCE_MA_GEO in sources:
+            ids = options["gpl_id"] or options["gpl_ids_file"]
+            if not (ids or keyword or organism):
+                errors.append(
+                    (
+                        "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or "
+                        "GEO platform ID(s) [--gpl-id, --gpl-ids-file] must be specified for "
+                        f"'{self.DATA_SOURCE_MA_GEO}' source."
+                    )
+                )
+        if self.DATA_SOURCE_RNA_SEQ in sources:
+            ids = options["taxon_id"] or options["taxon_ids_file"]
+            if not (ids or keyword or organism):
+                errors.append(
+                    (
+                        "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] "
+                        "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified for "
+                        f"'{self.DATA_SOURCE_RNA_SEQ}' source."
+                    )
+                )
 
         if errors:
             exit("\n".join(errors))
 
     def handle(self, *args, **options):
-        """Runs the accession gathering process."""
+        """Creates agents and runs the accession gathering process."""
         self.validate_args(options)
         self.set_verbosity_level(options)
 
         agents = list()
-        if options["rna_seq"]:
+        sources = options["source"] or self.DATA_SOURCES
+
+        if self.DATA_SOURCE_RNA_SEQ in sources:
             agents.append(RNASeqAccessionAgent(options))
-        elif options["microarray"]:
-            if (
-                options["ae_id"]
-                or options["ae_ids_file"]
-                or options["keyword"]
-                or options["organism"]
-            ):
-                agents.append(MicroArrayExpressAccessionAgent(options))
-            if (
-                options["gpl_id"]
-                or options["gpl_ids_file"]
-                or options["keyword"]
-                or options["organism"]
-            ):
-                agents.append(MicroArrayGEOAccessionAgent(options))
+
+        if self.DATA_SOURCE_MA_AE in sources:
+            agents.append(MicroArrayExpressAccessionAgent(options))
+
+        if self.DATA_SOURCE_MA_GEO in sources:
+            agents.append(MicroArrayGEOAccessionAgent(options))
 
         entries = set()
         for agent in agents:
@@ -245,487 +245,4 @@ def handle(self, *args, **options):
                 output = "No accessions found."
             print(output)
         else:
-            AccessionBacklogEntry.objects.bulk_create(entries)
-
-
-class AccessionAgentBase:
-    "Accession agent base class."
-
-    previous_accessions = set()
-    retry_params = {
-        "retry_on_exception": lambda e: isinstance(
-            e, (ConnectionError, ConnectTimeout, ProtocolError, RemoteDisconnected)
-        ),
-        "stop_max_attempt_number": 5,
-        "wait_exponential_multiplier": 1000,  # Seconds.
-        "wait_exponential_max": 16000,  # Seconds.
-    }
-
-    def __init__(self, options) -> None:
-        """Populates args and values for major variables."""
-        self.options = options
-        self.count = options["count"]
-        self.keyword = options["keyword"]
-        self.organism = options["organism"]
-        self.since = options["since"]
-        self.until = options["until"] or datetime.now().strftime("%Y-%m-%d")
-
-        self.populate_previous_accessions()
-
-    def build_query(self):
-        """Returns query/query dict depending on the accession data source."""
-        raise NotImplementedError
-
-    def collect_data(self):
-        """Generates resulting entry collection."""
-        raise NotImplementedError
-
-    def fetch_data(self):
-        """Fetches data from an external or local data source."""
-        raise NotImplementedError
-
-    def get_ids(self):
-        """Gets IDs for query filtering depending on the accession technology."""
-        raise NotImplementedError
-
-    def populate_previous_accessions(self) -> None:
-        """Populates previous accession set from a provided excluded ids file."""
-        if not self.options["exclude_previous"] or self.previous_accessions:
-            return
-
-        # Gathered accessions.
-        self.previous_accessions.update(
-            (entry["code"] for entry in AccessionBacklogEntry.objects.values("code"))
-        )
-
-        # Surveyed accessions.
-        experiments = Experiment.objects.values("accession_code", "alternate_accession_code")
-        self.previous_accessions.update(
-            (experiment["accession_code"] for experiment in experiments)
-        )
-        self.previous_accessions.update(
-            (experiment["alternate_accession_code"] for experiment in experiments)
-        )
-
-
-class MicroArrayExpressAccessionAgent(AccessionAgentBase):
-    """
-    MicroArray ArrayExpress accession gathering agent. The data is fetched from
-    the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and
-    https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more
-    information about the API endpoints.
-    """
-
-    def __init__(self, options) -> None:
-        super().__init__(options)
-
-        self.data_chunk_size = 100
-        self.data_url = "https://www.ebi.ac.uk/biostudies/api/v1/search"
-        self.ids = self.get_ids()
-
-    def build_query(self) -> dict:
-        """Returns a query dict for getting array/organism specific accessions."""
-        query_dict = {
-            "directsub": "true",
-            "page": 1,
-            "pageSize": self.data_chunk_size,
-            "release_date": f"[{self.since} TO {self.until}]",
-            "type": "study",
-        }
-
-        if self.ids:
-            # TODO(ark): figure out better way of array filtering.
-            # Also make sure it's equivalent to the array filtering in this query
-            # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208
-            query_dict.update({"content": ", ".join(self.ids)})
-        elif self.keyword:
-            query_dict.update({"content": self.keyword})
-        elif self.organism:
-            query_dict.update({"organism": f'"{self.organism}"'})
-
-        return query_dict
-
-    def collect_data(self) -> Set[str]:
-        """Gets new accessions from EBI Biostudies API."""
-        accessions = set()
-
-        if self.ids:
-            message = (
-                "Getting MicroArray ArrayExpress entries by "
-                f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] "
-                "range."
-            )
-        elif self.keyword:
-            message = (
-                "Getting MicroArray ArrayExpress entries by "
-                f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.'
-            )
-        elif self.organism:
-            message = (
-                "Getting MicroArray ArrayExpress entries by "
-                f'"{self.organism}" organism for [{self.since} - {self.until}] range.'
-            )
-        else:
-            return accessions
-
-        log.debug(message)
-        accessions.update(self.fetch_data())
-
-        return accessions
-
-    def fetch_data(self) -> Set[str]:
-        """Retrieves accessions from API search endpoint."""
-
-        @retry(**self.retry_params)
-        def get_response(url, **kwargs):
-            """Gets response from an API endpoint."""
-            return requests.get(url, **kwargs)
-
-        accessions = set()
-
-        is_done = False
-        params = self.build_query()
-        while not is_done:
-            range_start = (params["page"] - 1) * params["pageSize"] + 1
-            range_end = (params["page"] - 1) * params["pageSize"] + self.data_chunk_size
-            log.debug(f"Processing entries {range_start} - {range_end}")
-
-            response = get_response(self.data_url, params=params)
-            entries = response.json().get("hits")
-            if entries:
-                entries = (
-                    AccessionBacklogEntry.create_from_ma_ae_entry(entry) for entry in entries
-                )
-                params["page"] += 1
-            else:
-                is_done = True
-
-            if self.previous_accessions:
-                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
-            accessions.update(entries)
-
-            # Quit after getting a sufficient amount of accessions.
-            if self.count and len(accessions) >= self.count:
-                is_done = True
-
-        return accessions
-
-    def get_ids(self) -> List[str]:
-        """Returns a combined list of passed ArrayExpress IDs."""
-        ids = set()
-
-        if self.options["ae_id"]:
-            ids.update(self.options["ae_id"])
-
-        if self.options["ae_ids_file"]:
-            with open(self.options["ae_ids_file"]) as ae_ids_file:
-                ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines()))
-
-        return sorted(ids)
-
-
-class MicroArrayGEOAccessionAgent(AccessionAgentBase):
-    """
-    MicroArray GEO accession gathering agent. The data is fetched from a local
-    SQLite GEO meta database.
-    """
-
-    def __init__(self, options) -> None:
-        super().__init__(options)
-
-        self.db_path = "data/microarray/GEOmetadb.sqlite"
-        self.ids = self.get_ids()
-
-    def build_query(self) -> str:
-        """Returns a query for getting GEO accessions from the local SQLite meta DB."""
-        tables = [
-            f"SELECT *",
-            "FROM gse_gpl",
-            "JOIN gpl ON gse_gpl.gpl=gpl.gpl",
-            "JOIN gse ON gse.gse=gse_gpl.gse",
-            "GROUP BY gse_gpl.gse",
-        ]
-
-        conditions = [
-            f"HAVING gse.submission_date >= '{self.since}'",
-            f"gse.submission_date <= '{self.until}'",
-        ]
-
-        if self.ids:
-            gpl_ids = (f"'{gpl_id}'" for gpl_id in self.ids)
-            conditions.append(f"gse_gpl.gpl IN ({', '.join(gpl_ids)})")
-        elif self.organism:
-            conditions.append(f"lower(organism)='{self.organism.lower()}'")
-
-        return f"{' '.join(tables)} {' AND '.join(conditions)}"
-
-    def collect_data(self) -> Set[str]:
-        """Gets new accessions from GEO database."""
-        accessions = set()
-
-        if self.ids:
-            message = (
-                "Getting MicroArray GEO entries by GEO platform ID(s): "
-                f"{', '.join(self.ids)} for [{self.since} - {self.until}] range."
-            )
-        elif self.keyword:
-            message = (
-                f'Getting MicroArray GEO entries by "{self.keyword}" keyword '
-                f"for [{self.since} - {self.until}] range."
-            )
-        elif self.organism:
-            message = (
-                f'Getting MicroArray GEO entries by "{self.organism}" organism '
-                f"for [{self.since} - {self.until}] range."
-            )
-        else:
-            return accessions
-
-        log.debug(message)
-        accessions.update(self.fetch_data())
-
-        return accessions
-
-    def fetch_data(self) -> Set[str]:
-        """Retrieves accessions from the GEO meta DB."""
-
-        def match_keyword(row):
-            """
-            Returns True if `row` matches `self.keyword` based regex.
-            Otherwise returns False.
-            """
-            return re_keyword.match(" ".join((str(c) for c in row if c)))
-
-        accessions = set()
-
-        if not os.path.exists(self.db_path):
-            log.error("GEO meta database doesn't exist.")
-            return accessions
-
-        connection = sqlite3.connect(self.db_path)
-        connection.row_factory = sqlite3.Row
-        connection.text_factory = lambda b: b.decode(errors="ignore")
-        entries = connection.execute(self.build_query()).fetchall()
-        connection.close()
-
-        if self.keyword:
-            re_keyword = re.compile(f".*{self.keyword}.*", re.IGNORECASE)  # Keyword regex.
-            entries = filter(match_keyword, entries)
-
-        entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries)
-        entries = set((AccessionBacklogEntry.create_from_ma_geo_entry(entry) for entry in entries))
-
-        if self.previous_accessions:
-            entries = (entry for entry in entries if entry.code not in self.previous_accessions)
-        accessions.update(entries)
-
-        return accessions
-
-    def get_ids(self) -> List[str]:
-        """Returns a combined list of passed GEO platform IDs."""
-        ids = set()
-
-        if self.options["gpl_id"]:
-            ids.update(self.options["gpl_id"])
-
-        if self.options["gpl_ids_file"]:
-            with open(self.options["gpl_ids_file"]) as gpl_ids_file:
-                ids.update((gpl_id.strip() for gpl_id in gpl_ids_file.readlines()))
-
-        return sorted(ids)
-
-
-class RNASeqAccessionAgent(AccessionAgentBase):
-    """
-    RNA-Seq accession gathering agent. The data is fetched from
-    The European Nucleotide Archive (ENA) Portal.
-    See https://www.ebi.ac.uk/ena/portal/api/ for more information about the API
-    endpoints.
-    """
-
-    def __init__(self, options) -> None:
-        super().__init__(options)
-
-        self.data_chunk_size = 10000
-        self.data_url = "https://www.ebi.ac.uk/ena/portal/api/search"
-        self.ids = self.get_ids()
-
-    def build_query(self, taxon_id: str = None) -> str:
-        """
-        Returns a query to use for getting specific taxon ID accessions.
-        Some special characters must remain unquoted.
-        """
-
-        AND = " AND "
-        OR = " OR "
-        instrument_models = (
-            "HiSeq X Five",
-            "HiSeq X Ten",
-            "Illumina Genome Analyzer II",
-            "Illumina Genome Analyzer IIx",
-            "Illumina Genome Analyzer",
-            "Illumina HiScanSQ",
-            "Illumina HiSeq 1000",
-            "Illumina HiSeq 1500",
-            "Illumina HiSeq 2000",
-            "Illumina HiSeq 2500",
-            "Illumina HiSeq 3000",
-            "Illumina HiSeq 4000",
-            "Illumina MiSeq",
-            "Illumina NovaSeq 6000",
-            "Ion Torrent Proton",
-            "Ion Torrent S5 XL",
-            "Ion Torrent S5",
-            "NextSeq 500",
-            "NextSeq 550",
-        )
-
-        instrument_models = OR.join((f'instrument_model="{im}"' for im in instrument_models))
-        conditions = [
-            # Relevant date fields: collection_date, collection_date_submitted,
-            # first_public, last_updated.
-            f"first_public >= {self.since}",
-            f"first_public <= {self.until}",
-            f"({instrument_models})",
-            'library_source="TRANSCRIPTOMIC"',
-            'library_strategy="RNA-Seq"',
-        ]
-
-        if taxon_id:
-            conditions.append(f"tax_eq({taxon_id})")
-        elif self.keyword:
-            search_fields = (
-                "assembly_software",
-                "bio_material",
-                "center_name",
-                "collected_by",
-                "experiment_title",
-                "host_body_site",
-                "instrument_model",
-                "instrument_platform",
-                "library_name",
-                "project_name",
-                "sample_title",
-                "sequencing_method",
-                "study_title",
-            )
-            search_fields = OR.join(
-                (f'{sf}="*{self.keyword}*"' for sf in search_fields)
-            )  # Keyword regex.
-            conditions.append(f"({search_fields})")
-        elif self.organism:
-            # `host`: Natural (as opposed to laboratory) host to the organism from which sample
-            #         was obtained.
-            # `host_scientific_name`: Scientific name of the natural (as opposed to laboratory)
-            #                         host to the organism from which sample was obtained.
-            # `scientific_name` Scientific name of the organism from which the sample was derived.
-            # Neither `host_scientific_name` nor `scientific_name` available for search.
-            # https://www.ebi.ac.uk/ena/portal/api/searchFields?dataPortal=ena&format=json&result=read_study
-            conditions.append(f'host="{self.organism}"')
-
-        return quote(AND.join(conditions), safe='*()-="<>/ ')  # Must remain unquoted.
-
-    def collect_data(self) -> Set[str]:
-        """Gets new accessions from EBI ENA API."""
-        accessions = set()
-
-        if self.ids:
-            log.debug(
-                f"Getting RNA-Seq entries by taxon ID(s): "
-                f"{', '.join((str(idx) for idx in self.ids))} for [{self.since} - {self.until}] range."
-            )
-            total = len(self.ids)
-            for idx, taxon_id in enumerate(self.ids):
-                if self.count and len(accessions) >= self.count:
-                    break
-
-                if total > 1:
-                    log.debug(f"Getting entries for taxon ID {taxon_id}, {idx + 1} of {total}.")
-                accessions.update(self.fetch_data(taxon_id=taxon_id))
-        elif self.keyword:
-            log.debug(
-                f'Getting RNA-Seq entries by "{self.keyword}" keyword '
-                f"for [{self.since} - {self.until}] range."
-            )
-            accessions.update(self.fetch_data())
-        elif self.organism:
-            log.debug(
-                f'Getting entries by "{self.organism}" organism '
-                f"for [{self.since} - {self.until}] range."
-            )
-            accessions.update(self.fetch_data())
-
-        return accessions
-
-    def fetch_data(self, taxon_id=None) -> Set[str]:
-        """
-        Retrieves accessions from API search endpoint.
-        The API allows to set limit to 0 (get all in one request) but we do
-        it in a paginated fashion with `self.data_chunk_size` as a page size.
-        """
-
-        @retry(**self.retry_params)
-        def get_response(url, **kwargs):
-            """Gets response from an API endpoint."""
-            return requests.post(url, **kwargs)
-
-        accessions = set()
-
-        fields = [
-            "first_public",
-            "scientific_name",
-            "secondary_study_accession",
-        ]  # For DRP/ERP/SRP-prefixed accessions.
-        data = {
-            "dataPortal": "ena",
-            # TODO(ark): add excludeAccessions/excludeAccessionType support.
-            "fields": ",".join(fields),  # Use "all" to get all fields.
-            "format": "json",
-            "limit": self.data_chunk_size,
-            "offset": 0,
-            "query": self.build_query(taxon_id=taxon_id),
-            "result": "read_study",
-            "sortFields": fields,
-        }
-
-        is_done = False
-        while not is_done:
-            log.debug(
-                f"Processing entries {data['offset'] + 1} - {data['offset'] + self.data_chunk_size}"
-            )
-            entries = ()
-            try:
-                response = get_response(self.data_url, data=data)
-                entries = response.json()
-                # TODO(ark): add `organism` when -o, --organism flag is used.
-                entries = (
-                    AccessionBacklogEntry.create_from_rnaseq_entry(entry) for entry in entries
-                )
-            except JSONDecodeError:
-                is_done = True
-            except TypeError:
-                log.error(f"Couldn't get data from {self.data_url}. Response: {entries}")
-            data["offset"] += self.data_chunk_size
-
-            if self.previous_accessions:
-                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
-            accessions.update(entries)
-
-            # Quit after getting a sufficient amount of accessions.
-            if self.count and len(accessions) >= self.count:
-                is_done = True
-
-        return accessions
-
-    def get_ids(self) -> List[str]:
-        """Returns a combined list of passed taxon IDs."""
-        ids = set()
-
-        if self.options["taxon_id"]:
-            ids.update(self.options["taxon_id"])
-
-        if self.options["taxon_ids_file"]:
-            with open(self.options["taxon_ids_file"]) as taxon_id_file:
-                ids.update((taxon_id.strip() for taxon_id in taxon_id_file.readlines()))
-
-        return sorted(ids)
+            GatheredAccession.objects.bulk_create(entries)

From 30434e042320e418774257cf494f49860be7f57e Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Tue, 13 Sep 2022 17:17:19 -0700
Subject: [PATCH 05/24] Add a TODO.

---
 foreman/data_refinery_foreman/gatherer/agents/rna_seq.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
index f9497f3ba..f54ba570a 100644
--- a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
+++ b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
@@ -34,6 +34,7 @@ def build_query(self, taxon_id: str = None) -> str:
 
         AND = " AND "
         OR = " OR "
+        # TODO(ark): extract instrument models to a config file.
         instrument_models = (
             "HiSeq X Five",
             "HiSeq X Ten",

From 811b77ff78d303316d1c10beaff18584122b8e4b Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Tue, 13 Sep 2022 18:37:09 -0700
Subject: [PATCH 06/24] Fix empty response issue.

---
 foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
index b5314302b..541bd86d2 100644
--- a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
+++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
@@ -92,7 +92,7 @@ def get_response(url, **kwargs):
             logger.debug(f"Processing entries {range_start} - {range_end}")
 
             response = get_response(self.DATA_URL, params=params)
-            entries = response.json().get("hits")
+            entries = response.json().get("hits", ())
             if entries:
                 entries = (
                     GatheredAccession.create_from_ma_ae_entry(entry, organism=self.organism)

From df14fe3d1e63991f681371e3ecf325e7a0d16edc Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Wed, 14 Sep 2022 18:52:57 -0700
Subject: [PATCH 07/24] Address review comments.

---
 .../migrations/0071_gatheredaccession.py      |  2 +-
 .../models/gathered_accession.py              | 72 +++++++------------
 .../gatherer/agents/base.py                   |  5 +-
 .../gatherer/agents/microarray_ae.py          | 21 ++++--
 .../gatherer/agents/microarray_geo.py         | 23 ++++--
 .../gatherer/agents/rna_seq.py                | 24 +++++--
 .../management/commands/gather_accessions.py  | 60 ++++++++--------
 7 files changed, 112 insertions(+), 95 deletions(-)

diff --git a/common/data_refinery_common/migrations/0071_gatheredaccession.py b/common/data_refinery_common/migrations/0071_gatheredaccession.py
index a1740d96e..65d192b59 100644
--- a/common/data_refinery_common/migrations/0071_gatheredaccession.py
+++ b/common/data_refinery_common/migrations/0071_gatheredaccession.py
@@ -22,7 +22,7 @@ class Migration(migrations.Migration):
                         verbose_name="ID",
                     ),
                 ),
-                ("code", models.TextField(unique=True)),
+                ("accession_code", models.TextField(unique=True)),
                 ("created_at", models.DateTimeField(auto_now_add=True)),
                 ("last_modified_at", models.DateTimeField(auto_now=True)),
                 ("organism", models.TextField()),
diff --git a/common/data_refinery_common/models/gathered_accession.py b/common/data_refinery_common/models/gathered_accession.py
index 04b084533..e56ed615c 100644
--- a/common/data_refinery_common/models/gathered_accession.py
+++ b/common/data_refinery_common/models/gathered_accession.py
@@ -10,7 +10,7 @@ class GatheredAccession(models.Model):
     class Meta:
         db_table = "gathered_accessions"
 
-    code = models.TextField(unique=True)
+    accession_code = models.TextField(unique=True)
     created_at = models.DateTimeField(auto_now_add=True)
     last_modified_at = models.DateTimeField(auto_now=True)
     organism = models.TextField()
@@ -21,64 +21,44 @@ class Meta:
 
     def __eq__(self, other: object) -> bool:
         """Returns True if two objects are equal. Otherwise returns False."""
-        return isinstance(other, GatheredAccession) and self.code == other.code
+        return isinstance(other, GatheredAccession) and self.accession_code == other.accession_code
 
     def __hash__(self) -> int:
         """Returns accession object unique hash value."""
-        return hash(self.code)
+        return hash(self.accession_code)
 
     def __str__(self) -> str:
         """Returns accession default string representation."""
-        return ", ".join((self.code, self.technology, self.source, str(self.published_date.date())))
-
-    @staticmethod
-    def create_from_ma_ae_entry(entry, organism=None):
-        """Creates accession object from MicroArray ArrayExpress entry."""
-        accession = GatheredAccession()
-        accession.code = entry["accession"]
-        accession.source = "ebi_biostudies"
-        accession.technology = "microarray"
-
-        if organism:
-            accession.organism = organism
-        if "release_date" in entry:
-            accession.published_date = timezone.make_aware(
-                datetime.strptime(entry["release_date"], "%Y-%m-%d")
+        return ", ".join(
+            (
+                self.accession_code,
+                self.technology,
+                self.source,
+                str(self.published_date.date()),
             )
-
-        return accession
+        )
 
     @staticmethod
-    def create_from_ma_geo_entry(entry):
-        """Creates accession object from MicroArray GEO meta DB entry."""
+    def create_from_external_entry(data, source, technology, organism=None):
+        """Creates accession object from MicroArray ArrayExpress entry."""
         accession = GatheredAccession()
-        accession.code = entry["gse"]
-        accession.source = "geo_meta_db"
-        accession.technology = "microarray"
-
-        if "organism" in entry:
-            accession.organism = entry["organism"].lower()
-        if "submission_date" in entry:
 
-            accession.published_date = timezone.make_aware(
-                datetime.strptime(entry["submission_date"], "%Y-%m-%d")
-            )
+        accession.accession_code = (
+            data.get("accession") or data.get("gse") or data.get("secondary_study_accession")
+        )
 
-        return accession
+        organism = data.get("organism") or data.get("scientific_name") or organism
+        if organism:
+            accession.organism = organism.lower()
 
-    @staticmethod
-    def create_from_rnaseq_entry(entry):
-        """Creates accession object from RNA-Seq entry."""
-        accession = GatheredAccession()
-        accession.code = entry["secondary_study_accession"]
-        accession.source = "ebi_ena_portal"
-        accession.technology = "rna-seq"
+        published_date = (
+            data.get("first_public") or data.get("release_date") or data.get("submission_date")
+        )
+        accession.published_date = timezone.make_aware(
+            datetime.strptime(published_date, "%Y-%m-%d")
+        )
 
-        if "scientific_name" in entry:
-            accession.organism = entry["scientific_name"].lower()
-        if "first_public" in entry:
-            accession.published_date = timezone.make_aware(
-                datetime.strptime(entry["first_public"], "%Y-%m-%d")
-            )
+        accession.source = source
+        accession.technology = technology
 
         return accession
diff --git a/foreman/data_refinery_foreman/gatherer/agents/base.py b/foreman/data_refinery_foreman/gatherer/agents/base.py
index 3754a4068..818bbf72c 100644
--- a/foreman/data_refinery_foreman/gatherer/agents/base.py
+++ b/foreman/data_refinery_foreman/gatherer/agents/base.py
@@ -66,7 +66,10 @@ def populate_previous_accessions(self) -> None:
 
         # Gathered accessions.
         self.previous_accessions.update(
-            (entry["code"] for entry in GatheredAccession.objects.values("code"))
+            (
+                entry["accession_code"]
+                for entry in GatheredAccession.objects.values("accession_code")
+            )
         )
 
         # Surveyed accessions.
diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
index 541bd86d2..3bfcf08fe 100644
--- a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
+++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
@@ -1,5 +1,7 @@
-"""MicroArray ArrayExpress accession gathering automation.
-Data source: https://www.ebi.ac.uk/biostudies/help"""
+"""
+MicroArray ArrayExpress accession gathering automation.
+Data source: https://www.ebi.ac.uk/biostudies/help
+"""
 
 from typing import List, Set
 
@@ -13,7 +15,7 @@
 logger = get_and_configure_logger(__name__)
 
 
-class MicroArrayExpressAccessionAgent(AccessionAgentBase):
+class AEAgent(AccessionAgentBase):
     """
     MicroArray ArrayExpress accession gathering agent. The data is fetched from
     the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and
@@ -23,6 +25,9 @@ class MicroArrayExpressAccessionAgent(AccessionAgentBase):
 
     DATA_CHUNK_SIZE = 100
     DATA_URL = "https://www.ebi.ac.uk/biostudies/api/v1/search"
+    SOURCE = "ebi-biostudies"
+    SOURCE_NAME = "microarray-ae"
+    TECHNOLOGY = "microarray"
 
     def build_query(self) -> dict:
         """Returns a query dict for getting array/organism specific accessions."""
@@ -95,7 +100,9 @@ def get_response(url, **kwargs):
             entries = response.json().get("hits", ())
             if entries:
                 entries = (
-                    GatheredAccession.create_from_ma_ae_entry(entry, organism=self.organism)
+                    GatheredAccession.create_from_external_entry(
+                        entry, self.SOURCE, self.TECHNOLOGY, organism=self.organism
+                    )
                     for entry in entries
                 )
                 params["page"] += 1
@@ -103,7 +110,11 @@ def get_response(url, **kwargs):
                 is_done = True
 
             if self.previous_accessions:
-                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+                entries = (
+                    entry
+                    for entry in entries
+                    if entry.accession_code not in self.previous_accessions
+                )
             accessions.update(entries)
 
             # Quit after getting a sufficient amount of accessions.
diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
index 975c715b3..2500bcec5 100644
--- a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
+++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
@@ -1,5 +1,8 @@
-"""MicroArray GEO accession gathering automation.
-Data source: local SQLite meta DB from https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html"""
+"""
+MicroArray GEO accession gathering automation.
+Data source: local SQLite meta DB from
+https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html
+"""
 
 import os
 import re
@@ -13,7 +16,7 @@
 logger = get_and_configure_logger(__name__)
 
 
-class MicroArrayGEOAccessionAgent(AccessionAgentBase):
+class GEOAgent(AccessionAgentBase):
     """
     MicroArray GEO accession gathering agent. The data is fetched from a local
     SQLite GEO meta database.
@@ -23,6 +26,9 @@ class MicroArrayGEOAccessionAgent(AccessionAgentBase):
     # Implement syncing procedure.
     # Update URL once the original file is available again.
     DB_PATH = "data/microarray/GEOmetadb.sqlite"
+    SOURCE = "geo-meta-db"
+    SOURCE_NAME = "microarray-geo"
+    TECHNOLOGY = "microarray"
 
     def build_query(self) -> str:
         """Returns a query for getting GEO accessions from the local SQLite meta DB."""
@@ -101,10 +107,17 @@ def match_keyword(row):
             entries = filter(match_keyword, entries)
 
         entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries)
-        entries = set((GatheredAccession.create_from_ma_geo_entry(entry) for entry in entries))
+        entries = set(
+            (
+                GatheredAccession.create_from_external_entry(entry, self.SOURCE, self.TECHNOLOGY)
+                for entry in entries
+            )
+        )
 
         if self.previous_accessions:
-            entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+            entries = (
+                entry for entry in entries if entry.accession_code not in self.previous_accessions
+            )
         accessions.update(entries)
 
         return accessions
diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
index f54ba570a..577f815b8 100644
--- a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
+++ b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
@@ -1,5 +1,7 @@
-"""RNA-Seq accession gathering automation.
-Data source: https://www.ebi.ac.uk/ena/portal/api/"""
+"""
+RNA-Seq accession gathering automation.
+Data source: https://www.ebi.ac.uk/ena/portal/api/
+"""
 
 from json.decoder import JSONDecodeError
 from typing import List, Set
@@ -15,7 +17,7 @@
 logger = get_and_configure_logger(__name__)
 
 
-class RNASeqAccessionAgent(AccessionAgentBase):
+class RNASeqAgent(AccessionAgentBase):
     """
     RNA-Seq accession gathering agent. The data is fetched from
     The European Nucleotide Archive (ENA) Portal.
@@ -25,6 +27,9 @@ class RNASeqAccessionAgent(AccessionAgentBase):
 
     DATA_CHUNK_SIZE = 10000
     DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
+    SOURCE = "ebi-ena-portal"
+    SOURCE_NAME = "rna-seq"
+    TECHNOLOGY = "rna-seq"
 
     def build_query(self, taxon_id: str = None) -> str:
         """
@@ -174,7 +179,12 @@ def get_response(url, **kwargs):
             try:
                 response = get_response(self.DATA_URL, data=data)
                 entries = response.json()
-                entries = (GatheredAccession.create_from_rnaseq_entry(entry) for entry in entries)
+                entries = (
+                    GatheredAccession.create_from_external_entry(
+                        entry, self.SOURCE, self.TECHNOLOGY
+                    )
+                    for entry in entries
+                )
             except JSONDecodeError:
                 is_done = True
             except TypeError:
@@ -182,7 +192,11 @@ def get_response(url, **kwargs):
             data["offset"] += self.DATA_CHUNK_SIZE
 
             if self.previous_accessions:
-                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+                entries = (
+                    entry
+                    for entry in entries
+                    if entry.accession_code not in self.previous_accessions
+                )
             accessions.update(entries)
 
             # Quit after getting a sufficient amount of accessions.
diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
index 445245d3a..2b073ef45 100644
--- a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
+++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
@@ -11,12 +11,13 @@
 import re
 
 from django.core.management.base import BaseCommand
+from django.db.utils import IntegrityError
 
 from data_refinery_common.logging import get_and_configure_logger
 from data_refinery_common.models.gathered_accession import GatheredAccession
-from data_refinery_foreman.gatherer.agents.microarray_ae import MicroArrayExpressAccessionAgent
-from data_refinery_foreman.gatherer.agents.microarray_geo import MicroArrayGEOAccessionAgent
-from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAccessionAgent
+from data_refinery_foreman.gatherer.agents.microarray_ae import AEAgent
+from data_refinery_foreman.gatherer.agents.microarray_geo import GEOAgent
+from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAgent
 
 logger = get_and_configure_logger(__name__)
 
@@ -24,11 +25,8 @@
 class Command(BaseCommand):
     """Creates agents and runs actual accession gathering."""
 
-    DATA_SOURCE_MA_AE = "microarray-ae"
-    DATA_SOURCE_MA_GEO = "microarray-geo"
-    DATA_SOURCE_RNA_SEQ = "rna-seq"
-    DATA_SOURCES = (DATA_SOURCE_MA_AE, DATA_SOURCE_MA_GEO, DATA_SOURCE_RNA_SEQ)
-
+    DATA_AGENTS = (AEAgent, GEOAgent, RNASeqAgent)
+    DATA_SOURCE_NAMES = [agent.SOURCE_NAME for agent in DATA_AGENTS]
     RE_ACCESSION = re.compile(r"(\D+)(\d+)")
     RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}")
 
@@ -164,43 +162,43 @@ def validate_args(self, options) -> None:
 
         keyword = options["keyword"]
         organism = options["organism"]
-        sources = options["source"] or self.DATA_SOURCES
+        source_names = options["source"] or self.DATA_SOURCE_NAMES
 
-        for source in sources:
-            if source in self.DATA_SOURCES:
+        for source_name in source_names:
+            if source_name in self.DATA_SOURCE_NAMES:
                 continue
             errors.append(
-                f"Unknown source: {source}. Supported sources: {', '.join(self.DATA_SOURCES)}"
+                f"Unknown source: {source_name}. Supported sources: {', '.join(self.DATA_SOURCE_NAMES)}"
             )
 
-        if self.DATA_SOURCE_MA_AE in sources:
+        if AEAgent.SOURCE_NAME in source_names:
             ids = options["ae_id"] or options["ae_ids_file"]
             if not (ids or keyword or organism):
                 errors.append(
                     (
                         "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or "
                         "ArrayExpress ID(s) [--ae-id, --ae-ids-file] must be specified for "
-                        f"'{self.DATA_SOURCE_MA_AE}' source."
+                        f"'{AEAgent.SOURCE_NAME}' source."
                     )
                 )
-        if self.DATA_SOURCE_MA_GEO in sources:
+        if GEOAgent.SOURCE_NAME in source_names:
             ids = options["gpl_id"] or options["gpl_ids_file"]
             if not (ids or keyword or organism):
                 errors.append(
                     (
                         "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or "
                         "GEO platform ID(s) [--gpl-id, --gpl-ids-file] must be specified for "
-                        f"'{self.DATA_SOURCE_MA_GEO}' source."
+                        f"'{GEOAgent.SOURCE_NAME}' source."
                     )
                 )
-        if self.DATA_SOURCE_RNA_SEQ in sources:
+        if RNASeqAgent.SOURCE_NAME in source_names:
             ids = options["taxon_id"] or options["taxon_ids_file"]
             if not (ids or keyword or organism):
                 errors.append(
                     (
                         "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] "
                         "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified for "
-                        f"'{self.DATA_SOURCE_RNA_SEQ}' source."
+                        f"'{RNASeqAgent.SOURCE_NAME}' source."
                     )
                 )
 
@@ -213,26 +211,21 @@ def handle(self, *args, **options):
         self.set_verbosity_level(options)
 
         agents = list()
-        sources = options["source"] or self.DATA_SOURCES
-
-        if self.DATA_SOURCE_RNA_SEQ in sources:
-            agents.append(RNASeqAccessionAgent(options))
-
-        if self.DATA_SOURCE_MA_AE in sources:
-            agents.append(MicroArrayExpressAccessionAgent(options))
-
-        if self.DATA_SOURCE_MA_GEO in sources:
-            agents.append(MicroArrayGEOAccessionAgent(options))
+        sources_names = options["source"] or self.DATA_SOURCE_NAMES
+        for cls in self.DATA_AGENTS:
+            if cls.SOURCE_NAME not in sources_names:
+                continue
+            agents.append(cls(options))
 
         entries = set()
         for agent in agents:
             entries.update(agent.collect_data())
 
         entries = sorted(  # Sort the resulting list.
-            (entry for entry in entries if self.RE_ACCESSION.match(entry.code)),
+            (entry for entry in entries if self.RE_ACCESSION.match(entry.accession_code)),
             key=lambda entry: (
-                self.RE_ACCESSION.match(entry.code).group(1),
-                int(self.RE_ACCESSION.match(entry.code).group(2)),
+                self.RE_ACCESSION.match(entry.accession_code).group(1),
+                int(self.RE_ACCESSION.match(entry.accession_code).group(2)),
             ),
         )
         # Limit the number of output entries.
@@ -245,4 +238,7 @@ def handle(self, *args, **options):
                 output = "No accessions found."
             print(output)
         else:
-            GatheredAccession.objects.bulk_create(entries)
+            try:
+                GatheredAccession.objects.bulk_create(entries)
+            except IntegrityError as e:
+                logger.exception(f"Could not save new accessions to the database: {e}")

From c4b43eecc479255a40902707bd026eeacff95788 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Wed, 21 Sep 2022 11:05:24 -0700
Subject: [PATCH 08/24] Rename agent files.

---
 .../gatherer/agents/{microarray_ae.py => ae_agent.py}       | 0
 .../gatherer/agents/{microarray_geo.py => geo_agent.py}     | 0
 .../gatherer/agents/{rna_seq.py => rnaseq_agent.py}         | 0
 .../gatherer/management/commands/gather_accessions.py       | 6 +++---
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename foreman/data_refinery_foreman/gatherer/agents/{microarray_ae.py => ae_agent.py} (100%)
 rename foreman/data_refinery_foreman/gatherer/agents/{microarray_geo.py => geo_agent.py} (100%)
 rename foreman/data_refinery_foreman/gatherer/agents/{rna_seq.py => rnaseq_agent.py} (100%)

diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/ae_agent.py
similarity index 100%
rename from foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
rename to foreman/data_refinery_foreman/gatherer/agents/ae_agent.py
diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py b/foreman/data_refinery_foreman/gatherer/agents/geo_agent.py
similarity index 100%
rename from foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
rename to foreman/data_refinery_foreman/gatherer/agents/geo_agent.py
diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rnaseq_agent.py
similarity index 100%
rename from foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
rename to foreman/data_refinery_foreman/gatherer/agents/rnaseq_agent.py
diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
index 2b073ef45..554b74350 100644
--- a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
+++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
@@ -15,9 +15,9 @@
 
 from data_refinery_common.logging import get_and_configure_logger
 from data_refinery_common.models.gathered_accession import GatheredAccession
-from data_refinery_foreman.gatherer.agents.microarray_ae import AEAgent
-from data_refinery_foreman.gatherer.agents.microarray_geo import GEOAgent
-from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAgent
+from data_refinery_foreman.gatherer.agents.ae_agent import AEAgent
+from data_refinery_foreman.gatherer.agents.geo_agent import GEOAgent
+from data_refinery_foreman.gatherer.agents.rnaseq_agent import RNASeqAgent
 
 logger = get_and_configure_logger(__name__)
 

From fef0e6fd4b07eccbdfc25a6d5860000b41203281 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Mon, 3 Oct 2022 16:15:57 -0700
Subject: [PATCH 09/24] Update terraform config in order to make it runnable on
 arm64.

  - Migrate `template_file` to `templatefile`
  - Inline user_data templates.
  - Fix s3 backend configuration.
  - Set postgres version to 11.16.
  - Change DB `name` to `db_name`.
---
 ami/instance.tf             |  12 ++-
 ami/user-data-script.tf     |  16 ----
 infrastructure/backend.tf   |  10 +--
 infrastructure/batch.tf     |  14 +++-
 infrastructure/database.tf  |  34 ++++----
 infrastructure/instances.tf | 162 ++++++++++++++----------------------
 infrastructure/variables.tf |   2 +-
 7 files changed, 103 insertions(+), 147 deletions(-)

diff --git a/ami/instance.tf b/ami/instance.tf
index 620b13803..80680f351 100644
--- a/ami/instance.tf
+++ b/ami/instance.tf
@@ -26,7 +26,11 @@ resource "aws_instance" "ubuntu-ami-template-instance" {
   # Our instance-user-data.sh script is built by Terraform at
   # apply-time so that it can put additional files onto the
   # instance. For more information see the definition of this resource.
-  user_data = data.template_file.ubuntu_instance_user_data.rendered
+  user_data = templatefile("ubuntu-instance-user-data.tpl.sh",
+    {
+      docker_apt_key = data.local_file.docker_apt_key.content
+    }
+  )
 
   subnet_id = data.aws_subnet.ccdl_dev_subnet.id
   associate_public_ip_address = true
@@ -46,7 +50,11 @@ resource "aws_instance" "ecs-ami-template-instance" {
   # Our instance-user-data.sh script is built by Terraform at
   # apply-time so that it can put additional files onto the
   # instance. For more information see the definition of this resource.
-  user_data = data.template_file.ecs_instance_user_data.rendered
+  user_data = templatefile("ecs-instance-user-data.tpl.sh",
+    {
+      docker_apt_key = data.local_file.docker_apt_key.content
+    }
+  )
 
   subnet_id = data.aws_subnet.ccdl_dev_subnet.id
   associate_public_ip_address = true
diff --git a/ami/user-data-script.tf b/ami/user-data-script.tf
index d5d0718e3..585beacbe 100644
--- a/ami/user-data-script.tf
+++ b/ami/user-data-script.tf
@@ -2,19 +2,3 @@
 data "local_file" "docker_apt_key" {
   filename = "docker-apt-key.gpg"
 }
-
-data "template_file" "ubuntu_instance_user_data" {
-  template = file("ubuntu-instance-user-data.tpl.sh")
-
-  vars = {
-    docker_apt_key = data.local_file.docker_apt_key.content
-  }
-}
-
-data "template_file" "ecs_instance_user_data" {
-  template = file("ecs-instance-user-data.tpl.sh")
-
-  vars = {
-    docker_apt_key = data.local_file.docker_apt_key.content
-  }
-}
diff --git a/infrastructure/backend.tf b/infrastructure/backend.tf
index 02e25fc39..294f50ef7 100644
--- a/infrastructure/backend.tf
+++ b/infrastructure/backend.tf
@@ -1,16 +1,8 @@
 terraform {
   backend "s3" {
-    # Terraform will prompt the user for the other keys.
-    region = "us-east-1"
-  }
-}
-
-data "terraform_remote_state" "network" {
-  backend = "s3"
-  config = {
     bucket = "refinebio-tfstate-deploy-${var.stage}"
+    encrypt = true
     key = "terraform-${var.user}.tfstate"
     region = "us-east-1"
-    encrypt = true
   }
 }
diff --git a/infrastructure/batch.tf b/infrastructure/batch.tf
index 8dbbebd94..ae1ab8216 100644
--- a/infrastructure/batch.tf
+++ b/infrastructure/batch.tf
@@ -16,7 +16,19 @@ module "batch" {
   data_refinery_keypair = aws_key_pair.data_refinery
   data_refinery_worker_security_group = aws_security_group.data_refinery_worker
 
-  data_refinery_worker_user_data = data.template_file.worker_script_smusher.rendered
+  data_refinery_worker_user_data = templatefile(
+    "workers-configuration/workers-instance-user-data.tpl.sh",
+    {
+      database_host     = aws_instance.pg_bouncer.private_ip
+      database_name     = aws_db_instance.postgres_db.name
+      database_password = var.database_password
+      database_port     = var.database_port
+      database_user     = var.database_user
+      region            = var.region
+      stage             = var.stage
+      user              = var.user
+    }
+  )
   data_refinery_worker_ami = var.worker_ami
 
   user = var.user
diff --git a/infrastructure/database.tf b/infrastructure/database.tf
index 8f4f8520e..503fcf75b 100644
--- a/infrastructure/database.tf
+++ b/infrastructure/database.tf
@@ -137,11 +137,11 @@ resource "aws_db_instance" "postgres_db" {
   allocated_storage = 100
   storage_type = "gp2"
   engine = "postgres"
-  engine_version = "11.1"
+  engine_version = "11.16"
   allow_major_version_upgrade = true
   auto_minor_version_upgrade = false
   instance_class = "db.${var.database_instance_type}"
-  name = "data_refinery"
+  db_name = "data_refinery"
   port = var.database_hidden_port
   username = var.database_user
   password = var.database_password
@@ -182,7 +182,19 @@ resource "aws_instance" "pg_bouncer" {
   # Our instance-user-data.sh script is built by Terraform at
   # apply-time so that it can put additional files onto the
   # instance. For more information see the definition of this resource.
-  user_data = data.template_file.pg_bouncer_script_smusher.rendered
+  user_data = templatefile("workers-configuration/pg-bouncer-instance-user-data.tpl.sh",
+    {
+      database_host = aws_db_instance.postgres_db.address
+      database_name = aws_db_instance.postgres_db.db_name
+      database_password = var.database_password
+      database_port = var.database_hidden_port
+      database_user = var.database_user
+      listen_port = var.database_port
+      region = var.region
+      stage = var.stage
+      user = var.user
+    }
+  )
 
   tags = merge(
     var.default_tags,
@@ -198,19 +210,3 @@ resource "aws_instance" "pg_bouncer" {
     tags = var.default_tags
   }
 }
-
-data "template_file" "pg_bouncer_script_smusher" {
-  template = file("workers-configuration/pg-bouncer-instance-user-data.tpl.sh")
-
-  vars = {
-    database_host = aws_db_instance.postgres_db.address
-    database_user = var.database_user
-    database_port = var.database_hidden_port
-    database_password = var.database_password
-    database_name = aws_db_instance.postgres_db.name
-    listen_port = var.database_port
-    user = var.user
-    stage = var.stage
-    region = var.region
-  }
-}
diff --git a/infrastructure/instances.tf b/infrastructure/instances.tf
index 379971e8d..bed2234dd 100644
--- a/infrastructure/instances.tf
+++ b/infrastructure/instances.tf
@@ -7,34 +7,11 @@ data "aws_ami" "ubuntu" {
   owners = ["589864003899"]
 
   filter {
-    name = "name"
+    name   = "name"
     values = ["ccdl-ubuntu-18.04-*"]
   }
 }
 
-# This script smusher exists in order to be able to circumvent a
-# limitation of AWS which is that you get one script and one script
-# only to set up the instance when it boots up. Because there is only
-# one script you cannot place additional files your script may need
-# onto the instance. Therefore this script smusher templates the files
-# the instance-user-data.sh script needs into it, so that once it
-# makes its way onto the instance it can spit them back out onto the
-# disk.
-data "template_file" "worker_script_smusher" {
-  template = file("workers-configuration/workers-instance-user-data.tpl.sh")
-
-  vars = {
-    user = var.user
-    stage = var.stage
-    region = var.region
-    database_host = aws_instance.pg_bouncer.private_ip
-    database_port = var.database_port
-    database_user = var.database_user
-    database_password = var.database_password
-    database_name = aws_db_instance.postgres_db.name
-  }
-}
-
 ##
 # ElasticSearch
 ##
@@ -53,7 +30,7 @@ data "aws_caller_identity" "current" {
 }
 
 resource "aws_elasticsearch_domain" "es" {
-  domain_name = "es-${var.user}-${var.stage}"
+  domain_name           = "es-${var.user}-${var.stage}"
   elasticsearch_version = "6.3"
 
   advanced_options = {
@@ -109,7 +86,7 @@ CONFIG
     var.default_tags,
     {
       Domain = "es-${var.user}-${var.stage}"
-      Name = "es-${var.user}-${var.stage}"
+      Name   = "es-${var.user}-${var.stage}"
     }
   )
 }
@@ -130,56 +107,43 @@ data "local_file" "api_environment" {
   filename = "api-configuration/environment"
 }
 
-# This script smusher serves a similar purpose to
-# ${data.template_file.worker_script_smusher} but for the Nginx/API.
-data "template_file" "api_server_script_smusher" {
-  template = file("api-configuration/api-server-instance-user-data.tpl.sh")
-
-  vars = {
-    nginx_config = data.local_file.api_nginx_config.content
-    api_environment = data.local_file.api_environment.content
-    dockerhub_repo = var.dockerhub_repo
-    api_docker_image = var.api_docker_image
-    data_refinery_cert_bucket = aws_s3_bucket.data_refinery_cert_bucket.id
-    user = var.user
-    stage = var.stage
-    region = var.region
-    database_host = aws_instance.pg_bouncer.private_ip
-    database_user = var.database_user
-    database_password = var.database_password
-    database_name = aws_db_instance.postgres_db.name
-    elasticsearch_host = aws_elasticsearch_domain.es.endpoint
-    elasticsearch_port = "80" # AWS doesn't support the data transfer protocol on 9200 >:[
-    log_group = aws_cloudwatch_log_group.data_refinery_log_group.name
-    log_stream = aws_cloudwatch_log_stream.log_stream_api.name
-  }
-
-  depends_on = [
-    aws_db_instance.postgres_db,
-    aws_elasticsearch_domain.es,
-    aws_instance.pg_bouncer,
-    aws_security_group_rule.data_refinery_api_http,
-    aws_security_group_rule.data_refinery_api_outbound,
-    aws_s3_bucket.data_refinery_cert_bucket,
-  ]
-}
-
 resource "aws_instance" "api_server_1" {
-  ami = data.aws_ami.ubuntu.id
-  instance_type = var.api_instance_type
-  availability_zone = "${var.region}a"
+  ami                    = data.aws_ami.ubuntu.id
+  instance_type          = var.api_instance_type
+  availability_zone      = "${var.region}a"
   vpc_security_group_ids = [aws_security_group.data_refinery_api.id]
-  iam_instance_profile = aws_iam_instance_profile.data_refinery_api.name
-  subnet_id = aws_subnet.data_refinery_1a.id
+  iam_instance_profile   = aws_iam_instance_profile.data_refinery_api.name
+  subnet_id              = aws_subnet.data_refinery_1a.id
   depends_on = [
     aws_db_instance.postgres_db,
     aws_elasticsearch_domain.es,
     aws_instance.pg_bouncer,
+    aws_s3_bucket.data_refinery_cert_bucket,
     aws_security_group_rule.data_refinery_api_http,
     aws_security_group_rule.data_refinery_api_outbound,
   ]
-  user_data = data.template_file.api_server_script_smusher.rendered
-  key_name = aws_key_pair.data_refinery.key_name
+
+  user_data = templatefile("api-configuration/api-server-instance-user-data.tpl.sh",
+    {
+      api_docker_image          = var.api_docker_image
+      api_environment           = data.local_file.api_environment.content
+      data_refinery_cert_bucket = aws_s3_bucket.data_refinery_cert_bucket.id
+      database_host             = aws_instance.pg_bouncer.private_ip
+      database_name             = aws_db_instance.postgres_db.db_name
+      database_password         = var.database_password
+      database_user             = var.database_user
+      dockerhub_repo            = var.dockerhub_repo
+      elasticsearch_host        = aws_elasticsearch_domain.es.endpoint
+      elasticsearch_port        = "80" # AWS doesn't support the data transfer protocol on 9200 >:[
+      log_group                 = aws_cloudwatch_log_group.data_refinery_log_group.name
+      log_stream                = aws_cloudwatch_log_stream.log_stream_api.name
+      nginx_config              = data.local_file.api_nginx_config.content
+      region                    = var.region
+      stage                     = var.stage
+      user                      = var.user
+    }
+  )
+  key_name  = aws_key_pair.data_refinery.key_name
 
   tags = merge(
     var.default_tags,
@@ -210,44 +174,44 @@ data "local_file" "foreman_environment" {
   filename = "foreman-configuration/environment"
 }
 
-# This script smusher serves a similar purpose to
-# ${data.template_file.worker_script_smusher} but for the Foreman.
-data "template_file" "foreman_server_script_smusher" {
-  template = file(
-    "foreman-configuration/foreman-server-instance-user-data.tpl.sh",
-  )
-
-  vars = {
-    foreman_environment = data.local_file.foreman_environment.content
-    dockerhub_repo = var.dockerhub_repo
-    foreman_docker_image = var.foreman_docker_image
-    user = var.user
-    stage = var.stage
-    region = var.region
-    database_host = aws_instance.pg_bouncer.private_ip
-    database_user = var.database_user
-    database_password = var.database_password
-    database_name = aws_db_instance.postgres_db.name
-    elasticsearch_host = aws_elasticsearch_domain.es.endpoint
-    elasticsearch_port = var.elasticsearch_port
-    log_group = aws_cloudwatch_log_group.data_refinery_log_group.name
-  }
-}
-
 resource "aws_instance" "foreman_server_1" {
-  ami = data.aws_ami.ubuntu.id
-  instance_type = var.foreman_instance_type
-  availability_zone = "${var.region}a"
+  ami                    = data.aws_ami.ubuntu.id
+  instance_type          = var.foreman_instance_type
+  availability_zone      = "${var.region}a"
   vpc_security_group_ids = [aws_security_group.data_refinery_foreman.id]
-  iam_instance_profile = aws_iam_instance_profile.data_refinery_foreman.name
-  subnet_id = aws_subnet.data_refinery_1a.id
+  iam_instance_profile   = aws_iam_instance_profile.data_refinery_foreman.name
+  subnet_id              = aws_subnet.data_refinery_1a.id
+
   depends_on = [
     aws_db_instance.postgres_db,
-    aws_instance.pg_bouncer,
     aws_elasticsearch_domain.es,
+    aws_instance.pg_bouncer,
+    aws_s3_bucket.data_refinery_cert_bucket,
+    aws_security_group_rule.data_refinery_api_http,
+    aws_security_group_rule.data_refinery_api_outbound,
   ]
-  user_data = data.template_file.foreman_server_script_smusher.rendered
-  key_name = aws_key_pair.data_refinery.key_name
+
+  user_data = templatefile("api-configuration/api-server-instance-user-data.tpl.sh",
+    {
+      api_docker_image          = var.api_docker_image
+      api_environment           = data.local_file.api_environment.content
+      data_refinery_cert_bucket = aws_s3_bucket.data_refinery_cert_bucket.id
+      database_host             = aws_instance.pg_bouncer.private_ip
+      database_name             = aws_db_instance.postgres_db.db_name
+      database_password         = var.database_password
+      database_user             = var.database_user
+      dockerhub_repo            = var.dockerhub_repo
+      elasticsearch_host        = aws_elasticsearch_domain.es.endpoint
+      elasticsearch_port        = "80" # AWS doesn't support the data transfer protocol on 9200 >:[
+      log_group                 = aws_cloudwatch_log_group.data_refinery_log_group.name
+      log_stream                = aws_cloudwatch_log_stream.log_stream_api.name
+      nginx_config              = data.local_file.api_nginx_config.content
+      region                    = var.region
+      stage                     = var.stage
+      user                      = var.user
+    }
+  )
+  key_name  = aws_key_pair.data_refinery.key_name
 
   tags = merge(
     var.default_tags,
diff --git a/infrastructure/variables.tf b/infrastructure/variables.tf
index 3d56947a0..ab2589f14 100644
--- a/infrastructure/variables.tf
+++ b/infrastructure/variables.tf
@@ -256,7 +256,7 @@ output "environment_variables" {
     },
     {
       name = "DATABASE_NAME"
-      value = aws_db_instance.postgres_db.name
+      value = aws_db_instance.postgres_db.db_name
     },
     {
       name = "DATABASE_HOST"

From 96867782b7ce3d38336eb3d78af18357d726741f Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Thu, 6 Oct 2022 09:19:40 -0700
Subject: [PATCH 10/24] Fix foreman misconfiguration.

---
 .../foreman-server-instance-user-data.tpl.sh  |  9 +++---
 infrastructure/instances.tf                   | 32 +++++++++----------
 infrastructure/variables.tf                   |  8 +++++
 3 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh b/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh
index e77541c75..3598c2a45 100644
--- a/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh
+++ b/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh
@@ -77,8 +77,8 @@ docker run \\
        -e DATABASE_PASSWORD=${database_password} \\
        -v /tmp:/tmp \\
        -it ${dockerhub_repo}/dr_\"\$1\" python3 manage.py \"\$2\"
-" >> /home/ubuntu/run_cron_job_test.sh
-chmod +x /home/ubuntu/run_cron_job_test.sh
+" >> /home/ubuntu/run_cron_job.sh
+chmod +x /home/ubuntu/run_cron_job.sh
 
 # Use Monit to ensure the Foreman is always running
 apt-get -y update
@@ -112,8 +112,9 @@ service monit restart
 # Install the cron job tests
 crontab -l > tempcron
 cat <<EOF >> tempcron
-0 12 * * MON /bin/bash /home/ubuntu/run_cron_job_test.sh affymetrix check_brainarray_gene_agreement >> /var/log/cron_job_tests.log 2>&1
-0 12 * * MON /bin/bash /home/ubuntu/run_cron_job_test.sh affymetrix check_tx_index_transcript_agreement >> /var/log/cron_job_tests.log 2>&1
+0 12 * * MON /bin/bash /home/ubuntu/run_cron_job.sh affymetrix check_brainarray_gene_agreement >> /var/log/cron_job_tests.log 2>&1
+0 12 * * MON /bin/bash /home/ubuntu/run_cron_job.sh affymetrix check_tx_index_transcript_agreement >> /var/log/cron_job_tests.log 2>&1
+0 12 * * ${accession_gathering_job_run_day} /bin/bash /home/ubuntu/run_cron_job.sh foreman gather_weekly_accessions >> /var/log/gather_weekly_accessions.log 2>&1
 EOF
 # install new cron file
 crontab tempcron
diff --git a/infrastructure/instances.tf b/infrastructure/instances.tf
index bed2234dd..0628c31d3 100644
--- a/infrastructure/instances.tf
+++ b/infrastructure/instances.tf
@@ -191,24 +191,22 @@ resource "aws_instance" "foreman_server_1" {
     aws_security_group_rule.data_refinery_api_outbound,
   ]
 
-  user_data = templatefile("api-configuration/api-server-instance-user-data.tpl.sh",
+  user_data = templatefile("foreman-configuration/foreman-server-instance-user-data.tpl.sh",
     {
-      api_docker_image          = var.api_docker_image
-      api_environment           = data.local_file.api_environment.content
-      data_refinery_cert_bucket = aws_s3_bucket.data_refinery_cert_bucket.id
-      database_host             = aws_instance.pg_bouncer.private_ip
-      database_name             = aws_db_instance.postgres_db.db_name
-      database_password         = var.database_password
-      database_user             = var.database_user
-      dockerhub_repo            = var.dockerhub_repo
-      elasticsearch_host        = aws_elasticsearch_domain.es.endpoint
-      elasticsearch_port        = "80" # AWS doesn't support the data transfer protocol on 9200 >:[
-      log_group                 = aws_cloudwatch_log_group.data_refinery_log_group.name
-      log_stream                = aws_cloudwatch_log_stream.log_stream_api.name
-      nginx_config              = data.local_file.api_nginx_config.content
-      region                    = var.region
-      stage                     = var.stage
-      user                      = var.user
+      accession_gathering_job_run_day = var.accession_gathering_job_run_day
+      database_host = aws_instance.pg_bouncer.private_ip
+      database_name = aws_db_instance.postgres_db.name
+      database_password = var.database_password
+      database_user = var.database_user
+      dockerhub_repo = var.dockerhub_repo
+      elasticsearch_host = aws_elasticsearch_domain.es.endpoint
+      elasticsearch_port = var.elasticsearch_port
+      foreman_docker_image = var.foreman_docker_image
+      foreman_environment = data.local_file.foreman_environment.content
+      log_group = aws_cloudwatch_log_group.data_refinery_log_group.name
+      region = var.region
+      stage = var.stage
+      user = var.user
     }
   )
   key_name  = aws_key_pair.data_refinery.key_name
diff --git a/infrastructure/variables.tf b/infrastructure/variables.tf
index ab2589f14..8bf167971 100644
--- a/infrastructure/variables.tf
+++ b/infrastructure/variables.tf
@@ -223,6 +223,14 @@ variable "processing_compendia" {
   default = true
 }
 
+variable "accession_gathering_job_run_day" {
+  default = "SAT"
+}
+
+variable "max_accessions_gathered_per_run" {
+  default = 0
+}
+
 # Output our production environment variables.
 output "environment_variables" {
   value = [

From dccc246703eb8d25fbca521ebd625de9fd257298 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Wed, 2 Nov 2022 09:11:57 -0700
Subject: [PATCH 11/24] Update workers Docker images:   - Optimize and reorder
 building instructions.   - Update Python to v3.8.   - Resolve OS level
 package dependency conflicts.   - Add and set up missing OS packages.   - Add
 apt cache clean up instructions.   - Use `--ignore-installed` pip flag for
 better deps management.   -

---
 workers/dockerfiles/Dockerfile.affymetrix     |  87 +++++++-------
 .../dockerfiles/Dockerfile.affymetrix_local   |   3 +-
 workers/dockerfiles/Dockerfile.compendia      | 106 +++++++++---------
 workers/dockerfiles/Dockerfile.downloaders    |  88 ++++++++-------
 workers/dockerfiles/Dockerfile.illumina       |  87 +++++++-------
 workers/dockerfiles/Dockerfile.no_op          |  81 ++++++-------
 workers/dockerfiles/Dockerfile.salmon         | 100 +++++++++--------
 workers/dockerfiles/Dockerfile.smasher        |  86 +++++++-------
 workers/dockerfiles/Dockerfile.transcriptome  |  61 +++++-----
 9 files changed, 362 insertions(+), 337 deletions(-)

diff --git a/workers/dockerfiles/Dockerfile.affymetrix b/workers/dockerfiles/Dockerfile.affymetrix
index 33c6e2518..151473ecf 100644
--- a/workers/dockerfiles/Dockerfile.affymetrix
+++ b/workers/dockerfiles/Dockerfile.affymetrix
@@ -1,45 +1,51 @@
 FROM ubuntu:20.04
 
+# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
+# For whatever reason this worked and 'en_US.UTF-8' did not.
+ENV LANG C.UTF-8
+
+# Prevent tzdata from prompting us for a timezone and hanging the build.
+ENV DEBIAN_FRONTEND=noninteractive
+
 RUN apt-get update -qq
 RUN apt-get install -y software-properties-common
 RUN add-apt-repository ppa:apt-fast/stable
-RUN apt-get update -qq
-RUN apt-get -y install apt-fast
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -qq
+RUN apt-get install -y apt-fast apt-transport-https
 
 # The packages related to R are somewhat weird, see the README for more details.
-
 COPY workers/CRAN.gpg .
-RUN \
-  apt-fast update -qq && \
-  apt-get install -y apt-transport-https && \
-  apt-fast install -y lsb-release && \
-  echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list && \
-  apt-key add CRAN.gpg && \
-  apt-fast update -qq && \
-  apt-fast install -y \
+RUN apt-key add CRAN.gpg
+RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
+      >> /etc/apt/sources.list.d/added_repos.list
+
+RUN apt-fast update -qq && apt-fast install -y \
+  build-essential \
+  cmake \
+  curl \
+  cython3 \
   ed \
   git \
-  mercurial \
   libcairo-dev \
+  libcurl4-openssl-dev \
   libedit-dev \
+  libpq-dev \
+  libssl-dev \
+  libxml2-dev \
+  llvm-10-dev \
   lsb-release \
-  python3 \
+  mercurial \
+  pkg-config \
   python3-pip \
+  python3 \
   python3-dev \
   r-base-core \
-  r-base-dev \
-  libpq-dev \
-  libxml2-dev \
-  libssl-dev \
-  libcurl4-openssl-dev \
-  curl \
-  wget && \
-  rm -rf /var/lib/apt/lists/*
+  wget
+
 RUN rm CRAN.gpg
+RUN apt-get clean; rm -rf /var/lib/apt/lists/*
+RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
 
 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
@@ -47,41 +53,29 @@ WORKDIR /home/user
 ENV R_LIBS "/usr/local/lib/R/site-library"
 
 COPY common/install_devtools.R .
-
 RUN Rscript install_devtools.R
 
 COPY workers/R_dependencies/affymetrix/dependencies.R .
 RUN Rscript dependencies.R
 
 COPY workers/affymetrix_dependencies.R .
-COPY workers/install_ensg_pkgs.R .
-
 RUN Rscript affymetrix_dependencies.R
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
 RUN pip3 install pip --upgrade
+RUN pip3 install setuptools --upgrade
 
-RUN pip3 install setuptools --upgrade && \
-  rm -rf /root/.cache
-
-COPY config/ config/
-COPY .boto .boto
-
-COPY common/dist/data-refinery-common-* common/
+# Install this one here instead of via requirements.txt because not
+# all processors need it.
+RUN pip3 install rpy2==3.4.5
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install -r requirements.txt
 
 # Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
 RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
 
-COPY workers/data_refinery_workers/processors/requirements.txt .
-
-RUN pip3 install -r requirements.txt
-
-# Install this one here instead of via requirements.txt because not
-# all processors need it.
-RUN pip3 install rpy2==3.4.5
+# Clear out the pip3 cache.
+RUN rm -rf /root/.cache
 
 ARG SYSTEM_VERSION
 
@@ -89,6 +83,9 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION
 
 USER user
 
+COPY .boto .boto
+COPY config/ config/
 COPY workers/ .
+COPY workers/install_ensg_pkgs.R .
 
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.affymetrix_local b/workers/dockerfiles/Dockerfile.affymetrix_local
index 84309b4e9..9a37692e6 100644
--- a/workers/dockerfiles/Dockerfile.affymetrix_local
+++ b/workers/dockerfiles/Dockerfile.affymetrix_local
@@ -6,9 +6,8 @@ USER root
 RUN rm -r common/
 RUN pip3 uninstall -y data_refinery_common
 
-# Reinstall common.
-COPY common/dist/data-refinery-common-* common/
 # Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
 RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
 
 ARG SYSTEM_VERSION
diff --git a/workers/dockerfiles/Dockerfile.compendia b/workers/dockerfiles/Dockerfile.compendia
index 3a6df3f5f..2c6a38784 100644
--- a/workers/dockerfiles/Dockerfile.compendia
+++ b/workers/dockerfiles/Dockerfile.compendia
@@ -1,55 +1,67 @@
-FROM nvidia/cuda:11.1-runtime-ubuntu18.04
+FROM nvidia/cuda:11.8.0-runtime-ubuntu18.04
 
 # This is very similar to the `smasher` image, but comes with OpenBLAS and some
 # of the other libraries required for fancyimpute.
 
-RUN apt-get update -qq
-RUN apt-get install -y software-properties-common
-RUN add-apt-repository ppa:apt-fast/stable
-RUN apt-get update -qq
+# Prevent tzdata from prompting us for a timezone and hanging the build.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
+# For whatever reason this worked and 'en_US.UTF-8' did not.
+ENV LANG C.UTF-8
+
+# RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
 # via https://github.com/ilikenwf/apt-fast/issues/85#issuecomment-261640099
 RUN echo debconf apt-fast/maxdownloads string 16 | debconf-set-selections
 RUN echo debconf apt-fast/dlflag boolean true | debconf-set-selections
 RUN echo debconf apt-fast/aptmanager string apt-get | debconf-set-selections
-RUN _APTMGR=apt-get apt-get install -y apt-fast
-
-RUN export DEBIAN_FRONTEND=noninteractive; \
-    export DEBCONF_NONINTERACTIVE_SEEN=true; \
-    echo 'tzdata tzdata/Areas select Etc' | debconf-set-selections; \
-    echo 'tzdata tzdata/Zones/Etc select UTC' | debconf-set-selections; \
-    apt-get update -qqy \
- && apt-get install -qqy --no-install-recommends \
-        tzdata \
- && apt-get clean \
- && rm -rf /var/lib/apt/lists/*
+RUN echo 'tzdata tzdata/Areas select Etc' | debconf-set-selections
+RUN echo 'tzdata tzdata/Zones/Etc select UTC' | debconf-set-selections
+
+RUN apt-get update -qq
+RUN apt-get install -y software-properties-common
+RUN add-apt-repository ppa:apt-fast/stable
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN add-apt-repository ppa:savoury1/llvm-defaults-10
+
+RUN apt-get update -qq
+RUN apt-get install -y apt-fast apt-transport-https tzdata
 
 COPY workers/CRAN.gpg .
-RUN \
-  apt-fast update -qq && \
-  apt-fast install -y lsb-release && \
-  echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list && \
-  apt-key add CRAN.gpg && \
-  apt-fast update -qq && \
-  apt-fast install -y \
+RUN apt-key add CRAN.gpg
+RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
+      >> /etc/apt/sources.list.d/added_repos.list
+
+RUN apt-fast update -qq && apt-fast install -y \
+  build-essential \
+  cmake \
+  curl \
+  cython3 \
   ed \
+  gfortran \
   git \
-  liblapack-dev \
-  libopenblas-dev \
-  python3 \
-  python3-pip \
+  libcairo-dev \
   libcurl4-openssl-dev \
+  libedit-dev \
+  libblas-dev \
+  liblapack-dev \
   libpq-dev \
-  r-base-core \
-  r-base-dev \
-  libpq-dev \
-  libxml2-dev \
   libssl-dev \
-  libcurl4-openssl-dev \
-  curl \
-  wget && \
-  rm -rf /var/lib/apt/lists/*
+  libxml2-dev \
+  llvm-10-dev \
+  lsb-release \
+  mercurial \
+  pkg-config \
+  python3-pip \
+  python3.8 \
+  python3.8-dev \
+  r-base-core \
+  wget
+
 RUN rm CRAN.gpg
+RUN apt-get clean; rm -rf /var/lib/apt/lists/*
+RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
 
 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
@@ -62,7 +74,6 @@ RUN ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local
 ENV R_LIBS "/usr/local/lib/R/site-library"
 
 COPY common/install_devtools.R .
-
 RUN Rscript install_devtools.R
 
 COPY workers/R_dependencies/qn/dependencies.R .
@@ -72,30 +83,21 @@ COPY workers/qn_dependencies.R .
 RUN Rscript qn_dependencies.R
 # End QN-specific
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
 RUN pip3 install --upgrade pip
-
 # Smasher-specific requirements
-RUN pip3 install numpy scipy matplotlib pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD
+RUN pip3 install --ignore-installed numpy scipy matplotlib pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD
 # End smasher-specific
 
-COPY config/ config/
-COPY .boto .boto
-
 COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed -r requirements.txt
 
-RUN  pip3 --no-cache-dir install -r requirements.txt
-RUN pip3 install numpy==1.16.0 # Fix a downgrade
-
-COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed numpy==1.16.0 # Fix a downgrade
 
 # Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
 RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
 
-# Clear our the pip3 cache
+# Clear out the pip3 cache.
 RUN rm -rf /root/.cache
 
 ARG SYSTEM_VERSION
@@ -104,6 +106,8 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION
 
 USER user
 
+COPY .boto .boto
+COPY config/ config/
 COPY workers/ .
 
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.downloaders b/workers/dockerfiles/Dockerfile.downloaders
index f2fc9e78a..1b3337325 100644
--- a/workers/dockerfiles/Dockerfile.downloaders
+++ b/workers/dockerfiles/Dockerfile.downloaders
@@ -1,59 +1,64 @@
 FROM ubuntu:18.04
 
-RUN apt-get update -qq
+# Prevent tzdata from prompting us for a timezone and hanging the build.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
+# For whatever reason this worked and 'en_US.UTF-8' did not.
+ENV LANG C.UTF-8
+
+RUN apt-get update
 RUN apt-get install -y software-properties-common
 RUN add-apt-repository ppa:apt-fast/stable
-RUN apt-get update -qq
-RUN apt-get -y install apt-fast
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN add-apt-repository ppa:savoury1/llvm-defaults-10
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -qq
+RUN apt-get install -y apt-fast apt-transport-https
 
 # The packages related to R are somewhat weird, see the README for more details.
-
 COPY workers/CRAN.gpg .
-RUN \
-  apt-fast update -qq && \
-  apt-get install -y apt-transport-https && \
-  apt-fast install -y lsb-release && \
-  echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list && \
-  apt-key add CRAN.gpg && \
-  apt-fast update -qq && \
-  apt-fast install -y \
+RUN apt-key add CRAN.gpg
+RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
+      >> /etc/apt/sources.list.d/added_repos.list
+
+RUN apt-fast update -qq && apt-fast install -y \
+  build-essential \
+  cmake \
+  curl \
+  cython3 \
   ed \
   git \
-  python3 \
-  python3-pip \
-  r-base-core \
-  r-base-dev \
-  libpq-dev \
-  libxml2-dev \
-  libssl-dev \
+  libcairo-dev \
   libcurl4-openssl-dev \
+  libedit-dev \
   libpq-dev \
-  curl \
-  wget && \
-  rm -rf /var/lib/apt/lists/*
+  libssl-dev \
+  libxml2-dev \
+  llvm-10-dev \
+  lsb-release \
+  mercurial \
+  pkg-config \
+  python3-pip \
+  python3.8 \
+  python3.8-dev \
+  r-base-core \
+  wget
+
 RUN rm CRAN.gpg
+RUN apt-get clean; rm -rf /var/lib/apt/lists/*
+RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
 
 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
-RUN pip3 install --upgrade pip
-
 ENV R_LIBS "/usr/local/lib/R/site-library"
 
 COPY common/install_devtools.R .
-
 RUN Rscript install_devtools.R
 
 COPY workers/install_downloader_R_only.R .
-
 RUN Rscript install_downloader_R_only.R
 
 # Aspera will only install as the current user.
@@ -71,22 +76,19 @@ RUN rm aspera-cli-3.9.1-0.tar.bz2
 # Now that we're done installing Aspera go back to being root for a bit.
 USER root
 
-COPY config config
-COPY .boto .boto
-
-COPY workers/data_refinery_workers/downloaders/requirements.txt .
-
-RUN  pip3 install -r requirements.txt
-
+RUN pip3 install --upgrade pip
 # Install this rpy2 here instead of via requirements.txt because
 # pip-compile throws an error for it.
 RUN pip3 install rpy2==3.4.5
-COPY common/dist/data-refinery-common-* common/
+
+COPY workers/data_refinery_workers/downloaders/requirements.txt .
+RUN pip3 install --ignore-installed -r requirements.txt
 
 # Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
 RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
 
-# Clear our the pip3 cache
+# Clear out the pip3 cache.
 RUN rm -rf /root/.cache
 
 ARG SYSTEM_VERSION
@@ -95,6 +97,8 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION
 
 USER user
 
+COPY .boto .boto
+COPY config config
 COPY workers/ .
 
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.illumina b/workers/dockerfiles/Dockerfile.illumina
index bc2294425..e4cc70268 100644
--- a/workers/dockerfiles/Dockerfile.illumina
+++ b/workers/dockerfiles/Dockerfile.illumina
@@ -1,46 +1,57 @@
 FROM ubuntu:18.04
 
-RUN apt-get update -qq
+# Prevent tzdata from prompting us for a timezone and hanging the build.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
+# For whatever reason this worked and 'en_US.UTF-8' did not.
+ENV LANG C.UTF-8
+
+RUN apt-get update
 RUN apt-get install -y software-properties-common
 RUN add-apt-repository ppa:apt-fast/stable
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN add-apt-repository ppa:savoury1/llvm-defaults-10
+
 RUN apt-get update -qq
-RUN apt-get -y install apt-fast
+RUN apt-get install -y apt-fast apt-transport-https
 
 # The packages related to R are somewhat weird, see the README for more details.
-
 COPY workers/CRAN.gpg .
-RUN \
-  apt-fast update -qq && \
-  apt-get install -y apt-transport-https && \
-  apt-fast install -y lsb-release && \
-  echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list && \
-  apt-key add CRAN.gpg && \
-  apt-fast update -qq
+RUN apt-key add CRAN.gpg
+RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
+      >> /etc/apt/sources.list.d/added_repos.list
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-fast install -y \
+RUN apt-fast update -qq && apt-fast install -y \
+  build-essential \
+  cmake \
+  curl \
+  cython3 \
   ed \
+  gfortran \
   git \
-  mercurial \
   libcairo-dev \
+  libcurl4-openssl-dev \
   libedit-dev \
+  libblas-dev \
+  liblapack-dev \
+  libpq-dev \
+  libssl-dev \
+  libxml2-dev \
+  llvm-10-dev \
   lsb-release \
-  python3 \
+  mercurial \
+  pkg-config \
   python3-pip \
+  python3.8 \
+  python3.8-dev \
   r-base-core \
-  r-base-dev \
-  libpq-dev \
-  libxml2-dev \
-  libssl-dev \
-  libcurl4-openssl-dev \
-  curl \
-  wget && \
-  rm -rf /var/lib/apt/lists/*
+  wget
 
 RUN rm CRAN.gpg
+RUN apt-get clean; rm -rf /var/lib/apt/lists/*
+RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
 
 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
@@ -48,37 +59,24 @@ WORKDIR /home/user
 ENV R_LIBS "/usr/local/lib/R/site-library"
 
 COPY common/install_devtools.R .
-
 RUN Rscript install_devtools.R
 
 COPY workers/R_dependencies/illumina/dependencies.R .
 RUN Rscript dependencies.R
 
-# These are for Illumina
+# These are for Illumina.
 COPY workers/illumina_dependencies.R .
 RUN Rscript illumina_dependencies.R
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
 RUN pip3 install --upgrade pip
-
-COPY config/ config/
-COPY .boto .boto
-
-COPY workers/illumina_probe_maps/ probe_maps/
-
 COPY workers/data_refinery_workers/processors/requirements.txt .
-
-RUN  pip3 install -r requirements.txt
-
-COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed -r requirements.txt
 
 # Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
 RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
 
-# Clear our the pip3 cache
+# Clear out the pip3 cache.
 RUN rm -rf /root/.cache
 
 ARG SYSTEM_VERSION
@@ -87,7 +85,10 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION
 
 USER user
 
-COPY workers/data_refinery_workers/processors/detect_database.R .
+COPY .boto .boto
+COPY config/ config/
 COPY workers/ .
+COPY workers/data_refinery_workers/processors/detect_database.R .
+COPY workers/illumina_probe_maps/ probe_maps/
 
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.no_op b/workers/dockerfiles/Dockerfile.no_op
index ee59a63a5..98f35d772 100644
--- a/workers/dockerfiles/Dockerfile.no_op
+++ b/workers/dockerfiles/Dockerfile.no_op
@@ -1,51 +1,56 @@
 FROM ubuntu:18.04
 
+# Prevent tzdata from prompting us for a timezone and hanging the build.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
+# For whatever reason this worked and 'en_US.UTF-8' did not.
+ENV LANG C.UTF-8
+
 RUN apt-get update -qq
 RUN apt-get install -y software-properties-common
+
 RUN add-apt-repository ppa:apt-fast/stable
 # deadsnakes packages new python versions for older Ubuntu releases
 RUN add-apt-repository ppa:deadsnakes/ppa
+
 RUN apt-get update -qq
 RUN apt-get -y install apt-fast
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
-
 # The packages related to R are somewhat weird, see the README for more details.
-
 COPY workers/CRAN.gpg .
-RUN \
-  apt-fast update -qq && \
-  apt-get install -y apt-transport-https && \
-  apt-fast install -y lsb-release && \
-  echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list && \
-  apt-key add CRAN.gpg && \
-  apt-fast update -qq && \
-  apt-fast install -y \
+RUN apt-key add CRAN.gpg
+RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
+      >> /etc/apt/sources.list.d/added_repos.list
+
+RUN apt-fast update -qq && apt-fast install -y \
+  build-essential \
+  cmake \
+  curl \
+  cython3 \
   ed \
   git \
-  mercurial \
   libcairo-dev \
+  libcurl4-openssl-dev \
   libedit-dev \
+  libfreetype6-dev \
+  libpq-dev \
+  libssl-dev \
+  libxml2-dev \
+  llvm-10-dev \
   lsb-release \
-  python3.6 \
-  python3.6-dev \
+  mercurial \
+  pkg-config \
   python3-pip \
+  python3.8 \
+  python3.8-dev \
   r-base-core \
-  r-base-dev \
-  libpq-dev \
-  libxml2-dev \
-  libssl-dev \
-  libcurl4-openssl-dev \
-  curl \
-  unzip \
-  wget && \
-  rm -rf /var/lib/apt/lists/*
-RUN rm CRAN.gpg
+  wget
 
-# Set the system python version to python3.6 from deadsnakes
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1
+RUN rm CRAN.gpg
+RUN apt-get clean; rm -rf /var/lib/apt/lists/*
+RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
 
 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
@@ -54,7 +59,6 @@ WORKDIR /home/user
 ENV R_LIBS "/usr/local/lib/R/site-library"
 
 COPY common/install_devtools.R .
-
 RUN Rscript install_devtools.R
 
 COPY workers/R_dependencies/no_op/dependencies.R .
@@ -62,6 +66,7 @@ RUN Rscript dependencies.R
 
 COPY workers/install_gene_convert.R .
 RUN Rscript install_gene_convert.R
+
 RUN mkdir -p gene_indexes
 WORKDIR /home/user/gene_indexes
 ENV ID_REFINERY_URL https://zenodo.org/record/1410647/files/all_1536267482.zip
@@ -72,23 +77,17 @@ RUN rm *.zip
 WORKDIR /home/user
 # End Noop-specific
 
-COPY workers/data_refinery_workers/processors/requirements.txt .
-
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
 RUN pip3 install --upgrade pip
-RUN  pip3 install -r requirements.txt
+RUN pip3 install numpy
 
-COPY config/ config/
-COPY .boto .boto
-COPY common/dist/data-refinery-common-* common/
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed -r requirements.txt
 
 # Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
 RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
 
-# Clear our the pip3 cache
+# Clear out the pip3 cache.
 RUN rm -rf /root/.cache
 
 ARG SYSTEM_VERSION
@@ -97,6 +96,8 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION
 
 USER user
 
+COPY .boto .boto
+COPY config/ config/
 COPY workers/ .
 
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.salmon b/workers/dockerfiles/Dockerfile.salmon
index 31922880e..8bfbbe9a7 100644
--- a/workers/dockerfiles/Dockerfile.salmon
+++ b/workers/dockerfiles/Dockerfile.salmon
@@ -1,44 +1,54 @@
 FROM ubuntu:18.04
 
-RUN apt-get update -qq
+# Prevent tzdata from prompting us for a timezone and hanging the build.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
+# For whatever reason this worked and 'en_US.UTF-8' did not.
+ENV LANG C.UTF-8
+
+RUN apt-get update
 RUN apt-get install -y software-properties-common
 RUN add-apt-repository ppa:apt-fast/stable
-RUN apt-get update -qq
-RUN apt-get -y install apt-fast
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN add-apt-repository ppa:savoury1/llvm-defaults-10
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -qq
+RUN apt-get install -y apt-fast apt-transport-https
 
 # The packages related to R are somewhat weird, see the README for more details.
-
 COPY workers/CRAN.gpg .
+RUN apt-key add CRAN.gpg
+RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
+      >> /etc/apt/sources.list.d/added_repos.list
 
-RUN \
-  apt-fast update -qq && \
-  apt-get install -y apt-transport-https && \
-  apt-fast install -y lsb-release && \
-  echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list && \
-  apt-key add CRAN.gpg && \
-  apt-fast update -qq && \
-  apt-fast install -y \
+RUN apt-fast update -qq && apt-fast install -y \
+  build-essential \
+  cmake \
+  curl \
+  cython3 \
   ed \
   git \
-  mercurial \
   libcairo-dev \
+  libcurl4-openssl-dev \
   libedit-dev \
+  libpq-dev \
+  libssl-dev \
+  libxml2-dev \
+  llvm-10-dev \
   lsb-release \
-  python3 \
+  mercurial \
+  pkg-config \
   python3-pip \
-  libxml2-dev \
-  cmake \
+  python3.8 \
+  python3.8-dev \
   r-base-core \
-  libssl-dev \
-  libcurl4-openssl-dev \
-  curl \
-  wget && \
-  rm -rf /var/lib/apt/lists/*
+  wget
+
 RUN rm CRAN.gpg
+RUN apt-get upgrade; apt-get clean; rm -rf /var/lib/apt/lists/*
+RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
 
 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
@@ -60,44 +70,40 @@ RUN ln -sf `pwd`/Salmon-${SALMON_VERSION}_linux_x86_64/bin/salmon /usr/local/bin
 RUN rm -f Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz
 # End Salmon installation.
 
-# Install R dependencies
+# Install R dependencies.
 COPY common/install_devtools.R .
 RUN Rscript install_devtools.R
+
 COPY workers/R_dependencies/tximport/dependencies.R tximport_dependencies.R
 RUN Rscript tximport_dependencies.R
 
-# Install tximport
+# Install tximport.
 COPY workers/install_tximport.R .
 RUN Rscript install_tximport.R
 
-# Install SalmonTools
+RUN pip3 install --upgrade pip
+RUN pip3 install numpy
+
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed -r requirements.txt
+
+# Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
+
+# Install SalmonTools.
 RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && cd SalmonTools && git checkout 3e6654c2c10a5225498b623056993947fa688afc
 RUN cd SalmonTools && cmake . -DCMAKE_INSTALL_PREFIX=/usr/local && make install
 RUN rm -rf SalmonTools
 
-# Install sra-tools
+# Install sra-tools.
 ENV SRA_VERSION 2.9.1
-RUN wget "http://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \
+RUN wget "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \
     tar zxfv sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz && \
     cp -r sratoolkit.${SRA_VERSION}-ubuntu64/bin/* /usr/bin
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
-RUN pip3 install --upgrade pip
-
-COPY config/ config/
-COPY .boto .boto
-
-COPY workers/data_refinery_workers/processors/requirements.txt .
-
-RUN pip3 install -r requirements.txt
-
-COPY common/dist/data-refinery-common-* common/
-
-# Get the latest version from the dist directory.
-RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
+# Clear out the pip3 cache.
+RUN rm -rf /root/.cache
 
 ARG SYSTEM_VERSION
 
@@ -105,6 +111,8 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION
 
 USER user
 
+COPY .boto .boto
+COPY config/ config/
 COPY workers/ .
 
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.smasher b/workers/dockerfiles/Dockerfile.smasher
index 7b95644db..313ba2150 100644
--- a/workers/dockerfiles/Dockerfile.smasher
+++ b/workers/dockerfiles/Dockerfile.smasher
@@ -1,42 +1,57 @@
 FROM ubuntu:18.04
 
-RUN apt-get update -qq
+# Prevent tzdata from prompting us for a timezone and hanging the build.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
+# For whatever reason this worked and 'en_US.UTF-8' did not.
+ENV LANG C.UTF-8
+
+RUN apt-get update
 RUN apt-get install -y software-properties-common
 RUN add-apt-repository ppa:apt-fast/stable
-RUN apt-get update -qq
-RUN apt-get -y install apt-fast
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN add-apt-repository ppa:savoury1/llvm-defaults-10
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -qq
+RUN apt-get install -y apt-fast apt-transport-https
 
 # The packages related to R are somewhat weird, see the README for more details.
-
 COPY workers/CRAN.gpg .
-RUN \
-  apt-fast update -qq && \
-  apt-get install -y apt-transport-https && \
-  apt-fast install -y lsb-release && \
-  echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list && \
-  apt-key add CRAN.gpg && \
-  apt-fast update -qq && \
-  apt-fast install -y \
+RUN apt-key add CRAN.gpg
+RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
+      >> /etc/apt/sources.list.d/added_repos.list
+
+RUN apt-fast update -qq && apt-fast install -y \
+  build-essential \
+  cmake \
+  curl \
+  cython3 \
   ed \
+  gfortran \
   git \
-  python3 \
-  python3-pip \
+  libcairo-dev \
   libcurl4-openssl-dev \
+  libedit-dev \
+  libblas-dev \
+  liblapack-dev \
   libpq-dev \
-  r-base-core \
-  r-base-dev \
-  libpq-dev \
-  libxml2-dev \
   libssl-dev \
-  libcurl4-openssl-dev \
-  curl \
-  wget && \
-  rm -rf /var/lib/apt/lists/*
+  libxml2-dev \
+  llvm-10-dev \
+  lsb-release \
+  mercurial \
+  pkg-config \
+  python3-pip \
+  python3.8 \
+  python3.8-dev \
+  r-base-core \
+  wget
+
 RUN rm CRAN.gpg
+RUN apt-get clean; rm -rf /var/lib/apt/lists/*
+RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
 
 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
@@ -45,7 +60,6 @@ WORKDIR /home/user
 ENV R_LIBS "/usr/local/lib/R/site-library"
 
 COPY common/install_devtools.R .
-
 RUN Rscript install_devtools.R
 
 COPY workers/R_dependencies/qn/dependencies.R .
@@ -55,29 +69,19 @@ COPY workers/qn_dependencies.R .
 RUN Rscript qn_dependencies.R
 # End QN-specific
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
 RUN pip3 install --upgrade pip
-
 # Smasher-specific requirements
-RUN pip3 install numpy scipy matplotlib pandas==0.25.3 scikit-learn sympy nose rpy2==3.4.5 tzlocal
+RUN pip3 install --ignore-installed nose numpy rpy2==3.4.5
 # End smasher-specific
 
-COPY config/ config/
-COPY .boto .boto
-
 COPY workers/data_refinery_workers/processors/requirements.txt .
-
-RUN  pip3 install -r requirements.txt
-
-COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed -r requirements.txt
 
 # Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
 RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
 
-# Clear our the pip3 cache
+# Clear out the pip3 cache.
 RUN rm -rf /root/.cache
 
 ARG SYSTEM_VERSION
@@ -86,6 +90,8 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION
 
 USER user
 
+COPY .boto .boto
+COPY config/ config/
 COPY workers/ .
 
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.transcriptome b/workers/dockerfiles/Dockerfile.transcriptome
index 5cb65a6b2..d1ac0ea63 100644
--- a/workers/dockerfiles/Dockerfile.transcriptome
+++ b/workers/dockerfiles/Dockerfile.transcriptome
@@ -1,36 +1,49 @@
 FROM ubuntu:18.04
 
+# Prevent tzdata from prompting us for a timezone and hanging the build.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
+# For whatever reason this worked and 'en_US.UTF-8' did not.
+ENV LANG C.UTF-8
+
 RUN apt-get update -qq
 RUN apt-get install -y software-properties-common
 RUN add-apt-repository ppa:apt-fast/stable
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN add-apt-repository ppa:savoury1/llvm-defaults-10
+
 RUN apt-get update -qq
 RUN apt-get -y install apt-fast
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN \
-  apt-fast update -qq && \
-  apt-fast install -y \
+RUN apt-fast update -qq && apt-fast install -y \
+  build-essential \
+  curl \
+  cython3 \
   ed \
   git \
-  python3 \
-  python3-pip \
   libcurl4-openssl-dev \
+  libfreetype6-dev \
   libpq-dev \
-  zlib1g-dev \
-  curl \
-  wget && \
-  rm -rf /var/lib/apt/lists/*
+  llvm-10-dev \
+  pkg-config \
+  python3-pip \
+  python3.8 \
+  python3.8-dev \
+  wget \
+  zlib1g-dev
+
+RUN rm CRAN.gpg
+RUN apt-get clean; rm -rf /var/lib/apt/lists/*
+RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
 
 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
 
 # It's annoying that this can only be installed via git.
 RUN git clone https://github.com/deweylab/RSEM.git
-
 RUN cd RSEM && make install
-
 RUN rm -rf RSEM
 
 # Install Salmon
@@ -48,36 +61,26 @@ ENV SALMON_VERSION 0.13.1
 # ENV SALMON_VERSION 0.10.2
 
 RUN wget https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz
-
 RUN tar -xzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz
 
 # Salmon can extract to a different directory than the name of the tar file.
 RUN cp `tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | head -1 | cut -f1 -d"/"`/bin/salmon /usr/local/bin
-
 RUN cp `tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | head -1 | cut -f1 -d"/"`/lib/* /usr/local/lib
 
 RUN rm -r Salmon*
 # End Salmon installation.
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
 RUN pip3 install --upgrade pip
-
-COPY config/ config/
-COPY .boto .boto
+RUN pip3 install numpy
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-
-RUN  pip3 install -r requirements.txt
-
-COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed -r requirements.txt
 
 # Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
 RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
 
-# Clear our the pip3 cache
+# Clear out the pip3 cache.
 RUN rm -rf /root/.cache
 
 ARG SYSTEM_VERSION
@@ -86,6 +89,8 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION
 
 USER user
 
+COPY .boto .boto
+COPY config/ config/
 COPY workers/ .
 
 ENTRYPOINT []

From 46dfdae0bc77c8a6d5e66eba4844ab192c3bad45 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Thu, 1 Sep 2022 11:03:18 -0700
Subject: [PATCH 12/24] Add `Accession` model. Update pre-commit config.

---
 .pre-commit-config.yaml                       |  2 +-
 .../migrations/0071_auto_20220901_1653.py     | 44 +++++++++++++++++++
 .../data_refinery_common/models/__init__.py   |  1 +
 .../data_refinery_common/models/accession.py  | 22 ++++++++++
 4 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 common/data_refinery_common/migrations/0071_auto_20220901_1653.py
 create mode 100644 common/data_refinery_common/models/accession.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d024704da..b651ce24a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
       - id: isort
 
   - repo: https://github.com/psf/black
-    rev: 19.10b0
+    rev: 22.3.0
     hooks:
       - id: black
         args: [--line-length=100]
diff --git a/common/data_refinery_common/migrations/0071_auto_20220901_1653.py b/common/data_refinery_common/migrations/0071_auto_20220901_1653.py
new file mode 100644
index 000000000..c7d3b0b63
--- /dev/null
+++ b/common/data_refinery_common/migrations/0071_auto_20220901_1653.py
@@ -0,0 +1,44 @@
+# Generated by Django 3.2.7 on 2022-09-01 16:53
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("data_refinery_common", "0070_auto_20211208_2118"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="Accession",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("code", models.TextField()),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("last_modified_at", models.DateTimeField(auto_now=True)),
+                ("organism", models.TextField()),
+                ("published_date", models.DateTimeField()),
+                ("sample_count", models.PositiveIntegerField(default=0)),
+                ("source", models.TextField()),
+                ("technology", models.TextField()),
+            ],
+            options={
+                "db_table": "accessions",
+            },
+        ),
+        migrations.AddConstraint(
+            model_name="accession",
+            constraint=models.UniqueConstraint(
+                fields=("code", "source", "technology"), name="unique_accession"
+            ),
+        ),
+    ]
diff --git a/common/data_refinery_common/models/__init__.py b/common/data_refinery_common/models/__init__.py
index 39abe7ee3..8e9564153 100644
--- a/common/data_refinery_common/models/__init__.py
+++ b/common/data_refinery_common/models/__init__.py
@@ -1,3 +1,4 @@
+from data_refinery_common.models.accession import Accession  # noqa
 from data_refinery_common.models.api_token import APIToken  # noqa
 from data_refinery_common.models.associations.compendium_result_organism_association import (  # noqa
     CompendiumResultOrganismAssociation,
diff --git a/common/data_refinery_common/models/accession.py b/common/data_refinery_common/models/accession.py
new file mode 100644
index 000000000..dc93cfd88
--- /dev/null
+++ b/common/data_refinery_common/models/accession.py
@@ -0,0 +1,22 @@
+from django.db import models
+
+
+class Accession(models.Model):
+    """Accession model."""
+
+    class Meta:
+        constraints = (
+            models.UniqueConstraint(
+                fields=("code", "source", "technology"), name="unique_accession"
+            ),
+        )
+        db_table = "accessions"
+
+    code = models.TextField()
+    created_at = models.DateTimeField(auto_now_add=True)
+    last_modified_at = models.DateTimeField(auto_now=True)
+    organism = models.TextField()
+    published_date = models.DateTimeField()
+    sample_count = models.PositiveIntegerField(default=0)
+    source = models.TextField()
+    technology = models.TextField()

From cb9249981fb8081d4bc5cf041598b17f77d83a55 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Thu, 8 Sep 2022 18:12:39 -0700
Subject: [PATCH 13/24] Port Python script to Django command.   - Introduce
 AccessionBacklogEntry model.   - Clean up command flags.   - Get previous
 accessions from the DB.   -

---
 .../migrations/0071_accessionbacklogentry.py  |  38 +
 .../data_refinery_common/models/accession.py  |  82 +-
 .../gatherer/__init__.py                      |   0
 .../gatherer/management/__init__.py           |   0
 .../gatherer/management/commands/__init__.py  |   0
 .../management/commands/gather_accessions.py  | 731 ++++++++++++++++++
 foreman/data_refinery_foreman/settings.py     |  17 +-
 foreman/dockerfiles/Dockerfile.foreman        |   2 +
 8 files changed, 856 insertions(+), 14 deletions(-)
 create mode 100644 common/data_refinery_common/migrations/0071_accessionbacklogentry.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/__init__.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/management/__init__.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/management/commands/__init__.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py

diff --git a/common/data_refinery_common/migrations/0071_accessionbacklogentry.py b/common/data_refinery_common/migrations/0071_accessionbacklogentry.py
new file mode 100644
index 000000000..86c04daed
--- /dev/null
+++ b/common/data_refinery_common/migrations/0071_accessionbacklogentry.py
@@ -0,0 +1,38 @@
+# Generated by Django 3.2.7 on 2022-09-07 19:31
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("data_refinery_common", "0070_auto_20211208_2118"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="AccessionBacklogEntry",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("code", models.TextField(unique=True)),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("last_modified_at", models.DateTimeField(auto_now=True)),
+                ("organism", models.TextField()),
+                ("published_date", models.DateTimeField()),
+                ("sample_count", models.PositiveIntegerField(default=0)),
+                ("source", models.TextField()),
+                ("technology", models.TextField()),
+            ],
+            options={
+                "db_table": "accession_backlog",
+            },
+        ),
+    ]
diff --git a/common/data_refinery_common/models/accession.py b/common/data_refinery_common/models/accession.py
index dc93cfd88..6ac62da9f 100644
--- a/common/data_refinery_common/models/accession.py
+++ b/common/data_refinery_common/models/accession.py
@@ -1,18 +1,16 @@
+from datetime import datetime
+
 from django.db import models
+from django.utils import timezone
 
 
-class Accession(models.Model):
-    """Accession model."""
+class AccessionBacklogEntry(models.Model):
+    """Accession backlog entry model."""
 
     class Meta:
-        constraints = (
-            models.UniqueConstraint(
-                fields=("code", "source", "technology"), name="unique_accession"
-            ),
-        )
-        db_table = "accessions"
-
-    code = models.TextField()
+        db_table = "accession_backlog"
+
+    code = models.TextField(unique=True)
     created_at = models.DateTimeField(auto_now_add=True)
     last_modified_at = models.DateTimeField(auto_now=True)
     organism = models.TextField()
@@ -20,3 +18,67 @@ class Meta:
     sample_count = models.PositiveIntegerField(default=0)
     source = models.TextField()
     technology = models.TextField()
+
+    def __eq__(self, other: object) -> bool:
+        """Returns True if two objects are equal. Otherwise returns False."""
+        return isinstance(other, AccessionBacklogEntry) and self.code == other.code
+
+    def __hash__(self) -> int:
+        """Returns accession object unique hash value."""
+        return hash(self.code)
+
+    def __str__(self) -> str:
+        """Returns accession default string representation."""
+        return ", ".join((self.code, self.technology, self.source, str(self.published_date.date())))
+
+    @staticmethod
+    def create_from_ma_ae_entry(entry):
+        """Creates accession object from MicroArray ArrayExpress entry."""
+        accession = AccessionBacklogEntry()
+        accession.code = entry["accession"]
+        accession.source = "ebi_biostudies"
+        accession.technology = "microarray"
+
+        if "organism" in entry:
+            accession.organism = entry["organism"]
+        if "release_date" in entry:
+            accession.published_date = timezone.make_aware(
+                datetime.strptime(entry["release_date"], "%Y-%m-%d")
+            )
+
+        return accession
+
+    @staticmethod
+    def create_from_ma_geo_entry(entry):
+        """Creates accession object from MicroArray GEO meta DB entry."""
+        accession = AccessionBacklogEntry()
+        accession.code = entry["gse"]
+        accession.source = "geo_meta_db"
+        accession.technology = "microarray"
+
+        if "organism" in entry:
+            accession.organism = entry["organism"].lower()
+        if "submission_date" in entry:
+
+            accession.published_date = timezone.make_aware(
+                datetime.strptime(entry["submission_date"], "%Y-%m-%d")
+            )
+
+        return accession
+
+    @staticmethod
+    def create_from_rnaseq_entry(entry):
+        """Creates accession object from RNA-Seq entry."""
+        accession = AccessionBacklogEntry()
+        accession.code = entry["secondary_study_accession"]
+        accession.source = "ebi_ena_portal"
+        accession.technology = "rna-seq"
+
+        if "scientific_name" in entry:
+            accession.organism = entry["scientific_name"].lower()
+        if "first_public" in entry:
+            accession.published_date = timezone.make_aware(
+                datetime.strptime(entry["first_public"], "%Y-%m-%d")
+            )
+
+        return accession
diff --git a/foreman/data_refinery_foreman/gatherer/__init__.py b/foreman/data_refinery_foreman/gatherer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/foreman/data_refinery_foreman/gatherer/management/__init__.py b/foreman/data_refinery_foreman/gatherer/management/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/__init__.py b/foreman/data_refinery_foreman/gatherer/management/commands/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
new file mode 100644
index 000000000..c4808a191
--- /dev/null
+++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
@@ -0,0 +1,731 @@
+"""MicroArray (ArrayExpress, GEO) and RNA-Seq accession gathering automation.
+Data sources:
+  - https://www.ebi.ac.uk/biostudies/help (MicroArray ArrayExpress).
+  - local SQLite meta DB from https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html
+    (MicroArray GEO).
+  - https://www.ebi.ac.uk/ena/portal/api/ (RNA-Seq).
+"""
+
+import argparse
+import logging
+import os
+import re
+import sqlite3
+from datetime import datetime
+from http.client import RemoteDisconnected
+from json.decoder import JSONDecodeError
+from typing import List, Set
+from urllib.parse import quote
+
+from django.core.management.base import BaseCommand
+
+import requests
+from requests.exceptions import ConnectionError, ConnectTimeout
+from retrying import retry
+from urllib3.exceptions import ProtocolError
+
+from data_refinery_common.logging import get_and_configure_logger
+from data_refinery_common.models.accession import AccessionBacklogEntry
+from data_refinery_common.models.experiment import Experiment
+
+log = get_and_configure_logger(__name__)
+
+
+class Command(BaseCommand):
+    """Creates agents and runs actual accession gathering."""
+
+    RE_ACCESSION = re.compile(r"(\D+)(\d+)")
+    RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}")
+
+    # TODO(ark): remove after upgrade to python3.8 where parser argument
+    # "extend" action is directly available.
+    # https://docs.python.org/3.8/library/argparse.html#action
+    class ExtendAction(argparse.Action):
+        def __call__(self, parser, namespace, values, option_string=None):
+            items = getattr(namespace, self.dest) or []
+            items.extend(values)
+            setattr(namespace, self.dest, items)
+
+    def add_arguments(self, parser) -> None:
+        parser.register("action", "extend", Command.ExtendAction)
+
+        parser.add_argument(
+            "--ae-id",
+            action="extend",
+            nargs="+",
+            type=str,
+            help="ArrayExpress ID(s) to use for filtering.",
+        )
+        parser.add_argument(
+            "--ae-ids-file",
+            type=str,
+            help="Path to a file containing ArrayExpress ID(s) to use for filtering.",
+        )
+        parser.add_argument("-c", "--count", type=int, help="Number of accessions to collect.")
+        parser.add_argument(
+            "-d",
+            "--dry-run",
+            action="store_true",
+            default=False,
+            help="Do not write the result to the database.",
+        )
+        parser.add_argument(
+            "-e",
+            "--exclude-previous",
+            action="store_true",
+            default=True,
+            help="Exclude previously gathered or surveyed accessions.",
+        )
+        parser.add_argument(
+            "-ne",
+            "--no-exclude-previous",
+            action="store_false",
+            default=False,
+            dest="exclude_previous",
+            help="Do not exclude previously gathered or surveyed accessions.",
+        )
+        parser.add_argument(
+            "--gpl-id",
+            action="extend",
+            nargs="+",
+            type=str,
+            help="GEO platform ID(s) to use for filtering.",
+        )
+        parser.add_argument(
+            "--gpl-ids-file",
+            type=str,
+            help="Path to a file containing GEO platform ID(s) to use for filtering.",
+        )
+        parser.add_argument(
+            "-k",
+            "--keyword",
+            type=str,
+            help="Keyword to use for filtering.",
+        )
+        parser.add_argument(
+            "-m",
+            "--microarray",
+            action="store_true",
+            default=False,
+            help="Collect MicroArray accessions.",
+        )
+        parser.add_argument(
+            "-o", "--organism", type=str, help="Organism name to use for filtering."
+        )
+        parser.add_argument(
+            "-r",
+            "--rna-seq",
+            action="store_true",
+            default=False,
+            help="Collect RNA-Seq accessions.",
+        )
+        parser.add_argument(
+            "-s",
+            "--since",
+            type=str,
+            required=True,
+            help="Collect accessions made public on or after this date.",
+        )
+        parser.add_argument(
+            "--taxon-id",
+            action="extend",
+            nargs="+",
+            type=int,
+            help="Taxon ID(s) to use for filtering.",
+        )
+        parser.add_argument(
+            "--taxon-ids-file",
+            type=str,
+            help="Path to a file containing taxon ID(s) to use for filtering.",
+        )
+        parser.add_argument(
+            "-u",
+            "--until",
+            type=str,
+            help="Collect accessions made public before or on this date.",
+        )
+        parser.add_argument(
+            "-lv",
+            "--log-verbose",
+            action="store_true",
+            default=False,
+            help="Enable verbose log output.",
+        )
+
+    def set_verbosity_level(self, options) -> None:
+        """Configures log verbosity level."""
+        if options["log_verbose"]:
+            log.addHandler(logging.StreamHandler())
+            log.setLevel(logging.DEBUG)
+        else:
+            log.setLevel(logging.ERROR)
+
+    def validate_args(self, options) -> None:
+        """Validates arguments."""
+        if not options["microarray"] and not options["rna_seq"]:
+            exit("Either --microarray or --rna-seq must be specified.")
+
+        errors = list()
+        since = options["since"]
+        until = options["until"]
+        if not self.RE_DATE.match(since):
+            errors.append('The -s, --since value must match "YYYY-MM-DD" format.')
+        if until and not self.RE_DATE.match(until):
+            errors.append('The -u, --until value must match "YYYY-MM-DD" format.')
+        if since and until and since > until:
+            errors.append("The -s, --since date must be earlier than -u, --until date.")
+
+        keyword = options["keyword"]
+        organism = options["organism"]
+        if options["microarray"]:
+            ae_id = options["ae_id"] or options["ae_ids_file"]
+            gpl_id = options["gpl_id"] or options["gpl_ids_file"]
+            ids = ae_id or gpl_id
+            invalid_options_message = (
+                "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or "
+                "ArrayExpress ID(s) [--ae-id, --ae-ids-file] / GEO platform ID(s) "
+                "[--gpl-id, --gpl-ids-file] must be specified."
+            )
+        elif options["rna_seq"]:
+            taxon_id = options["taxon_id"] or options["taxon_ids_file"]
+            ids = taxon_id
+            invalid_options_message = (
+                "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] "
+                "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified."
+            )
+
+        if len([option for option in (ids, keyword, organism) if option]) != 1:
+            errors.append(invalid_options_message)
+
+        if errors:
+            exit("\n".join(errors))
+
+    def handle(self, *args, **options):
+        """Runs the accession gathering process."""
+        self.validate_args(options)
+        self.set_verbosity_level(options)
+
+        agents = list()
+        if options["rna_seq"]:
+            agents.append(RNASeqAccessionAgent(options))
+        elif options["microarray"]:
+            if (
+                options["ae_id"]
+                or options["ae_ids_file"]
+                or options["keyword"]
+                or options["organism"]
+            ):
+                agents.append(MicroArrayExpressAccessionAgent(options))
+            if (
+                options["gpl_id"]
+                or options["gpl_ids_file"]
+                or options["keyword"]
+                or options["organism"]
+            ):
+                agents.append(MicroArrayGEOAccessionAgent(options))
+
+        entries = set()
+        for agent in agents:
+            entries.update(agent.collect_data())
+
+        entries = sorted(  # Sort the resulting list.
+            (entry for entry in entries if self.RE_ACCESSION.match(entry.code)),
+            key=lambda entry: (
+                self.RE_ACCESSION.match(entry.code).group(1),
+                int(self.RE_ACCESSION.match(entry.code).group(2)),
+            ),
+        )
+        # Limit the number of output entries.
+        entries = entries[: options["count"]] if options["count"] else entries
+
+        if options["dry_run"]:
+            if entries:
+                output = "\n".join((str(entry) for entry in entries))
+            else:
+                output = "No accessions found."
+            print(output)
+        else:
+            AccessionBacklogEntry.objects.bulk_create(entries)
+
+
+class AccessionAgentBase:
+    "Accession agent base class."
+
+    previous_accessions = set()
+    retry_params = {
+        "retry_on_exception": lambda e: isinstance(
+            e, (ConnectionError, ConnectTimeout, ProtocolError, RemoteDisconnected)
+        ),
+        "stop_max_attempt_number": 5,
+        "wait_exponential_multiplier": 1000,  # Seconds.
+        "wait_exponential_max": 16000,  # Seconds.
+    }
+
+    def __init__(self, options) -> None:
+        """Populates args and values for major variables."""
+        self.options = options
+        self.count = options["count"]
+        self.keyword = options["keyword"]
+        self.organism = options["organism"]
+        self.since = options["since"]
+        self.until = options["until"] or datetime.now().strftime("%Y-%m-%d")
+
+        self.populate_previous_accessions()
+
+    def build_query(self):
+        """Returns query/query dict depending on the accession data source."""
+        raise NotImplementedError
+
+    def collect_data(self):
+        """Generates resulting entry collection."""
+        raise NotImplementedError
+
+    def fetch_data(self):
+        """Fetches data from an external or local data source."""
+        raise NotImplementedError
+
+    def get_ids(self):
+        """Gets IDs for query filtering depending on the accession technology."""
+        raise NotImplementedError
+
+    def populate_previous_accessions(self) -> None:
+        """Populates previous accession set from a provided excluded ids file."""
+        if not self.options["exclude_previous"] or self.previous_accessions:
+            return
+
+        # Gathered accessions.
+        self.previous_accessions.update(
+            (entry["code"] for entry in AccessionBacklogEntry.objects.values("code"))
+        )
+
+        # Surveyed accessions.
+        experiments = Experiment.objects.values("accession_code", "alternate_accession_code")
+        self.previous_accessions.update(
+            (experiment["accession_code"] for experiment in experiments)
+        )
+        self.previous_accessions.update(
+            (experiment["alternate_accession_code"] for experiment in experiments)
+        )
+
+
+class MicroArrayExpressAccessionAgent(AccessionAgentBase):
+    """
+    MicroArray ArrayExpress accession gathering agent. The data is fetched from
+    the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and
+    https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more
+    information about the API endpoints.
+    """
+
+    def __init__(self, options) -> None:
+        super().__init__(options)
+
+        self.data_chunk_size = 100
+        self.data_url = "https://www.ebi.ac.uk/biostudies/api/v1/search"
+        self.ids = self.get_ids()
+
+    def build_query(self) -> dict:
+        """Returns a query dict for getting array/organism specific accessions."""
+        query_dict = {
+            "directsub": "true",
+            "page": 1,
+            "pageSize": self.data_chunk_size,
+            "release_date": f"[{self.since} TO {self.until}]",
+            "type": "study",
+        }
+
+        if self.ids:
+            # TODO(ark): figure out better way of array filtering.
+            # Also make sure it's equivalent to the array filtering in this query
+            # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208
+            query_dict.update({"content": ", ".join(self.ids)})
+        elif self.keyword:
+            query_dict.update({"content": self.keyword})
+        elif self.organism:
+            query_dict.update({"organism": f'"{self.organism}"'})
+
+        return query_dict
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from EBI Biostudies API."""
+        accessions = set()
+
+        if self.ids:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] "
+                "range."
+            )
+        elif self.keyword:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.'
+            )
+        elif self.organism:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f'"{self.organism}" organism for [{self.since} - {self.until}] range.'
+            )
+        else:
+            return accessions
+
+        log.debug(message)
+        accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self) -> Set[str]:
+        """Retrieves accessions from API search endpoint."""
+
+        @retry(**self.retry_params)
+        def get_response(url, **kwargs):
+            """Gets response from an API endpoint."""
+            return requests.get(url, **kwargs)
+
+        accessions = set()
+
+        is_done = False
+        params = self.build_query()
+        while not is_done:
+            range_start = (params["page"] - 1) * params["pageSize"] + 1
+            range_end = (params["page"] - 1) * params["pageSize"] + self.data_chunk_size
+            log.debug(f"Processing entries {range_start} - {range_end}")
+
+            response = get_response(self.data_url, params=params)
+            entries = response.json().get("hits")
+            if entries:
+                entries = (
+                    AccessionBacklogEntry.create_from_ma_ae_entry(entry) for entry in entries
+                )
+                params["page"] += 1
+            else:
+                is_done = True
+
+            if self.previous_accessions:
+                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+            accessions.update(entries)
+
+            # Quit after getting a sufficient amount of accessions.
+            if self.count and len(accessions) >= self.count:
+                is_done = True
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed ArrayExpress IDs."""
+        ids = set()
+
+        if self.options["ae_id"]:
+            ids.update(self.options["ae_id"])
+
+        if self.options["ae_ids_file"]:
+            with open(self.options["ae_ids_file"]) as ae_ids_file:
+                ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines()))
+
+        return sorted(ids)
+
+
+class MicroArrayGEOAccessionAgent(AccessionAgentBase):
+    """
+    MicroArray GEO accession gathering agent. The data is fetched from a local
+    SQLite GEO meta database.
+    """
+
+    def __init__(self, options) -> None:
+        super().__init__(options)
+
+        self.db_path = "data/microarray/GEOmetadb.sqlite"
+        self.ids = self.get_ids()
+
+    def build_query(self) -> str:
+        """Returns a query for getting GEO accessions from the local SQLite meta DB."""
+        tables = [
+            f"SELECT *",
+            "FROM gse_gpl",
+            "JOIN gpl ON gse_gpl.gpl=gpl.gpl",
+            "JOIN gse ON gse.gse=gse_gpl.gse",
+            "GROUP BY gse_gpl.gse",
+        ]
+
+        conditions = [
+            f"HAVING gse.submission_date >= '{self.since}'",
+            f"gse.submission_date <= '{self.until}'",
+        ]
+
+        if self.ids:
+            gpl_ids = (f"'{gpl_id}'" for gpl_id in self.ids)
+            conditions.append(f"gse_gpl.gpl IN ({', '.join(gpl_ids)})")
+        elif self.organism:
+            conditions.append(f"lower(organism)='{self.organism.lower()}'")
+
+        return f"{' '.join(tables)} {' AND '.join(conditions)}"
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from GEO database."""
+        accessions = set()
+
+        if self.ids:
+            message = (
+                "Getting MicroArray GEO entries by GEO platform ID(s): "
+                f"{', '.join(self.ids)} for [{self.since} - {self.until}] range."
+            )
+        elif self.keyword:
+            message = (
+                f'Getting MicroArray GEO entries by "{self.keyword}" keyword '
+                f"for [{self.since} - {self.until}] range."
+            )
+        elif self.organism:
+            message = (
+                f'Getting MicroArray GEO entries by "{self.organism}" organism '
+                f"for [{self.since} - {self.until}] range."
+            )
+        else:
+            return accessions
+
+        log.debug(message)
+        accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self) -> Set[str]:
+        """Retrieves accessions from the GEO meta DB."""
+
+        def match_keyword(row):
+            """
+            Returns True if `row` matches `self.keyword` based regex.
+            Otherwise returns False.
+            """
+            return re_keyword.match(" ".join((str(c) for c in row if c)))
+
+        accessions = set()
+
+        if not os.path.exists(self.db_path):
+            log.error("GEO meta database doesn't exist.")
+            return accessions
+
+        connection = sqlite3.connect(self.db_path)
+        connection.row_factory = sqlite3.Row
+        connection.text_factory = lambda b: b.decode(errors="ignore")
+        entries = connection.execute(self.build_query()).fetchall()
+        connection.close()
+
+        if self.keyword:
+            re_keyword = re.compile(f".*{self.keyword}.*", re.IGNORECASE)  # Keyword regex.
+            entries = filter(match_keyword, entries)
+
+        entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries)
+        entries = set((AccessionBacklogEntry.create_from_ma_geo_entry(entry) for entry in entries))
+
+        if self.previous_accessions:
+            entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+        accessions.update(entries)
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed GEO platform IDs."""
+        ids = set()
+
+        if self.options["gpl_id"]:
+            ids.update(self.options["gpl_id"])
+
+        if self.options["gpl_ids_file"]:
+            with open(self.options["gpl_ids_file"]) as gpl_ids_file:
+                ids.update((gpl_id.strip() for gpl_id in gpl_ids_file.readlines()))
+
+        return sorted(ids)
+
+
+class RNASeqAccessionAgent(AccessionAgentBase):
+    """
+    RNA-Seq accession gathering agent. The data is fetched from
+    The European Nucleotide Archive (ENA) Portal.
+    See https://www.ebi.ac.uk/ena/portal/api/ for more information about the API
+    endpoints.
+    """
+
+    def __init__(self, options) -> None:
+        super().__init__(options)
+
+        self.data_chunk_size = 10000
+        self.data_url = "https://www.ebi.ac.uk/ena/portal/api/search"
+        self.ids = self.get_ids()
+
+    def build_query(self, taxon_id: str = None) -> str:
+        """
+        Returns a query to use for getting specific taxon ID accessions.
+        Some special characters must remain unquoted.
+        """
+
+        AND = " AND "
+        OR = " OR "
+        instrument_models = (
+            "HiSeq X Five",
+            "HiSeq X Ten",
+            "Illumina Genome Analyzer II",
+            "Illumina Genome Analyzer IIx",
+            "Illumina Genome Analyzer",
+            "Illumina HiScanSQ",
+            "Illumina HiSeq 1000",
+            "Illumina HiSeq 1500",
+            "Illumina HiSeq 2000",
+            "Illumina HiSeq 2500",
+            "Illumina HiSeq 3000",
+            "Illumina HiSeq 4000",
+            "Illumina MiSeq",
+            "Illumina NovaSeq 6000",
+            "Ion Torrent Proton",
+            "Ion Torrent S5 XL",
+            "Ion Torrent S5",
+            "NextSeq 500",
+            "NextSeq 550",
+        )
+
+        instrument_models = OR.join((f'instrument_model="{im}"' for im in instrument_models))
+        conditions = [
+            # Relevant date fields: collection_date, collection_date_submitted,
+            # first_public, last_updated.
+            f"first_public >= {self.since}",
+            f"first_public <= {self.until}",
+            f"({instrument_models})",
+            'library_source="TRANSCRIPTOMIC"',
+            'library_strategy="RNA-Seq"',
+        ]
+
+        if taxon_id:
+            conditions.append(f"tax_eq({taxon_id})")
+        elif self.keyword:
+            search_fields = (
+                "assembly_software",
+                "bio_material",
+                "center_name",
+                "collected_by",
+                "experiment_title",
+                "host_body_site",
+                "instrument_model",
+                "instrument_platform",
+                "library_name",
+                "project_name",
+                "sample_title",
+                "sequencing_method",
+                "study_title",
+            )
+            search_fields = OR.join(
+                (f'{sf}="*{self.keyword}*"' for sf in search_fields)
+            )  # Keyword regex.
+            conditions.append(f"({search_fields})")
+        elif self.organism:
+            # `host`: Natural (as opposed to laboratory) host to the organism from which sample
+            #         was obtained.
+            # `host_scientific_name`: Scientific name of the natural (as opposed to laboratory)
+            #                         host to the organism from which sample was obtained.
+            # `scientific_name` Scientific name of the organism from which the sample was derived.
+            # Neither `host_scientific_name` nor `scientific_name` available for search.
+            # https://www.ebi.ac.uk/ena/portal/api/searchFields?dataPortal=ena&format=json&result=read_study
+            conditions.append(f'host="{self.organism}"')
+
+        return quote(AND.join(conditions), safe='*()-="<>/ ')  # Must remain unquoted.
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from EBI ENA API."""
+        accessions = set()
+
+        if self.ids:
+            log.debug(
+                f"Getting RNA-Seq entries by taxon ID(s): "
+                f"{', '.join((str(idx) for idx in self.ids))} for [{self.since} - {self.until}] range."
+            )
+            total = len(self.ids)
+            for idx, taxon_id in enumerate(self.ids):
+                if self.count and len(accessions) >= self.count:
+                    break
+
+                if total > 1:
+                    log.debug(f"Getting entries for taxon ID {taxon_id}, {idx + 1} of {total}.")
+                accessions.update(self.fetch_data(taxon_id=taxon_id))
+        elif self.keyword:
+            log.debug(
+                f'Getting RNA-Seq entries by "{self.keyword}" keyword '
+                f"for [{self.since} - {self.until}] range."
+            )
+            accessions.update(self.fetch_data())
+        elif self.organism:
+            log.debug(
+                f'Getting entries by "{self.organism}" organism '
+                f"for [{self.since} - {self.until}] range."
+            )
+            accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self, taxon_id=None) -> Set[str]:
+        """
+        Retrieves accessions from API search endpoint.
+        The API allows to set limit to 0 (get all in one request) but we do
+        it in a paginated fashion with `self.data_chunk_size` as a page size.
+        """
+
+        @retry(**self.retry_params)
+        def get_response(url, **kwargs):
+            """Gets response from an API endpoint."""
+            return requests.post(url, **kwargs)
+
+        accessions = set()
+
+        fields = [
+            "first_public",
+            "scientific_name",
+            "secondary_study_accession",
+        ]  # For DRP/ERP/SRP-prefixed accessions.
+        data = {
+            "dataPortal": "ena",
+            # TODO(ark): add excludeAccessions/excludeAccessionType support.
+            "fields": ",".join(fields),  # Use "all" to get all fields.
+            "format": "json",
+            "limit": self.data_chunk_size,
+            "offset": 0,
+            "query": self.build_query(taxon_id=taxon_id),
+            "result": "read_study",
+            "sortFields": fields,
+        }
+
+        is_done = False
+        while not is_done:
+            log.debug(
+                f"Processing entries {data['offset'] + 1} - {data['offset'] + self.data_chunk_size}"
+            )
+            entries = ()
+            try:
+                response = get_response(self.data_url, data=data)
+                entries = response.json()
+                # TODO(ark): add `organism` when -o, --organism flag is used.
+                entries = (
+                    AccessionBacklogEntry.create_from_rnaseq_entry(entry) for entry in entries
+                )
+            except JSONDecodeError:
+                is_done = True
+            except TypeError:
+                log.error(f"Couldn't get data from {self.data_url}. Response: {entries}")
+            data["offset"] += self.data_chunk_size
+
+            if self.previous_accessions:
+                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+            accessions.update(entries)
+
+            # Quit after getting a sufficient amount of accessions.
+            if self.count and len(accessions) >= self.count:
+                is_done = True
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed taxon IDs."""
+        ids = set()
+
+        if self.options["taxon_id"]:
+            ids.update(self.options["taxon_id"])
+
+        if self.options["taxon_ids_file"]:
+            with open(self.options["taxon_ids_file"]) as taxon_id_file:
+                ids.update((taxon_id.strip() for taxon_id in taxon_id_file.readlines()))
+
+        return sorted(ids)
diff --git a/foreman/data_refinery_foreman/settings.py b/foreman/data_refinery_foreman/settings.py
index 7a489facc..5fea76d71 100644
--- a/foreman/data_refinery_foreman/settings.py
+++ b/foreman/data_refinery_foreman/settings.py
@@ -47,6 +47,7 @@
     "data_refinery_common",
     "data_refinery_foreman.surveyor",
     "data_refinery_foreman.foreman",
+    "data_refinery_foreman.gatherer",
     "raven.contrib.django.raven_compat",
     "computedfields",
 ]
@@ -108,10 +109,18 @@
 # https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators
 
 AUTH_PASSWORD_VALIDATORS = [
-    {"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",},
-    {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",},
-    {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",},
-    {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",},
+    {
+        "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
+    },
+    {
+        "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
+    },
+    {
+        "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
+    },
+    {
+        "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
+    },
 ]
 
 
diff --git a/foreman/dockerfiles/Dockerfile.foreman b/foreman/dockerfiles/Dockerfile.foreman
index 8c09c6888..929ef2476 100644
--- a/foreman/dockerfiles/Dockerfile.foreman
+++ b/foreman/dockerfiles/Dockerfile.foreman
@@ -8,6 +8,8 @@ RUN apt-get -y install apt-fast
 
 RUN apt-fast update -qq && \
     apt-fast install -y \
+    gcc \
+    libpq-dev \
     python3 \
     python3-pip
 

From 4c7f8049506257c22d07bb96c606dd2221c1a8f7 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Tue, 13 Sep 2022 17:12:41 -0700
Subject: [PATCH 14/24] Address review comments.

---
 .../migrations/0071_auto_20220901_1653.py     |  44 --
 ...klogentry.py => 0071_gatheredaccession.py} |   6 +-
 .../data_refinery_common/models/__init__.py   |   2 +-
 .../{accession.py => gathered_accession.py}   |  20 +-
 .../gatherer/agents/__init__.py               |   0
 .../gatherer/agents/base.py                   |  79 +++
 .../gatherer/agents/microarray_ae.py          | 126 ++++
 .../gatherer/agents/microarray_geo.py         | 123 ++++
 .../gatherer/agents/rna_seq.py                | 204 ++++++
 .../management/commands/gather_accessions.py  | 643 +++---------------
 10 files changed, 626 insertions(+), 621 deletions(-)
 delete mode 100644 common/data_refinery_common/migrations/0071_auto_20220901_1653.py
 rename common/data_refinery_common/migrations/{0071_accessionbacklogentry.py => 0071_gatheredaccession.py} (88%)
 rename common/data_refinery_common/models/{accession.py => gathered_accession.py} (84%)
 create mode 100644 foreman/data_refinery_foreman/gatherer/agents/__init__.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/agents/base.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
 create mode 100644 foreman/data_refinery_foreman/gatherer/agents/rna_seq.py

diff --git a/common/data_refinery_common/migrations/0071_auto_20220901_1653.py b/common/data_refinery_common/migrations/0071_auto_20220901_1653.py
deleted file mode 100644
index c7d3b0b63..000000000
--- a/common/data_refinery_common/migrations/0071_auto_20220901_1653.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Generated by Django 3.2.7 on 2022-09-01 16:53
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ("data_refinery_common", "0070_auto_20211208_2118"),
-    ]
-
-    operations = [
-        migrations.CreateModel(
-            name="Accession",
-            fields=[
-                (
-                    "id",
-                    models.AutoField(
-                        auto_created=True,
-                        primary_key=True,
-                        serialize=False,
-                        verbose_name="ID",
-                    ),
-                ),
-                ("code", models.TextField()),
-                ("created_at", models.DateTimeField(auto_now_add=True)),
-                ("last_modified_at", models.DateTimeField(auto_now=True)),
-                ("organism", models.TextField()),
-                ("published_date", models.DateTimeField()),
-                ("sample_count", models.PositiveIntegerField(default=0)),
-                ("source", models.TextField()),
-                ("technology", models.TextField()),
-            ],
-            options={
-                "db_table": "accessions",
-            },
-        ),
-        migrations.AddConstraint(
-            model_name="accession",
-            constraint=models.UniqueConstraint(
-                fields=("code", "source", "technology"), name="unique_accession"
-            ),
-        ),
-    ]
diff --git a/common/data_refinery_common/migrations/0071_accessionbacklogentry.py b/common/data_refinery_common/migrations/0071_gatheredaccession.py
similarity index 88%
rename from common/data_refinery_common/migrations/0071_accessionbacklogentry.py
rename to common/data_refinery_common/migrations/0071_gatheredaccession.py
index 86c04daed..a1740d96e 100644
--- a/common/data_refinery_common/migrations/0071_accessionbacklogentry.py
+++ b/common/data_refinery_common/migrations/0071_gatheredaccession.py
@@ -1,4 +1,4 @@
-# Generated by Django 3.2.7 on 2022-09-07 19:31
+# Generated by Django 3.2.7 on 2022-09-13 18:14
 
 from django.db import migrations, models
 
@@ -11,7 +11,7 @@ class Migration(migrations.Migration):
 
     operations = [
         migrations.CreateModel(
-            name="AccessionBacklogEntry",
+            name="GatheredAccession",
             fields=[
                 (
                     "id",
@@ -32,7 +32,7 @@ class Migration(migrations.Migration):
                 ("technology", models.TextField()),
             ],
             options={
-                "db_table": "accession_backlog",
+                "db_table": "gathered_accessions",
             },
         ),
     ]
diff --git a/common/data_refinery_common/models/__init__.py b/common/data_refinery_common/models/__init__.py
index 8e9564153..2b544765d 100644
--- a/common/data_refinery_common/models/__init__.py
+++ b/common/data_refinery_common/models/__init__.py
@@ -1,4 +1,3 @@
-from data_refinery_common.models.accession import Accession  # noqa
 from data_refinery_common.models.api_token import APIToken  # noqa
 from data_refinery_common.models.associations.compendium_result_organism_association import (  # noqa
     CompendiumResultOrganismAssociation,
@@ -46,6 +45,7 @@
 from data_refinery_common.models.dataset_annotation import DatasetAnnotation  # noqa
 from data_refinery_common.models.experiment import Experiment  # noqa
 from data_refinery_common.models.experiment_annotation import ExperimentAnnotation  # noqa
+from data_refinery_common.models.gathered_accession import GatheredAccession  # noqa
 from data_refinery_common.models.jobs.downloader_job import DownloaderJob  # noqa
 from data_refinery_common.models.jobs.processor_job import ProcessorJob  # noqa
 from data_refinery_common.models.jobs.survey_job import SurveyJob  # noqa
diff --git a/common/data_refinery_common/models/accession.py b/common/data_refinery_common/models/gathered_accession.py
similarity index 84%
rename from common/data_refinery_common/models/accession.py
rename to common/data_refinery_common/models/gathered_accession.py
index 6ac62da9f..04b084533 100644
--- a/common/data_refinery_common/models/accession.py
+++ b/common/data_refinery_common/models/gathered_accession.py
@@ -4,11 +4,11 @@
 from django.utils import timezone
 
 
-class AccessionBacklogEntry(models.Model):
-    """Accession backlog entry model."""
+class GatheredAccession(models.Model):
+    """Gathered accession model."""
 
     class Meta:
-        db_table = "accession_backlog"
+        db_table = "gathered_accessions"
 
     code = models.TextField(unique=True)
     created_at = models.DateTimeField(auto_now_add=True)
@@ -21,7 +21,7 @@ class Meta:
 
     def __eq__(self, other: object) -> bool:
         """Returns True if two objects are equal. Otherwise returns False."""
-        return isinstance(other, AccessionBacklogEntry) and self.code == other.code
+        return isinstance(other, GatheredAccession) and self.code == other.code
 
     def __hash__(self) -> int:
         """Returns accession object unique hash value."""
@@ -32,15 +32,15 @@ def __str__(self) -> str:
         return ", ".join((self.code, self.technology, self.source, str(self.published_date.date())))
 
     @staticmethod
-    def create_from_ma_ae_entry(entry):
+    def create_from_ma_ae_entry(entry, organism=None):
         """Creates accession object from MicroArray ArrayExpress entry."""
-        accession = AccessionBacklogEntry()
+        accession = GatheredAccession()
         accession.code = entry["accession"]
         accession.source = "ebi_biostudies"
         accession.technology = "microarray"
 
-        if "organism" in entry:
-            accession.organism = entry["organism"]
+        if organism:
+            accession.organism = organism
         if "release_date" in entry:
             accession.published_date = timezone.make_aware(
                 datetime.strptime(entry["release_date"], "%Y-%m-%d")
@@ -51,7 +51,7 @@ def create_from_ma_ae_entry(entry):
     @staticmethod
     def create_from_ma_geo_entry(entry):
         """Creates accession object from MicroArray GEO meta DB entry."""
-        accession = AccessionBacklogEntry()
+        accession = GatheredAccession()
         accession.code = entry["gse"]
         accession.source = "geo_meta_db"
         accession.technology = "microarray"
@@ -69,7 +69,7 @@ def create_from_ma_geo_entry(entry):
     @staticmethod
     def create_from_rnaseq_entry(entry):
         """Creates accession object from RNA-Seq entry."""
-        accession = AccessionBacklogEntry()
+        accession = GatheredAccession()
         accession.code = entry["secondary_study_accession"]
         accession.source = "ebi_ena_portal"
         accession.technology = "rna-seq"
diff --git a/foreman/data_refinery_foreman/gatherer/agents/__init__.py b/foreman/data_refinery_foreman/gatherer/agents/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/foreman/data_refinery_foreman/gatherer/agents/base.py b/foreman/data_refinery_foreman/gatherer/agents/base.py
new file mode 100644
index 000000000..3754a4068
--- /dev/null
+++ b/foreman/data_refinery_foreman/gatherer/agents/base.py
@@ -0,0 +1,79 @@
+"""Abstract base class for accession gathering automation agents."""
+
+from abc import ABC, abstractmethod
+from datetime import datetime
+from http.client import RemoteDisconnected
+
+from requests.exceptions import ConnectionError, ConnectTimeout
+from urllib3.exceptions import ProtocolError
+
+from data_refinery_common.logging import get_and_configure_logger
+from data_refinery_common.models.experiment import Experiment
+from data_refinery_common.models.gathered_accession import GatheredAccession
+
+logger = get_and_configure_logger(__name__)
+
+
+class AccessionAgentBase(ABC):
+    "Accession agent abstract base class."
+
+    previous_accessions = set()
+    retry_params = {
+        "retry_on_exception": lambda e: isinstance(
+            e, (ConnectionError, ConnectTimeout, ProtocolError, RemoteDisconnected)
+        ),
+        "stop_max_attempt_number": 5,
+        "wait_exponential_multiplier": 1000,  # Seconds.
+        "wait_exponential_max": 16000,  # Seconds.
+    }
+
+    def __init__(self, options) -> None:
+        """Populates args and values for major variables."""
+        self.options = options
+        self.count = options["count"]
+        self.keyword = options["keyword"]
+        self.organism = options["organism"]
+        self.since = options["since"]
+        self.until = options["until"] or datetime.now().strftime("%Y-%m-%d")
+
+        self.ids = self.get_ids()
+        self.populate_previous_accessions()
+
+    @abstractmethod
+    def build_query(self):
+        """Returns query/query dict depending on the accession data source."""
+        pass
+
+    @abstractmethod
+    def collect_data(self):
+        """Generates resulting entry collection."""
+        pass
+
+    @abstractmethod
+    def fetch_data(self):
+        """Fetches data from an external or local data source."""
+        pass
+
+    @abstractmethod
+    def get_ids(self):
+        """Gets IDs for query filtering depending on the accession technology."""
+        pass
+
+    def populate_previous_accessions(self) -> None:
+        """Populates previous accession set from a provided excluded ids file."""
+        if not self.options["exclude_previous"] or self.previous_accessions:
+            return
+
+        # Gathered accessions.
+        self.previous_accessions.update(
+            (entry["code"] for entry in GatheredAccession.objects.values("code"))
+        )
+
+        # Surveyed accessions.
+        experiments = Experiment.objects.values("accession_code", "alternate_accession_code")
+        self.previous_accessions.update(
+            (experiment["accession_code"] for experiment in experiments)
+        )
+        self.previous_accessions.update(
+            (experiment["alternate_accession_code"] for experiment in experiments)
+        )
diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
new file mode 100644
index 000000000..b5314302b
--- /dev/null
+++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
@@ -0,0 +1,126 @@
+"""MicroArray ArrayExpress accession gathering automation.
+Data source: https://www.ebi.ac.uk/biostudies/help"""
+
+from typing import List, Set
+
+import requests
+from retrying import retry
+
+from data_refinery_common.logging import get_and_configure_logger
+from data_refinery_common.models.gathered_accession import GatheredAccession
+from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase
+
+logger = get_and_configure_logger(__name__)
+
+
+class MicroArrayExpressAccessionAgent(AccessionAgentBase):
+    """
+    MicroArray ArrayExpress accession gathering agent. The data is fetched from
+    the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and
+    https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more
+    information about the API endpoints.
+    """
+
+    DATA_CHUNK_SIZE = 100
+    DATA_URL = "https://www.ebi.ac.uk/biostudies/api/v1/search"
+
+    def build_query(self) -> dict:
+        """Returns a query dict for getting array/organism specific accessions."""
+        query_dict = {
+            "directsub": "true",
+            "page": 1,
+            "pageSize": self.DATA_CHUNK_SIZE,
+            "release_date": f"[{self.since} TO {self.until}]",
+            "type": "study",
+        }
+
+        if self.ids:
+            # TODO(ark): figure out better way of array filtering.
+            # Also make sure it's equivalent to the array filtering in this query
+            # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208
+            query_dict.update({"content": ", ".join(self.ids)})
+        elif self.keyword:
+            query_dict.update({"content": self.keyword})
+        elif self.organism:
+            query_dict.update({"organism": f'"{self.organism}"'})
+
+        return query_dict
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from EBI Biostudies API."""
+        accessions = set()
+
+        if self.ids:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] "
+                "range."
+            )
+        elif self.keyword:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.'
+            )
+        elif self.organism:
+            message = (
+                "Getting MicroArray ArrayExpress entries by "
+                f'"{self.organism}" organism for [{self.since} - {self.until}] range.'
+            )
+        else:
+            return accessions
+
+        logger.debug(message)
+        accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self) -> Set[str]:
+        """Retrieves accessions from API search endpoint."""
+
+        @retry(**self.retry_params)
+        def get_response(url, **kwargs):
+            """Gets response from an API endpoint."""
+            return requests.get(url, **kwargs)
+
+        accessions = set()
+
+        is_done = False
+        params = self.build_query()
+        while not is_done:
+            range_start = (params["page"] - 1) * params["pageSize"] + 1
+            range_end = (params["page"] - 1) * params["pageSize"] + self.DATA_CHUNK_SIZE
+            logger.debug(f"Processing entries {range_start} - {range_end}")
+
+            response = get_response(self.DATA_URL, params=params)
+            entries = response.json().get("hits")
+            if entries:
+                entries = (
+                    GatheredAccession.create_from_ma_ae_entry(entry, organism=self.organism)
+                    for entry in entries
+                )
+                params["page"] += 1
+            else:
+                is_done = True
+
+            if self.previous_accessions:
+                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+            accessions.update(entries)
+
+            # Quit after getting a sufficient amount of accessions.
+            if self.count and len(accessions) >= self.count:
+                is_done = True
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed ArrayExpress IDs."""
+        ids = set()
+
+        if self.options["ae_id"]:
+            ids.update(self.options["ae_id"])
+
+        if self.options["ae_ids_file"]:
+            with open(self.options["ae_ids_file"]) as ae_ids_file:
+                ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines()))
+
+        return sorted(ids)
diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
new file mode 100644
index 000000000..975c715b3
--- /dev/null
+++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
@@ -0,0 +1,123 @@
+"""MicroArray GEO accession gathering automation.
+Data source: local SQLite meta DB from https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html"""
+
+import os
+import re
+import sqlite3
+from typing import List, Set
+
+from data_refinery_common.logging import get_and_configure_logger
+from data_refinery_common.models.gathered_accession import GatheredAccession
+from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase
+
+logger = get_and_configure_logger(__name__)
+
+
+class MicroArrayGEOAccessionAgent(AccessionAgentBase):
+    """
+    MicroArray GEO accession gathering agent. The data is fetched from a local
+    SQLite GEO meta database.
+    """
+
+    # TODO(ark): move the DB file from Docker image to S3.
+    # Implement syncing procedure.
+    # Update URL once the original file is available again.
+    DB_PATH = "data/microarray/GEOmetadb.sqlite"
+
+    def build_query(self) -> str:
+        """Returns a query for getting GEO accessions from the local SQLite meta DB."""
+        tables = [
+            "SELECT *",
+            "FROM gse_gpl",
+            "JOIN gpl ON gse_gpl.gpl=gpl.gpl",
+            "JOIN gse ON gse.gse=gse_gpl.gse",
+            "GROUP BY gse_gpl.gse",
+        ]
+
+        conditions = [
+            f"HAVING gse.submission_date >= '{self.since}'",
+            f"gse.submission_date <= '{self.until}'",
+        ]
+
+        if self.ids:
+            gpl_ids = (f"'{gpl_id}'" for gpl_id in self.ids)
+            conditions.append(f"gse_gpl.gpl IN ({', '.join(gpl_ids)})")
+        elif self.organism:
+            conditions.append(f"lower(organism)='{self.organism.lower()}'")
+
+        return f"{' '.join(tables)} {' AND '.join(conditions)}"
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from GEO database."""
+        accessions = set()
+
+        if self.ids:
+            message = (
+                "Getting MicroArray GEO entries by GEO platform ID(s): "
+                f"{', '.join(self.ids)} for [{self.since} - {self.until}] range."
+            )
+        elif self.keyword:
+            message = (
+                f'Getting MicroArray GEO entries by "{self.keyword}" keyword '
+                f"for [{self.since} - {self.until}] range."
+            )
+        elif self.organism:
+            message = (
+                f'Getting MicroArray GEO entries by "{self.organism}" organism '
+                f"for [{self.since} - {self.until}] range."
+            )
+        else:
+            return accessions
+
+        logger.debug(message)
+        accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self) -> Set[str]:
+        """Retrieves accessions from the GEO meta DB."""
+
+        def match_keyword(row):
+            """
+            Returns True if `row` matches `self.keyword` based regex.
+            Otherwise returns False.
+            """
+            return re_keyword.match(" ".join((str(c) for c in row if c)))
+
+        accessions = set()
+
+        if not os.path.exists(self.DB_PATH):
+            logger.error("GEO meta database doesn't exist.")
+            return accessions
+
+        connection = sqlite3.connect(self.DB_PATH)
+        connection.row_factory = sqlite3.Row
+        connection.text_factory = lambda b: b.decode(errors="ignore")
+        entries = connection.execute(self.build_query()).fetchall()
+        connection.close()
+
+        if self.keyword:
+            re_keyword = re.compile(f".*{self.keyword}.*", re.IGNORECASE)  # Keyword regex.
+            entries = filter(match_keyword, entries)
+
+        entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries)
+        entries = set((GatheredAccession.create_from_ma_geo_entry(entry) for entry in entries))
+
+        if self.previous_accessions:
+            entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+        accessions.update(entries)
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed GEO platform IDs."""
+        ids = set()
+
+        if self.options["gpl_id"]:
+            ids.update(self.options["gpl_id"])
+
+        if self.options["gpl_ids_file"]:
+            with open(self.options["gpl_ids_file"]) as gpl_ids_file:
+                ids.update((gpl_id.strip() for gpl_id in gpl_ids_file.readlines()))
+
+        return sorted(ids)
diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
new file mode 100644
index 000000000..f9497f3ba
--- /dev/null
+++ b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
@@ -0,0 +1,204 @@
+"""RNA-Seq accession gathering automation.
+Data source: https://www.ebi.ac.uk/ena/portal/api/"""
+
+from json.decoder import JSONDecodeError
+from typing import List, Set
+from urllib.parse import quote
+
+import requests
+from retrying import retry
+
+from data_refinery_common.logging import get_and_configure_logger
+from data_refinery_common.models.gathered_accession import GatheredAccession
+from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase
+
+logger = get_and_configure_logger(__name__)
+
+
+class RNASeqAccessionAgent(AccessionAgentBase):
+    """
+    RNA-Seq accession gathering agent. The data is fetched from
+    The European Nucleotide Archive (ENA) Portal.
+    See https://www.ebi.ac.uk/ena/portal/api/ for more information about the API
+    endpoints.
+    """
+
+    DATA_CHUNK_SIZE = 10000
+    DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
+
+    def build_query(self, taxon_id: str = None) -> str:
+        """
+        Returns a query to use for getting specific taxon ID accessions.
+        Some special characters must remain unquoted.
+        """
+
+        AND = " AND "
+        OR = " OR "
+        instrument_models = (
+            "HiSeq X Five",
+            "HiSeq X Ten",
+            "Illumina Genome Analyzer II",
+            "Illumina Genome Analyzer IIx",
+            "Illumina Genome Analyzer",
+            "Illumina HiScanSQ",
+            "Illumina HiSeq 1000",
+            "Illumina HiSeq 1500",
+            "Illumina HiSeq 2000",
+            "Illumina HiSeq 2500",
+            "Illumina HiSeq 3000",
+            "Illumina HiSeq 4000",
+            "Illumina MiSeq",
+            "Illumina NovaSeq 6000",
+            "Ion Torrent Proton",
+            "Ion Torrent S5 XL",
+            "Ion Torrent S5",
+            "NextSeq 500",
+            "NextSeq 550",
+        )
+
+        instrument_models = OR.join((f'instrument_model="{im}"' for im in instrument_models))
+        conditions = [
+            # Relevant date fields: collection_date, collection_date_submitted,
+            # first_public, last_updated.
+            f"first_public >= {self.since}",
+            f"first_public <= {self.until}",
+            f"({instrument_models})",
+            'library_source="TRANSCRIPTOMIC"',
+            'library_strategy="RNA-Seq"',
+        ]
+
+        if taxon_id:
+            conditions.append(f"tax_eq({taxon_id})")
+        elif self.keyword:
+            search_fields = (
+                "assembly_software",
+                "bio_material",
+                "center_name",
+                "collected_by",
+                "experiment_title",
+                "host_body_site",
+                "instrument_model",
+                "instrument_platform",
+                "library_name",
+                "project_name",
+                "sample_title",
+                "sequencing_method",
+                "study_title",
+            )
+            search_fields = OR.join(
+                (f'{sf}="*{self.keyword}*"' for sf in search_fields)
+            )  # Keyword regex.
+            conditions.append(f"({search_fields})")
+        elif self.organism:
+            # `host`: Natural (as opposed to laboratory) host to the organism from which sample
+            #         was obtained.
+            # `host_scientific_name`: Scientific name of the natural (as opposed to laboratory)
+            #                         host to the organism from which sample was obtained.
+            # `scientific_name` Scientific name of the organism from which the sample was derived.
+            # Neither `host_scientific_name` nor `scientific_name` available for search.
+            # https://www.ebi.ac.uk/ena/portal/api/searchFields?dataPortal=ena&format=json&result=read_study
+            conditions.append(f'host="{self.organism}"')
+
+        return quote(AND.join(conditions), safe='*()-="<>/ ')  # Must remain unquoted.
+
+    def collect_data(self) -> Set[str]:
+        """Gets new accessions from EBI ENA API."""
+        accessions = set()
+
+        if self.ids:
+            logger.debug(
+                f"Getting RNA-Seq entries by taxon ID(s): "
+                f"{', '.join((str(i) for i in self.ids))} for [{self.since} - {self.until}] range."
+            )
+            total = len(self.ids)
+            for idx, taxon_id in enumerate(self.ids):
+                if self.count and len(accessions) >= self.count:
+                    break
+
+                if total > 1:
+                    logger.debug(f"Getting entries for taxon ID {taxon_id}, {idx + 1} of {total}.")
+                accessions.update(self.fetch_data(taxon_id=taxon_id))
+        elif self.keyword:
+            logger.debug(
+                f'Getting RNA-Seq entries by "{self.keyword}" keyword '
+                f"for [{self.since} - {self.until}] range."
+            )
+            accessions.update(self.fetch_data())
+        elif self.organism:
+            logger.debug(
+                f'Getting entries by "{self.organism}" organism '
+                f"for [{self.since} - {self.until}] range."
+            )
+            accessions.update(self.fetch_data())
+
+        return accessions
+
+    def fetch_data(self, taxon_id=None) -> Set[str]:
+        """
+        Retrieves accessions from API search endpoint.
+        The API allows to set limit to 0 (get all in one request) but we do
+        it in a paginated fashion with `self.DATA_CHUNK_SIZE` as a page size.
+        """
+
+        @retry(**self.retry_params)
+        def get_response(url, **kwargs):
+            """Gets response from an API endpoint."""
+            return requests.post(url, **kwargs)
+
+        accessions = set()
+
+        fields = [
+            "first_public",
+            "scientific_name",
+            "secondary_study_accession",
+        ]  # For DRP/ERP/SRP-prefixed accessions.
+        data = {
+            "dataPortal": "ena",
+            # TODO(ark): add excludeAccessions/excludeAccessionType support.
+            "fields": ",".join(fields),  # Use "all" to get all fields.
+            "format": "json",
+            "limit": self.DATA_CHUNK_SIZE,
+            "offset": 0,
+            "query": self.build_query(taxon_id=taxon_id),
+            "result": "read_study",
+            "sortFields": fields,
+        }
+
+        is_done = False
+        while not is_done:
+            logger.debug(
+                f"Processing entries {data['offset'] + 1} - {data['offset'] + self.DATA_CHUNK_SIZE}"
+            )
+            entries = ()
+            try:
+                response = get_response(self.DATA_URL, data=data)
+                entries = response.json()
+                entries = (GatheredAccession.create_from_rnaseq_entry(entry) for entry in entries)
+            except JSONDecodeError:
+                is_done = True
+            except TypeError:
+                logger.error(f"Couldn't get data from {self.data_url}. Response: {entries}")
+            data["offset"] += self.DATA_CHUNK_SIZE
+
+            if self.previous_accessions:
+                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+            accessions.update(entries)
+
+            # Quit after getting a sufficient amount of accessions.
+            if self.count and len(accessions) >= self.count:
+                is_done = True
+
+        return accessions
+
+    def get_ids(self) -> List[str]:
+        """Returns a combined list of passed taxon IDs."""
+        ids = set()
+
+        if self.options["taxon_id"]:
+            ids.update(self.options["taxon_id"])
+
+        if self.options["taxon_ids_file"]:
+            with open(self.options["taxon_ids_file"]) as taxon_id_file:
+                ids.update((taxon_id.strip() for taxon_id in taxon_id_file.readlines()))
+
+        return sorted(ids)
diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
index c4808a191..445245d3a 100644
--- a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
+++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
@@ -8,32 +8,27 @@
 
 import argparse
 import logging
-import os
 import re
-import sqlite3
-from datetime import datetime
-from http.client import RemoteDisconnected
-from json.decoder import JSONDecodeError
-from typing import List, Set
-from urllib.parse import quote
 
 from django.core.management.base import BaseCommand
 
-import requests
-from requests.exceptions import ConnectionError, ConnectTimeout
-from retrying import retry
-from urllib3.exceptions import ProtocolError
-
 from data_refinery_common.logging import get_and_configure_logger
-from data_refinery_common.models.accession import AccessionBacklogEntry
-from data_refinery_common.models.experiment import Experiment
+from data_refinery_common.models.gathered_accession import GatheredAccession
+from data_refinery_foreman.gatherer.agents.microarray_ae import MicroArrayExpressAccessionAgent
+from data_refinery_foreman.gatherer.agents.microarray_geo import MicroArrayGEOAccessionAgent
+from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAccessionAgent
 
-log = get_and_configure_logger(__name__)
+logger = get_and_configure_logger(__name__)
 
 
 class Command(BaseCommand):
     """Creates agents and runs actual accession gathering."""
 
+    DATA_SOURCE_MA_AE = "microarray-ae"
+    DATA_SOURCE_MA_GEO = "microarray-geo"
+    DATA_SOURCE_RNA_SEQ = "rna-seq"
+    DATA_SOURCES = (DATA_SOURCE_MA_AE, DATA_SOURCE_MA_GEO, DATA_SOURCE_RNA_SEQ)
+
     RE_ACCESSION = re.compile(r"(\D+)(\d+)")
     RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}")
 
@@ -76,14 +71,6 @@ def add_arguments(self, parser) -> None:
             default=True,
             help="Exclude previously gathered or surveyed accessions.",
         )
-        parser.add_argument(
-            "-ne",
-            "--no-exclude-previous",
-            action="store_false",
-            default=False,
-            dest="exclude_previous",
-            help="Do not exclude previously gathered or surveyed accessions.",
-        )
         parser.add_argument(
             "--gpl-id",
             action="extend",
@@ -103,21 +90,22 @@ def add_arguments(self, parser) -> None:
             help="Keyword to use for filtering.",
         )
         parser.add_argument(
-            "-m",
-            "--microarray",
+            "-lv",
+            "--log-verbose",
             action="store_true",
             default=False,
-            help="Collect MicroArray accessions.",
+            help="Enable verbose log output.",
         )
         parser.add_argument(
-            "-o", "--organism", type=str, help="Organism name to use for filtering."
+            "-ne",
+            "--no-exclude-previous",
+            action="store_false",
+            default=False,
+            dest="exclude_previous",
+            help="Do not exclude previously gathered or surveyed accessions.",
         )
         parser.add_argument(
-            "-r",
-            "--rna-seq",
-            action="store_true",
-            default=False,
-            help="Collect RNA-Seq accessions.",
+            "-o", "--organism", type=str, help="Organism name to use for filtering."
         )
         parser.add_argument(
             "-s",
@@ -126,6 +114,14 @@ def add_arguments(self, parser) -> None:
             required=True,
             help="Collect accessions made public on or after this date.",
         )
+        parser.add_argument(
+            "-src",
+            "--source",
+            type=str,
+            action="extend",
+            nargs="+",
+            help="Gather accessions from selected sources.",
+        )
         parser.add_argument(
             "--taxon-id",
             action="extend",
@@ -144,28 +140,19 @@ def add_arguments(self, parser) -> None:
             type=str,
             help="Collect accessions made public before or on this date.",
         )
-        parser.add_argument(
-            "-lv",
-            "--log-verbose",
-            action="store_true",
-            default=False,
-            help="Enable verbose log output.",
-        )
 
     def set_verbosity_level(self, options) -> None:
         """Configures log verbosity level."""
         if options["log_verbose"]:
-            log.addHandler(logging.StreamHandler())
-            log.setLevel(logging.DEBUG)
+            logger.addHandler(logging.StreamHandler())
+            logger.setLevel(logging.DEBUG)
         else:
-            log.setLevel(logging.ERROR)
+            logger.setLevel(logging.ERROR)
 
     def validate_args(self, options) -> None:
         """Validates arguments."""
-        if not options["microarray"] and not options["rna_seq"]:
-            exit("Either --microarray or --rna-seq must be specified.")
-
         errors = list()
+
         since = options["since"]
         until = options["until"]
         if not self.RE_DATE.match(since):
@@ -177,52 +164,65 @@ def validate_args(self, options) -> None:
 
         keyword = options["keyword"]
         organism = options["organism"]
-        if options["microarray"]:
-            ae_id = options["ae_id"] or options["ae_ids_file"]
-            gpl_id = options["gpl_id"] or options["gpl_ids_file"]
-            ids = ae_id or gpl_id
-            invalid_options_message = (
-                "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or "
-                "ArrayExpress ID(s) [--ae-id, --ae-ids-file] / GEO platform ID(s) "
-                "[--gpl-id, --gpl-ids-file] must be specified."
-            )
-        elif options["rna_seq"]:
-            taxon_id = options["taxon_id"] or options["taxon_ids_file"]
-            ids = taxon_id
-            invalid_options_message = (
-                "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] "
-                "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified."
+        sources = options["source"] or self.DATA_SOURCES
+
+        for source in sources:
+            if source in self.DATA_SOURCES:
+                continue
+            errors.append(
+                f"Unknown source: {source}. Supported sources: {', '.join(self.DATA_SOURCES)}"
             )
 
-        if len([option for option in (ids, keyword, organism) if option]) != 1:
-            errors.append(invalid_options_message)
+        if self.DATA_SOURCE_MA_AE in sources:
+            ids = options["ae_id"] or options["ae_ids_file"]
+            if not (ids or keyword or organism):
+                errors.append(
+                    (
+                        "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or "
+                        "ArrayExpress ID(s) [--ae-id, --ae-ids-file] must be specified for "
+                        f"'{self.DATA_SOURCE_MA_AE}' source."
+                    )
+                )
+        if self.DATA_SOURCE_MA_GEO in sources:
+            ids = options["gpl_id"] or options["gpl_ids_file"]
+            if not (ids or keyword or organism):
+                errors.append(
+                    (
+                        "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or "
+                        "GEO platform ID(s) [--gpl-id, --gpl-ids-file] must be specified for "
+                        f"'{self.DATA_SOURCE_MA_GEO}' source."
+                    )
+                )
+        if self.DATA_SOURCE_RNA_SEQ in sources:
+            ids = options["taxon_id"] or options["taxon_ids_file"]
+            if not (ids or keyword or organism):
+                errors.append(
+                    (
+                        "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] "
+                        "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified for "
+                        f"'{self.DATA_SOURCE_RNA_SEQ}' source."
+                    )
+                )
 
         if errors:
             exit("\n".join(errors))
 
     def handle(self, *args, **options):
-        """Runs the accession gathering process."""
+        """Creates agents and runs the accession gathering process."""
         self.validate_args(options)
         self.set_verbosity_level(options)
 
         agents = list()
-        if options["rna_seq"]:
+        sources = options["source"] or self.DATA_SOURCES
+
+        if self.DATA_SOURCE_RNA_SEQ in sources:
             agents.append(RNASeqAccessionAgent(options))
-        elif options["microarray"]:
-            if (
-                options["ae_id"]
-                or options["ae_ids_file"]
-                or options["keyword"]
-                or options["organism"]
-            ):
-                agents.append(MicroArrayExpressAccessionAgent(options))
-            if (
-                options["gpl_id"]
-                or options["gpl_ids_file"]
-                or options["keyword"]
-                or options["organism"]
-            ):
-                agents.append(MicroArrayGEOAccessionAgent(options))
+
+        if self.DATA_SOURCE_MA_AE in sources:
+            agents.append(MicroArrayExpressAccessionAgent(options))
+
+        if self.DATA_SOURCE_MA_GEO in sources:
+            agents.append(MicroArrayGEOAccessionAgent(options))
 
         entries = set()
         for agent in agents:
@@ -245,487 +245,4 @@ def handle(self, *args, **options):
                 output = "No accessions found."
             print(output)
         else:
-            AccessionBacklogEntry.objects.bulk_create(entries)
-
-
-class AccessionAgentBase:
-    "Accession agent base class."
-
-    previous_accessions = set()
-    retry_params = {
-        "retry_on_exception": lambda e: isinstance(
-            e, (ConnectionError, ConnectTimeout, ProtocolError, RemoteDisconnected)
-        ),
-        "stop_max_attempt_number": 5,
-        "wait_exponential_multiplier": 1000,  # Seconds.
-        "wait_exponential_max": 16000,  # Seconds.
-    }
-
-    def __init__(self, options) -> None:
-        """Populates args and values for major variables."""
-        self.options = options
-        self.count = options["count"]
-        self.keyword = options["keyword"]
-        self.organism = options["organism"]
-        self.since = options["since"]
-        self.until = options["until"] or datetime.now().strftime("%Y-%m-%d")
-
-        self.populate_previous_accessions()
-
-    def build_query(self):
-        """Returns query/query dict depending on the accession data source."""
-        raise NotImplementedError
-
-    def collect_data(self):
-        """Generates resulting entry collection."""
-        raise NotImplementedError
-
-    def fetch_data(self):
-        """Fetches data from an external or local data source."""
-        raise NotImplementedError
-
-    def get_ids(self):
-        """Gets IDs for query filtering depending on the accession technology."""
-        raise NotImplementedError
-
-    def populate_previous_accessions(self) -> None:
-        """Populates previous accession set from a provided excluded ids file."""
-        if not self.options["exclude_previous"] or self.previous_accessions:
-            return
-
-        # Gathered accessions.
-        self.previous_accessions.update(
-            (entry["code"] for entry in AccessionBacklogEntry.objects.values("code"))
-        )
-
-        # Surveyed accessions.
-        experiments = Experiment.objects.values("accession_code", "alternate_accession_code")
-        self.previous_accessions.update(
-            (experiment["accession_code"] for experiment in experiments)
-        )
-        self.previous_accessions.update(
-            (experiment["alternate_accession_code"] for experiment in experiments)
-        )
-
-
-class MicroArrayExpressAccessionAgent(AccessionAgentBase):
-    """
-    MicroArray ArrayExpress accession gathering agent. The data is fetched from
-    the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and
-    https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more
-    information about the API endpoints.
-    """
-
-    def __init__(self, options) -> None:
-        super().__init__(options)
-
-        self.data_chunk_size = 100
-        self.data_url = "https://www.ebi.ac.uk/biostudies/api/v1/search"
-        self.ids = self.get_ids()
-
-    def build_query(self) -> dict:
-        """Returns a query dict for getting array/organism specific accessions."""
-        query_dict = {
-            "directsub": "true",
-            "page": 1,
-            "pageSize": self.data_chunk_size,
-            "release_date": f"[{self.since} TO {self.until}]",
-            "type": "study",
-        }
-
-        if self.ids:
-            # TODO(ark): figure out better way of array filtering.
-            # Also make sure it's equivalent to the array filtering in this query
-            # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208
-            query_dict.update({"content": ", ".join(self.ids)})
-        elif self.keyword:
-            query_dict.update({"content": self.keyword})
-        elif self.organism:
-            query_dict.update({"organism": f'"{self.organism}"'})
-
-        return query_dict
-
-    def collect_data(self) -> Set[str]:
-        """Gets new accessions from EBI Biostudies API."""
-        accessions = set()
-
-        if self.ids:
-            message = (
-                "Getting MicroArray ArrayExpress entries by "
-                f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] "
-                "range."
-            )
-        elif self.keyword:
-            message = (
-                "Getting MicroArray ArrayExpress entries by "
-                f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.'
-            )
-        elif self.organism:
-            message = (
-                "Getting MicroArray ArrayExpress entries by "
-                f'"{self.organism}" organism for [{self.since} - {self.until}] range.'
-            )
-        else:
-            return accessions
-
-        log.debug(message)
-        accessions.update(self.fetch_data())
-
-        return accessions
-
-    def fetch_data(self) -> Set[str]:
-        """Retrieves accessions from API search endpoint."""
-
-        @retry(**self.retry_params)
-        def get_response(url, **kwargs):
-            """Gets response from an API endpoint."""
-            return requests.get(url, **kwargs)
-
-        accessions = set()
-
-        is_done = False
-        params = self.build_query()
-        while not is_done:
-            range_start = (params["page"] - 1) * params["pageSize"] + 1
-            range_end = (params["page"] - 1) * params["pageSize"] + self.data_chunk_size
-            log.debug(f"Processing entries {range_start} - {range_end}")
-
-            response = get_response(self.data_url, params=params)
-            entries = response.json().get("hits")
-            if entries:
-                entries = (
-                    AccessionBacklogEntry.create_from_ma_ae_entry(entry) for entry in entries
-                )
-                params["page"] += 1
-            else:
-                is_done = True
-
-            if self.previous_accessions:
-                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
-            accessions.update(entries)
-
-            # Quit after getting a sufficient amount of accessions.
-            if self.count and len(accessions) >= self.count:
-                is_done = True
-
-        return accessions
-
-    def get_ids(self) -> List[str]:
-        """Returns a combined list of passed ArrayExpress IDs."""
-        ids = set()
-
-        if self.options["ae_id"]:
-            ids.update(self.options["ae_id"])
-
-        if self.options["ae_ids_file"]:
-            with open(self.options["ae_ids_file"]) as ae_ids_file:
-                ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines()))
-
-        return sorted(ids)
-
-
-class MicroArrayGEOAccessionAgent(AccessionAgentBase):
-    """
-    MicroArray GEO accession gathering agent. The data is fetched from a local
-    SQLite GEO meta database.
-    """
-
-    def __init__(self, options) -> None:
-        super().__init__(options)
-
-        self.db_path = "data/microarray/GEOmetadb.sqlite"
-        self.ids = self.get_ids()
-
-    def build_query(self) -> str:
-        """Returns a query for getting GEO accessions from the local SQLite meta DB."""
-        tables = [
-            f"SELECT *",
-            "FROM gse_gpl",
-            "JOIN gpl ON gse_gpl.gpl=gpl.gpl",
-            "JOIN gse ON gse.gse=gse_gpl.gse",
-            "GROUP BY gse_gpl.gse",
-        ]
-
-        conditions = [
-            f"HAVING gse.submission_date >= '{self.since}'",
-            f"gse.submission_date <= '{self.until}'",
-        ]
-
-        if self.ids:
-            gpl_ids = (f"'{gpl_id}'" for gpl_id in self.ids)
-            conditions.append(f"gse_gpl.gpl IN ({', '.join(gpl_ids)})")
-        elif self.organism:
-            conditions.append(f"lower(organism)='{self.organism.lower()}'")
-
-        return f"{' '.join(tables)} {' AND '.join(conditions)}"
-
-    def collect_data(self) -> Set[str]:
-        """Gets new accessions from GEO database."""
-        accessions = set()
-
-        if self.ids:
-            message = (
-                "Getting MicroArray GEO entries by GEO platform ID(s): "
-                f"{', '.join(self.ids)} for [{self.since} - {self.until}] range."
-            )
-        elif self.keyword:
-            message = (
-                f'Getting MicroArray GEO entries by "{self.keyword}" keyword '
-                f"for [{self.since} - {self.until}] range."
-            )
-        elif self.organism:
-            message = (
-                f'Getting MicroArray GEO entries by "{self.organism}" organism '
-                f"for [{self.since} - {self.until}] range."
-            )
-        else:
-            return accessions
-
-        log.debug(message)
-        accessions.update(self.fetch_data())
-
-        return accessions
-
-    def fetch_data(self) -> Set[str]:
-        """Retrieves accessions from the GEO meta DB."""
-
-        def match_keyword(row):
-            """
-            Returns True if `row` matches `self.keyword` based regex.
-            Otherwise returns False.
-            """
-            return re_keyword.match(" ".join((str(c) for c in row if c)))
-
-        accessions = set()
-
-        if not os.path.exists(self.db_path):
-            log.error("GEO meta database doesn't exist.")
-            return accessions
-
-        connection = sqlite3.connect(self.db_path)
-        connection.row_factory = sqlite3.Row
-        connection.text_factory = lambda b: b.decode(errors="ignore")
-        entries = connection.execute(self.build_query()).fetchall()
-        connection.close()
-
-        if self.keyword:
-            re_keyword = re.compile(f".*{self.keyword}.*", re.IGNORECASE)  # Keyword regex.
-            entries = filter(match_keyword, entries)
-
-        entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries)
-        entries = set((AccessionBacklogEntry.create_from_ma_geo_entry(entry) for entry in entries))
-
-        if self.previous_accessions:
-            entries = (entry for entry in entries if entry.code not in self.previous_accessions)
-        accessions.update(entries)
-
-        return accessions
-
-    def get_ids(self) -> List[str]:
-        """Returns a combined list of passed GEO platform IDs."""
-        ids = set()
-
-        if self.options["gpl_id"]:
-            ids.update(self.options["gpl_id"])
-
-        if self.options["gpl_ids_file"]:
-            with open(self.options["gpl_ids_file"]) as gpl_ids_file:
-                ids.update((gpl_id.strip() for gpl_id in gpl_ids_file.readlines()))
-
-        return sorted(ids)
-
-
-class RNASeqAccessionAgent(AccessionAgentBase):
-    """
-    RNA-Seq accession gathering agent. The data is fetched from
-    The European Nucleotide Archive (ENA) Portal.
-    See https://www.ebi.ac.uk/ena/portal/api/ for more information about the API
-    endpoints.
-    """
-
-    def __init__(self, options) -> None:
-        super().__init__(options)
-
-        self.data_chunk_size = 10000
-        self.data_url = "https://www.ebi.ac.uk/ena/portal/api/search"
-        self.ids = self.get_ids()
-
-    def build_query(self, taxon_id: str = None) -> str:
-        """
-        Returns a query to use for getting specific taxon ID accessions.
-        Some special characters must remain unquoted.
-        """
-
-        AND = " AND "
-        OR = " OR "
-        instrument_models = (
-            "HiSeq X Five",
-            "HiSeq X Ten",
-            "Illumina Genome Analyzer II",
-            "Illumina Genome Analyzer IIx",
-            "Illumina Genome Analyzer",
-            "Illumina HiScanSQ",
-            "Illumina HiSeq 1000",
-            "Illumina HiSeq 1500",
-            "Illumina HiSeq 2000",
-            "Illumina HiSeq 2500",
-            "Illumina HiSeq 3000",
-            "Illumina HiSeq 4000",
-            "Illumina MiSeq",
-            "Illumina NovaSeq 6000",
-            "Ion Torrent Proton",
-            "Ion Torrent S5 XL",
-            "Ion Torrent S5",
-            "NextSeq 500",
-            "NextSeq 550",
-        )
-
-        instrument_models = OR.join((f'instrument_model="{im}"' for im in instrument_models))
-        conditions = [
-            # Relevant date fields: collection_date, collection_date_submitted,
-            # first_public, last_updated.
-            f"first_public >= {self.since}",
-            f"first_public <= {self.until}",
-            f"({instrument_models})",
-            'library_source="TRANSCRIPTOMIC"',
-            'library_strategy="RNA-Seq"',
-        ]
-
-        if taxon_id:
-            conditions.append(f"tax_eq({taxon_id})")
-        elif self.keyword:
-            search_fields = (
-                "assembly_software",
-                "bio_material",
-                "center_name",
-                "collected_by",
-                "experiment_title",
-                "host_body_site",
-                "instrument_model",
-                "instrument_platform",
-                "library_name",
-                "project_name",
-                "sample_title",
-                "sequencing_method",
-                "study_title",
-            )
-            search_fields = OR.join(
-                (f'{sf}="*{self.keyword}*"' for sf in search_fields)
-            )  # Keyword regex.
-            conditions.append(f"({search_fields})")
-        elif self.organism:
-            # `host`: Natural (as opposed to laboratory) host to the organism from which sample
-            #         was obtained.
-            # `host_scientific_name`: Scientific name of the natural (as opposed to laboratory)
-            #                         host to the organism from which sample was obtained.
-            # `scientific_name` Scientific name of the organism from which the sample was derived.
-            # Neither `host_scientific_name` nor `scientific_name` available for search.
-            # https://www.ebi.ac.uk/ena/portal/api/searchFields?dataPortal=ena&format=json&result=read_study
-            conditions.append(f'host="{self.organism}"')
-
-        return quote(AND.join(conditions), safe='*()-="<>/ ')  # Must remain unquoted.
-
-    def collect_data(self) -> Set[str]:
-        """Gets new accessions from EBI ENA API."""
-        accessions = set()
-
-        if self.ids:
-            log.debug(
-                f"Getting RNA-Seq entries by taxon ID(s): "
-                f"{', '.join((str(idx) for idx in self.ids))} for [{self.since} - {self.until}] range."
-            )
-            total = len(self.ids)
-            for idx, taxon_id in enumerate(self.ids):
-                if self.count and len(accessions) >= self.count:
-                    break
-
-                if total > 1:
-                    log.debug(f"Getting entries for taxon ID {taxon_id}, {idx + 1} of {total}.")
-                accessions.update(self.fetch_data(taxon_id=taxon_id))
-        elif self.keyword:
-            log.debug(
-                f'Getting RNA-Seq entries by "{self.keyword}" keyword '
-                f"for [{self.since} - {self.until}] range."
-            )
-            accessions.update(self.fetch_data())
-        elif self.organism:
-            log.debug(
-                f'Getting entries by "{self.organism}" organism '
-                f"for [{self.since} - {self.until}] range."
-            )
-            accessions.update(self.fetch_data())
-
-        return accessions
-
-    def fetch_data(self, taxon_id=None) -> Set[str]:
-        """
-        Retrieves accessions from API search endpoint.
-        The API allows to set limit to 0 (get all in one request) but we do
-        it in a paginated fashion with `self.data_chunk_size` as a page size.
-        """
-
-        @retry(**self.retry_params)
-        def get_response(url, **kwargs):
-            """Gets response from an API endpoint."""
-            return requests.post(url, **kwargs)
-
-        accessions = set()
-
-        fields = [
-            "first_public",
-            "scientific_name",
-            "secondary_study_accession",
-        ]  # For DRP/ERP/SRP-prefixed accessions.
-        data = {
-            "dataPortal": "ena",
-            # TODO(ark): add excludeAccessions/excludeAccessionType support.
-            "fields": ",".join(fields),  # Use "all" to get all fields.
-            "format": "json",
-            "limit": self.data_chunk_size,
-            "offset": 0,
-            "query": self.build_query(taxon_id=taxon_id),
-            "result": "read_study",
-            "sortFields": fields,
-        }
-
-        is_done = False
-        while not is_done:
-            log.debug(
-                f"Processing entries {data['offset'] + 1} - {data['offset'] + self.data_chunk_size}"
-            )
-            entries = ()
-            try:
-                response = get_response(self.data_url, data=data)
-                entries = response.json()
-                # TODO(ark): add `organism` when -o, --organism flag is used.
-                entries = (
-                    AccessionBacklogEntry.create_from_rnaseq_entry(entry) for entry in entries
-                )
-            except JSONDecodeError:
-                is_done = True
-            except TypeError:
-                log.error(f"Couldn't get data from {self.data_url}. Response: {entries}")
-            data["offset"] += self.data_chunk_size
-
-            if self.previous_accessions:
-                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
-            accessions.update(entries)
-
-            # Quit after getting a sufficient amount of accessions.
-            if self.count and len(accessions) >= self.count:
-                is_done = True
-
-        return accessions
-
-    def get_ids(self) -> List[str]:
-        """Returns a combined list of passed taxon IDs."""
-        ids = set()
-
-        if self.options["taxon_id"]:
-            ids.update(self.options["taxon_id"])
-
-        if self.options["taxon_ids_file"]:
-            with open(self.options["taxon_ids_file"]) as taxon_id_file:
-                ids.update((taxon_id.strip() for taxon_id in taxon_id_file.readlines()))
-
-        return sorted(ids)
+            GatheredAccession.objects.bulk_create(entries)

From 3fe5d8088d0862a1445cf002c4a4b1f067f4cb55 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Tue, 13 Sep 2022 17:17:19 -0700
Subject: [PATCH 15/24] Add a TODO.

---
 foreman/data_refinery_foreman/gatherer/agents/rna_seq.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
index f9497f3ba..f54ba570a 100644
--- a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
+++ b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
@@ -34,6 +34,7 @@ def build_query(self, taxon_id: str = None) -> str:
 
         AND = " AND "
         OR = " OR "
+        # TODO(ark): extract instrument models to a config file.
         instrument_models = (
             "HiSeq X Five",
             "HiSeq X Ten",

From ee66ac812337f7cdf9101c7a906497d9e3ea75ba Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Tue, 13 Sep 2022 18:37:09 -0700
Subject: [PATCH 16/24] Fix empty response issue.

---
 foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
index b5314302b..541bd86d2 100644
--- a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
+++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
@@ -92,7 +92,7 @@ def get_response(url, **kwargs):
             logger.debug(f"Processing entries {range_start} - {range_end}")
 
             response = get_response(self.DATA_URL, params=params)
-            entries = response.json().get("hits")
+            entries = response.json().get("hits", ())
             if entries:
                 entries = (
                     GatheredAccession.create_from_ma_ae_entry(entry, organism=self.organism)

From 5521286c64b4dc41a9ce070907157fce435c2cfa Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Wed, 14 Sep 2022 18:52:57 -0700
Subject: [PATCH 17/24] Address review comments.

---
 .../migrations/0071_gatheredaccession.py      |  2 +-
 .../models/gathered_accession.py              | 72 +++++++------------
 .../gatherer/agents/base.py                   |  5 +-
 .../gatherer/agents/microarray_ae.py          | 21 ++++--
 .../gatherer/agents/microarray_geo.py         | 23 ++++--
 .../gatherer/agents/rna_seq.py                | 24 +++++--
 .../management/commands/gather_accessions.py  | 60 ++++++++--------
 7 files changed, 112 insertions(+), 95 deletions(-)

diff --git a/common/data_refinery_common/migrations/0071_gatheredaccession.py b/common/data_refinery_common/migrations/0071_gatheredaccession.py
index a1740d96e..65d192b59 100644
--- a/common/data_refinery_common/migrations/0071_gatheredaccession.py
+++ b/common/data_refinery_common/migrations/0071_gatheredaccession.py
@@ -22,7 +22,7 @@ class Migration(migrations.Migration):
                         verbose_name="ID",
                     ),
                 ),
-                ("code", models.TextField(unique=True)),
+                ("accession_code", models.TextField(unique=True)),
                 ("created_at", models.DateTimeField(auto_now_add=True)),
                 ("last_modified_at", models.DateTimeField(auto_now=True)),
                 ("organism", models.TextField()),
diff --git a/common/data_refinery_common/models/gathered_accession.py b/common/data_refinery_common/models/gathered_accession.py
index 04b084533..e56ed615c 100644
--- a/common/data_refinery_common/models/gathered_accession.py
+++ b/common/data_refinery_common/models/gathered_accession.py
@@ -10,7 +10,7 @@ class GatheredAccession(models.Model):
     class Meta:
         db_table = "gathered_accessions"
 
-    code = models.TextField(unique=True)
+    accession_code = models.TextField(unique=True)
     created_at = models.DateTimeField(auto_now_add=True)
     last_modified_at = models.DateTimeField(auto_now=True)
     organism = models.TextField()
@@ -21,64 +21,44 @@ class Meta:
 
     def __eq__(self, other: object) -> bool:
         """Returns True if two objects are equal. Otherwise returns False."""
-        return isinstance(other, GatheredAccession) and self.code == other.code
+        return isinstance(other, GatheredAccession) and self.accession_code == other.accession_code
 
     def __hash__(self) -> int:
         """Returns accession object unique hash value."""
-        return hash(self.code)
+        return hash(self.accession_code)
 
     def __str__(self) -> str:
         """Returns accession default string representation."""
-        return ", ".join((self.code, self.technology, self.source, str(self.published_date.date())))
-
-    @staticmethod
-    def create_from_ma_ae_entry(entry, organism=None):
-        """Creates accession object from MicroArray ArrayExpress entry."""
-        accession = GatheredAccession()
-        accession.code = entry["accession"]
-        accession.source = "ebi_biostudies"
-        accession.technology = "microarray"
-
-        if organism:
-            accession.organism = organism
-        if "release_date" in entry:
-            accession.published_date = timezone.make_aware(
-                datetime.strptime(entry["release_date"], "%Y-%m-%d")
+        return ", ".join(
+            (
+                self.accession_code,
+                self.technology,
+                self.source,
+                str(self.published_date.date()),
             )
-
-        return accession
+        )
 
     @staticmethod
-    def create_from_ma_geo_entry(entry):
-        """Creates accession object from MicroArray GEO meta DB entry."""
+    def create_from_external_entry(data, source, technology, organism=None):
+        """Creates accession object from MicroArray ArrayExpress entry."""
         accession = GatheredAccession()
-        accession.code = entry["gse"]
-        accession.source = "geo_meta_db"
-        accession.technology = "microarray"
-
-        if "organism" in entry:
-            accession.organism = entry["organism"].lower()
-        if "submission_date" in entry:
 
-            accession.published_date = timezone.make_aware(
-                datetime.strptime(entry["submission_date"], "%Y-%m-%d")
-            )
+        accession.accession_code = (
+            data.get("accession") or data.get("gse") or data.get("secondary_study_accession")
+        )
 
-        return accession
+        organism = data.get("organism") or data.get("scientific_name") or organism
+        if organism:
+            accession.organism = organism.lower()
 
-    @staticmethod
-    def create_from_rnaseq_entry(entry):
-        """Creates accession object from RNA-Seq entry."""
-        accession = GatheredAccession()
-        accession.code = entry["secondary_study_accession"]
-        accession.source = "ebi_ena_portal"
-        accession.technology = "rna-seq"
+        published_date = (
+            data.get("first_public") or data.get("release_date") or data.get("submission_date")
+        )
+        accession.published_date = timezone.make_aware(
+            datetime.strptime(published_date, "%Y-%m-%d")
+        )
 
-        if "scientific_name" in entry:
-            accession.organism = entry["scientific_name"].lower()
-        if "first_public" in entry:
-            accession.published_date = timezone.make_aware(
-                datetime.strptime(entry["first_public"], "%Y-%m-%d")
-            )
+        accession.source = source
+        accession.technology = technology
 
         return accession
diff --git a/foreman/data_refinery_foreman/gatherer/agents/base.py b/foreman/data_refinery_foreman/gatherer/agents/base.py
index 3754a4068..818bbf72c 100644
--- a/foreman/data_refinery_foreman/gatherer/agents/base.py
+++ b/foreman/data_refinery_foreman/gatherer/agents/base.py
@@ -66,7 +66,10 @@ def populate_previous_accessions(self) -> None:
 
         # Gathered accessions.
         self.previous_accessions.update(
-            (entry["code"] for entry in GatheredAccession.objects.values("code"))
+            (
+                entry["accession_code"]
+                for entry in GatheredAccession.objects.values("accession_code")
+            )
         )
 
         # Surveyed accessions.
diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
index 541bd86d2..3bfcf08fe 100644
--- a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
+++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
@@ -1,5 +1,7 @@
-"""MicroArray ArrayExpress accession gathering automation.
-Data source: https://www.ebi.ac.uk/biostudies/help"""
+"""
+MicroArray ArrayExpress accession gathering automation.
+Data source: https://www.ebi.ac.uk/biostudies/help
+"""
 
 from typing import List, Set
 
@@ -13,7 +15,7 @@
 logger = get_and_configure_logger(__name__)
 
 
-class MicroArrayExpressAccessionAgent(AccessionAgentBase):
+class AEAgent(AccessionAgentBase):
     """
     MicroArray ArrayExpress accession gathering agent. The data is fetched from
     the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and
@@ -23,6 +25,9 @@ class MicroArrayExpressAccessionAgent(AccessionAgentBase):
 
     DATA_CHUNK_SIZE = 100
     DATA_URL = "https://www.ebi.ac.uk/biostudies/api/v1/search"
+    SOURCE = "ebi-biostudies"
+    SOURCE_NAME = "microarray-ae"
+    TECHNOLOGY = "microarray"
 
     def build_query(self) -> dict:
         """Returns a query dict for getting array/organism specific accessions."""
@@ -95,7 +100,9 @@ def get_response(url, **kwargs):
             entries = response.json().get("hits", ())
             if entries:
                 entries = (
-                    GatheredAccession.create_from_ma_ae_entry(entry, organism=self.organism)
+                    GatheredAccession.create_from_external_entry(
+                        entry, self.SOURCE, self.TECHNOLOGY, organism=self.organism
+                    )
                     for entry in entries
                 )
                 params["page"] += 1
@@ -103,7 +110,11 @@ def get_response(url, **kwargs):
                 is_done = True
 
             if self.previous_accessions:
-                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+                entries = (
+                    entry
+                    for entry in entries
+                    if entry.accession_code not in self.previous_accessions
+                )
             accessions.update(entries)
 
             # Quit after getting a sufficient amount of accessions.
diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
index 975c715b3..2500bcec5 100644
--- a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
+++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
@@ -1,5 +1,8 @@
-"""MicroArray GEO accession gathering automation.
-Data source: local SQLite meta DB from https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html"""
+"""
+MicroArray GEO accession gathering automation.
+Data source: local SQLite meta DB from
+https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html
+"""
 
 import os
 import re
@@ -13,7 +16,7 @@
 logger = get_and_configure_logger(__name__)
 
 
-class MicroArrayGEOAccessionAgent(AccessionAgentBase):
+class GEOAgent(AccessionAgentBase):
     """
     MicroArray GEO accession gathering agent. The data is fetched from a local
     SQLite GEO meta database.
@@ -23,6 +26,9 @@ class MicroArrayGEOAccessionAgent(AccessionAgentBase):
     # Implement syncing procedure.
     # Update URL once the original file is available again.
     DB_PATH = "data/microarray/GEOmetadb.sqlite"
+    SOURCE = "geo-meta-db"
+    SOURCE_NAME = "microarray-geo"
+    TECHNOLOGY = "microarray"
 
     def build_query(self) -> str:
         """Returns a query for getting GEO accessions from the local SQLite meta DB."""
@@ -101,10 +107,17 @@ def match_keyword(row):
             entries = filter(match_keyword, entries)
 
         entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries)
-        entries = set((GatheredAccession.create_from_ma_geo_entry(entry) for entry in entries))
+        entries = set(
+            (
+                GatheredAccession.create_from_external_entry(entry, self.SOURCE, self.TECHNOLOGY)
+                for entry in entries
+            )
+        )
 
         if self.previous_accessions:
-            entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+            entries = (
+                entry for entry in entries if entry.accession_code not in self.previous_accessions
+            )
         accessions.update(entries)
 
         return accessions
diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
index f54ba570a..577f815b8 100644
--- a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
+++ b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
@@ -1,5 +1,7 @@
-"""RNA-Seq accession gathering automation.
-Data source: https://www.ebi.ac.uk/ena/portal/api/"""
+"""
+RNA-Seq accession gathering automation.
+Data source: https://www.ebi.ac.uk/ena/portal/api/
+"""
 
 from json.decoder import JSONDecodeError
 from typing import List, Set
@@ -15,7 +17,7 @@
 logger = get_and_configure_logger(__name__)
 
 
-class RNASeqAccessionAgent(AccessionAgentBase):
+class RNASeqAgent(AccessionAgentBase):
     """
     RNA-Seq accession gathering agent. The data is fetched from
     The European Nucleotide Archive (ENA) Portal.
@@ -25,6 +27,9 @@ class RNASeqAccessionAgent(AccessionAgentBase):
 
     DATA_CHUNK_SIZE = 10000
     DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
+    SOURCE = "ebi-ena-portal"
+    SOURCE_NAME = "rna-seq"
+    TECHNOLOGY = "rna-seq"
 
     def build_query(self, taxon_id: str = None) -> str:
         """
@@ -174,7 +179,12 @@ def get_response(url, **kwargs):
             try:
                 response = get_response(self.DATA_URL, data=data)
                 entries = response.json()
-                entries = (GatheredAccession.create_from_rnaseq_entry(entry) for entry in entries)
+                entries = (
+                    GatheredAccession.create_from_external_entry(
+                        entry, self.SOURCE, self.TECHNOLOGY
+                    )
+                    for entry in entries
+                )
             except JSONDecodeError:
                 is_done = True
             except TypeError:
@@ -182,7 +192,11 @@ def get_response(url, **kwargs):
             data["offset"] += self.DATA_CHUNK_SIZE
 
             if self.previous_accessions:
-                entries = (entry for entry in entries if entry.code not in self.previous_accessions)
+                entries = (
+                    entry
+                    for entry in entries
+                    if entry.accession_code not in self.previous_accessions
+                )
             accessions.update(entries)
 
             # Quit after getting a sufficient amount of accessions.
diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
index 445245d3a..2b073ef45 100644
--- a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
+++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
@@ -11,12 +11,13 @@
 import re
 
 from django.core.management.base import BaseCommand
+from django.db.utils import IntegrityError
 
 from data_refinery_common.logging import get_and_configure_logger
 from data_refinery_common.models.gathered_accession import GatheredAccession
-from data_refinery_foreman.gatherer.agents.microarray_ae import MicroArrayExpressAccessionAgent
-from data_refinery_foreman.gatherer.agents.microarray_geo import MicroArrayGEOAccessionAgent
-from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAccessionAgent
+from data_refinery_foreman.gatherer.agents.microarray_ae import AEAgent
+from data_refinery_foreman.gatherer.agents.microarray_geo import GEOAgent
+from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAgent
 
 logger = get_and_configure_logger(__name__)
 
@@ -24,11 +25,8 @@
 class Command(BaseCommand):
     """Creates agents and runs actual accession gathering."""
 
-    DATA_SOURCE_MA_AE = "microarray-ae"
-    DATA_SOURCE_MA_GEO = "microarray-geo"
-    DATA_SOURCE_RNA_SEQ = "rna-seq"
-    DATA_SOURCES = (DATA_SOURCE_MA_AE, DATA_SOURCE_MA_GEO, DATA_SOURCE_RNA_SEQ)
-
+    DATA_AGENTS = (AEAgent, GEOAgent, RNASeqAgent)
+    DATA_SOURCE_NAMES = [agent.SOURCE_NAME for agent in DATA_AGENTS]
     RE_ACCESSION = re.compile(r"(\D+)(\d+)")
     RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}")
 
@@ -164,43 +162,43 @@ def validate_args(self, options) -> None:
 
         keyword = options["keyword"]
         organism = options["organism"]
-        sources = options["source"] or self.DATA_SOURCES
+        source_names = options["source"] or self.DATA_SOURCE_NAMES
 
-        for source in sources:
-            if source in self.DATA_SOURCES:
+        for source_name in source_names:
+            if source_name in self.DATA_SOURCE_NAMES:
                 continue
             errors.append(
-                f"Unknown source: {source}. Supported sources: {', '.join(self.DATA_SOURCES)}"
+                f"Unknown source: {source_name}. Supported sources: {', '.join(self.DATA_SOURCE_NAMES)}"
             )
 
-        if self.DATA_SOURCE_MA_AE in sources:
+        if AEAgent.SOURCE_NAME in source_names:
             ids = options["ae_id"] or options["ae_ids_file"]
             if not (ids or keyword or organism):
                 errors.append(
                     (
                         "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or "
                         "ArrayExpress ID(s) [--ae-id, --ae-ids-file] must be specified for "
-                        f"'{self.DATA_SOURCE_MA_AE}' source."
+                        f"'{AEAgent.SOURCE_NAME}' source."
                     )
                 )
-        if self.DATA_SOURCE_MA_GEO in sources:
+        if GEOAgent.SOURCE_NAME in source_names:
             ids = options["gpl_id"] or options["gpl_ids_file"]
             if not (ids or keyword or organism):
                 errors.append(
                     (
                         "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or "
                         "GEO platform ID(s) [--gpl-id, --gpl-ids-file] must be specified for "
-                        f"'{self.DATA_SOURCE_MA_GEO}' source."
+                        f"'{GEOAgent.SOURCE_NAME}' source."
                     )
                 )
-        if self.DATA_SOURCE_RNA_SEQ in sources:
+        if RNASeqAgent.SOURCE_NAME in source_names:
             ids = options["taxon_id"] or options["taxon_ids_file"]
             if not (ids or keyword or organism):
                 errors.append(
                     (
                         "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] "
                         "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified for "
-                        f"'{self.DATA_SOURCE_RNA_SEQ}' source."
+                        f"'{RNASeqAgent.SOURCE_NAME}' source."
                     )
                 )
 
@@ -213,26 +211,21 @@ def handle(self, *args, **options):
         self.set_verbosity_level(options)
 
         agents = list()
-        sources = options["source"] or self.DATA_SOURCES
-
-        if self.DATA_SOURCE_RNA_SEQ in sources:
-            agents.append(RNASeqAccessionAgent(options))
-
-        if self.DATA_SOURCE_MA_AE in sources:
-            agents.append(MicroArrayExpressAccessionAgent(options))
-
-        if self.DATA_SOURCE_MA_GEO in sources:
-            agents.append(MicroArrayGEOAccessionAgent(options))
+        sources_names = options["source"] or self.DATA_SOURCE_NAMES
+        for cls in self.DATA_AGENTS:
+            if cls.SOURCE_NAME not in sources_names:
+                continue
+            agents.append(cls(options))
 
         entries = set()
         for agent in agents:
             entries.update(agent.collect_data())
 
         entries = sorted(  # Sort the resulting list.
-            (entry for entry in entries if self.RE_ACCESSION.match(entry.code)),
+            (entry for entry in entries if self.RE_ACCESSION.match(entry.accession_code)),
             key=lambda entry: (
-                self.RE_ACCESSION.match(entry.code).group(1),
-                int(self.RE_ACCESSION.match(entry.code).group(2)),
+                self.RE_ACCESSION.match(entry.accession_code).group(1),
+                int(self.RE_ACCESSION.match(entry.accession_code).group(2)),
             ),
         )
         # Limit the number of output entries.
@@ -245,4 +238,7 @@ def handle(self, *args, **options):
                 output = "No accessions found."
             print(output)
         else:
-            GatheredAccession.objects.bulk_create(entries)
+            try:
+                GatheredAccession.objects.bulk_create(entries)
+            except IntegrityError as e:
+                logger.exception(f"Could not save new accessions to the database: {e}")

From 6d3e17988765f045a69ebac8618a6fffad55a907 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Wed, 21 Sep 2022 11:05:24 -0700
Subject: [PATCH 18/24] Rename agent files.

---
 .../gatherer/agents/{microarray_ae.py => ae_agent.py}       | 0
 .../gatherer/agents/{microarray_geo.py => geo_agent.py}     | 0
 .../gatherer/agents/{rna_seq.py => rnaseq_agent.py}         | 0
 .../gatherer/management/commands/gather_accessions.py       | 6 +++---
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename foreman/data_refinery_foreman/gatherer/agents/{microarray_ae.py => ae_agent.py} (100%)
 rename foreman/data_refinery_foreman/gatherer/agents/{microarray_geo.py => geo_agent.py} (100%)
 rename foreman/data_refinery_foreman/gatherer/agents/{rna_seq.py => rnaseq_agent.py} (100%)

diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/ae_agent.py
similarity index 100%
rename from foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py
rename to foreman/data_refinery_foreman/gatherer/agents/ae_agent.py
diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py b/foreman/data_refinery_foreman/gatherer/agents/geo_agent.py
similarity index 100%
rename from foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py
rename to foreman/data_refinery_foreman/gatherer/agents/geo_agent.py
diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rnaseq_agent.py
similarity index 100%
rename from foreman/data_refinery_foreman/gatherer/agents/rna_seq.py
rename to foreman/data_refinery_foreman/gatherer/agents/rnaseq_agent.py
diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
index 2b073ef45..554b74350 100644
--- a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
+++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py
@@ -15,9 +15,9 @@
 
 from data_refinery_common.logging import get_and_configure_logger
 from data_refinery_common.models.gathered_accession import GatheredAccession
-from data_refinery_foreman.gatherer.agents.microarray_ae import AEAgent
-from data_refinery_foreman.gatherer.agents.microarray_geo import GEOAgent
-from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAgent
+from data_refinery_foreman.gatherer.agents.ae_agent import AEAgent
+from data_refinery_foreman.gatherer.agents.geo_agent import GEOAgent
+from data_refinery_foreman.gatherer.agents.rnaseq_agent import RNASeqAgent
 
 logger = get_and_configure_logger(__name__)
 

From 2f6ff4a36d263433241b75629bef7ad041227652 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Thu, 17 Nov 2022 14:46:30 -0800
Subject: [PATCH 19/24] Workers Docker images refactoring.

---
 common/{ => R}/install_devtools.R             |  12 +-
 .../dependencies}/README.md                   |   0
 .../affymetrix/bioc/dependencies.R            | 221 ++++++++++++++++++
 .../affymetrix/cran}/dependencies.R           |   0
 .../affymetrix/cran}/packages.txt             |   0
 .../affymetrix/cran}/versions.tsv             |   0
 .../affymetrix/install_affy_only.R            |  38 +++
 .../affymetrix}/install_ensg_pkgs.R           |   8 +-
 .../illumina/bioc/dependencies.R}             |   0
 .../illumina/cran}/dependencies.R             |   0
 .../dependencies/illumina/cran}/packages.txt  |   0
 .../dependencies/illumina/cran}/versions.tsv  |   0
 workers/{ => R/dependencies}/install_bioc.R   |  10 +-
 .../dependencies}/install_downloader_R_only.R |  14 +-
 .../dependencies/no_op/cran}/dependencies.R   |   0
 .../dependencies/no_op/cran}/packages.txt     |   0
 .../dependencies/no_op/cran}/versions.tsv     |   0
 .../dependencies/no_op/install_gene_convert.R |  43 ++++
 .../dependencies/qn/bioc/dependencies.R}      |  12 +-
 .../dependencies/qn/cran}/dependencies.R      |   0
 .../dependencies/qn/cran}/packages.txt        |   0
 .../dependencies/qn/cran}/versions.tsv        |   0
 .../dependencies/tximport/cran/dependencies.R |  25 ++
 .../dependencies/tximport/cran}/packages.txt  |   0
 .../dependencies/tximport/cran}/versions.tsv  |   0
 .../dependencies/tximport/install_tximport.R  |   7 +
 .../R_dependencies/tximport/dependencies.R    |  25 --
 workers/affymetrix_dependencies.R             | 220 -----------------
 workers/ccache.conf                           |   5 +
 workers/dockerfiles/Dockerfile.affymetrix     |  90 ++-----
 .../dockerfiles/Dockerfile.affymetrix_local   |  12 +-
 workers/dockerfiles/Dockerfile.compendia      | 154 ++++++------
 workers/dockerfiles/Dockerfile.downloaders    |  87 ++-----
 workers/dockerfiles/Dockerfile.illumina       |  80 +------
 workers/dockerfiles/Dockerfile.no_op          |  90 ++-----
 workers/dockerfiles/Dockerfile.salmon         | 114 +++------
 workers/dockerfiles/Dockerfile.smasher        |  90 ++-----
 workers/dockerfiles/Dockerfile.transcriptome  |  90 ++-----
 workers/dockerfiles/Dockerfile.worker_base    |  68 ++++++
 workers/install_affy_only.R                   |  38 ---
 workers/install_gene_convert.R                |  43 ----
 workers/install_tximport.R                    |   7 -
 42 files changed, 650 insertions(+), 953 deletions(-)
 rename common/{ => R}/install_devtools.R (95%)
 rename workers/{R_dependencies => R/dependencies}/README.md (100%)
 create mode 100644 workers/R/dependencies/affymetrix/bioc/dependencies.R
 rename workers/{R_dependencies/affymetrix => R/dependencies/affymetrix/cran}/dependencies.R (100%)
 rename workers/{R_dependencies/affymetrix => R/dependencies/affymetrix/cran}/packages.txt (100%)
 rename workers/{R_dependencies/affymetrix => R/dependencies/affymetrix/cran}/versions.tsv (100%)
 create mode 100644 workers/R/dependencies/affymetrix/install_affy_only.R
 rename workers/{ => R/dependencies/affymetrix}/install_ensg_pkgs.R (89%)
 rename workers/{illumina_dependencies.R => R/dependencies/illumina/bioc/dependencies.R} (100%)
 rename workers/{R_dependencies/illumina => R/dependencies/illumina/cran}/dependencies.R (100%)
 rename workers/{R_dependencies/illumina => R/dependencies/illumina/cran}/packages.txt (100%)
 rename workers/{R_dependencies/illumina => R/dependencies/illumina/cran}/versions.tsv (100%)
 rename workers/{ => R/dependencies}/install_bioc.R (58%)
 rename workers/{ => R/dependencies}/install_downloader_R_only.R (54%)
 rename workers/{R_dependencies/no_op => R/dependencies/no_op/cran}/dependencies.R (100%)
 rename workers/{R_dependencies/no_op => R/dependencies/no_op/cran}/packages.txt (100%)
 rename workers/{R_dependencies/no_op => R/dependencies/no_op/cran}/versions.tsv (100%)
 create mode 100644 workers/R/dependencies/no_op/install_gene_convert.R
 rename workers/{qn_dependencies.R => R/dependencies/qn/bioc/dependencies.R} (61%)
 rename workers/{R_dependencies/qn => R/dependencies/qn/cran}/dependencies.R (100%)
 rename workers/{R_dependencies/qn => R/dependencies/qn/cran}/packages.txt (100%)
 rename workers/{R_dependencies/qn => R/dependencies/qn/cran}/versions.tsv (100%)
 create mode 100644 workers/R/dependencies/tximport/cran/dependencies.R
 rename workers/{R_dependencies/tximport => R/dependencies/tximport/cran}/packages.txt (100%)
 rename workers/{R_dependencies/tximport => R/dependencies/tximport/cran}/versions.tsv (100%)
 create mode 100644 workers/R/dependencies/tximport/install_tximport.R
 delete mode 100644 workers/R_dependencies/tximport/dependencies.R
 delete mode 100644 workers/affymetrix_dependencies.R
 create mode 100644 workers/ccache.conf
 create mode 100644 workers/dockerfiles/Dockerfile.worker_base
 delete mode 100644 workers/install_affy_only.R
 delete mode 100644 workers/install_gene_convert.R
 delete mode 100644 workers/install_tximport.R

diff --git a/common/install_devtools.R b/common/R/install_devtools.R
similarity index 95%
rename from common/install_devtools.R
rename to common/R/install_devtools.R
index 678418f19..e241152ea 100644
--- a/common/install_devtools.R
+++ b/common/R/install_devtools.R
@@ -16,9 +16,9 @@
 # Cranlock was used to find the versions of dependencies to install
 
 # Treat warnings as errors, set CRAN mirror, and set parallelization:
-options(warn=2)
-options(repos=structure(c(CRAN="https://cloud.r-project.org/")))
-options(Ncpus=parallel::detectCores())
+options(warn = 2)
+options(repos = structure(c(CRAN = "https://cloud.r-project.org/")))
+options(Ncpus = parallel::detectCores())
 
 
 install_package_version <- function(package_name, version) {
@@ -31,18 +31,18 @@ install_package_version <- function(package_name, version) {
   package_url <- paste0("https://cloud.r-project.org/src/contrib/", package_tarball)
 
   # Give CRAN a full minute to timeout since it's not always the most reliable.
-  curl_result <- system(paste0("curl --head --connect-timeout 60 ", package_url), intern=TRUE)
+  curl_result <- system(paste0("curl --head --connect-timeout 60 ", package_url), intern = TRUE)
   if (grepl("404", curl_result[1])) {
     package_url <- paste0("https://cloud.r-project.org/src/contrib/Archive/", package_name, "/", package_tarball)
 
     # Make sure the package actually exists in the archive!
-    curl_result <- system(paste0("curl --head --connect-timeout 120 ", package_url), intern=TRUE)
+    curl_result <- system(paste0("curl --head --connect-timeout 120 ", package_url), intern = TRUE)
     if (grepl("404", curl_result[1])) {
       stop(paste("Package", package_name, "version", version, "does not exist!"))
     }
   }
 
-  install.packages(package_url)
+  install.packages(package_url, Ncpus = 32)
 }
 
 # Generated using cranlock
diff --git a/workers/R_dependencies/README.md b/workers/R/dependencies/README.md
similarity index 100%
rename from workers/R_dependencies/README.md
rename to workers/R/dependencies/README.md
diff --git a/workers/R/dependencies/affymetrix/bioc/dependencies.R b/workers/R/dependencies/affymetrix/bioc/dependencies.R
new file mode 100644
index 000000000..1570292e0
--- /dev/null
+++ b/workers/R/dependencies/affymetrix/bioc/dependencies.R
@@ -0,0 +1,221 @@
+# Turn warnings into errors because biocLite throws warnings instead # of error if it fails to install something.
+options(warn = 2)
+options(repos = structure(c(CRAN = "https://cloud.r-project.org")))
+options(Ncpus = parallel::detectCores())
+
+# Bioconductor packages, installed by devtools::install_url()
+
+# Helper function that installs a list of packages using the input URLs
+install_with_url <- function(urls) {
+  pkg_ids <- devtools::install_url(urls)
+  if (any(is.na(pkg_ids))) {
+    pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ")
+    stop(paste("Failed to install package(s):", pkg_fails))
+  }
+  return(pkg_ids)
+}
+
+devtools::install_version("dplyr", version = "1.0.2")
+devtools::install_version("locfit", version = "1.5-9.4")
+
+bioc_pkg_urls <- c(
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/oligoClasses_1.50.4.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/oligo_1.52.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/GEOquery_2.56.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/SCAN.UPC_2.30.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/affy_1.66.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/affyio_1.58.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/zlibbioc_1.34.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/preprocessCore_1.50.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/genefilter_1.70.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/sva_3.36.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/limma_3.44.3.tar.gz"
+)
+install_with_url(bioc_pkg_urls)
+
+# Invoke another R script to install BrainArray ensg packages
+source("install_ensg_pkgs.R")
+
+# Install Bioconductor platform design (pd) packages
+pd_experiment_pkgs <- c(
+  "https://bioconductor.org/packages/3.11/data/experiment/src/contrib/pd.atdschip.tiling_0.26.0.tar.gz"
+)
+install_with_url(pd_experiment_pkgs)
+
+pd_annotation_pkgs <- c(
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.081229.hg18.promoter.medip.hx1_0.99.4.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.2006.07.18.hg18.refseq.promoter_1.8.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.2006.07.18.mm8.refseq.promoter_0.99.3.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.2006.10.31.rn34.refseq.promoter_0.99.3.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ag_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.aragene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.aragene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ath1.121501_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.barley1_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bovgene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bovgene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bovine_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bsubtilis_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cangene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cangene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.canine_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.canine.2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.celegans_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.charm.hg18.example_0.99.4.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chicken_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chigene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chigene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chogene.2.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chogene.2.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.citrus_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.d.human_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.human_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.human.ht_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.mouse_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.mouse.ht_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.rat_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.rat.ht_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cotton_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyngene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyngene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyrgene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyrgene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cytogenetics.array_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drogene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drogene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drosgenome1_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drosophila.2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.e.coli.2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ecoli_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ecoli.asv2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.elegene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.elegene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.equgene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.equgene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.feinberg.hg18.me.hx1_0.99.3.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.feinberg.mm8.me.hx1_0.99.3.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.felgene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.felgene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.fingene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.fingene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.genomewidesnp.5_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.genomewidesnp.6_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.guigene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.guigene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hc.g110_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.focus_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133.plus.2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133a_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133a.2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133a.tag_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133b_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u219_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95a_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95av2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95b_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95c_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95d_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95e_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg18.60mer.expr_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ht.hg.u133.plus.pm_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ht.hg.u133a_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ht.mg.430a_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hta.2.0_3.12.2.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hu6800_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.huex.1.0.st.v2_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.1.0.st.v1_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.1.1.st.v1_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.2.0.st_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.2.1.st_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.maize_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping250k.nsp_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping250k.sty_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping50k.hind240_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping50k.xba240_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.margene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.margene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.medgene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.medgene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.medicago_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74a_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74av2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74b_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74bv2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74c_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74cv2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.1.0_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.2.0_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.3.0_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.3.1_3.8.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.4.0_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.moe430a_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.moe430b_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.moex.1.0.st.v1_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.1.0.st.v1_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.1.1.st.v1_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.2.0.st_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.2.1.st_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mouse430.2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mouse430a.2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mta.1.0_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mu11ksuba_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mu11ksubb_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.nugo.hs1a520180_3.4.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.nugo.mm1a520177_3.4.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ovigene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ovigene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.pae.g1a_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.plasmodium.anopheles_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.poplar_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.porcine_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.porgene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.porgene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rabgene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rabgene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rae230a_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rae230b_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.raex.1.0.st.v1_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.1.0.st.v1_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.1.1.st.v1_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.2.0.st_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.2.1.st_3.14.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rat230.2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rcngene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rcngene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rg.u34a_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rg.u34b_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rg.u34c_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rhegene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rhegene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rhesus_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rice_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rjpgene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rjpgene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rn.u34_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rta.1.0_3.12.2.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rusgene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rusgene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.s.aureus_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.soybean_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.soygene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.soygene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.sugar.cane_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.tomato_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.u133.x3p_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.vitis.vinifera_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.wheat_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.x.laevis.2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.x.tropicalis_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.xenopus.laevis_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.yeast.2_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.yg.s98_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.zebgene.1.0.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.zebgene.1.1.st_3.12.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.zebrafish_3.12.0.tar.gz"
+)
+install_with_url(pd_annotation_pkgs)
+
+# Load this libraries because apparently just installing it isn't
+# enough to verify that the correct versions of dependencies are installed.
+library("foreach")
diff --git a/workers/R_dependencies/affymetrix/dependencies.R b/workers/R/dependencies/affymetrix/cran/dependencies.R
similarity index 100%
rename from workers/R_dependencies/affymetrix/dependencies.R
rename to workers/R/dependencies/affymetrix/cran/dependencies.R
diff --git a/workers/R_dependencies/affymetrix/packages.txt b/workers/R/dependencies/affymetrix/cran/packages.txt
similarity index 100%
rename from workers/R_dependencies/affymetrix/packages.txt
rename to workers/R/dependencies/affymetrix/cran/packages.txt
diff --git a/workers/R_dependencies/affymetrix/versions.tsv b/workers/R/dependencies/affymetrix/cran/versions.tsv
similarity index 100%
rename from workers/R_dependencies/affymetrix/versions.tsv
rename to workers/R/dependencies/affymetrix/cran/versions.tsv
diff --git a/workers/R/dependencies/affymetrix/install_affy_only.R b/workers/R/dependencies/affymetrix/install_affy_only.R
new file mode 100644
index 000000000..2bce94bc4
--- /dev/null
+++ b/workers/R/dependencies/affymetrix/install_affy_only.R
@@ -0,0 +1,38 @@
+# Turn warnings into errors because biocLite throws warnings instead
+# of error if it fails to install something.
+options(warn = 2)
+options(repos = structure(c(CRAN = "https://cloud.r-project.org")))
+options(Ncpus = parallel::detectCores())
+
+# Use devtools::install_version() to install packages in cran.
+devtools::install_version("dplyr", version = "1.0.0")
+devtools::install_version("tidyr", version = "1.1.0")
+devtools::install_version("ff", version = "2.2-14")
+devtools::install_version("locfit", version = "1.5-9.4")
+
+# Helper function that installs a list of packages using the input URLs
+install_with_url <- function(urls) {
+  pkg_ids <- devtools::install_url(urls)
+  if (any(is.na(pkg_ids))) {
+    pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ")
+    stop(paste("Failed to install package(s):", pkg_fails))
+  }
+  return(pkg_ids)
+}
+
+bioc_pkgs <- c(
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/oligoClasses_1.50.4.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/oligo_1.52.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/GEOquery_2.56.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/SCAN.UPC_2.30.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/affy_1.66.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/affyio_1.58.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/zlibbioc_1.34.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/preprocessCore_1.50.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/genefilter_1.70.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/sva_3.36.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/tximport_1.16.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/limma_3.44.3.tar.gz"
+)
+install_with_url(bioc_pkgs)
diff --git a/workers/install_ensg_pkgs.R b/workers/R/dependencies/affymetrix/install_ensg_pkgs.R
similarity index 89%
rename from workers/install_ensg_pkgs.R
rename to workers/R/dependencies/affymetrix/install_ensg_pkgs.R
index 3f3b5bf08..5de25a262 100644
--- a/workers/install_ensg_pkgs.R
+++ b/workers/R/dependencies/affymetrix/install_ensg_pkgs.R
@@ -1,4 +1,4 @@
-options(Ncpus=parallel::detectCores())
+options(Ncpus = parallel::detectCores())
 install.packages("xml2")
 library("xml2")
 ensg_url <- "http://brainarray.mbni.med.umich.edu/Brainarray/Database/CustomCDF/22.0.0/ensg.asp"
@@ -39,8 +39,10 @@ lapply(data_rows, save_chip_pkg)
 
 # Write chips and pkg_urls to a tab-delimited file
 output_filename <- "/home/user/r_ensg_probe_pkgs.txt"
-write.table(list(chips, pkg_urls), file=output_filename, quote=FALSE,
-            row.names=FALSE, col.names=FALSE, sep="\t")
+write.table(list(chips, pkg_urls),
+    file = output_filename, quote = FALSE,
+    row.names = FALSE, col.names = FALSE, sep = "\t"
+)
 
 # Install these ensg packages
 lapply(pkg_urls, devtools::install_url)
diff --git a/workers/illumina_dependencies.R b/workers/R/dependencies/illumina/bioc/dependencies.R
similarity index 100%
rename from workers/illumina_dependencies.R
rename to workers/R/dependencies/illumina/bioc/dependencies.R
diff --git a/workers/R_dependencies/illumina/dependencies.R b/workers/R/dependencies/illumina/cran/dependencies.R
similarity index 100%
rename from workers/R_dependencies/illumina/dependencies.R
rename to workers/R/dependencies/illumina/cran/dependencies.R
diff --git a/workers/R_dependencies/illumina/packages.txt b/workers/R/dependencies/illumina/cran/packages.txt
similarity index 100%
rename from workers/R_dependencies/illumina/packages.txt
rename to workers/R/dependencies/illumina/cran/packages.txt
diff --git a/workers/R_dependencies/illumina/versions.tsv b/workers/R/dependencies/illumina/cran/versions.tsv
similarity index 100%
rename from workers/R_dependencies/illumina/versions.tsv
rename to workers/R/dependencies/illumina/cran/versions.tsv
diff --git a/workers/install_bioc.R b/workers/R/dependencies/install_bioc.R
similarity index 58%
rename from workers/install_bioc.R
rename to workers/R/dependencies/install_bioc.R
index c51d36988..3eb5aa29a 100644
--- a/workers/install_bioc.R
+++ b/workers/R/dependencies/install_bioc.R
@@ -1,12 +1,12 @@
 # Turn warnings into errors because biocLite throws warnings instead
 # of error if it fails to install something.
-options(warn=2)
-options(repos=structure(c(CRAN="https://cloud.r-project.org")))
-options(Ncpus=parallel::detectCores())
+options(warn = 2)
+options(repos = structure(c(CRAN = "https://cloud.r-project.org")))
+options(Ncpus = parallel::detectCores())
 
 # Use devtools::install_version() to install packages in cran.
-devtools::install_version('dplyr', version='1.0.0')
-devtools::install_version('tidyr', version='1.1.0')
+devtools::install_version("dplyr", version = "1.0.0")
+devtools::install_version("tidyr", version = "1.1.0")
 
 # devtools::install_url() requires BiocInstaller
 # install.packages('https://bioconductor.org/packages/3.6/bioc/src/contrib/BiocInstaller_1.28.0.tar.gz')
diff --git a/workers/install_downloader_R_only.R b/workers/R/dependencies/install_downloader_R_only.R
similarity index 54%
rename from workers/install_downloader_R_only.R
rename to workers/R/dependencies/install_downloader_R_only.R
index 02feb6275..85eb866b1 100644
--- a/workers/install_downloader_R_only.R
+++ b/workers/R/dependencies/install_downloader_R_only.R
@@ -1,8 +1,8 @@
 # Turn warnings into errors because biocLite throws warnings instead
 # of error if it fails to install something.
-options(warn=2)
-options(repos=structure(c(CRAN="https://cloud.r-project.org")))
-options(Ncpus=parallel::detectCores())
+options(warn = 2)
+options(repos = structure(c(CRAN = "https://cloud.r-project.org")))
+options(Ncpus = parallel::detectCores())
 
 # Bioconductor packages, installed by devtools::install_url()
 
@@ -10,15 +10,15 @@ options(Ncpus=parallel::detectCores())
 # Helper function that installs a list of packages using the input URLs
 install_with_url <- function(urls) {
   pkg_ids <- devtools::install_url(urls)
-  if(any(is.na(pkg_ids))) {
+  if (any(is.na(pkg_ids))) {
     pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ")
-    stop(paste("Failed to install package(s):", pkg_fails ))
+    stop(paste("Failed to install package(s):", pkg_fails))
   }
   return(pkg_ids)
 }
 
 bioc_pkgs <- c(
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/affyio_1.58.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/zlibbioc_1.34.0.tar.gz'
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/affyio_1.58.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/zlibbioc_1.34.0.tar.gz"
 )
 install_with_url(bioc_pkgs)
diff --git a/workers/R_dependencies/no_op/dependencies.R b/workers/R/dependencies/no_op/cran/dependencies.R
similarity index 100%
rename from workers/R_dependencies/no_op/dependencies.R
rename to workers/R/dependencies/no_op/cran/dependencies.R
diff --git a/workers/R_dependencies/no_op/packages.txt b/workers/R/dependencies/no_op/cran/packages.txt
similarity index 100%
rename from workers/R_dependencies/no_op/packages.txt
rename to workers/R/dependencies/no_op/cran/packages.txt
diff --git a/workers/R_dependencies/no_op/versions.tsv b/workers/R/dependencies/no_op/cran/versions.tsv
similarity index 100%
rename from workers/R_dependencies/no_op/versions.tsv
rename to workers/R/dependencies/no_op/cran/versions.tsv
diff --git a/workers/R/dependencies/no_op/install_gene_convert.R b/workers/R/dependencies/no_op/install_gene_convert.R
new file mode 100644
index 000000000..2ad4afb84
--- /dev/null
+++ b/workers/R/dependencies/no_op/install_gene_convert.R
@@ -0,0 +1,43 @@
+# Turn warnings into errors because biocLite throws warnings instead
+# of error if it fails to install something.
+options(warn = 2)
+options(Ncpus = parallel::detectCores())
+options(repos = structure(c(CRAN = "https://cloud.r-project.org")))
+
+# Helper function that installs a list of packages using the input URLs
+install_with_url <- function(urls) {
+  pkg_ids <- devtools::install_url(urls)
+  if (any(is.na(pkg_ids))) {
+    pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ")
+    stop(paste("Failed to install package(s):", pkg_fails))
+  }
+  return(pkg_ids)
+}
+
+devtools::install_version("dplyr", version = "1.0.2")
+
+bioc_pkgs <- c(
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz"
+)
+install_with_url(bioc_pkgs)
+
+illumina_pkgs <- c(
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv1.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv2.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv3.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv4.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1p1.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev2.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaRatv1.db_1.26.0.tar.gz"
+)
+install_with_url(illumina_pkgs)
+
+# Load these libraries because apparently just installing them isn't
+# enough to verify that they have complementary versions.
+library("optparse")
+library(data.table)
+library("dplyr")
+library("rlang")
+library(lazyeval)
+library(AnnotationDbi)
diff --git a/workers/qn_dependencies.R b/workers/R/dependencies/qn/bioc/dependencies.R
similarity index 61%
rename from workers/qn_dependencies.R
rename to workers/R/dependencies/qn/bioc/dependencies.R
index 8238bb7a4..2abd45f49 100644
--- a/workers/qn_dependencies.R
+++ b/workers/R/dependencies/qn/bioc/dependencies.R
@@ -1,19 +1,19 @@
-options(warn=2)
-options(repos=structure(c(CRAN="https://cloud.r-project.org")))
-options(Ncpus=parallel::detectCores())
+options(warn = 2)
+options(repos = structure(c(CRAN = "https://cloud.r-project.org")))
+options(Ncpus = parallel::detectCores())
 
 # Helper function that installs a list of packages using the input URLs
 install_with_url <- function(urls) {
   pkg_ids <- devtools::install_url(urls)
-  if(any(is.na(pkg_ids))) {
+  if (any(is.na(pkg_ids))) {
     pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ")
-    stop(paste("Failed to install package(s):", pkg_fails ))
+    stop(paste("Failed to install package(s):", pkg_fails))
   }
   return(pkg_ids)
 }
 
 bioc_pkgs <- c(
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/preprocessCore_1.50.0.tar.gz'
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/preprocessCore_1.50.0.tar.gz"
 )
 install_with_url(bioc_pkgs)
 
diff --git a/workers/R_dependencies/qn/dependencies.R b/workers/R/dependencies/qn/cran/dependencies.R
similarity index 100%
rename from workers/R_dependencies/qn/dependencies.R
rename to workers/R/dependencies/qn/cran/dependencies.R
diff --git a/workers/R_dependencies/qn/packages.txt b/workers/R/dependencies/qn/cran/packages.txt
similarity index 100%
rename from workers/R_dependencies/qn/packages.txt
rename to workers/R/dependencies/qn/cran/packages.txt
diff --git a/workers/R_dependencies/qn/versions.tsv b/workers/R/dependencies/qn/cran/versions.tsv
similarity index 100%
rename from workers/R_dependencies/qn/versions.tsv
rename to workers/R/dependencies/qn/cran/versions.tsv
diff --git a/workers/R/dependencies/tximport/cran/dependencies.R b/workers/R/dependencies/tximport/cran/dependencies.R
new file mode 100644
index 000000000..02c5ed011
--- /dev/null
+++ b/workers/R/dependencies/tximport/cran/dependencies.R
@@ -0,0 +1,25 @@
+# Generated from cranlock
+options(warn = 2)
+options(Ncpus = parallel::detectCores())
+options(repos = structure(c(CRAN = "https://cloud.r-project.org")))
+devtools::install_version("getopt", version = "1.20.3")
+devtools::install_version("optparse", version = "1.4.4")
+devtools::install_version("rjson", version = "0.2.19")
+devtools::install_version("R6", version = "2.4.0")
+devtools::install_version("pkgconfig", version = "2.0.2")
+devtools::install_version("rlang", version = "0.4.0")
+devtools::install_version("zeallot", version = "0.1.0")
+devtools::install_version("backports", version = "1.1.4")
+devtools::install_version("glue", version = "1.3.1")
+devtools::install_version("digest", version = "0.6.19")
+devtools::install_version("vctrs", version = "0.1.0")
+devtools::install_version("hms", version = "0.4.2")
+devtools::install_version("Rcpp", version = "1.0.1")
+devtools::install_version("assertthat", version = "0.2.1")
+devtools::install_version("crayon", version = "1.3.4")
+devtools::install_version("cli", version = "1.1.0")
+devtools::install_version("utf8", version = "1.1.4")
+devtools::install_version("fansi", version = "0.4.0")
+devtools::install_version("pillar", version = "1.4.2")
+devtools::install_version("tibble", version = "2.1.3")
+devtools::install_version("readr", version = "1.1.1")
diff --git a/workers/R_dependencies/tximport/packages.txt b/workers/R/dependencies/tximport/cran/packages.txt
similarity index 100%
rename from workers/R_dependencies/tximport/packages.txt
rename to workers/R/dependencies/tximport/cran/packages.txt
diff --git a/workers/R_dependencies/tximport/versions.tsv b/workers/R/dependencies/tximport/cran/versions.tsv
similarity index 100%
rename from workers/R_dependencies/tximport/versions.tsv
rename to workers/R/dependencies/tximport/cran/versions.tsv
diff --git a/workers/R/dependencies/tximport/install_tximport.R b/workers/R/dependencies/tximport/install_tximport.R
new file mode 100644
index 000000000..f28c97538
--- /dev/null
+++ b/workers/R/dependencies/tximport/install_tximport.R
@@ -0,0 +1,7 @@
+# Turn warnings into errors because biocLite throws warnings instead
+# of error if it fails to install something.
+options(warn = 2)
+options(Ncpus = parallel::detectCores())
+options(repos = structure(c(CRAN = "https://cloud.r-project.org")))
+
+devtools::install_url("https://bioconductor.org/packages/3.11/bioc/src/contrib/tximport_1.16.1.tar.gz")
diff --git a/workers/R_dependencies/tximport/dependencies.R b/workers/R_dependencies/tximport/dependencies.R
deleted file mode 100644
index 62fce5df5..000000000
--- a/workers/R_dependencies/tximport/dependencies.R
+++ /dev/null
@@ -1,25 +0,0 @@
-# Generated from cranlock
-options(warn=2)
-options(Ncpus=parallel::detectCores())
-options(repos=structure(c(CRAN="https://cloud.r-project.org")))
-devtools::install_version('getopt', version='1.20.3')
-devtools::install_version('optparse', version='1.4.4')
-devtools::install_version('rjson', version='0.2.19')
-devtools::install_version('R6', version='2.4.0')
-devtools::install_version('pkgconfig', version='2.0.2')
-devtools::install_version('rlang', version='0.4.0')
-devtools::install_version('zeallot', version='0.1.0')
-devtools::install_version('backports', version='1.1.4')
-devtools::install_version('glue', version='1.3.1')
-devtools::install_version('digest', version='0.6.19')
-devtools::install_version('vctrs', version='0.1.0')
-devtools::install_version('hms', version='0.4.2')
-devtools::install_version('Rcpp', version='1.0.1')
-devtools::install_version('assertthat', version='0.2.1')
-devtools::install_version('crayon', version='1.3.4')
-devtools::install_version('cli', version='1.1.0')
-devtools::install_version('utf8', version='1.1.4')
-devtools::install_version('fansi', version='0.4.0')
-devtools::install_version('pillar', version='1.4.2')
-devtools::install_version('tibble', version='2.1.3')
-devtools::install_version('readr', version='1.1.1')
diff --git a/workers/affymetrix_dependencies.R b/workers/affymetrix_dependencies.R
deleted file mode 100644
index 5fd501002..000000000
--- a/workers/affymetrix_dependencies.R
+++ /dev/null
@@ -1,220 +0,0 @@
-# Turn warnings into errors because biocLite throws warnings instead # of error if it fails to install something.
-options(warn=2)
-options(repos=structure(c(CRAN="https://cloud.r-project.org")))
-options(Ncpus=parallel::detectCores())
-
-# Bioconductor packages, installed by devtools::install_url()
-
-# Helper function that installs a list of packages using the input URLs
-install_with_url <- function(urls) {
-  pkg_ids <- devtools::install_url(urls)
-  if(any(is.na(pkg_ids))) {
-    pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ")
-    stop(paste("Failed to install package(s):", pkg_fails ))
-  }
-  return(pkg_ids)
-}
-
-devtools::install_version('dplyr', version='1.0.2')
-
-bioc_pkg_urls <- c(
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/oligoClasses_1.50.4.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/oligo_1.52.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/GEOquery_2.56.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/SCAN.UPC_2.30.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/affy_1.66.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/affyio_1.58.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/zlibbioc_1.34.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/preprocessCore_1.50.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/genefilter_1.70.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/sva_3.36.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/limma_3.44.3.tar.gz'
-)
-install_with_url(bioc_pkg_urls)
-
-# Invoke another R script to install BrainArray ensg packages
-source("install_ensg_pkgs.R")
-
-# Install Bioconductor platform design (pd) packages
-pd_experiment_pkgs <- c(
-  'https://bioconductor.org/packages/3.11/data/experiment/src/contrib/pd.atdschip.tiling_0.26.0.tar.gz'
-)
-install_with_url(pd_experiment_pkgs)
-
-pd_annotation_pkgs <- c(
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.081229.hg18.promoter.medip.hx1_0.99.4.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.2006.07.18.hg18.refseq.promoter_1.8.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.2006.07.18.mm8.refseq.promoter_0.99.3.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.2006.10.31.rn34.refseq.promoter_0.99.3.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ag_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.aragene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.aragene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ath1.121501_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.barley1_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bovgene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bovgene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bovine_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bsubtilis_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cangene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cangene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.canine_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.canine.2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.celegans_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.charm.hg18.example_0.99.4.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chicken_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chigene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chigene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chogene.2.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chogene.2.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.citrus_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.d.human_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.human_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.human.ht_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.mouse_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.mouse.ht_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.rat_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.rat.ht_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cotton_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyngene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyngene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyrgene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyrgene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cytogenetics.array_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drogene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drogene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drosgenome1_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drosophila.2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.e.coli.2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ecoli_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ecoli.asv2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.elegene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.elegene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.equgene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.equgene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.feinberg.hg18.me.hx1_0.99.3.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.feinberg.mm8.me.hx1_0.99.3.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.felgene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.felgene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.fingene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.fingene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.genomewidesnp.5_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.genomewidesnp.6_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.guigene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.guigene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hc.g110_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.focus_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133.plus.2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133a_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133a.2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133a.tag_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133b_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u219_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95a_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95av2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95b_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95c_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95d_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95e_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg18.60mer.expr_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ht.hg.u133.plus.pm_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ht.hg.u133a_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ht.mg.430a_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hta.2.0_3.12.2.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hu6800_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.huex.1.0.st.v2_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.1.0.st.v1_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.1.1.st.v1_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.2.0.st_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.2.1.st_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.maize_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping250k.nsp_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping250k.sty_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping50k.hind240_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping50k.xba240_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.margene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.margene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.medgene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.medgene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.medicago_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74a_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74av2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74b_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74bv2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74c_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74cv2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.1.0_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.2.0_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.3.0_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.3.1_3.8.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.4.0_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.moe430a_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.moe430b_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.moex.1.0.st.v1_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.1.0.st.v1_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.1.1.st.v1_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.2.0.st_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.2.1.st_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mouse430.2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mouse430a.2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mta.1.0_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mu11ksuba_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mu11ksubb_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.nugo.hs1a520180_3.4.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.nugo.mm1a520177_3.4.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ovigene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ovigene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.pae.g1a_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.plasmodium.anopheles_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.poplar_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.porcine_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.porgene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.porgene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rabgene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rabgene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rae230a_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rae230b_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.raex.1.0.st.v1_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.1.0.st.v1_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.1.1.st.v1_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.2.0.st_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.2.1.st_3.14.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rat230.2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rcngene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rcngene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rg.u34a_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rg.u34b_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rg.u34c_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rhegene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rhegene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rhesus_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rice_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rjpgene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rjpgene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rn.u34_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rta.1.0_3.12.2.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rusgene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rusgene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.s.aureus_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.soybean_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.soygene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.soygene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.sugar.cane_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.tomato_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.u133.x3p_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.vitis.vinifera_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.wheat_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.x.laevis.2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.x.tropicalis_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.xenopus.laevis_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.yeast.2_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.yg.s98_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.zebgene.1.0.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.zebgene.1.1.st_3.12.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.zebrafish_3.12.0.tar.gz'
-)
-install_with_url(pd_annotation_pkgs)
-
-# Load this libraries because apparently just installing it isn't
-# enough to verify that the correct versions of dependencies are installed.
-library('foreach')
diff --git a/workers/ccache.conf b/workers/ccache.conf
new file mode 100644
index 000000000..cde43e665
--- /dev/null
+++ b/workers/ccache.conf
@@ -0,0 +1,5 @@
+max_size = 5.0G
+# important for R CMD INSTALL *.tar.gz as tarballs are expanded freshly -> fresh ctime
+sloppiness = include_file_ctime
+# also important as the (temp.) directory name will differ
+hash_dir = false
diff --git a/workers/dockerfiles/Dockerfile.affymetrix b/workers/dockerfiles/Dockerfile.affymetrix
index 151473ecf..3ee59ea1a 100644
--- a/workers/dockerfiles/Dockerfile.affymetrix
+++ b/workers/dockerfiles/Dockerfile.affymetrix
@@ -1,81 +1,30 @@
-FROM ubuntu:20.04
+FROM ccdlstaging/dr_worker_base:latest
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
+# Fail in case of an error at any stage in the pipe.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update -qq
-RUN apt-get install -y software-properties-common
-RUN add-apt-repository ppa:apt-fast/stable
-
-RUN apt-get update -qq
-RUN apt-get install -y apt-fast apt-transport-https
-
-# The packages related to R are somewhat weird, see the README for more details.
-COPY workers/CRAN.gpg .
-RUN apt-key add CRAN.gpg
-RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list
-
-RUN apt-fast update -qq && apt-fast install -y \
-  build-essential \
-  cmake \
-  curl \
-  cython3 \
-  ed \
-  git \
-  libcairo-dev \
-  libcurl4-openssl-dev \
-  libedit-dev \
-  libpq-dev \
-  libssl-dev \
-  libxml2-dev \
-  llvm-10-dev \
-  lsb-release \
-  mercurial \
-  pkg-config \
-  python3-pip \
-  python3 \
-  python3-dev \
-  r-base-core \
-  wget
-
-RUN rm CRAN.gpg
-RUN apt-get clean; rm -rf /var/lib/apt/lists/*
-RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
-
-RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
 
-ENV R_LIBS "/usr/local/lib/R/site-library"
-
-COPY common/install_devtools.R .
-RUN Rscript install_devtools.R
-
-COPY workers/R_dependencies/affymetrix/dependencies.R .
-RUN Rscript dependencies.R
-
-COPY workers/affymetrix_dependencies.R .
-RUN Rscript affymetrix_dependencies.R
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip setuptools && \
+    # Install this one here instead of via requirements.txt because not
+    # all processors need it.
+    pip3 install rpy2==3.4.5
 
-RUN pip3 install pip --upgrade
-RUN pip3 install setuptools --upgrade
+# Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed --no-cache-dir \
+    common/$(ls common -1 | sort --version-sort | tail -1)
 
-# Install this one here instead of via requirements.txt because not
-# all processors need it.
-RUN pip3 install rpy2==3.4.5
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install -r requirements.txt
+RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt
 
-# Get the latest version from the dist directory.
-COPY common/dist/data-refinery-common-* common/
-RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
+COPY workers/R/dependencies/affymetrix/cran/dependencies.R dependencies.R
+RUN Rscript dependencies.R
+
+COPY workers/R/dependencies/affymetrix/install_ensg_pkgs.R .
 
-# Clear out the pip3 cache.
-RUN rm -rf /root/.cache
+COPY workers/R/dependencies/affymetrix/bioc/dependencies.R dependencies.R
+RUN Rscript dependencies.R
 
 ARG SYSTEM_VERSION
 
@@ -86,6 +35,7 @@ USER user
 COPY .boto .boto
 COPY config/ config/
 COPY workers/ .
-COPY workers/install_ensg_pkgs.R .
+
+RUN ccache -s
 
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.affymetrix_local b/workers/dockerfiles/Dockerfile.affymetrix_local
index 9a37692e6..3d7eff18f 100644
--- a/workers/dockerfiles/Dockerfile.affymetrix_local
+++ b/workers/dockerfiles/Dockerfile.affymetrix_local
@@ -1,14 +1,18 @@
 FROM ccdlstaging/dr_affymetrix:latest
 
-USER root
+# Fail in case of an error at any stage in the pipe.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+WORKDIR /home/user
 
 # Remove the version of common already installed.
-RUN rm -r common/
-RUN pip3 uninstall -y data_refinery_common
+RUN rm -r common && \
+    pip3 uninstall -y data_refinery_common
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
+RUN pip3 install --ignore-installed --no-cache-dir \
+    common/$(ls common -1 | sort --version-sort | tail -1)
 
 ARG SYSTEM_VERSION
 
diff --git a/workers/dockerfiles/Dockerfile.compendia b/workers/dockerfiles/Dockerfile.compendia
index 2c6a38784..82e14df01 100644
--- a/workers/dockerfiles/Dockerfile.compendia
+++ b/workers/dockerfiles/Dockerfile.compendia
@@ -3,102 +3,98 @@ FROM nvidia/cuda:11.8.0-runtime-ubuntu18.04
 # This is very similar to the `smasher` image, but comes with OpenBLAS and some
 # of the other libraries required for fancyimpute.
 
+# Fail in case of an error at any stage in the pipe.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+WORKDIR /home/user
+
 # Prevent tzdata from prompting us for a timezone and hanging the build.
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
 # For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
-# RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
-# via https://github.com/ilikenwf/apt-fast/issues/85#issuecomment-261640099
-RUN echo debconf apt-fast/maxdownloads string 16 | debconf-set-selections
-RUN echo debconf apt-fast/dlflag boolean true | debconf-set-selections
-RUN echo debconf apt-fast/aptmanager string apt-get | debconf-set-selections
-RUN echo 'tzdata tzdata/Areas select Etc' | debconf-set-selections
-RUN echo 'tzdata tzdata/Zones/Etc select UTC' | debconf-set-selections
-
-RUN apt-get update -qq
-RUN apt-get install -y software-properties-common
-RUN add-apt-repository ppa:apt-fast/stable
-RUN add-apt-repository ppa:deadsnakes/ppa
-RUN add-apt-repository ppa:savoury1/llvm-defaults-10
-
-RUN apt-get update -qq
-RUN apt-get install -y apt-fast apt-transport-https tzdata
+ENV LANG=C.UTF-8
 
 COPY workers/CRAN.gpg .
-RUN apt-key add CRAN.gpg
-RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list
-
-RUN apt-fast update -qq && apt-fast install -y \
-  build-essential \
-  cmake \
-  curl \
-  cython3 \
-  ed \
-  gfortran \
-  git \
-  libcairo-dev \
-  libcurl4-openssl-dev \
-  libedit-dev \
-  libblas-dev \
-  liblapack-dev \
-  libpq-dev \
-  libssl-dev \
-  libxml2-dev \
-  llvm-10-dev \
-  lsb-release \
-  mercurial \
-  pkg-config \
-  python3-pip \
-  python3.8 \
-  python3.8-dev \
-  r-base-core \
-  wget
-
-RUN rm CRAN.gpg
-RUN apt-get clean; rm -rf /var/lib/apt/lists/*
-RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
-
-RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
-WORKDIR /home/user
-
-RUN wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
-RUN tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/
-RUN ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin/
 
-# We need a few special packages for QN
-ENV R_LIBS "/usr/local/lib/R/site-library"
-
-COPY common/install_devtools.R .
+# RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+# via https://github.com/ilikenwf/apt-fast/issues/85#issuecomment-261640099
+RUN echo debconf apt-fast/maxdownloads string 16 | debconf-set-selections && \
+    echo debconf apt-fast/dlflag boolean true | debconf-set-selections && \
+    echo debconf apt-fast/aptmanager string apt-get | debconf-set-selections && \
+    echo 'tzdata tzdata/Areas select Etc' | debconf-set-selections && \
+    echo 'tzdata tzdata/Zones/Etc select UTC' | debconf-set-selections && \
+    apt-get update -qq && \
+    apt-get install --no-install-recommends -y software-properties-common && \
+    add-apt-repository ppa:apt-fast/stable && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    add-apt-repository ppa:savoury1/llvm-defaults-10 && \
+    apt-get update -qq && \
+    apt-get install --no-install-recommends -y apt-fast apt-transport-https gpg-agent tzdata && \
+    apt-key add CRAN.gpg && \
+    echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
+      >> /etc/apt/sources.list.d/added_repos.list && \
+    apt-fast update -qq && \
+    apt-fast install -y \
+      build-essential \
+      ccache \
+      cmake \
+      curl \
+      cython3 \
+      ed \
+      gfortran \
+      git \
+      libblas-dev \
+      libcairo-dev \
+      libcurl4-openssl-dev \
+      libedit-dev \
+      liblapack-dev \
+      libpq-dev \
+      libssl-dev \
+      libxml2-dev \
+      llvm-10-dev \
+      lsb-release \
+      mercurial \
+      pkg-config \
+      python3-pip \
+      python3.8 \
+      python3.8-dev \
+      r-base-core \
+      wget && \
+    rm CRAN.gpg && \
+    apt-get clean; rm -rf /var/lib/apt/lists/* && \
+    ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config & \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \
+    groupadd user && useradd --create-home --home-dir /home/user -g user user && \
+    wget -q https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 && \
+    tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ && \
+    ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin/
+
+# We need a few special packages for QN.
+ENV R_LIBS="/usr/local/lib/R/site-library"
+
+COPY common/R/install_devtools.R .
 RUN Rscript install_devtools.R
 
-COPY workers/R_dependencies/qn/dependencies.R .
+COPY workers/R/dependencies/qn/cran/dependencies.R .
 RUN Rscript dependencies.R
 
-COPY workers/qn_dependencies.R .
-RUN Rscript qn_dependencies.R
+COPY workers/R/dependencies/qn/bioc/dependencies.R .
+RUN Rscript dependencies.R
 # End QN-specific
 
-RUN pip3 install --upgrade pip
-# Smasher-specific requirements
-RUN pip3 install --ignore-installed numpy scipy matplotlib pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD
-# End smasher-specific
-
-COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt
-
-RUN pip3 install --ignore-installed numpy==1.16.0 # Fix a downgrade
-
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
+RUN pip3 install --ignore-installed --no-cache-dir \
+    common/$(ls common -1 | sort --version-sort | tail -1)
 
-# Clear out the pip3 cache.
-RUN rm -rf /root/.cache
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir numpy scipy matplotlib \
+      pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD
+
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt  && \
+    pip3 install --ignore-installed --no-cache-dir numpy==1.16.0
 
 ARG SYSTEM_VERSION
 
diff --git a/workers/dockerfiles/Dockerfile.downloaders b/workers/dockerfiles/Dockerfile.downloaders
index 1b3337325..3a5974491 100644
--- a/workers/dockerfiles/Dockerfile.downloaders
+++ b/workers/dockerfiles/Dockerfile.downloaders
@@ -1,64 +1,11 @@
-FROM ubuntu:18.04
+FROM ccdlstaging/dr_worker_base:latest
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
+# Fail in case of an error at any stage in the pipe.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
-RUN apt-get update
-RUN apt-get install -y software-properties-common
-RUN add-apt-repository ppa:apt-fast/stable
-RUN add-apt-repository ppa:deadsnakes/ppa
-RUN add-apt-repository ppa:savoury1/llvm-defaults-10
-
-RUN apt-get update -qq
-RUN apt-get install -y apt-fast apt-transport-https
-
-# The packages related to R are somewhat weird, see the README for more details.
-COPY workers/CRAN.gpg .
-RUN apt-key add CRAN.gpg
-RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list
-
-RUN apt-fast update -qq && apt-fast install -y \
-  build-essential \
-  cmake \
-  curl \
-  cython3 \
-  ed \
-  git \
-  libcairo-dev \
-  libcurl4-openssl-dev \
-  libedit-dev \
-  libpq-dev \
-  libssl-dev \
-  libxml2-dev \
-  llvm-10-dev \
-  lsb-release \
-  mercurial \
-  pkg-config \
-  python3-pip \
-  python3.8 \
-  python3.8-dev \
-  r-base-core \
-  wget
-
-RUN rm CRAN.gpg
-RUN apt-get clean; rm -rf /var/lib/apt/lists/*
-RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
-
-RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
 
-ENV R_LIBS "/usr/local/lib/R/site-library"
-
-COPY common/install_devtools.R .
-RUN Rscript install_devtools.R
-
-COPY workers/install_downloader_R_only.R .
+COPY workers/R/dependencies/install_downloader_R_only.R .
 RUN Rscript install_downloader_R_only.R
 
 # Aspera will only install as the current user.
@@ -67,29 +14,25 @@ USER user
 
 # Install Aspera. We have to install it using Holland Computing Center's conda
 # repo because download.asperasoft.com now returns 403s
-RUN wget -q https://anaconda.org/HCC/aspera-cli/3.9.1/download/linux-64/aspera-cli-3.9.1-0.tar.bz2
-RUN [ "$(sha256sum aspera-cli-3.9.1-0.tar.bz2 | cut -d' ' -f1)" = 60a09a7f3795186954079869106aa89a64183b7be8e0da7cbbe9d57c66c9bcdb ]
-RUN mkdir -p .aspera/cli
-RUN tar xf aspera-cli-3.9.1-0.tar.bz2 -C .aspera/cli
-RUN rm aspera-cli-3.9.1-0.tar.bz2
+RUN wget -q https://anaconda.org/HCC/aspera-cli/3.9.1/download/linux-64/aspera-cli-3.9.1-0.tar.bz2 && \
+    [ "$(sha256sum aspera-cli-3.9.1-0.tar.bz2 | cut -d ' ' -f1)" = 60a09a7f3795186954079869106aa89a64183b7be8e0da7cbbe9d57c66c9bcdb ] && \
+    mkdir -p .aspera/cli && \
+    tar xf aspera-cli-3.9.1-0.tar.bz2 -C .aspera/cli && \
+    rm aspera-cli-3.9.1-0.tar.bz2
 
 # Now that we're done installing Aspera go back to being root for a bit.
 USER root
 
-RUN pip3 install --upgrade pip
-# Install this rpy2 here instead of via requirements.txt because
-# pip-compile throws an error for it.
-RUN pip3 install rpy2==3.4.5
-
-COPY workers/data_refinery_workers/downloaders/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir rpy2==3.4.5
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
+RUN pip3 install --ignore-installed --no-cache-dir \
+    common/$(ls common -1 | sort --version-sort | tail -1)
 
-# Clear out the pip3 cache.
-RUN rm -rf /root/.cache
+COPY workers/data_refinery_workers/downloaders/requirements.txt .
+RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt
 
 ARG SYSTEM_VERSION
 
diff --git a/workers/dockerfiles/Dockerfile.illumina b/workers/dockerfiles/Dockerfile.illumina
index e4cc70268..949c22405 100644
--- a/workers/dockerfiles/Dockerfile.illumina
+++ b/workers/dockerfiles/Dockerfile.illumina
@@ -1,83 +1,25 @@
-FROM ubuntu:18.04
+FROM ccdlstaging/dr_worker_base:latest
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
+# Fail in case of an error at any stage in the pipe.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
-RUN apt-get update
-RUN apt-get install -y software-properties-common
-RUN add-apt-repository ppa:apt-fast/stable
-RUN add-apt-repository ppa:deadsnakes/ppa
-RUN add-apt-repository ppa:savoury1/llvm-defaults-10
-
-RUN apt-get update -qq
-RUN apt-get install -y apt-fast apt-transport-https
-
-# The packages related to R are somewhat weird, see the README for more details.
-COPY workers/CRAN.gpg .
-RUN apt-key add CRAN.gpg
-RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list
-
-RUN apt-fast update -qq && apt-fast install -y \
-  build-essential \
-  cmake \
-  curl \
-  cython3 \
-  ed \
-  gfortran \
-  git \
-  libcairo-dev \
-  libcurl4-openssl-dev \
-  libedit-dev \
-  libblas-dev \
-  liblapack-dev \
-  libpq-dev \
-  libssl-dev \
-  libxml2-dev \
-  llvm-10-dev \
-  lsb-release \
-  mercurial \
-  pkg-config \
-  python3-pip \
-  python3.8 \
-  python3.8-dev \
-  r-base-core \
-  wget
-
-RUN rm CRAN.gpg
-RUN apt-get clean; rm -rf /var/lib/apt/lists/*
-RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
-
-RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
 
-ENV R_LIBS "/usr/local/lib/R/site-library"
-
-COPY common/install_devtools.R .
-RUN Rscript install_devtools.R
-
-COPY workers/R_dependencies/illumina/dependencies.R .
+COPY workers/R/dependencies/illumina/cran/dependencies.R .
 RUN Rscript dependencies.R
 
 # These are for Illumina.
-COPY workers/illumina_dependencies.R .
-RUN Rscript illumina_dependencies.R
-
-RUN pip3 install --upgrade pip
-COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt
+COPY workers/R/dependencies/illumina/bioc/dependencies.R .
+RUN Rscript dependencies.R
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
+RUN pip3 install --ignore-installed --no-cache-dir \
+    common/$(ls common -1 | sort --version-sort | tail -1)
 
-# Clear out the pip3 cache.
-RUN rm -rf /root/.cache
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed -r requirements.txt
 
 ARG SYSTEM_VERSION
 
diff --git a/workers/dockerfiles/Dockerfile.no_op b/workers/dockerfiles/Dockerfile.no_op
index 98f35d772..a0cb7855e 100644
--- a/workers/dockerfiles/Dockerfile.no_op
+++ b/workers/dockerfiles/Dockerfile.no_op
@@ -1,94 +1,38 @@
-FROM ubuntu:18.04
+FROM ccdlstaging/dr_worker_base:latest
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
+# Fail in case of an error at any stage in the pipe.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
-RUN apt-get update -qq
-RUN apt-get install -y software-properties-common
-
-RUN add-apt-repository ppa:apt-fast/stable
-# deadsnakes packages new python versions for older Ubuntu releases
-RUN add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt-get update -qq
-RUN apt-get -y install apt-fast
-
-# The packages related to R are somewhat weird, see the README for more details.
-COPY workers/CRAN.gpg .
-RUN apt-key add CRAN.gpg
-RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list
-
-RUN apt-fast update -qq && apt-fast install -y \
-  build-essential \
-  cmake \
-  curl \
-  cython3 \
-  ed \
-  git \
-  libcairo-dev \
-  libcurl4-openssl-dev \
-  libedit-dev \
-  libfreetype6-dev \
-  libpq-dev \
-  libssl-dev \
-  libxml2-dev \
-  llvm-10-dev \
-  lsb-release \
-  mercurial \
-  pkg-config \
-  python3-pip \
-  python3.8 \
-  python3.8-dev \
-  r-base-core \
-  wget
-
-RUN rm CRAN.gpg
-RUN apt-get clean; rm -rf /var/lib/apt/lists/*
-RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
-
-RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
 
-# Noop-specific requirements
-ENV R_LIBS "/usr/local/lib/R/site-library"
-
-COPY common/install_devtools.R .
-RUN Rscript install_devtools.R
-
-COPY workers/R_dependencies/no_op/dependencies.R .
+COPY workers/R/dependencies/no_op/cran/dependencies.R .
 RUN Rscript dependencies.R
 
-COPY workers/install_gene_convert.R .
+COPY workers/R/dependencies/no_op/install_gene_convert.R .
 RUN Rscript install_gene_convert.R
 
+# Noop-specific.
 RUN mkdir -p gene_indexes
 WORKDIR /home/user/gene_indexes
 ENV ID_REFINERY_URL https://zenodo.org/record/1410647/files/all_1536267482.zip
-RUN curl -O $ID_REFINERY_URL
-RUN echo $ID_REFINERY_URL > /etc/identifier_refinery_url
-RUN unzip *.zip
-RUN rm *.zip
+RUN curl -O $ID_REFINERY_URL && \
+    echo $ID_REFINERY_URL > /etc/identifier_refinery_url && \
+    unzip *.zip && \
+    rm *.zip
+# End Noop-specific.
+
 WORKDIR /home/user
-# End Noop-specific
 
-RUN pip3 install --upgrade pip
-RUN pip3 install numpy
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir numpy
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt
+RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
-
-# Clear out the pip3 cache.
-RUN rm -rf /root/.cache
+RUN pip3 install --ignore-installed --no-cache-dir \
+    common/$(ls common -1 | sort --version-sort | tail -1)
 
 ARG SYSTEM_VERSION
 
diff --git a/workers/dockerfiles/Dockerfile.salmon b/workers/dockerfiles/Dockerfile.salmon
index 8bfbbe9a7..724ddac60 100644
--- a/workers/dockerfiles/Dockerfile.salmon
+++ b/workers/dockerfiles/Dockerfile.salmon
@@ -1,109 +1,59 @@
-FROM ubuntu:18.04
-
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
-
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
-RUN apt-get update
-RUN apt-get install -y software-properties-common
-RUN add-apt-repository ppa:apt-fast/stable
-RUN add-apt-repository ppa:deadsnakes/ppa
-RUN add-apt-repository ppa:savoury1/llvm-defaults-10
-
-RUN apt-get update -qq
-RUN apt-get install -y apt-fast apt-transport-https
-
-# The packages related to R are somewhat weird, see the README for more details.
-COPY workers/CRAN.gpg .
-RUN apt-key add CRAN.gpg
-RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list
-
-RUN apt-fast update -qq && apt-fast install -y \
-  build-essential \
-  cmake \
-  curl \
-  cython3 \
-  ed \
-  git \
-  libcairo-dev \
-  libcurl4-openssl-dev \
-  libedit-dev \
-  libpq-dev \
-  libssl-dev \
-  libxml2-dev \
-  llvm-10-dev \
-  lsb-release \
-  mercurial \
-  pkg-config \
-  python3-pip \
-  python3.8 \
-  python3.8-dev \
-  r-base-core \
-  wget
-
-RUN rm CRAN.gpg
-RUN apt-get upgrade; apt-get clean; rm -rf /var/lib/apt/lists/*
-RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
-
-RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
+FROM ccdlstaging/dr_worker_base:latest
+
+# Fail in case of an error at any stage in the pipe.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
 WORKDIR /home/user
 
 # Install Salmon
-
-# Tximport requires all experiments to be processed with the same version of Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496
+# Tximport requires all experiments to be processed with the same version of
+# Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496.
 # This is something that should be considered when updating salmon, because
 # all samples from incomplete experiments must have salmon run on them again.
 ENV SALMON_VERSION 0.13.1
 
-RUN wget https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz
-RUN mkdir Salmon-${SALMON_VERSION}_linux_x86_64
 # On version 0.13.1 salmon was being extracted to a folder with an all lowercase name
 # the options `-C` and `--strip-components` allow us to specify the name for the resulting file
-RUN tar -xzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz -C Salmon-${SALMON_VERSION}_linux_x86_64 --strip-components 1
-# Create soft link `/usr/local/bin/salmon` that points to the actual program
-RUN ln -sf `pwd`/Salmon-${SALMON_VERSION}_linux_x86_64/bin/salmon /usr/local/bin/
-RUN rm -f Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz
+RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \
+    mkdir "Salmon-${SALMON_VERSION}_linux_x86_64" && \
+    tar -xzf "Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" \
+        -C "Salmon-${SALMON_VERSION}_linux_x86_64" --strip-components 1 && \
+    ln -sf "$(pwd)/Salmon-${SALMON_VERSION}_linux_x86_64/bin/salmon" \
+        /usr/local/bin/ && \
+    rm -f "Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz"
 # End Salmon installation.
 
-# Install R dependencies.
-COPY common/install_devtools.R .
-RUN Rscript install_devtools.R
-
-COPY workers/R_dependencies/tximport/dependencies.R tximport_dependencies.R
+COPY workers/R/dependencies/tximport/cran/dependencies.R tximport_dependencies.R
 RUN Rscript tximport_dependencies.R
 
 # Install tximport.
-COPY workers/install_tximport.R .
+COPY workers/R/dependencies/tximport/install_tximport.R .
 RUN Rscript install_tximport.R
 
-RUN pip3 install --upgrade pip
-RUN pip3 install numpy
-
-COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir numpy
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
+RUN pip3 install --ignore-installed --no-cache-dir \
+    common/$(ls common -1 | sort --version-sort | tail -1)
+
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt
 
 # Install SalmonTools.
-RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && cd SalmonTools && git checkout 3e6654c2c10a5225498b623056993947fa688afc
-RUN cd SalmonTools && cmake . -DCMAKE_INSTALL_PREFIX=/usr/local && make install
-RUN rm -rf SalmonTools
+RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && \
+    cd SalmonTools && \
+    git checkout 3e6654c2c10a5225498b623056993947fa688afc && \
+    cmake . -DCMAKE_INSTALL_PREFIX=/usr/local && \
+    make install && \
+    rm -rf SalmonTools
 
 # Install sra-tools.
 ENV SRA_VERSION 2.9.1
-RUN wget "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \
-    tar zxfv sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz && \
-    cp -r sratoolkit.${SRA_VERSION}-ubuntu64/bin/* /usr/bin
-
-# Clear out the pip3 cache.
-RUN rm -rf /root/.cache
+RUN wget -q "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \
+    tar zxfv "sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \
+    cp -r "sratoolkit.${SRA_VERSION}-ubuntu64/bin/"* /usr/bin
 
 ARG SYSTEM_VERSION
 
diff --git a/workers/dockerfiles/Dockerfile.smasher b/workers/dockerfiles/Dockerfile.smasher
index 313ba2150..00ea02baa 100644
--- a/workers/dockerfiles/Dockerfile.smasher
+++ b/workers/dockerfiles/Dockerfile.smasher
@@ -1,88 +1,26 @@
-FROM ubuntu:18.04
+FROM ccdlstaging/dr_worker_base:latest
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
+# Fail in case of an error at any stage in the pipe.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
-RUN apt-get update
-RUN apt-get install -y software-properties-common
-RUN add-apt-repository ppa:apt-fast/stable
-RUN add-apt-repository ppa:deadsnakes/ppa
-RUN add-apt-repository ppa:savoury1/llvm-defaults-10
-
-RUN apt-get update -qq
-RUN apt-get install -y apt-fast apt-transport-https
-
-# The packages related to R are somewhat weird, see the README for more details.
-COPY workers/CRAN.gpg .
-RUN apt-key add CRAN.gpg
-RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list
-
-RUN apt-fast update -qq && apt-fast install -y \
-  build-essential \
-  cmake \
-  curl \
-  cython3 \
-  ed \
-  gfortran \
-  git \
-  libcairo-dev \
-  libcurl4-openssl-dev \
-  libedit-dev \
-  libblas-dev \
-  liblapack-dev \
-  libpq-dev \
-  libssl-dev \
-  libxml2-dev \
-  llvm-10-dev \
-  lsb-release \
-  mercurial \
-  pkg-config \
-  python3-pip \
-  python3.8 \
-  python3.8-dev \
-  r-base-core \
-  wget
-
-RUN rm CRAN.gpg
-RUN apt-get clean; rm -rf /var/lib/apt/lists/*
-RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
-
-RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
 
-# We need a few special packages for QN
-ENV R_LIBS "/usr/local/lib/R/site-library"
-
-COPY common/install_devtools.R .
-RUN Rscript install_devtools.R
-
-COPY workers/R_dependencies/qn/dependencies.R .
-RUN Rscript dependencies.R
-
-COPY workers/qn_dependencies.R .
-RUN Rscript qn_dependencies.R
-# End QN-specific
+RUN pip3 install --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir nose numpy rpy2==3.4.5
 
-RUN pip3 install --upgrade pip
-# Smasher-specific requirements
-RUN pip3 install --ignore-installed nose numpy rpy2==3.4.5
-# End smasher-specific
+# Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed --no-cache-dir \
+    common/$(ls common -1 | sort --version-sort | tail -1)
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt
+RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt
 
-# Get the latest version from the dist directory.
-COPY common/dist/data-refinery-common-* common/
-RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
+COPY workers/R/dependencies/qn/cran/dependencies.R .
+RUN Rscript dependencies.R
 
-# Clear out the pip3 cache.
-RUN rm -rf /root/.cache
+COPY workers/R/dependencies/qn/bioc/dependencies.R .
+RUN Rscript dependencies.R
 
 ARG SYSTEM_VERSION
 
diff --git a/workers/dockerfiles/Dockerfile.transcriptome b/workers/dockerfiles/Dockerfile.transcriptome
index d1ac0ea63..1c04e7390 100644
--- a/workers/dockerfiles/Dockerfile.transcriptome
+++ b/workers/dockerfiles/Dockerfile.transcriptome
@@ -1,87 +1,41 @@
-FROM ubuntu:18.04
+FROM ccdlstaging/dr_worker_base:latest
 
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
+# Fail in case of an error at any stage in the pipe.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG C.UTF-8
-
-RUN apt-get update -qq
-RUN apt-get install -y software-properties-common
-RUN add-apt-repository ppa:apt-fast/stable
-RUN add-apt-repository ppa:deadsnakes/ppa
-RUN add-apt-repository ppa:savoury1/llvm-defaults-10
-
-RUN apt-get update -qq
-RUN apt-get -y install apt-fast
-
-RUN apt-fast update -qq && apt-fast install -y \
-  build-essential \
-  curl \
-  cython3 \
-  ed \
-  git \
-  libcurl4-openssl-dev \
-  libfreetype6-dev \
-  libpq-dev \
-  llvm-10-dev \
-  pkg-config \
-  python3-pip \
-  python3.8 \
-  python3.8-dev \
-  wget \
-  zlib1g-dev
-
-RUN rm CRAN.gpg
-RUN apt-get clean; rm -rf /var/lib/apt/lists/*
-RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
-
-RUN groupadd user && useradd --create-home --home-dir /home/user -g user user
 WORKDIR /home/user
 
 # It's annoying that this can only be installed via git.
-RUN git clone https://github.com/deweylab/RSEM.git
-RUN cd RSEM && make install
-RUN rm -rf RSEM
-
-# Install Salmon
+RUN git clone https://github.com/deweylab/RSEM.git && \
+    cd RSEM && make install && \
+    rm -rf RSEM
 
-# Tximport requires all experiments to be processed with the same version of Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496
+# Install Salmon.
+# Tximport requires all experiments to be processed with the same version of
+# Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496.
 # This is something that should be considered when updating salmon, because
 # all samples from incomplete experiments must have salmon run on them again.
 ENV SALMON_VERSION 0.13.1
 
-# Doesn't work:
-# salmon: relocation error: /usr/local/bin/../lib/librt.so.1: symbol __vdso_clock_gettime, version GLIBC_PRIVATE not defined in file libc.so.6 with link time reference
-# ENV SALMON_VERSION 0.10.0
-
-# Binary releases moved to bioconda, doesn't work anymore.
-# ENV SALMON_VERSION 0.10.2
-
-RUN wget https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz
-RUN tar -xzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz
-
 # Salmon can extract to a different directory than the name of the tar file.
-RUN cp `tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | head -1 | cut -f1 -d"/"`/bin/salmon /usr/local/bin
-RUN cp `tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | head -1 | cut -f1 -d"/"`/lib/* /usr/local/lib
-
-RUN rm -r Salmon*
+RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \
+    tar -xzf "Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \
+    cp "$(tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | \
+        head -1 | cut -f1 -d '/')/bin/salmon" /usr/local/bin && \
+    cp "$(tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | \
+        head -1 | cut -f1 -d '/')/lib/"* /usr/local/lib/ && \
+    rm -r Salmon*
 # End Salmon installation.
 
-RUN pip3 install --upgrade pip
-RUN pip3 install numpy
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir numpy
 
-COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt
-
-# Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1)
+RUN pip3 install --ignore-installed --no-cache-dir \
+    common/$(ls common -1 | sort --version-sort | tail -1)
 
-# Clear out the pip3 cache.
-RUN rm -rf /root/.cache
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt
 
 ARG SYSTEM_VERSION
 
diff --git a/workers/dockerfiles/Dockerfile.worker_base b/workers/dockerfiles/Dockerfile.worker_base
new file mode 100644
index 000000000..d8f1a0588
--- /dev/null
+++ b/workers/dockerfiles/Dockerfile.worker_base
@@ -0,0 +1,68 @@
+FROM ubuntu:18.04
+
+# Fail in case of an error at any stage in the pipe.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+WORKDIR /home/user
+
+# Prevent tzdata from prompting us for a timezone and hanging the build.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
+# For whatever reason this worked and 'en_US.UTF-8' did not.
+ENV LANG=C.UTF-8
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y software-properties-common && \
+    add-apt-repository ppa:apt-fast/stable && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    add-apt-repository ppa:savoury1/llvm-defaults-10 && \
+    apt-get update -qq && \
+    apt-get install --no-install-recommends -y apt-fast apt-transport-https gpg-agent
+
+# The packages related to R are somewhat weird, see the README for more details.
+COPY workers/CRAN.gpg .
+RUN apt-key add CRAN.gpg && \
+    echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
+      >> /etc/apt/sources.list.d/added_repos.list
+
+RUN apt-fast update -qq && apt-fast install -y \
+    build-essential \
+    ccache \
+    cmake \
+    curl \
+    cython3 \
+    ed \
+    git \
+    libcairo-dev \
+    libcurl4-gnutls-dev \
+    libedit-dev \
+    libgit2-dev \
+    libpq-dev \
+    libssl-dev \
+    libxml2-dev \
+    llvm-10-dev \
+    lsb-release \
+    mercurial \
+    pkg-config \
+    python3-pip \
+    python3.8 \
+    python3.8-dev \
+    r-base-core \
+    wget && \
+    rm CRAN.gpg && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \
+    groupadd user && useradd --create-home --home-dir /home/user -g user user
+
+# Set up ccache.
+COPY workers/ccache.conf /root/.ccache/ccache.conf
+RUN for i in c++ cc g++ gcc; do ln -s /usr/bin/ccache /usr/local/bin/$i; done
+ENV PATH="/usr/local/bin:${PATH}"
+
+# Pre-install dev tools.
+ENV R_LIBS="/usr/local/lib/R/site-library"
+COPY common/R/install_devtools.R .
+RUN Rscript install_devtools.R
diff --git a/workers/install_affy_only.R b/workers/install_affy_only.R
deleted file mode 100644
index b488a0287..000000000
--- a/workers/install_affy_only.R
+++ /dev/null
@@ -1,38 +0,0 @@
-# Turn warnings into errors because biocLite throws warnings instead
-# of error if it fails to install something.
-options(warn=2)
-options(repos=structure(c(CRAN="https://cloud.r-project.org")))
-options(Ncpus=parallel::detectCores())
-
-# Use devtools::install_version() to install packages in cran.
-devtools::install_version('dplyr', version='1.0.0')
-devtools::install_version('tidyr', version='1.1.0')
-devtools::install_version('ff', version='2.2-14')
-devtools::install_version('locfit', version='1.5-9.4')
-
-# Helper function that installs a list of packages using the input URLs
-install_with_url <- function(urls) {
-  pkg_ids <- devtools::install_url(urls)
-  if(any(is.na(pkg_ids))) {
-    pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ")
-    stop(paste("Failed to install package(s):", pkg_fails ))
-  }
-  return(pkg_ids)
-}
-
-bioc_pkgs <- c(
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/oligoClasses_1.50.4.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/oligo_1.52.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/GEOquery_2.56.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/SCAN.UPC_2.30.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/affy_1.66.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/affyio_1.58.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/zlibbioc_1.34.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/preprocessCore_1.50.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/genefilter_1.70.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/sva_3.36.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/tximport_1.16.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/limma_3.44.3.tar.gz'
-)
-install_with_url(bioc_pkgs)
diff --git a/workers/install_gene_convert.R b/workers/install_gene_convert.R
deleted file mode 100644
index 5e053f9c6..000000000
--- a/workers/install_gene_convert.R
+++ /dev/null
@@ -1,43 +0,0 @@
-# Turn warnings into errors because biocLite throws warnings instead
-# of error if it fails to install something.
-options(warn=2)
-options(Ncpus=parallel::detectCores())
-options(repos=structure(c(CRAN="https://cloud.r-project.org")))
-
-# Helper function that installs a list of packages using the input URLs
-install_with_url <- function(urls) {
-  pkg_ids <- devtools::install_url(urls)
-  if(any(is.na(pkg_ids))) {
-    pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ")
-    stop(paste("Failed to install package(s):", pkg_fails ))
-  }
-  return(pkg_ids)
-}
-
-devtools::install_version('dplyr', version='1.0.2')
-
-bioc_pkgs <- c(
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz'
-)
-install_with_url(bioc_pkgs)
-
-illumina_pkgs <- c(
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv1.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv2.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv3.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv4.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1p1.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev2.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaRatv1.db_1.26.0.tar.gz'
-)
-install_with_url(illumina_pkgs)
-
-# Load these libraries because apparently just installing them isn't
-# enough to verify that they have complementary versions.
-library("optparse")
-library(data.table)
-library("dplyr")
-library("rlang")
-library(lazyeval)
-library(AnnotationDbi)
diff --git a/workers/install_tximport.R b/workers/install_tximport.R
deleted file mode 100644
index 2889fc630..000000000
--- a/workers/install_tximport.R
+++ /dev/null
@@ -1,7 +0,0 @@
-# Turn warnings into errors because biocLite throws warnings instead
-# of error if it fails to install something.
-options(warn=2)
-options(Ncpus=parallel::detectCores())
-options(repos=structure(c(CRAN="https://cloud.r-project.org")))
-
-devtools::install_url('https://bioconductor.org/packages/3.11/bioc/src/contrib/tximport_1.16.1.tar.gz')

From 5f3b5625a0c13a72a9e191ff7c15255cb0fdd415 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Tue, 22 Nov 2022 13:45:21 -0800
Subject: [PATCH 20/24] Update docker images.

---
 common/dockerfiles/Dockerfile.base            |  65 +++++++++++
 .../dependencies/illumina/bioc/dependencies.R |  36 +++----
 workers/dockerfiles/Dockerfile.affymetrix     |  41 ++++---
 .../dockerfiles/Dockerfile.affymetrix_local   |   3 +-
 workers/dockerfiles/Dockerfile.compendia      | 102 +++++++++---------
 workers/dockerfiles/Dockerfile.downloaders    |  23 ++--
 workers/dockerfiles/Dockerfile.illumina       |  12 ++-
 workers/dockerfiles/Dockerfile.no_op          |  23 ++--
 workers/dockerfiles/Dockerfile.salmon         |  28 ++---
 workers/dockerfiles/Dockerfile.smasher        |  27 ++---
 workers/dockerfiles/Dockerfile.transcriptome  |  20 ++--
 workers/dockerfiles/Dockerfile.worker_base    |  68 ------------
 12 files changed, 238 insertions(+), 210 deletions(-)
 create mode 100644 common/dockerfiles/Dockerfile.base
 delete mode 100644 workers/dockerfiles/Dockerfile.worker_base

diff --git a/common/dockerfiles/Dockerfile.base b/common/dockerfiles/Dockerfile.base
new file mode 100644
index 000000000..5b833cef9
--- /dev/null
+++ b/common/dockerfiles/Dockerfile.base
@@ -0,0 +1,65 @@
+FROM ubuntu:18.04
+
+# Fail in case of an error at any stage in the pipe.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+WORKDIR /home/user
+
+# Prevent tzdata from prompting us for a timezone and hanging the build.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
+# For whatever reason this worked and 'en_US.UTF-8' did not.
+ENV LANG=C.UTF-8
+
+COPY workers/CRAN.gpg .
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y software-properties-common && \
+    add-apt-repository ppa:apt-fast/stable && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    add-apt-repository ppa:savoury1/llvm-defaults-10 && \
+    apt-get update -qq && \
+    apt-get install --no-install-recommends -y \
+        apt-fast \
+        apt-transport-https \
+        gpg-agent && \
+    apt-key add CRAN.gpg && \
+    echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
+        >> /etc/apt/sources.list.d/added_repos.list && \
+    apt-fast update -qq && apt-fast install -y \
+        build-essential \
+        cmake \
+        curl \
+        cython3 \
+        ed \
+        git \
+        libcairo-dev \
+        libcurl4-gnutls-dev \
+        libedit-dev \
+        libgit2-dev \
+        libpq-dev \
+        libssl-dev \
+        libxml2-dev \
+        llvm-10-dev \
+        lsb-release \
+        mercurial \
+        pkg-config \
+        python3-pip \
+        python3.8 \
+        python3.8-dev \
+        r-base-core \
+        wget && \
+    apt-get clean && \
+    rm CRAN.gpg && \
+    rm -rf /var/lib/apt/lists/* && \
+    ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config && \
+    update-alternatives --install \
+        /usr/bin/python3 python3 /usr/bin/python3.8 1 && \
+    groupadd user && \
+    useradd --create-home --home-dir /home/user/ -g user user && \
+    chown -R user /home/user/
+
+# Pre-install dev tools.
+ENV R_LIBS=/usr/local/lib/R/site-library
+COPY common/R/install_devtools.R .
+RUN Rscript install_devtools.R
diff --git a/workers/R/dependencies/illumina/bioc/dependencies.R b/workers/R/dependencies/illumina/bioc/dependencies.R
index b3e598f79..06aee98a5 100644
--- a/workers/R/dependencies/illumina/bioc/dependencies.R
+++ b/workers/R/dependencies/illumina/bioc/dependencies.R
@@ -1,36 +1,36 @@
-options(warn=2)
-options(repos=structure(c(CRAN="https://cloud.r-project.org")))
-options(Ncpus=parallel::detectCores())
+options(warn = 2)
+options(repos = structure(c(CRAN = "https://cloud.r-project.org")))
+options(Ncpus = parallel::detectCores())
 
 # Helper function that installs a list of packages using the input URLs
 install_with_url <- function(urls) {
   pkg_ids <- devtools::install_url(urls)
-  if(any(is.na(pkg_ids))) {
+  if (any(is.na(pkg_ids))) {
     pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ")
-    stop(paste("Failed to install package(s):", pkg_fails ))
+    stop(paste("Failed to install package(s):", pkg_fails))
   }
   return(pkg_ids)
 }
 
-devtools::install_version('dplyr', version='1.0.2')
+devtools::install_version("dplyr", version = "1.0.2")
 
 bioc_pkgs <- c(
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/oligoClasses_1.50.4.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/oligo_1.52.1.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz',
-  'https://bioconductor.org/packages/3.11/bioc/src/contrib/limma_3.44.3.tar.gz'
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/oligoClasses_1.50.4.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/oligo_1.52.1.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz",
+  "https://bioconductor.org/packages/3.11/bioc/src/contrib/limma_3.44.3.tar.gz"
 )
 install_with_url(bioc_pkgs)
 
 illumina_pkgs <- c(
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv1.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv2.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv3.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv4.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1p1.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev2.db_1.26.0.tar.gz',
-  'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaRatv1.db_1.26.0.tar.gz'
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv1.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv2.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv3.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv4.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1p1.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev2.db_1.26.0.tar.gz",
+  "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaRatv1.db_1.26.0.tar.gz"
 )
 install_with_url(illumina_pkgs)
 
diff --git a/workers/dockerfiles/Dockerfile.affymetrix b/workers/dockerfiles/Dockerfile.affymetrix
index 3ee59ea1a..e90300240 100644
--- a/workers/dockerfiles/Dockerfile.affymetrix
+++ b/workers/dockerfiles/Dockerfile.affymetrix
@@ -5,18 +5,12 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 
 WORKDIR /home/user
 
-RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip setuptools && \
-    # Install this one here instead of via requirements.txt because not
-    # all processors need it.
-    pip3 install rpy2==3.4.5
-
-# Get the latest version from the dist directory.
-COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed --no-cache-dir \
-    common/$(ls common -1 | sort --version-sort | tail -1)
-
-COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt
+# Install and set up ccache.
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y ccache && \
+    for i in c++ cc g++ gcc; do ln -fs /usr/bin/ccache /usr/local/bin/$i; done
+COPY workers/ccache.conf /root/.ccache/ccache.conf
+ENV PATH="/usr/local/bin:${PATH}"
 
 COPY workers/R/dependencies/affymetrix/cran/dependencies.R dependencies.R
 RUN Rscript dependencies.R
@@ -26,9 +20,28 @@ COPY workers/R/dependencies/affymetrix/install_ensg_pkgs.R .
 COPY workers/R/dependencies/affymetrix/bioc/dependencies.R dependencies.R
 RUN Rscript dependencies.R
 
+RUN pip3 install --ignore-installed --upgrade pip setuptools && \
+    # Install this one here instead of via requirements.txt because not
+    # all processors need it.
+    pip3 install --ignore-installed rpy2==3.4.5
+
+# Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed \
+    common/$(ls common -1 | sort --version-sort | tail -1)
+
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip3 cache.
+    rm -r /root/.cache
+
 ARG SYSTEM_VERSION
+ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
-ENV SYSTEM_VERSION $SYSTEM_VERSION
+# Print compiler cache stats (the cache hit ratio should be ~90%).
+RUN ccache --show-stats && \
+    # Clear out the ccache.
+    ccache --clear
 
 USER user
 
@@ -36,6 +49,4 @@ COPY .boto .boto
 COPY config/ config/
 COPY workers/ .
 
-RUN ccache -s
-
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.affymetrix_local b/workers/dockerfiles/Dockerfile.affymetrix_local
index 3d7eff18f..7f1af8ad6 100644
--- a/workers/dockerfiles/Dockerfile.affymetrix_local
+++ b/workers/dockerfiles/Dockerfile.affymetrix_local
@@ -15,8 +15,7 @@ RUN pip3 install --ignore-installed --no-cache-dir \
     common/$(ls common -1 | sort --version-sort | tail -1)
 
 ARG SYSTEM_VERSION
-
-ENV SYSTEM_VERSION $SYSTEM_VERSION
+ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
diff --git a/workers/dockerfiles/Dockerfile.compendia b/workers/dockerfiles/Dockerfile.compendia
index 82e14df01..f2afa185f 100644
--- a/workers/dockerfiles/Dockerfile.compendia
+++ b/workers/dockerfiles/Dockerfile.compendia
@@ -1,13 +1,11 @@
-FROM nvidia/cuda:11.8.0-runtime-ubuntu18.04
-
 # This is very similar to the `smasher` image, but comes with OpenBLAS and some
 # of the other libraries required for fancyimpute.
 
+FROM nvidia/cuda:11.8.0-runtime-ubuntu18.04
+
 # Fail in case of an error at any stage in the pipe.
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 
-WORKDIR /home/user
-
 # Prevent tzdata from prompting us for a timezone and hanging the build.
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -33,45 +31,52 @@ RUN echo debconf apt-fast/maxdownloads string 16 | debconf-set-selections && \
     apt-get install --no-install-recommends -y apt-fast apt-transport-https gpg-agent tzdata && \
     apt-key add CRAN.gpg && \
     echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list && \
+        >> /etc/apt/sources.list.d/added_repos.list && \
     apt-fast update -qq && \
     apt-fast install -y \
-      build-essential \
-      ccache \
-      cmake \
-      curl \
-      cython3 \
-      ed \
-      gfortran \
-      git \
-      libblas-dev \
-      libcairo-dev \
-      libcurl4-openssl-dev \
-      libedit-dev \
-      liblapack-dev \
-      libpq-dev \
-      libssl-dev \
-      libxml2-dev \
-      llvm-10-dev \
-      lsb-release \
-      mercurial \
-      pkg-config \
-      python3-pip \
-      python3.8 \
-      python3.8-dev \
-      r-base-core \
-      wget && \
+        build-essential \
+        ccache \
+        cmake \
+        curl \
+        cython3 \
+        ed \
+        gfortran \
+        git \
+        libblas-dev \
+        libcairo-dev \
+        libcurl4-openssl-dev \
+        libedit-dev \
+        liblapack-dev \
+        libpq-dev \
+        libssl-dev \
+        libxml2-dev \
+        llvm-10-dev \
+        lsb-release \
+        mercurial \
+        pkg-config \
+        python3-pip \
+        python3.8 \
+        python3.8-dev \
+        r-base-core \
+        wget && \
+    apt-get clean && \
     rm CRAN.gpg && \
-    apt-get clean; rm -rf /var/lib/apt/lists/* && \
-    ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config & \
-    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \
-    groupadd user && useradd --create-home --home-dir /home/user -g user user && \
-    wget -q https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 && \
+    rm -rf /var/lib/apt/lists/* && \
+    ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config && \
+    update-alternatives --install /usr/bin/python3 python3 \
+        /usr/bin/python3.8 1 && \
+    groupadd user && \
+    useradd --create-home --home-dir /home/user -g user user
+
+WORKDIR /home/user
+RUN wget -q https://bitbucket.org/ariya/phantomjs/downloads/\
+phantomjs-2.1.1-linux-x86_64.tar.bz2 && \
     tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ && \
-    ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin/
+    ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs \
+        /usr/local/bin/
 
 # We need a few special packages for QN.
-ENV R_LIBS="/usr/local/lib/R/site-library"
+ENV R_LIBS=/usr/local/lib/R/site-library
 
 COPY common/R/install_devtools.R .
 RUN Rscript install_devtools.R
@@ -83,22 +88,23 @@ COPY workers/R/dependencies/qn/bioc/dependencies.R .
 RUN Rscript dependencies.R
 # End QN-specific
 
+RUN pip3 install --ignore-installed --upgrade pip && \
+    pip3 install --ignore-installed numpy scipy matplotlib \
+        pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD
+
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed -r requirements.txt && \
+    # Pin setuptools as a workaround for
+    # https://github.com/pypa/setuptools/issues/3693
+    pip3 install --ignore-installed setuptools==65.0.1 numpy==1.16.0
+
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed --no-cache-dir \
+RUN pip3 install --ignore-installed \
     common/$(ls common -1 | sort --version-sort | tail -1)
 
-RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
-    pip3 install --ignore-installed --no-cache-dir numpy scipy matplotlib \
-      pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD
-
-COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt  && \
-    pip3 install --ignore-installed --no-cache-dir numpy==1.16.0
-
 ARG SYSTEM_VERSION
-
-ENV SYSTEM_VERSION $SYSTEM_VERSION
+ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
diff --git a/workers/dockerfiles/Dockerfile.downloaders b/workers/dockerfiles/Dockerfile.downloaders
index 3a5974491..c65faf5dc 100644
--- a/workers/dockerfiles/Dockerfile.downloaders
+++ b/workers/dockerfiles/Dockerfile.downloaders
@@ -10,12 +10,16 @@ RUN Rscript install_downloader_R_only.R
 
 # Aspera will only install as the current user.
 # Even using `su - user &&` doesn't work...
-USER user
 
+USER user
 # Install Aspera. We have to install it using Holland Computing Center's conda
 # repo because download.asperasoft.com now returns 403s
-RUN wget -q https://anaconda.org/HCC/aspera-cli/3.9.1/download/linux-64/aspera-cli-3.9.1-0.tar.bz2 && \
-    [ "$(sha256sum aspera-cli-3.9.1-0.tar.bz2 | cut -d ' ' -f1)" = 60a09a7f3795186954079869106aa89a64183b7be8e0da7cbbe9d57c66c9bcdb ] && \
+
+RUN wget -q "https://anaconda.org/HCC/aspera-cli/3.9.1/download/\
+linux-64/aspera-cli-3.9.1-0.tar.bz2" && \
+    [ "$(sha256sum aspera-cli-3.9.1-0.tar.bz2 | cut -d ' ' -f1)" =\
+      "60a09a7f3795186954079869106aa89a64183b7be8e0da7cbbe9d57c66c9bcdb" ] && \
+    rm -rf .aspera && \
     mkdir -p .aspera/cli && \
     tar xf aspera-cli-3.9.1-0.tar.bz2 -C .aspera/cli && \
     rm aspera-cli-3.9.1-0.tar.bz2
@@ -23,20 +27,21 @@ RUN wget -q https://anaconda.org/HCC/aspera-cli/3.9.1/download/linux-64/aspera-c
 # Now that we're done installing Aspera go back to being root for a bit.
 USER root
 
-RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
-    pip3 install --ignore-installed --no-cache-dir rpy2==3.4.5
+RUN pip3 install --ignore-installed --upgrade pip && \
+    pip3 install --ignore-installed rpy2==3.4.5
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed --no-cache-dir \
+RUN pip3 install --ignore-installed \
     common/$(ls common -1 | sort --version-sort | tail -1)
 
 COPY workers/data_refinery_workers/downloaders/requirements.txt .
-RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt
+RUN pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip3 cache.
+    rm -r /root/.cache
 
 ARG SYSTEM_VERSION
-
-ENV SYSTEM_VERSION $SYSTEM_VERSION
+ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
diff --git a/workers/dockerfiles/Dockerfile.illumina b/workers/dockerfiles/Dockerfile.illumina
index 949c22405..fe65bf7ab 100644
--- a/workers/dockerfiles/Dockerfile.illumina
+++ b/workers/dockerfiles/Dockerfile.illumina
@@ -14,16 +14,18 @@ RUN Rscript dependencies.R
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed --no-cache-dir \
+RUN pip3 install --ignore-installed \
     common/$(ls common -1 | sort --version-sort | tail -1)
 
-RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip
+RUN pip3 install --ignore-installed --upgrade pip
+
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt
+RUN pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip3 cache.
+    rm -r /root/.cache
 
 ARG SYSTEM_VERSION
-
-ENV SYSTEM_VERSION $SYSTEM_VERSION
+ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
diff --git a/workers/dockerfiles/Dockerfile.no_op b/workers/dockerfiles/Dockerfile.no_op
index a0cb7855e..7f55589d0 100644
--- a/workers/dockerfiles/Dockerfile.no_op
+++ b/workers/dockerfiles/Dockerfile.no_op
@@ -14,29 +14,30 @@ RUN Rscript install_gene_convert.R
 # Noop-specific.
 RUN mkdir -p gene_indexes
 WORKDIR /home/user/gene_indexes
-ENV ID_REFINERY_URL https://zenodo.org/record/1410647/files/all_1536267482.zip
+ENV ID_REFINERY_URL=https://zenodo.org/record/1410647/files/all_1536267482.zip
 RUN curl -O $ID_REFINERY_URL && \
     echo $ID_REFINERY_URL > /etc/identifier_refinery_url && \
     unzip *.zip && \
     rm *.zip
 # End Noop-specific.
 
-WORKDIR /home/user
-
-RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
-    pip3 install --ignore-installed --no-cache-dir numpy
-
-COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt
+WORKDIR /home/user/
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed --no-cache-dir \
+RUN pip3 install --ignore-installed \
     common/$(ls common -1 | sort --version-sort | tail -1)
 
-ARG SYSTEM_VERSION
+RUN pip3 install --ignore-installed --upgrade pip && \
+    pip3 install --ignore-installed numpy
+
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip3 cache.
+    rm -r /root/.cache
 
-ENV SYSTEM_VERSION $SYSTEM_VERSION
+ARG SYSTEM_VERSION
+ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
diff --git a/workers/dockerfiles/Dockerfile.salmon b/workers/dockerfiles/Dockerfile.salmon
index 724ddac60..1bb8de6e6 100644
--- a/workers/dockerfiles/Dockerfile.salmon
+++ b/workers/dockerfiles/Dockerfile.salmon
@@ -10,11 +10,13 @@ WORKDIR /home/user
 # Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496.
 # This is something that should be considered when updating salmon, because
 # all samples from incomplete experiments must have salmon run on them again.
-ENV SALMON_VERSION 0.13.1
+ENV SALMON_VERSION=0.13.1
 
-# On version 0.13.1 salmon was being extracted to a folder with an all lowercase name
-# the options `-C` and `--strip-components` allow us to specify the name for the resulting file
-RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \
+# On version 0.13.1 salmon was being extracted to a folder with an all
+# lowercase name the options `-C` and `--strip-components` allow us to specify
+#the name for the resulting file.
+RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/\
+v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \
     mkdir "Salmon-${SALMON_VERSION}_linux_x86_64" && \
     tar -xzf "Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" \
         -C "Salmon-${SALMON_VERSION}_linux_x86_64" --strip-components 1 && \
@@ -30,16 +32,18 @@ RUN Rscript tximport_dependencies.R
 COPY workers/R/dependencies/tximport/install_tximport.R .
 RUN Rscript install_tximport.R
 
-RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
-    pip3 install --ignore-installed --no-cache-dir numpy
+RUN pip3 install --ignore-installed --upgrade pip && \
+    pip3 install --ignore-installed numpy
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed --no-cache-dir \
+RUN pip3 install --ignore-installed \
     common/$(ls common -1 | sort --version-sort | tail -1)
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt
+RUN pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip3 cache.
+    rm -r /root/.cache
 
 # Install SalmonTools.
 RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && \
@@ -50,14 +54,14 @@ RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && \
     rm -rf SalmonTools
 
 # Install sra-tools.
-ENV SRA_VERSION 2.9.1
-RUN wget -q "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \
+ENV SRA_VERSION=2.9.1
+RUN wget -q "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/\
+sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \
     tar zxfv "sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \
     cp -r "sratoolkit.${SRA_VERSION}-ubuntu64/bin/"* /usr/bin
 
 ARG SYSTEM_VERSION
-
-ENV SYSTEM_VERSION $SYSTEM_VERSION
+ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
diff --git a/workers/dockerfiles/Dockerfile.smasher b/workers/dockerfiles/Dockerfile.smasher
index 00ea02baa..ebdf924a4 100644
--- a/workers/dockerfiles/Dockerfile.smasher
+++ b/workers/dockerfiles/Dockerfile.smasher
@@ -1,30 +1,31 @@
-FROM ccdlstaging/dr_worker_base:latest
+FROM ccdlstaging/dr_base:latest
 
 # Fail in case of an error at any stage in the pipe.
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 
 WORKDIR /home/user
 
-RUN pip3 install --no-cache-dir --upgrade pip && \
-    pip3 install --ignore-installed --no-cache-dir nose numpy rpy2==3.4.5
+COPY workers/R/dependencies/qn/cran/dependencies.R .
+RUN Rscript dependencies.R
+
+COPY workers/R/dependencies/qn/bioc/dependencies.R .
+RUN Rscript dependencies.R
+
+RUN pip3 install --ignore-installed --upgrade pip && \
+    pip3 install --ignore-installed nose numpy rpy2==3.4.5
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed --no-cache-dir \
+RUN pip3 install --ignore-installed \
     common/$(ls common -1 | sort --version-sort | tail -1)
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt
-
-COPY workers/R/dependencies/qn/cran/dependencies.R .
-RUN Rscript dependencies.R
-
-COPY workers/R/dependencies/qn/bioc/dependencies.R .
-RUN Rscript dependencies.R
+RUN pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip3 cache.
+    rm -rf /root/.cache
 
 ARG SYSTEM_VERSION
-
-ENV SYSTEM_VERSION $SYSTEM_VERSION
+ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
diff --git a/workers/dockerfiles/Dockerfile.transcriptome b/workers/dockerfiles/Dockerfile.transcriptome
index 1c04e7390..8ce0d974a 100644
--- a/workers/dockerfiles/Dockerfile.transcriptome
+++ b/workers/dockerfiles/Dockerfile.transcriptome
@@ -1,4 +1,4 @@
-FROM ccdlstaging/dr_worker_base:latest
+FROM ccdlstaging/dr_base:latest
 
 # Fail in case of an error at any stage in the pipe.
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
@@ -15,10 +15,11 @@ RUN git clone https://github.com/deweylab/RSEM.git && \
 # Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496.
 # This is something that should be considered when updating salmon, because
 # all samples from incomplete experiments must have salmon run on them again.
-ENV SALMON_VERSION 0.13.1
+ENV SALMON_VERSION=0.13.1
 
 # Salmon can extract to a different directory than the name of the tar file.
-RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \
+RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/\
+v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \
     tar -xzf "Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \
     cp "$(tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | \
         head -1 | cut -f1 -d '/')/bin/salmon" /usr/local/bin && \
@@ -27,19 +28,20 @@ RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_V
     rm -r Salmon*
 # End Salmon installation.
 
-RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
-    pip3 install --ignore-installed --no-cache-dir numpy
+RUN pip3 install --ignore-installed --upgrade pip && \
+    pip3 install --ignore-installed numpy
 
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed --no-cache-dir \
+RUN pip3 install --ignore-installed \
     common/$(ls common -1 | sort --version-sort | tail -1)
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt
+RUN pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip3 cache.
+    rm -rf /root/.cache
 
 ARG SYSTEM_VERSION
-
-ENV SYSTEM_VERSION $SYSTEM_VERSION
+ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
diff --git a/workers/dockerfiles/Dockerfile.worker_base b/workers/dockerfiles/Dockerfile.worker_base
deleted file mode 100644
index d8f1a0588..000000000
--- a/workers/dockerfiles/Dockerfile.worker_base
+++ /dev/null
@@ -1,68 +0,0 @@
-FROM ubuntu:18.04
-
-# Fail in case of an error at any stage in the pipe.
-SHELL ["/bin/bash", "-o", "pipefail", "-c"]
-
-WORKDIR /home/user
-
-# Prevent tzdata from prompting us for a timezone and hanging the build.
-ENV DEBIAN_FRONTEND=noninteractive
-
-# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082
-# For whatever reason this worked and 'en_US.UTF-8' did not.
-ENV LANG=C.UTF-8
-
-RUN apt-get update && \
-    apt-get install --no-install-recommends -y software-properties-common && \
-    add-apt-repository ppa:apt-fast/stable && \
-    add-apt-repository ppa:deadsnakes/ppa && \
-    add-apt-repository ppa:savoury1/llvm-defaults-10 && \
-    apt-get update -qq && \
-    apt-get install --no-install-recommends -y apt-fast apt-transport-https gpg-agent
-
-# The packages related to R are somewhat weird, see the README for more details.
-COPY workers/CRAN.gpg .
-RUN apt-key add CRAN.gpg && \
-    echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \
-      >> /etc/apt/sources.list.d/added_repos.list
-
-RUN apt-fast update -qq && apt-fast install -y \
-    build-essential \
-    ccache \
-    cmake \
-    curl \
-    cython3 \
-    ed \
-    git \
-    libcairo-dev \
-    libcurl4-gnutls-dev \
-    libedit-dev \
-    libgit2-dev \
-    libpq-dev \
-    libssl-dev \
-    libxml2-dev \
-    llvm-10-dev \
-    lsb-release \
-    mercurial \
-    pkg-config \
-    python3-pip \
-    python3.8 \
-    python3.8-dev \
-    r-base-core \
-    wget && \
-    rm CRAN.gpg && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* && \
-    ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config && \
-    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \
-    groupadd user && useradd --create-home --home-dir /home/user -g user user
-
-# Set up ccache.
-COPY workers/ccache.conf /root/.ccache/ccache.conf
-RUN for i in c++ cc g++ gcc; do ln -s /usr/bin/ccache /usr/local/bin/$i; done
-ENV PATH="/usr/local/bin:${PATH}"
-
-# Pre-install dev tools.
-ENV R_LIBS="/usr/local/lib/R/site-library"
-COPY common/R/install_devtools.R .
-RUN Rscript install_devtools.R

From 13089441a9f96b8778fcb8852e62be26eb017b6d Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Mon, 28 Nov 2022 10:42:12 -0800
Subject: [PATCH 21/24] Add another docker images update.

---
 workers/dockerfiles/Dockerfile.compendia   | 8 +++-----
 workers/dockerfiles/Dockerfile.downloaders | 2 +-
 workers/dockerfiles/Dockerfile.illumina    | 2 +-
 workers/dockerfiles/Dockerfile.no_op       | 2 +-
 workers/dockerfiles/Dockerfile.salmon      | 2 +-
 5 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/workers/dockerfiles/Dockerfile.compendia b/workers/dockerfiles/Dockerfile.compendia
index f2afa185f..4dd80559f 100644
--- a/workers/dockerfiles/Dockerfile.compendia
+++ b/workers/dockerfiles/Dockerfile.compendia
@@ -69,8 +69,8 @@ RUN echo debconf apt-fast/maxdownloads string 16 | debconf-set-selections && \
     useradd --create-home --home-dir /home/user -g user user
 
 WORKDIR /home/user
-RUN wget -q https://bitbucket.org/ariya/phantomjs/downloads/\
-phantomjs-2.1.1-linux-x86_64.tar.bz2 && \
+RUN wget "https://bitbucket.org/ariya/phantomjs/downloads/\
+phantomjs-2.1.1-linux-x86_64.tar.bz2" && \
     tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ && \
     ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs \
         /usr/local/bin/
@@ -94,9 +94,7 @@ RUN pip3 install --ignore-installed --upgrade pip && \
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
 RUN pip3 install --ignore-installed -r requirements.txt && \
-    # Pin setuptools as a workaround for
-    # https://github.com/pypa/setuptools/issues/3693
-    pip3 install --ignore-installed setuptools==65.0.1 numpy==1.16.0
+    pip3 install --ignore-installed numpy==1.16.0
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
diff --git a/workers/dockerfiles/Dockerfile.downloaders b/workers/dockerfiles/Dockerfile.downloaders
index c65faf5dc..145d567e1 100644
--- a/workers/dockerfiles/Dockerfile.downloaders
+++ b/workers/dockerfiles/Dockerfile.downloaders
@@ -1,4 +1,4 @@
-FROM ccdlstaging/dr_worker_base:latest
+FROM ccdlstaging/dr_base:latest
 
 # Fail in case of an error at any stage in the pipe.
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
diff --git a/workers/dockerfiles/Dockerfile.illumina b/workers/dockerfiles/Dockerfile.illumina
index fe65bf7ab..648d2e758 100644
--- a/workers/dockerfiles/Dockerfile.illumina
+++ b/workers/dockerfiles/Dockerfile.illumina
@@ -1,4 +1,4 @@
-FROM ccdlstaging/dr_worker_base:latest
+FROM ccdlstaging/dr_base:latest
 
 # Fail in case of an error at any stage in the pipe.
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
diff --git a/workers/dockerfiles/Dockerfile.no_op b/workers/dockerfiles/Dockerfile.no_op
index 7f55589d0..8d55281ca 100644
--- a/workers/dockerfiles/Dockerfile.no_op
+++ b/workers/dockerfiles/Dockerfile.no_op
@@ -1,4 +1,4 @@
-FROM ccdlstaging/dr_worker_base:latest
+FROM ccdlstaging/dr_base:latest
 
 # Fail in case of an error at any stage in the pipe.
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
diff --git a/workers/dockerfiles/Dockerfile.salmon b/workers/dockerfiles/Dockerfile.salmon
index 1bb8de6e6..b6395ef58 100644
--- a/workers/dockerfiles/Dockerfile.salmon
+++ b/workers/dockerfiles/Dockerfile.salmon
@@ -1,4 +1,4 @@
-FROM ccdlstaging/dr_worker_base:latest
+FROM ccdlstaging/dr_base:latest
 
 # Fail in case of an error at any stage in the pipe.
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]

From f7191badbd0d28d8ef944bd91e5287d8ba3a918b Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Thu, 1 Dec 2022 17:05:49 -0800
Subject: [PATCH 22/24] Optimize docker images, put cache invalidating
 instructions to the tail.

---
 .pre-commit-config.yaml                       |  2 +-
 .../dockerfiles/Dockerfile.affymetrix_local   | 10 +--
 workers/dockerfiles/Dockerfile.compendia      | 36 +++++------
 workers/dockerfiles/Dockerfile.downloaders    | 34 +++++------
 workers/dockerfiles/Dockerfile.illumina       | 25 ++++----
 workers/dockerfiles/Dockerfile.no_op          | 35 +++++------
 workers/dockerfiles/Dockerfile.salmon         | 61 +++++++++----------
 workers/dockerfiles/Dockerfile.smasher        | 19 +++---
 workers/dockerfiles/Dockerfile.transcriptome  | 29 +++++----
 9 files changed, 124 insertions(+), 127 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d024704da..b651ce24a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
       - id: isort
 
   - repo: https://github.com/psf/black
-    rev: 19.10b0
+    rev: 22.3.0
     hooks:
       - id: black
         args: [--line-length=100]
diff --git a/workers/dockerfiles/Dockerfile.affymetrix_local b/workers/dockerfiles/Dockerfile.affymetrix_local
index 7f1af8ad6..08db9e197 100644
--- a/workers/dockerfiles/Dockerfile.affymetrix_local
+++ b/workers/dockerfiles/Dockerfile.affymetrix_local
@@ -6,20 +6,20 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 WORKDIR /home/user
 
 # Remove the version of common already installed.
-RUN rm -r common && \
+RUN rm -rf common && \
     pip3 uninstall -y data_refinery_common
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
 RUN pip3 install --ignore-installed --no-cache-dir \
-    common/$(ls common -1 | sort --version-sort | tail -1)
+        common/$(ls common -1 | sort --version-sort | tail -1)
+
+COPY config/ config/
+COPY workers/ .
 
 ARG SYSTEM_VERSION
 ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
-COPY config/ config/
-COPY workers/ .
-
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.compendia b/workers/dockerfiles/Dockerfile.compendia
index 4dd80559f..9a44a7ecc 100644
--- a/workers/dockerfiles/Dockerfile.compendia
+++ b/workers/dockerfiles/Dockerfile.compendia
@@ -69,11 +69,6 @@ RUN echo debconf apt-fast/maxdownloads string 16 | debconf-set-selections && \
     useradd --create-home --home-dir /home/user -g user user
 
 WORKDIR /home/user
-RUN wget "https://bitbucket.org/ariya/phantomjs/downloads/\
-phantomjs-2.1.1-linux-x86_64.tar.bz2" && \
-    tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ && \
-    ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs \
-        /usr/local/bin/
 
 # We need a few special packages for QN.
 ENV R_LIBS=/usr/local/lib/R/site-library
@@ -86,28 +81,35 @@ RUN Rscript dependencies.R
 
 COPY workers/R/dependencies/qn/bioc/dependencies.R .
 RUN Rscript dependencies.R
-# End QN-specific
-
-RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed numpy scipy matplotlib \
-        pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD
+# End QN-specific.
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt && \
-    pip3 install --ignore-installed numpy==1.16.0
+RUN pip3 install --ignore-installed --upgrade pip && \
+    pip3 install --ignore-installed nose numpy scipy matplotlib \
+        pandas==0.25.3 scikit-learn sympy rpy2===3.4.5 tzlocal fancySVD && \
+    pip3 install --ignore-installed -r requirements.txt && \
+    pip3 install --ignore-installed numpy==1.16.0 && \
+    # Clear out the pip cache.
+    rm -rf /root/.cache
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
 RUN pip3 install --ignore-installed \
-    common/$(ls common -1 | sort --version-sort | tail -1)
+        common/$(ls common -1 | sort --version-sort | tail -1) && \
+    # Install phantomjs.
+    wget "https://bitbucket.org/ariya/phantomjs/downloads/\
+phantomjs-2.1.1-linux-x86_64.tar.bz2" && \
+    tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ && \
+    ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs \
+        /usr/local/bin/
+
+COPY .boto .boto
+COPY config/ config/
+COPY workers/ .
 
 ARG SYSTEM_VERSION
 ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
-COPY .boto .boto
-COPY config/ config/
-COPY workers/ .
-
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.downloaders b/workers/dockerfiles/Dockerfile.downloaders
index 145d567e1..02917518d 100644
--- a/workers/dockerfiles/Dockerfile.downloaders
+++ b/workers/dockerfiles/Dockerfile.downloaders
@@ -8,13 +8,24 @@ WORKDIR /home/user
 COPY workers/R/dependencies/install_downloader_R_only.R .
 RUN Rscript install_downloader_R_only.R
 
+COPY workers/data_refinery_workers/downloaders/requirements.txt .
+RUN pip3 install --ignore-installed --upgrade pip && \
+    pip3 install --ignore-installed rpy2==3.4.5 && \
+    pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip cache.
+    rm -rf /root/.cache
+
+# Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed \
+        common/$(ls common -1 | sort --version-sort | tail -1)
+
 # Aspera will only install as the current user.
 # Even using `su - user &&` doesn't work...
-
 USER user
+
 # Install Aspera. We have to install it using Holland Computing Center's conda
 # repo because download.asperasoft.com now returns 403s
-
 RUN wget -q "https://anaconda.org/HCC/aspera-cli/3.9.1/download/\
 linux-64/aspera-cli-3.9.1-0.tar.bz2" && \
     [ "$(sha256sum aspera-cli-3.9.1-0.tar.bz2 | cut -d ' ' -f1)" =\
@@ -27,26 +38,13 @@ linux-64/aspera-cli-3.9.1-0.tar.bz2" && \
 # Now that we're done installing Aspera go back to being root for a bit.
 USER root
 
-RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed rpy2==3.4.5
-
-# Get the latest version from the dist directory.
-COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed \
-    common/$(ls common -1 | sort --version-sort | tail -1)
-
-COPY workers/data_refinery_workers/downloaders/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt && \
-    # Clear out the pip3 cache.
-    rm -r /root/.cache
+COPY .boto .boto
+COPY config config
+COPY workers/ .
 
 ARG SYSTEM_VERSION
 ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
-COPY .boto .boto
-COPY config config
-COPY workers/ .
-
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.illumina b/workers/dockerfiles/Dockerfile.illumina
index 648d2e758..e578b0b69 100644
--- a/workers/dockerfiles/Dockerfile.illumina
+++ b/workers/dockerfiles/Dockerfile.illumina
@@ -12,22 +12,16 @@ RUN Rscript dependencies.R
 COPY workers/R/dependencies/illumina/bioc/dependencies.R .
 RUN Rscript dependencies.R
 
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed --upgrade pip && \
+    pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip cache.
+    rm -rf /root/.cache
+
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
 RUN pip3 install --ignore-installed \
-    common/$(ls common -1 | sort --version-sort | tail -1)
-
-RUN pip3 install --ignore-installed --upgrade pip
-
-COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt && \
-    # Clear out the pip3 cache.
-    rm -r /root/.cache
-
-ARG SYSTEM_VERSION
-ENV SYSTEM_VERSION=$SYSTEM_VERSION
-
-USER user
+        common/$(ls common -1 | sort --version-sort | tail -1)
 
 COPY .boto .boto
 COPY config/ config/
@@ -35,4 +29,9 @@ COPY workers/ .
 COPY workers/data_refinery_workers/processors/detect_database.R .
 COPY workers/illumina_probe_maps/ probe_maps/
 
+ARG SYSTEM_VERSION
+ENV SYSTEM_VERSION=$SYSTEM_VERSION
+
+USER user
+
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.no_op b/workers/dockerfiles/Dockerfile.no_op
index 8d55281ca..0d98229ee 100644
--- a/workers/dockerfiles/Dockerfile.no_op
+++ b/workers/dockerfiles/Dockerfile.no_op
@@ -11,9 +11,23 @@ RUN Rscript dependencies.R
 COPY workers/R/dependencies/no_op/install_gene_convert.R .
 RUN Rscript install_gene_convert.R
 
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed --upgrade pip && \
+    pip3 install --ignore-installed numpy && \
+    pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip cache.
+    rm -rf /root/.cache
+
+# Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed \
+        common/$(ls common -1 | sort --version-sort | tail -1)
+
 # Noop-specific.
 RUN mkdir -p gene_indexes
+
 WORKDIR /home/user/gene_indexes
+
 ENV ID_REFINERY_URL=https://zenodo.org/record/1410647/files/all_1536267482.zip
 RUN curl -O $ID_REFINERY_URL && \
     echo $ID_REFINERY_URL > /etc/identifier_refinery_url && \
@@ -21,28 +35,15 @@ RUN curl -O $ID_REFINERY_URL && \
     rm *.zip
 # End Noop-specific.
 
-WORKDIR /home/user/
-
-# Get the latest version from the dist directory.
-COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed \
-    common/$(ls common -1 | sort --version-sort | tail -1)
-
-RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed numpy
+WORKDIR /home/user
 
-COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt && \
-    # Clear out the pip3 cache.
-    rm -r /root/.cache
+COPY .boto .boto
+COPY config/ config/
+COPY workers/ .
 
 ARG SYSTEM_VERSION
 ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
-COPY .boto .boto
-COPY config/ config/
-COPY workers/ .
-
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.salmon b/workers/dockerfiles/Dockerfile.salmon
index b6395ef58..b4493b2e9 100644
--- a/workers/dockerfiles/Dockerfile.salmon
+++ b/workers/dockerfiles/Dockerfile.salmon
@@ -5,6 +5,25 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 
 WORKDIR /home/user
 
+COPY workers/R/dependencies/tximport/cran/dependencies.R tximport_dependencies.R
+RUN Rscript tximport_dependencies.R
+
+# Install tximport.
+COPY workers/R/dependencies/tximport/install_tximport.R .
+RUN Rscript install_tximport.R
+
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed --upgrade pip && \
+    pip3 install --ignore-installed numpy && \
+    pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip cache.
+    rm -rf /root/.cache
+
+# Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed \
+        common/$(ls common -1 | sort --version-sort | tail -1)
+
 # Install Salmon
 # Tximport requires all experiments to be processed with the same version of
 # Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496.
@@ -25,48 +44,28 @@ v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \
     rm -f "Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz"
 # End Salmon installation.
 
-COPY workers/R/dependencies/tximport/cran/dependencies.R tximport_dependencies.R
-RUN Rscript tximport_dependencies.R
-
-# Install tximport.
-COPY workers/R/dependencies/tximport/install_tximport.R .
-RUN Rscript install_tximport.R
-
-RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed numpy
-
-# Get the latest version from the dist directory.
-COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed \
-    common/$(ls common -1 | sort --version-sort | tail -1)
-
-COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt && \
-    # Clear out the pip3 cache.
-    rm -r /root/.cache
-
-# Install SalmonTools.
-RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && \
+ENV SRA_VERSION=2.9.1
+RUN \
+    # Install SalmonTools.
+    git clone https://github.com/COMBINE-lab/SalmonTools.git && \
     cd SalmonTools && \
     git checkout 3e6654c2c10a5225498b623056993947fa688afc && \
     cmake . -DCMAKE_INSTALL_PREFIX=/usr/local && \
     make install && \
-    rm -rf SalmonTools
-
-# Install sra-tools.
-ENV SRA_VERSION=2.9.1
-RUN wget -q "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/\
+    rm -rf SalmonTools && \
+    # Install sra-tools.
+    wget -q "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/\
 sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \
     tar zxfv "sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \
     cp -r "sratoolkit.${SRA_VERSION}-ubuntu64/bin/"* /usr/bin
 
+COPY .boto .boto
+COPY config/ config/
+COPY workers/ .
+
 ARG SYSTEM_VERSION
 ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
-COPY .boto .boto
-COPY config/ config/
-COPY workers/ .
-
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.smasher b/workers/dockerfiles/Dockerfile.smasher
index ebdf924a4..bdf146474 100644
--- a/workers/dockerfiles/Dockerfile.smasher
+++ b/workers/dockerfiles/Dockerfile.smasher
@@ -11,26 +11,25 @@ RUN Rscript dependencies.R
 COPY workers/R/dependencies/qn/bioc/dependencies.R .
 RUN Rscript dependencies.R
 
+COPY workers/data_refinery_workers/processors/requirements.txt .
 RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed nose numpy rpy2==3.4.5
+    pip3 install --ignore-installed nose numpy rpy2==3.4.5 && \
+    pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip cache.
+    rm -rf /root/.cache
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
 RUN pip3 install --ignore-installed \
-    common/$(ls common -1 | sort --version-sort | tail -1)
+        common/$(ls common -1 | sort --version-sort | tail -1)
 
-COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt && \
-    # Clear out the pip3 cache.
-    rm -rf /root/.cache
+COPY .boto .boto
+COPY config/ config/
+COPY workers/ .
 
 ARG SYSTEM_VERSION
 ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
-COPY .boto .boto
-COPY config/ config/
-COPY workers/ .
-
 ENTRYPOINT []
diff --git a/workers/dockerfiles/Dockerfile.transcriptome b/workers/dockerfiles/Dockerfile.transcriptome
index 8ce0d974a..ae75c87a8 100644
--- a/workers/dockerfiles/Dockerfile.transcriptome
+++ b/workers/dockerfiles/Dockerfile.transcriptome
@@ -5,6 +5,17 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 
 WORKDIR /home/user
 
+COPY workers/data_refinery_workers/processors/requirements.txt .
+RUN pip3 install --ignore-installed --upgrade pip && \
+    pip3 install --ignore-installed numpy && \
+    pip3 install --ignore-installed -r requirements.txt && \
+    # Clear out the pip cache.
+    rm -rf /root/.cache
+
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed \
+    common/$(ls common -1 | sort --version-sort | tail -1)
+
 # It's annoying that this can only be installed via git.
 RUN git clone https://github.com/deweylab/RSEM.git && \
     cd RSEM && make install && \
@@ -28,25 +39,13 @@ v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \
     rm -r Salmon*
 # End Salmon installation.
 
-RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed numpy
-
-COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed \
-    common/$(ls common -1 | sort --version-sort | tail -1)
-
-COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed -r requirements.txt && \
-    # Clear out the pip3 cache.
-    rm -rf /root/.cache
+COPY .boto .boto
+COPY config/ config/
+COPY workers/ .
 
 ARG SYSTEM_VERSION
 ENV SYSTEM_VERSION=$SYSTEM_VERSION
 
 USER user
 
-COPY .boto .boto
-COPY config/ config/
-COPY workers/ .
-
 ENTRYPOINT []

From a10865346ddce740acdbc1d6f6bb9513c94f03f5 Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Fri, 2 Dec 2022 17:04:51 -0800
Subject: [PATCH 23/24] Add `--no-cache-dir` to pip. Move common package
 intallation to the bottom.

---
 workers/dockerfiles/Dockerfile.compendia     | 23 +++++++++----------
 workers/dockerfiles/Dockerfile.downloaders   | 18 +++++++--------
 workers/dockerfiles/Dockerfile.illumina      |  8 +++----
 workers/dockerfiles/Dockerfile.no_op         | 18 +++++++--------
 workers/dockerfiles/Dockerfile.salmon        | 24 +++++++++-----------
 workers/dockerfiles/Dockerfile.smasher       | 10 ++++----
 workers/dockerfiles/Dockerfile.transcriptome | 21 ++++++++---------
 7 files changed, 54 insertions(+), 68 deletions(-)

diff --git a/workers/dockerfiles/Dockerfile.compendia b/workers/dockerfiles/Dockerfile.compendia
index 9a44a7ecc..8790d859c 100644
--- a/workers/dockerfiles/Dockerfile.compendia
+++ b/workers/dockerfiles/Dockerfile.compendia
@@ -84,25 +84,24 @@ RUN Rscript dependencies.R
 # End QN-specific.
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed nose numpy scipy matplotlib \
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir nose numpy scipy matplotlib \
         pandas==0.25.3 scikit-learn sympy rpy2===3.4.5 tzlocal fancySVD && \
-    pip3 install --ignore-installed -r requirements.txt && \
-    pip3 install --ignore-installed numpy==1.16.0 && \
-    # Clear out the pip cache.
-    rm -rf /root/.cache
+    pip3 install --ignore-installed --no-cache-dir -r requirements.txt && \
+    pip3 install --ignore-installed --no-cache-dir numpy==1.16.0
 
-# Get the latest version from the dist directory.
-COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed \
-        common/$(ls common -1 | sort --version-sort | tail -1) && \
-    # Install phantomjs.
-    wget "https://bitbucket.org/ariya/phantomjs/downloads/\
+# Install phantomjs.
+RUN wget -q "https://bitbucket.org/ariya/phantomjs/downloads/\
 phantomjs-2.1.1-linux-x86_64.tar.bz2" && \
     tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ && \
     ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs \
         /usr/local/bin/
 
+# Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed --no-cache-dir \
+        common/$(ls common -1 | sort --version-sort | tail -1)
+
 COPY .boto .boto
 COPY config/ config/
 COPY workers/ .
diff --git a/workers/dockerfiles/Dockerfile.downloaders b/workers/dockerfiles/Dockerfile.downloaders
index 02917518d..9b611956d 100644
--- a/workers/dockerfiles/Dockerfile.downloaders
+++ b/workers/dockerfiles/Dockerfile.downloaders
@@ -9,16 +9,9 @@ COPY workers/R/dependencies/install_downloader_R_only.R .
 RUN Rscript install_downloader_R_only.R
 
 COPY workers/data_refinery_workers/downloaders/requirements.txt .
-RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed rpy2==3.4.5 && \
-    pip3 install --ignore-installed -r requirements.txt && \
-    # Clear out the pip cache.
-    rm -rf /root/.cache
-
-# Get the latest version from the dist directory.
-COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed \
-        common/$(ls common -1 | sort --version-sort | tail -1)
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir rpy2==3.4.5 && \
+    pip3 install --ignore-installed --no-cache-dir -r requirements.txt
 
 # Aspera will only install as the current user.
 # Even using `su - user &&` doesn't work...
@@ -38,6 +31,11 @@ linux-64/aspera-cli-3.9.1-0.tar.bz2" && \
 # Now that we're done installing Aspera go back to being root for a bit.
 USER root
 
+# Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed --no-cache-dir \
+        common/$(ls common -1 | sort --version-sort | tail -1)
+
 COPY .boto .boto
 COPY config config
 COPY workers/ .
diff --git a/workers/dockerfiles/Dockerfile.illumina b/workers/dockerfiles/Dockerfile.illumina
index e578b0b69..200b37725 100644
--- a/workers/dockerfiles/Dockerfile.illumina
+++ b/workers/dockerfiles/Dockerfile.illumina
@@ -13,14 +13,12 @@ COPY workers/R/dependencies/illumina/bioc/dependencies.R .
 RUN Rscript dependencies.R
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed -r requirements.txt && \
-    # Clear out the pip cache.
-    rm -rf /root/.cache
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir -r requirements.txt
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed \
+RUN pip3 install --ignore-installed --no-cache-dir \
         common/$(ls common -1 | sort --version-sort | tail -1)
 
 COPY .boto .boto
diff --git a/workers/dockerfiles/Dockerfile.no_op b/workers/dockerfiles/Dockerfile.no_op
index 0d98229ee..60f686265 100644
--- a/workers/dockerfiles/Dockerfile.no_op
+++ b/workers/dockerfiles/Dockerfile.no_op
@@ -12,16 +12,9 @@ COPY workers/R/dependencies/no_op/install_gene_convert.R .
 RUN Rscript install_gene_convert.R
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed numpy && \
-    pip3 install --ignore-installed -r requirements.txt && \
-    # Clear out the pip cache.
-    rm -rf /root/.cache
-
-# Get the latest version from the dist directory.
-COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed \
-        common/$(ls common -1 | sort --version-sort | tail -1)
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir numpy && \
+    pip3 install --ignore-installed --no-cache-dir -r requirements.txt
 
 # Noop-specific.
 RUN mkdir -p gene_indexes
@@ -37,6 +30,11 @@ RUN curl -O $ID_REFINERY_URL && \
 
 WORKDIR /home/user
 
+# Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed --no-cache-dir \
+        common/$(ls common -1 | sort --version-sort | tail -1)
+
 COPY .boto .boto
 COPY config/ config/
 COPY workers/ .
diff --git a/workers/dockerfiles/Dockerfile.salmon b/workers/dockerfiles/Dockerfile.salmon
index b4493b2e9..046ee0b89 100644
--- a/workers/dockerfiles/Dockerfile.salmon
+++ b/workers/dockerfiles/Dockerfile.salmon
@@ -13,16 +13,9 @@ COPY workers/R/dependencies/tximport/install_tximport.R .
 RUN Rscript install_tximport.R
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed numpy && \
-    pip3 install --ignore-installed -r requirements.txt && \
-    # Clear out the pip cache.
-    rm -rf /root/.cache
-
-# Get the latest version from the dist directory.
-COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed \
-        common/$(ls common -1 | sort --version-sort | tail -1)
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir numpy && \
+    pip3 install --ignore-installed --no-cache-dir -r requirements.txt
 
 # Install Salmon
 # Tximport requires all experiments to be processed with the same version of
@@ -45,9 +38,9 @@ v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \
 # End Salmon installation.
 
 ENV SRA_VERSION=2.9.1
-RUN \
-    # Install SalmonTools.
-    git clone https://github.com/COMBINE-lab/SalmonTools.git && \
+
+# Install SalmonTools.
+RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && \
     cd SalmonTools && \
     git checkout 3e6654c2c10a5225498b623056993947fa688afc && \
     cmake . -DCMAKE_INSTALL_PREFIX=/usr/local && \
@@ -59,6 +52,11 @@ sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \
     tar zxfv "sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \
     cp -r "sratoolkit.${SRA_VERSION}-ubuntu64/bin/"* /usr/bin
 
+# Get the latest version from the dist directory.
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed --no-cache-dir \
+        common/$(ls common -1 | sort --version-sort | tail -1)
+
 COPY .boto .boto
 COPY config/ config/
 COPY workers/ .
diff --git a/workers/dockerfiles/Dockerfile.smasher b/workers/dockerfiles/Dockerfile.smasher
index bdf146474..926f98208 100644
--- a/workers/dockerfiles/Dockerfile.smasher
+++ b/workers/dockerfiles/Dockerfile.smasher
@@ -12,15 +12,13 @@ COPY workers/R/dependencies/qn/bioc/dependencies.R .
 RUN Rscript dependencies.R
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed nose numpy rpy2==3.4.5 && \
-    pip3 install --ignore-installed -r requirements.txt && \
-    # Clear out the pip cache.
-    rm -rf /root/.cache
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir nose numpy rpy2==3.4.5 && \
+    pip3 install --ignore-installed --no-cache-dir -r requirements.txt
 
 # Get the latest version from the dist directory.
 COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed \
+RUN pip3 install --ignore-installed --no-cache-dir \
         common/$(ls common -1 | sort --version-sort | tail -1)
 
 COPY .boto .boto
diff --git a/workers/dockerfiles/Dockerfile.transcriptome b/workers/dockerfiles/Dockerfile.transcriptome
index ae75c87a8..f7dbe3676 100644
--- a/workers/dockerfiles/Dockerfile.transcriptome
+++ b/workers/dockerfiles/Dockerfile.transcriptome
@@ -6,18 +6,11 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 WORKDIR /home/user
 
 COPY workers/data_refinery_workers/processors/requirements.txt .
-RUN pip3 install --ignore-installed --upgrade pip && \
-    pip3 install --ignore-installed numpy && \
-    pip3 install --ignore-installed -r requirements.txt && \
-    # Clear out the pip cache.
-    rm -rf /root/.cache
-
-COPY common/dist/data-refinery-common-* common/
-RUN pip3 install --ignore-installed \
-    common/$(ls common -1 | sort --version-sort | tail -1)
-
-# It's annoying that this can only be installed via git.
-RUN git clone https://github.com/deweylab/RSEM.git && \
+RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \
+    pip3 install --ignore-installed --no-cache-dir numpy && \
+    pip3 install --ignore-installed --no-cache-dir -r requirements.txt && \
+    # It's annoying that this can only be installed via git.
+    git clone https://github.com/deweylab/RSEM.git && \
     cd RSEM && make install && \
     rm -rf RSEM
 
@@ -39,6 +32,10 @@ v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \
     rm -r Salmon*
 # End Salmon installation.
 
+COPY common/dist/data-refinery-common-* common/
+RUN pip3 install --ignore-installed --no-cache-dir \
+        common/$(ls common -1 | sort --version-sort | tail -1)
+
 COPY .boto .boto
 COPY config/ config/
 COPY workers/ .

From 29b8d24e7f8db875c92fa785b70730ffdaa7834a Mon Sep 17 00:00:00 2001
From: Arkadii Yakovets <ark@ccdatalab.org>
Date: Thu, 2 Feb 2023 10:25:52 -0800
Subject: [PATCH 24/24] Rename cron job script.

---
 .../foreman-server-instance-user-data.tpl.sh           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh b/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh
index 3598c2a45..c53d368bf 100644
--- a/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh
+++ b/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh
@@ -77,8 +77,8 @@ docker run \\
        -e DATABASE_PASSWORD=${database_password} \\
        -v /tmp:/tmp \\
        -it ${dockerhub_repo}/dr_\"\$1\" python3 manage.py \"\$2\"
-" >> /home/ubuntu/run_cron_job.sh
-chmod +x /home/ubuntu/run_cron_job.sh
+" >> /home/ubuntu/run_manage_command.sh
+chmod +x /home/ubuntu/run_manage_command.sh
 
 # Use Monit to ensure the Foreman is always running
 apt-get -y update
@@ -112,9 +112,9 @@ service monit restart
 # Install the cron job tests
 crontab -l > tempcron
 cat <<EOF >> tempcron
-0 12 * * MON /bin/bash /home/ubuntu/run_cron_job.sh affymetrix check_brainarray_gene_agreement >> /var/log/cron_job_tests.log 2>&1
-0 12 * * MON /bin/bash /home/ubuntu/run_cron_job.sh affymetrix check_tx_index_transcript_agreement >> /var/log/cron_job_tests.log 2>&1
-0 12 * * ${accession_gathering_job_run_day} /bin/bash /home/ubuntu/run_cron_job.sh foreman gather_weekly_accessions >> /var/log/gather_weekly_accessions.log 2>&1
+0 12 * * MON /bin/bash /home/ubuntu/run_manage_command.sh affymetrix check_brainarray_gene_agreement >> /var/log/affymetrix_checks.log 2>&1
+0 12 * * MON /bin/bash /home/ubuntu/run_manage_command.sh affymetrix check_tx_index_transcript_agreement >> /var/log/affymetrix_checks.log 2>&1
+0 12 * * ${accession_gathering_job_run_day} /bin/bash /home/ubuntu/run_manage_command.sh foreman gather_weekly_accessions >> /var/log/weekly_accessions.log 2>&1
 EOF
 # install new cron file
 crontab tempcron