From 6b606f2013982e78cee31a1bec8f2d6672cbf629 Mon Sep 17 00:00:00 2001 From: ark Date: Thu, 5 May 2022 20:30:59 -0700 Subject: [PATCH 01/24] Fix test_common job. --- workers/install_affy_only.R | 1 + 1 file changed, 1 insertion(+) diff --git a/workers/install_affy_only.R b/workers/install_affy_only.R index 354e0e592..b488a0287 100644 --- a/workers/install_affy_only.R +++ b/workers/install_affy_only.R @@ -8,6 +8,7 @@ options(Ncpus=parallel::detectCores()) devtools::install_version('dplyr', version='1.0.0') devtools::install_version('tidyr', version='1.1.0') devtools::install_version('ff', version='2.2-14') +devtools::install_version('locfit', version='1.5-9.4') # Helper function that installs a list of packages using the input URLs install_with_url <- function(urls) { From b87cbd21675c51dbaa8a956a3c87a1444ccade14 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Thu, 1 Sep 2022 11:03:18 -0700 Subject: [PATCH 02/24] Add `Accession` model. Update pre-commit config. --- .pre-commit-config.yaml | 2 +- .../migrations/0071_auto_20220901_1653.py | 44 +++++++++++++++++++ .../data_refinery_common/models/__init__.py | 1 + .../data_refinery_common/models/accession.py | 22 ++++++++++ 4 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 common/data_refinery_common/migrations/0071_auto_20220901_1653.py create mode 100644 common/data_refinery_common/models/accession.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d024704da..b651ce24a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: - id: isort - repo: https://github.com/psf/black - rev: 19.10b0 + rev: 22.3.0 hooks: - id: black args: [--line-length=100] diff --git a/common/data_refinery_common/migrations/0071_auto_20220901_1653.py b/common/data_refinery_common/migrations/0071_auto_20220901_1653.py new file mode 100644 index 000000000..c7d3b0b63 --- /dev/null +++ b/common/data_refinery_common/migrations/0071_auto_20220901_1653.py @@ -0,0 +1,44 @@ +# Generated by Django 3.2.7 on 2022-09-01 16:53 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("data_refinery_common", "0070_auto_20211208_2118"), + ] + + operations = [ + migrations.CreateModel( + name="Accession", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("code", models.TextField()), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("last_modified_at", models.DateTimeField(auto_now=True)), + ("organism", models.TextField()), + ("published_date", models.DateTimeField()), + ("sample_count", models.PositiveIntegerField(default=0)), + ("source", models.TextField()), + ("technology", models.TextField()), + ], + options={ + "db_table": "accessions", + }, + ), + migrations.AddConstraint( + model_name="accession", + constraint=models.UniqueConstraint( + fields=("code", "source", "technology"), name="unique_accession" + ), + ), + ] diff --git a/common/data_refinery_common/models/__init__.py b/common/data_refinery_common/models/__init__.py index 39abe7ee3..8e9564153 100644 --- a/common/data_refinery_common/models/__init__.py +++ b/common/data_refinery_common/models/__init__.py @@ -1,3 +1,4 @@ +from data_refinery_common.models.accession import Accession # noqa from data_refinery_common.models.api_token import APIToken # noqa from data_refinery_common.models.associations.compendium_result_organism_association import ( # noqa CompendiumResultOrganismAssociation, diff --git a/common/data_refinery_common/models/accession.py b/common/data_refinery_common/models/accession.py new file mode 100644 index 000000000..dc93cfd88 --- /dev/null +++ b/common/data_refinery_common/models/accession.py @@ -0,0 +1,22 @@ +from django.db import models + + +class Accession(models.Model): + """Accession model.""" + + class Meta: + constraints = ( + models.UniqueConstraint( + fields=("code", "source", "technology"), name="unique_accession" + ), + ) + db_table = "accessions" + + code = models.TextField() + created_at = models.DateTimeField(auto_now_add=True) + last_modified_at = models.DateTimeField(auto_now=True) + organism = models.TextField() + published_date = models.DateTimeField() + sample_count = models.PositiveIntegerField(default=0) + source = models.TextField() + technology = models.TextField() From f1b1c06ee75d91a97fa40c2276c3d7b937431a72 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Thu, 8 Sep 2022 18:12:39 -0700 Subject: [PATCH 03/24] Port Python script to Django command. - Introduce AccessionBacklogEntry model. - Clean up command flags. - Get previous accessions from the DB. - --- .../migrations/0071_accessionbacklogentry.py | 38 + .../data_refinery_common/models/accession.py | 82 +- .../gatherer/__init__.py | 0 .../gatherer/management/__init__.py | 0 .../gatherer/management/commands/__init__.py | 0 .../management/commands/gather_accessions.py | 731 ++++++++++++++++++ foreman/data_refinery_foreman/settings.py | 17 +- foreman/dockerfiles/Dockerfile.foreman | 2 + 8 files changed, 856 insertions(+), 14 deletions(-) create mode 100644 common/data_refinery_common/migrations/0071_accessionbacklogentry.py create mode 100644 foreman/data_refinery_foreman/gatherer/__init__.py create mode 100644 foreman/data_refinery_foreman/gatherer/management/__init__.py create mode 100644 foreman/data_refinery_foreman/gatherer/management/commands/__init__.py create mode 100644 foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py diff --git a/common/data_refinery_common/migrations/0071_accessionbacklogentry.py b/common/data_refinery_common/migrations/0071_accessionbacklogentry.py new file mode 100644 index 000000000..86c04daed --- /dev/null +++ b/common/data_refinery_common/migrations/0071_accessionbacklogentry.py @@ -0,0 +1,38 @@ +# Generated by Django 3.2.7 on 2022-09-07 19:31 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("data_refinery_common", "0070_auto_20211208_2118"), + ] + + operations = [ + migrations.CreateModel( + name="AccessionBacklogEntry", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("code", models.TextField(unique=True)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("last_modified_at", models.DateTimeField(auto_now=True)), + ("organism", models.TextField()), + ("published_date", models.DateTimeField()), + ("sample_count", models.PositiveIntegerField(default=0)), + ("source", models.TextField()), + ("technology", models.TextField()), + ], + options={ + "db_table": "accession_backlog", + }, + ), + ] diff --git a/common/data_refinery_common/models/accession.py b/common/data_refinery_common/models/accession.py index dc93cfd88..6ac62da9f 100644 --- a/common/data_refinery_common/models/accession.py +++ b/common/data_refinery_common/models/accession.py @@ -1,18 +1,16 @@ +from datetime import datetime + from django.db import models +from django.utils import timezone -class Accession(models.Model): - """Accession model.""" +class AccessionBacklogEntry(models.Model): + """Accession backlog entry model.""" class Meta: - constraints = ( - models.UniqueConstraint( - fields=("code", "source", "technology"), name="unique_accession" - ), - ) - db_table = "accessions" - - code = models.TextField() + db_table = "accession_backlog" + + code = models.TextField(unique=True) created_at = models.DateTimeField(auto_now_add=True) last_modified_at = models.DateTimeField(auto_now=True) organism = models.TextField() @@ -20,3 +18,67 @@ class Meta: sample_count = models.PositiveIntegerField(default=0) source = models.TextField() technology = models.TextField() + + def __eq__(self, other: object) -> bool: + """Returns True if two objects are equal. Otherwise returns False.""" + return isinstance(other, AccessionBacklogEntry) and self.code == other.code + + def __hash__(self) -> int: + """Returns accession object unique hash value.""" + return hash(self.code) + + def __str__(self) -> str: + """Returns accession default string representation.""" + return ", ".join((self.code, self.technology, self.source, str(self.published_date.date()))) + + @staticmethod + def create_from_ma_ae_entry(entry): + """Creates accession object from MicroArray ArrayExpress entry.""" + accession = AccessionBacklogEntry() + accession.code = entry["accession"] + accession.source = "ebi_biostudies" + accession.technology = "microarray" + + if "organism" in entry: + accession.organism = entry["organism"] + if "release_date" in entry: + accession.published_date = timezone.make_aware( + datetime.strptime(entry["release_date"], "%Y-%m-%d") + ) + + return accession + + @staticmethod + def create_from_ma_geo_entry(entry): + """Creates accession object from MicroArray GEO meta DB entry.""" + accession = AccessionBacklogEntry() + accession.code = entry["gse"] + accession.source = "geo_meta_db" + accession.technology = "microarray" + + if "organism" in entry: + accession.organism = entry["organism"].lower() + if "submission_date" in entry: + + accession.published_date = timezone.make_aware( + datetime.strptime(entry["submission_date"], "%Y-%m-%d") + ) + + return accession + + @staticmethod + def create_from_rnaseq_entry(entry): + """Creates accession object from RNA-Seq entry.""" + accession = AccessionBacklogEntry() + accession.code = entry["secondary_study_accession"] + accession.source = "ebi_ena_portal" + accession.technology = "rna-seq" + + if "scientific_name" in entry: + accession.organism = entry["scientific_name"].lower() + if "first_public" in entry: + accession.published_date = timezone.make_aware( + datetime.strptime(entry["first_public"], "%Y-%m-%d") + ) + + return accession diff --git a/foreman/data_refinery_foreman/gatherer/__init__.py b/foreman/data_refinery_foreman/gatherer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/foreman/data_refinery_foreman/gatherer/management/__init__.py b/foreman/data_refinery_foreman/gatherer/management/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/__init__.py b/foreman/data_refinery_foreman/gatherer/management/commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py new file mode 100644 index 000000000..c4808a191 --- /dev/null +++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py @@ -0,0 +1,731 @@ +"""MicroArray (ArrayExpress, GEO) and RNA-Seq accession gathering automation. +Data sources: + - https://www.ebi.ac.uk/biostudies/help (MicroArray ArrayExpress). + - local SQLite meta DB from https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html + (MicroArray GEO). + - https://www.ebi.ac.uk/ena/portal/api/ (RNA-Seq). +""" + +import argparse +import logging +import os +import re +import sqlite3 +from datetime import datetime +from http.client import RemoteDisconnected +from json.decoder import JSONDecodeError +from typing import List, Set +from urllib.parse import quote + +from django.core.management.base import BaseCommand + +import requests +from requests.exceptions import ConnectionError, ConnectTimeout +from retrying import retry +from urllib3.exceptions import ProtocolError + +from data_refinery_common.logging import get_and_configure_logger +from data_refinery_common.models.accession import AccessionBacklogEntry +from data_refinery_common.models.experiment import Experiment + +log = get_and_configure_logger(__name__) + + +class Command(BaseCommand): + """Creates agents and runs actual accession gathering.""" + + RE_ACCESSION = re.compile(r"(\D+)(\d+)") + RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}") + + # TODO(ark): remove after upgrade to python3.8 where parser argument + # "extend" action is directly available. + # https://docs.python.org/3.8/library/argparse.html#action + class ExtendAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + items = getattr(namespace, self.dest) or [] + items.extend(values) + setattr(namespace, self.dest, items) + + def add_arguments(self, parser) -> None: + parser.register("action", "extend", Command.ExtendAction) + + parser.add_argument( + "--ae-id", + action="extend", + nargs="+", + type=str, + help="ArrayExpress ID(s) to use for filtering.", + ) + parser.add_argument( + "--ae-ids-file", + type=str, + help="Path to a file containing ArrayExpress ID(s) to use for filtering.", + ) + parser.add_argument("-c", "--count", type=int, help="Number of accessions to collect.") + parser.add_argument( + "-d", + "--dry-run", + action="store_true", + default=False, + help="Do not write the result to the database.", + ) + parser.add_argument( + "-e", + "--exclude-previous", + action="store_true", + default=True, + help="Exclude previously gathered or surveyed accessions.", + ) + parser.add_argument( + "-ne", + "--no-exclude-previous", + action="store_false", + default=False, + dest="exclude_previous", + help="Do not exclude previously gathered or surveyed accessions.", + ) + parser.add_argument( + "--gpl-id", + action="extend", + nargs="+", + type=str, + help="GEO platform ID(s) to use for filtering.", + ) + parser.add_argument( + "--gpl-ids-file", + type=str, + help="Path to a file containing GEO platform ID(s) to use for filtering.", + ) + parser.add_argument( + "-k", + "--keyword", + type=str, + help="Keyword to use for filtering.", + ) + parser.add_argument( + "-m", + "--microarray", + action="store_true", + default=False, + help="Collect MicroArray accessions.", + ) + parser.add_argument( + "-o", "--organism", type=str, help="Organism name to use for filtering." + ) + parser.add_argument( + "-r", + "--rna-seq", + action="store_true", + default=False, + help="Collect RNA-Seq accessions.", + ) + parser.add_argument( + "-s", + "--since", + type=str, + required=True, + help="Collect accessions made public on or after this date.", + ) + parser.add_argument( + "--taxon-id", + action="extend", + nargs="+", + type=int, + help="Taxon ID(s) to use for filtering.", + ) + parser.add_argument( + "--taxon-ids-file", + type=str, + help="Path to a file containing taxon ID(s) to use for filtering.", + ) + parser.add_argument( + "-u", + "--until", + type=str, + help="Collect accessions made public before or on this date.", + ) + parser.add_argument( + "-lv", + "--log-verbose", + action="store_true", + default=False, + help="Enable verbose log output.", + ) + + def set_verbosity_level(self, options) -> None: + """Configures log verbosity level.""" + if options["log_verbose"]: + log.addHandler(logging.StreamHandler()) + log.setLevel(logging.DEBUG) + else: + log.setLevel(logging.ERROR) + + def validate_args(self, options) -> None: + """Validates arguments.""" + if not options["microarray"] and not options["rna_seq"]: + exit("Either --microarray or --rna-seq must be specified.") + + errors = list() + since = options["since"] + until = options["until"] + if not self.RE_DATE.match(since): + errors.append('The -s, --since value must match "YYYY-MM-DD" format.') + if until and not self.RE_DATE.match(until): + errors.append('The -u, --until value must match "YYYY-MM-DD" format.') + if since and until and since > until: + errors.append("The -s, --since date must be earlier than -u, --until date.") + + keyword = options["keyword"] + organism = options["organism"] + if options["microarray"]: + ae_id = options["ae_id"] or options["ae_ids_file"] + gpl_id = options["gpl_id"] or options["gpl_ids_file"] + ids = ae_id or gpl_id + invalid_options_message = ( + "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or " + "ArrayExpress ID(s) [--ae-id, --ae-ids-file] / GEO platform ID(s) " + "[--gpl-id, --gpl-ids-file] must be specified." + ) + elif options["rna_seq"]: + taxon_id = options["taxon_id"] or options["taxon_ids_file"] + ids = taxon_id + invalid_options_message = ( + "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] " + "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified." + ) + + if len([option for option in (ids, keyword, organism) if option]) != 1: + errors.append(invalid_options_message) + + if errors: + exit("\n".join(errors)) + + def handle(self, *args, **options): + """Runs the accession gathering process.""" + self.validate_args(options) + self.set_verbosity_level(options) + + agents = list() + if options["rna_seq"]: + agents.append(RNASeqAccessionAgent(options)) + elif options["microarray"]: + if ( + options["ae_id"] + or options["ae_ids_file"] + or options["keyword"] + or options["organism"] + ): + agents.append(MicroArrayExpressAccessionAgent(options)) + if ( + options["gpl_id"] + or options["gpl_ids_file"] + or options["keyword"] + or options["organism"] + ): + agents.append(MicroArrayGEOAccessionAgent(options)) + + entries = set() + for agent in agents: + entries.update(agent.collect_data()) + + entries = sorted( # Sort the resulting list. + (entry for entry in entries if self.RE_ACCESSION.match(entry.code)), + key=lambda entry: ( + self.RE_ACCESSION.match(entry.code).group(1), + int(self.RE_ACCESSION.match(entry.code).group(2)), + ), + ) + # Limit the number of output entries. + entries = entries[: options["count"]] if options["count"] else entries + + if options["dry_run"]: + if entries: + output = "\n".join((str(entry) for entry in entries)) + else: + output = "No accessions found." + print(output) + else: + AccessionBacklogEntry.objects.bulk_create(entries) + + +class AccessionAgentBase: + "Accession agent base class." + + previous_accessions = set() + retry_params = { + "retry_on_exception": lambda e: isinstance( + e, (ConnectionError, ConnectTimeout, ProtocolError, RemoteDisconnected) + ), + "stop_max_attempt_number": 5, + "wait_exponential_multiplier": 1000, # Seconds. + "wait_exponential_max": 16000, # Seconds. + } + + def __init__(self, options) -> None: + """Populates args and values for major variables.""" + self.options = options + self.count = options["count"] + self.keyword = options["keyword"] + self.organism = options["organism"] + self.since = options["since"] + self.until = options["until"] or datetime.now().strftime("%Y-%m-%d") + + self.populate_previous_accessions() + + def build_query(self): + """Returns query/query dict depending on the accession data source.""" + raise NotImplementedError + + def collect_data(self): + """Generates resulting entry collection.""" + raise NotImplementedError + + def fetch_data(self): + """Fetches data from an external or local data source.""" + raise NotImplementedError + + def get_ids(self): + """Gets IDs for query filtering depending on the accession technology.""" + raise NotImplementedError + + def populate_previous_accessions(self) -> None: + """Populates previous accession set from a provided excluded ids file.""" + if not self.options["exclude_previous"] or self.previous_accessions: + return + + # Gathered accessions. + self.previous_accessions.update( + (entry["code"] for entry in AccessionBacklogEntry.objects.values("code")) + ) + + # Surveyed accessions. + experiments = Experiment.objects.values("accession_code", "alternate_accession_code") + self.previous_accessions.update( + (experiment["accession_code"] for experiment in experiments) + ) + self.previous_accessions.update( + (experiment["alternate_accession_code"] for experiment in experiments) + ) + + +class MicroArrayExpressAccessionAgent(AccessionAgentBase): + """ + MicroArray ArrayExpress accession gathering agent. The data is fetched from + the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and + https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more + information about the API endpoints. + """ + + def __init__(self, options) -> None: + super().__init__(options) + + self.data_chunk_size = 100 + self.data_url = "https://www.ebi.ac.uk/biostudies/api/v1/search" + self.ids = self.get_ids() + + def build_query(self) -> dict: + """Returns a query dict for getting array/organism specific accessions.""" + query_dict = { + "directsub": "true", + "page": 1, + "pageSize": self.data_chunk_size, + "release_date": f"[{self.since} TO {self.until}]", + "type": "study", + } + + if self.ids: + # TODO(ark): figure out better way of array filtering. + # Also make sure it's equivalent to the array filtering in this query + # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208 + query_dict.update({"content": ", ".join(self.ids)}) + elif self.keyword: + query_dict.update({"content": self.keyword}) + elif self.organism: + query_dict.update({"organism": f'"{self.organism}"'}) + + return query_dict + + def collect_data(self) -> Set[str]: + """Gets new accessions from EBI Biostudies API.""" + accessions = set() + + if self.ids: + message = ( + "Getting MicroArray ArrayExpress entries by " + f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] " + "range." + ) + elif self.keyword: + message = ( + "Getting MicroArray ArrayExpress entries by " + f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.' + ) + elif self.organism: + message = ( + "Getting MicroArray ArrayExpress entries by " + f'"{self.organism}" organism for [{self.since} - {self.until}] range.' + ) + else: + return accessions + + log.debug(message) + accessions.update(self.fetch_data()) + + return accessions + + def fetch_data(self) -> Set[str]: + """Retrieves accessions from API search endpoint.""" + + @retry(**self.retry_params) + def get_response(url, **kwargs): + """Gets response from an API endpoint.""" + return requests.get(url, **kwargs) + + accessions = set() + + is_done = False + params = self.build_query() + while not is_done: + range_start = (params["page"] - 1) * params["pageSize"] + 1 + range_end = (params["page"] - 1) * params["pageSize"] + self.data_chunk_size + log.debug(f"Processing entries {range_start} - {range_end}") + + response = get_response(self.data_url, params=params) + entries = response.json().get("hits") + if entries: + entries = ( + AccessionBacklogEntry.create_from_ma_ae_entry(entry) for entry in entries + ) + params["page"] += 1 + else: + is_done = True + + if self.previous_accessions: + entries = (entry for entry in entries if entry.code not in self.previous_accessions) + accessions.update(entries) + + # Quit after getting a sufficient amount of accessions. + if self.count and len(accessions) >= self.count: + is_done = True + + return accessions + + def get_ids(self) -> List[str]: + """Returns a combined list of passed ArrayExpress IDs.""" + ids = set() + + if self.options["ae_id"]: + ids.update(self.options["ae_id"]) + + if self.options["ae_ids_file"]: + with open(self.options["ae_ids_file"]) as ae_ids_file: + ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines())) + + return sorted(ids) + + +class MicroArrayGEOAccessionAgent(AccessionAgentBase): + """ + MicroArray GEO accession gathering agent. The data is fetched from a local + SQLite GEO meta database. + """ + + def __init__(self, options) -> None: + super().__init__(options) + + self.db_path = "data/microarray/GEOmetadb.sqlite" + self.ids = self.get_ids() + + def build_query(self) -> str: + """Returns a query for getting GEO accessions from the local SQLite meta DB.""" + tables = [ + f"SELECT *", + "FROM gse_gpl", + "JOIN gpl ON gse_gpl.gpl=gpl.gpl", + "JOIN gse ON gse.gse=gse_gpl.gse", + "GROUP BY gse_gpl.gse", + ] + + conditions = [ + f"HAVING gse.submission_date >= '{self.since}'", + f"gse.submission_date <= '{self.until}'", + ] + + if self.ids: + gpl_ids = (f"'{gpl_id}'" for gpl_id in self.ids) + conditions.append(f"gse_gpl.gpl IN ({', '.join(gpl_ids)})") + elif self.organism: + conditions.append(f"lower(organism)='{self.organism.lower()}'") + + return f"{' '.join(tables)} {' AND '.join(conditions)}" + + def collect_data(self) -> Set[str]: + """Gets new accessions from GEO database.""" + accessions = set() + + if self.ids: + message = ( + "Getting MicroArray GEO entries by GEO platform ID(s): " + f"{', '.join(self.ids)} for [{self.since} - {self.until}] range." + ) + elif self.keyword: + message = ( + f'Getting MicroArray GEO entries by "{self.keyword}" keyword ' + f"for [{self.since} - {self.until}] range." + ) + elif self.organism: + message = ( + f'Getting MicroArray GEO entries by "{self.organism}" organism ' + f"for [{self.since} - {self.until}] range." + ) + else: + return accessions + + log.debug(message) + accessions.update(self.fetch_data()) + + return accessions + + def fetch_data(self) -> Set[str]: + """Retrieves accessions from the GEO meta DB.""" + + def match_keyword(row): + """ + Returns True if `row` matches `self.keyword` based regex. + Otherwise returns False. + """ + return re_keyword.match(" ".join((str(c) for c in row if c))) + + accessions = set() + + if not os.path.exists(self.db_path): + log.error("GEO meta database doesn't exist.") + return accessions + + connection = sqlite3.connect(self.db_path) + connection.row_factory = sqlite3.Row + connection.text_factory = lambda b: b.decode(errors="ignore") + entries = connection.execute(self.build_query()).fetchall() + connection.close() + + if self.keyword: + re_keyword = re.compile(f".*{self.keyword}.*", re.IGNORECASE) # Keyword regex. + entries = filter(match_keyword, entries) + + entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries) + entries = set((AccessionBacklogEntry.create_from_ma_geo_entry(entry) for entry in entries)) + + if self.previous_accessions: + entries = (entry for entry in entries if entry.code not in self.previous_accessions) + accessions.update(entries) + + return accessions + + def get_ids(self) -> List[str]: + """Returns a combined list of passed GEO platform IDs.""" + ids = set() + + if self.options["gpl_id"]: + ids.update(self.options["gpl_id"]) + + if self.options["gpl_ids_file"]: + with open(self.options["gpl_ids_file"]) as gpl_ids_file: + ids.update((gpl_id.strip() for gpl_id in gpl_ids_file.readlines())) + + return sorted(ids) + + +class RNASeqAccessionAgent(AccessionAgentBase): + """ + RNA-Seq accession gathering agent. The data is fetched from + The European Nucleotide Archive (ENA) Portal. + See https://www.ebi.ac.uk/ena/portal/api/ for more information about the API + endpoints. + """ + + def __init__(self, options) -> None: + super().__init__(options) + + self.data_chunk_size = 10000 + self.data_url = "https://www.ebi.ac.uk/ena/portal/api/search" + self.ids = self.get_ids() + + def build_query(self, taxon_id: str = None) -> str: + """ + Returns a query to use for getting specific taxon ID accessions. + Some special characters must remain unquoted. + """ + + AND = " AND " + OR = " OR " + instrument_models = ( + "HiSeq X Five", + "HiSeq X Ten", + "Illumina Genome Analyzer II", + "Illumina Genome Analyzer IIx", + "Illumina Genome Analyzer", + "Illumina HiScanSQ", + "Illumina HiSeq 1000", + "Illumina HiSeq 1500", + "Illumina HiSeq 2000", + "Illumina HiSeq 2500", + "Illumina HiSeq 3000", + "Illumina HiSeq 4000", + "Illumina MiSeq", + "Illumina NovaSeq 6000", + "Ion Torrent Proton", + "Ion Torrent S5 XL", + "Ion Torrent S5", + "NextSeq 500", + "NextSeq 550", + ) + + instrument_models = OR.join((f'instrument_model="{im}"' for im in instrument_models)) + conditions = [ + # Relevant date fields: collection_date, collection_date_submitted, + # first_public, last_updated. + f"first_public >= {self.since}", + f"first_public <= {self.until}", + f"({instrument_models})", + 'library_source="TRANSCRIPTOMIC"', + 'library_strategy="RNA-Seq"', + ] + + if taxon_id: + conditions.append(f"tax_eq({taxon_id})") + elif self.keyword: + search_fields = ( + "assembly_software", + "bio_material", + "center_name", + "collected_by", + "experiment_title", + "host_body_site", + "instrument_model", + "instrument_platform", + "library_name", + "project_name", + "sample_title", + "sequencing_method", + "study_title", + ) + search_fields = OR.join( + (f'{sf}="*{self.keyword}*"' for sf in search_fields) + ) # Keyword regex. + conditions.append(f"({search_fields})") + elif self.organism: + # `host`: Natural (as opposed to laboratory) host to the organism from which sample + # was obtained. + # `host_scientific_name`: Scientific name of the natural (as opposed to laboratory) + # host to the organism from which sample was obtained. + # `scientific_name` Scientific name of the organism from which the sample was derived. + # Neither `host_scientific_name` nor `scientific_name` available for search. + # https://www.ebi.ac.uk/ena/portal/api/searchFields?dataPortal=ena&format=json&result=read_study + conditions.append(f'host="{self.organism}"') + + return quote(AND.join(conditions), safe='*()-="<>/ ') # Must remain unquoted. + + def collect_data(self) -> Set[str]: + """Gets new accessions from EBI ENA API.""" + accessions = set() + + if self.ids: + log.debug( + f"Getting RNA-Seq entries by taxon ID(s): " + f"{', '.join((str(idx) for idx in self.ids))} for [{self.since} - {self.until}] range." + ) + total = len(self.ids) + for idx, taxon_id in enumerate(self.ids): + if self.count and len(accessions) >= self.count: + break + + if total > 1: + log.debug(f"Getting entries for taxon ID {taxon_id}, {idx + 1} of {total}.") + accessions.update(self.fetch_data(taxon_id=taxon_id)) + elif self.keyword: + log.debug( + f'Getting RNA-Seq entries by "{self.keyword}" keyword ' + f"for [{self.since} - {self.until}] range." + ) + accessions.update(self.fetch_data()) + elif self.organism: + log.debug( + f'Getting entries by "{self.organism}" organism ' + f"for [{self.since} - {self.until}] range." + ) + accessions.update(self.fetch_data()) + + return accessions + + def fetch_data(self, taxon_id=None) -> Set[str]: + """ + Retrieves accessions from API search endpoint. + The API allows to set limit to 0 (get all in one request) but we do + it in a paginated fashion with `self.data_chunk_size` as a page size. + """ + + @retry(**self.retry_params) + def get_response(url, **kwargs): + """Gets response from an API endpoint.""" + return requests.post(url, **kwargs) + + accessions = set() + + fields = [ + "first_public", + "scientific_name", + "secondary_study_accession", + ] # For DRP/ERP/SRP-prefixed accessions. + data = { + "dataPortal": "ena", + # TODO(ark): add excludeAccessions/excludeAccessionType support. + "fields": ",".join(fields), # Use "all" to get all fields. + "format": "json", + "limit": self.data_chunk_size, + "offset": 0, + "query": self.build_query(taxon_id=taxon_id), + "result": "read_study", + "sortFields": fields, + } + + is_done = False + while not is_done: + log.debug( + f"Processing entries {data['offset'] + 1} - {data['offset'] + self.data_chunk_size}" + ) + entries = () + try: + response = get_response(self.data_url, data=data) + entries = response.json() + # TODO(ark): add `organism` when -o, --organism flag is used. + entries = ( + AccessionBacklogEntry.create_from_rnaseq_entry(entry) for entry in entries + ) + except JSONDecodeError: + is_done = True + except TypeError: + log.error(f"Couldn't get data from {self.data_url}. Response: {entries}") + data["offset"] += self.data_chunk_size + + if self.previous_accessions: + entries = (entry for entry in entries if entry.code not in self.previous_accessions) + accessions.update(entries) + + # Quit after getting a sufficient amount of accessions. + if self.count and len(accessions) >= self.count: + is_done = True + + return accessions + + def get_ids(self) -> List[str]: + """Returns a combined list of passed taxon IDs.""" + ids = set() + + if self.options["taxon_id"]: + ids.update(self.options["taxon_id"]) + + if self.options["taxon_ids_file"]: + with open(self.options["taxon_ids_file"]) as taxon_id_file: + ids.update((taxon_id.strip() for taxon_id in taxon_id_file.readlines())) + + return sorted(ids) diff --git a/foreman/data_refinery_foreman/settings.py b/foreman/data_refinery_foreman/settings.py index 7a489facc..5fea76d71 100644 --- a/foreman/data_refinery_foreman/settings.py +++ b/foreman/data_refinery_foreman/settings.py @@ -47,6 +47,7 @@ "data_refinery_common", "data_refinery_foreman.surveyor", "data_refinery_foreman.foreman", + "data_refinery_foreman.gatherer", "raven.contrib.django.raven_compat", "computedfields", ] @@ -108,10 +109,18 @@ # https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators AUTH_PASSWORD_VALIDATORS = [ - {"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",}, - {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",}, - {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",}, - {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",}, + { + "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", + }, ] diff --git a/foreman/dockerfiles/Dockerfile.foreman b/foreman/dockerfiles/Dockerfile.foreman index 8c09c6888..929ef2476 100644 --- a/foreman/dockerfiles/Dockerfile.foreman +++ b/foreman/dockerfiles/Dockerfile.foreman @@ -8,6 +8,8 @@ RUN apt-get -y install apt-fast RUN apt-fast update -qq && \ apt-fast install -y \ + gcc \ + libpq-dev \ python3 \ python3-pip From 9e520a0e0834368830fa73e409cd23c29de01587 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Tue, 13 Sep 2022 17:12:41 -0700 Subject: [PATCH 04/24] Address review comments. --- .../migrations/0071_auto_20220901_1653.py | 44 -- ...klogentry.py => 0071_gatheredaccession.py} | 6 +- .../data_refinery_common/models/__init__.py | 2 +- .../{accession.py => gathered_accession.py} | 20 +- .../gatherer/agents/__init__.py | 0 .../gatherer/agents/base.py | 79 +++ .../gatherer/agents/microarray_ae.py | 126 ++++ .../gatherer/agents/microarray_geo.py | 123 ++++ .../gatherer/agents/rna_seq.py | 204 ++++++ .../management/commands/gather_accessions.py | 643 +++--------------- 10 files changed, 626 insertions(+), 621 deletions(-) delete mode 100644 common/data_refinery_common/migrations/0071_auto_20220901_1653.py rename common/data_refinery_common/migrations/{0071_accessionbacklogentry.py => 0071_gatheredaccession.py} (88%) rename common/data_refinery_common/models/{accession.py => gathered_accession.py} (84%) create mode 100644 foreman/data_refinery_foreman/gatherer/agents/__init__.py create mode 100644 foreman/data_refinery_foreman/gatherer/agents/base.py create mode 100644 foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py create mode 100644 foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py create mode 100644 foreman/data_refinery_foreman/gatherer/agents/rna_seq.py diff --git a/common/data_refinery_common/migrations/0071_auto_20220901_1653.py b/common/data_refinery_common/migrations/0071_auto_20220901_1653.py deleted file mode 100644 index c7d3b0b63..000000000 --- a/common/data_refinery_common/migrations/0071_auto_20220901_1653.py +++ /dev/null @@ -1,44 +0,0 @@ -# Generated by Django 3.2.7 on 2022-09-01 16:53 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("data_refinery_common", "0070_auto_20211208_2118"), - ] - - operations = [ - migrations.CreateModel( - name="Accession", - fields=[ - ( - "id", - models.AutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("code", models.TextField()), - ("created_at", models.DateTimeField(auto_now_add=True)), - ("last_modified_at", models.DateTimeField(auto_now=True)), - ("organism", models.TextField()), - ("published_date", models.DateTimeField()), - ("sample_count", models.PositiveIntegerField(default=0)), - ("source", models.TextField()), - ("technology", models.TextField()), - ], - options={ - "db_table": "accessions", - }, - ), - migrations.AddConstraint( - model_name="accession", - constraint=models.UniqueConstraint( - fields=("code", "source", "technology"), name="unique_accession" - ), - ), - ] diff --git a/common/data_refinery_common/migrations/0071_accessionbacklogentry.py b/common/data_refinery_common/migrations/0071_gatheredaccession.py similarity index 88% rename from common/data_refinery_common/migrations/0071_accessionbacklogentry.py rename to common/data_refinery_common/migrations/0071_gatheredaccession.py index 86c04daed..a1740d96e 100644 --- a/common/data_refinery_common/migrations/0071_accessionbacklogentry.py +++ b/common/data_refinery_common/migrations/0071_gatheredaccession.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.7 on 2022-09-07 19:31 +# Generated by Django 3.2.7 on 2022-09-13 18:14 from django.db import migrations, models @@ -11,7 +11,7 @@ class Migration(migrations.Migration): operations = [ migrations.CreateModel( - name="AccessionBacklogEntry", + name="GatheredAccession", fields=[ ( "id", @@ -32,7 +32,7 @@ class Migration(migrations.Migration): ("technology", models.TextField()), ], options={ - "db_table": "accession_backlog", + "db_table": "gathered_accessions", }, ), ] diff --git a/common/data_refinery_common/models/__init__.py b/common/data_refinery_common/models/__init__.py index 8e9564153..2b544765d 100644 --- a/common/data_refinery_common/models/__init__.py +++ b/common/data_refinery_common/models/__init__.py @@ -1,4 +1,3 @@ -from data_refinery_common.models.accession import Accession # noqa from data_refinery_common.models.api_token import APIToken # noqa from data_refinery_common.models.associations.compendium_result_organism_association import ( # noqa CompendiumResultOrganismAssociation, @@ -46,6 +45,7 @@ from data_refinery_common.models.dataset_annotation import DatasetAnnotation # noqa from data_refinery_common.models.experiment import Experiment # noqa from data_refinery_common.models.experiment_annotation import ExperimentAnnotation # noqa +from data_refinery_common.models.gathered_accession import GatheredAccession # noqa from data_refinery_common.models.jobs.downloader_job import DownloaderJob # noqa from data_refinery_common.models.jobs.processor_job import ProcessorJob # noqa from data_refinery_common.models.jobs.survey_job import SurveyJob # noqa diff --git a/common/data_refinery_common/models/accession.py b/common/data_refinery_common/models/gathered_accession.py similarity index 84% rename from common/data_refinery_common/models/accession.py rename to common/data_refinery_common/models/gathered_accession.py index 6ac62da9f..04b084533 100644 --- a/common/data_refinery_common/models/accession.py +++ b/common/data_refinery_common/models/gathered_accession.py @@ -4,11 +4,11 @@ from django.utils import timezone -class AccessionBacklogEntry(models.Model): - """Accession backlog entry model.""" +class GatheredAccession(models.Model): + """Gathered accession model.""" class Meta: - db_table = "accession_backlog" + db_table = "gathered_accessions" code = models.TextField(unique=True) created_at = models.DateTimeField(auto_now_add=True) @@ -21,7 +21,7 @@ class Meta: def __eq__(self, other: object) -> bool: """Returns True if two objects are equal. Otherwise returns False.""" - return isinstance(other, AccessionBacklogEntry) and self.code == other.code + return isinstance(other, GatheredAccession) and self.code == other.code def __hash__(self) -> int: """Returns accession object unique hash value.""" @@ -32,15 +32,15 @@ def __str__(self) -> str: return ", ".join((self.code, self.technology, self.source, str(self.published_date.date()))) @staticmethod - def create_from_ma_ae_entry(entry): + def create_from_ma_ae_entry(entry, organism=None): """Creates accession object from MicroArray ArrayExpress entry.""" - accession = AccessionBacklogEntry() + accession = GatheredAccession() accession.code = entry["accession"] accession.source = "ebi_biostudies" accession.technology = "microarray" - if "organism" in entry: - accession.organism = entry["organism"] + if organism: + accession.organism = organism if "release_date" in entry: accession.published_date = timezone.make_aware( datetime.strptime(entry["release_date"], "%Y-%m-%d") @@ -51,7 +51,7 @@ def create_from_ma_ae_entry(entry): @staticmethod def create_from_ma_geo_entry(entry): """Creates accession object from MicroArray GEO meta DB entry.""" - accession = AccessionBacklogEntry() + accession = GatheredAccession() accession.code = entry["gse"] accession.source = "geo_meta_db" accession.technology = "microarray" @@ -69,7 +69,7 @@ def create_from_ma_geo_entry(entry): @staticmethod def create_from_rnaseq_entry(entry): """Creates accession object from RNA-Seq entry.""" - accession = AccessionBacklogEntry() + accession = GatheredAccession() accession.code = entry["secondary_study_accession"] accession.source = "ebi_ena_portal" accession.technology = "rna-seq" diff --git a/foreman/data_refinery_foreman/gatherer/agents/__init__.py b/foreman/data_refinery_foreman/gatherer/agents/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/foreman/data_refinery_foreman/gatherer/agents/base.py b/foreman/data_refinery_foreman/gatherer/agents/base.py new file mode 100644 index 000000000..3754a4068 --- /dev/null +++ b/foreman/data_refinery_foreman/gatherer/agents/base.py @@ -0,0 +1,79 @@ +"""Abstract base class for accession gathering automation agents.""" + +from abc import ABC, abstractmethod +from datetime import datetime +from http.client import RemoteDisconnected + +from requests.exceptions import ConnectionError, ConnectTimeout +from urllib3.exceptions import ProtocolError + +from data_refinery_common.logging import get_and_configure_logger +from data_refinery_common.models.experiment import Experiment +from data_refinery_common.models.gathered_accession import GatheredAccession + +logger = get_and_configure_logger(__name__) + + +class AccessionAgentBase(ABC): + "Accession agent abstract base class." + + previous_accessions = set() + retry_params = { + "retry_on_exception": lambda e: isinstance( + e, (ConnectionError, ConnectTimeout, ProtocolError, RemoteDisconnected) + ), + "stop_max_attempt_number": 5, + "wait_exponential_multiplier": 1000, # Seconds. + "wait_exponential_max": 16000, # Seconds. + } + + def __init__(self, options) -> None: + """Populates args and values for major variables.""" + self.options = options + self.count = options["count"] + self.keyword = options["keyword"] + self.organism = options["organism"] + self.since = options["since"] + self.until = options["until"] or datetime.now().strftime("%Y-%m-%d") + + self.ids = self.get_ids() + self.populate_previous_accessions() + + @abstractmethod + def build_query(self): + """Returns query/query dict depending on the accession data source.""" + pass + + @abstractmethod + def collect_data(self): + """Generates resulting entry collection.""" + pass + + @abstractmethod + def fetch_data(self): + """Fetches data from an external or local data source.""" + pass + + @abstractmethod + def get_ids(self): + """Gets IDs for query filtering depending on the accession technology.""" + pass + + def populate_previous_accessions(self) -> None: + """Populates previous accession set from a provided excluded ids file.""" + if not self.options["exclude_previous"] or self.previous_accessions: + return + + # Gathered accessions. + self.previous_accessions.update( + (entry["code"] for entry in GatheredAccession.objects.values("code")) + ) + + # Surveyed accessions. + experiments = Experiment.objects.values("accession_code", "alternate_accession_code") + self.previous_accessions.update( + (experiment["accession_code"] for experiment in experiments) + ) + self.previous_accessions.update( + (experiment["alternate_accession_code"] for experiment in experiments) + ) diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py new file mode 100644 index 000000000..b5314302b --- /dev/null +++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py @@ -0,0 +1,126 @@ +"""MicroArray ArrayExpress accession gathering automation. +Data source: https://www.ebi.ac.uk/biostudies/help""" + +from typing import List, Set + +import requests +from retrying import retry + +from data_refinery_common.logging import get_and_configure_logger +from data_refinery_common.models.gathered_accession import GatheredAccession +from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase + +logger = get_and_configure_logger(__name__) + + +class MicroArrayExpressAccessionAgent(AccessionAgentBase): + """ + MicroArray ArrayExpress accession gathering agent. The data is fetched from + the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and + https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more + information about the API endpoints. + """ + + DATA_CHUNK_SIZE = 100 + DATA_URL = "https://www.ebi.ac.uk/biostudies/api/v1/search" + + def build_query(self) -> dict: + """Returns a query dict for getting array/organism specific accessions.""" + query_dict = { + "directsub": "true", + "page": 1, + "pageSize": self.DATA_CHUNK_SIZE, + "release_date": f"[{self.since} TO {self.until}]", + "type": "study", + } + + if self.ids: + # TODO(ark): figure out better way of array filtering. + # Also make sure it's equivalent to the array filtering in this query + # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208 + query_dict.update({"content": ", ".join(self.ids)}) + elif self.keyword: + query_dict.update({"content": self.keyword}) + elif self.organism: + query_dict.update({"organism": f'"{self.organism}"'}) + + return query_dict + + def collect_data(self) -> Set[str]: + """Gets new accessions from EBI Biostudies API.""" + accessions = set() + + if self.ids: + message = ( + "Getting MicroArray ArrayExpress entries by " + f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] " + "range." + ) + elif self.keyword: + message = ( + "Getting MicroArray ArrayExpress entries by " + f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.' + ) + elif self.organism: + message = ( + "Getting MicroArray ArrayExpress entries by " + f'"{self.organism}" organism for [{self.since} - {self.until}] range.' + ) + else: + return accessions + + logger.debug(message) + accessions.update(self.fetch_data()) + + return accessions + + def fetch_data(self) -> Set[str]: + """Retrieves accessions from API search endpoint.""" + + @retry(**self.retry_params) + def get_response(url, **kwargs): + """Gets response from an API endpoint.""" + return requests.get(url, **kwargs) + + accessions = set() + + is_done = False + params = self.build_query() + while not is_done: + range_start = (params["page"] - 1) * params["pageSize"] + 1 + range_end = (params["page"] - 1) * params["pageSize"] + self.DATA_CHUNK_SIZE + logger.debug(f"Processing entries {range_start} - {range_end}") + + response = get_response(self.DATA_URL, params=params) + entries = response.json().get("hits") + if entries: + entries = ( + GatheredAccession.create_from_ma_ae_entry(entry, organism=self.organism) + for entry in entries + ) + params["page"] += 1 + else: + is_done = True + + if self.previous_accessions: + entries = (entry for entry in entries if entry.code not in self.previous_accessions) + accessions.update(entries) + + # Quit after getting a sufficient amount of accessions. + if self.count and len(accessions) >= self.count: + is_done = True + + return accessions + + def get_ids(self) -> List[str]: + """Returns a combined list of passed ArrayExpress IDs.""" + ids = set() + + if self.options["ae_id"]: + ids.update(self.options["ae_id"]) + + if self.options["ae_ids_file"]: + with open(self.options["ae_ids_file"]) as ae_ids_file: + ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines())) + + return sorted(ids) diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py new file mode 100644 index 000000000..975c715b3 --- /dev/null +++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py @@ -0,0 +1,123 @@ +"""MicroArray GEO accession gathering automation. +Data source: local SQLite meta DB from https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html""" + +import os +import re +import sqlite3 +from typing import List, Set + +from data_refinery_common.logging import get_and_configure_logger +from data_refinery_common.models.gathered_accession import GatheredAccession +from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase + +logger = get_and_configure_logger(__name__) + + +class MicroArrayGEOAccessionAgent(AccessionAgentBase): + """ + MicroArray GEO accession gathering agent. The data is fetched from a local + SQLite GEO meta database. + """ + + # TODO(ark): move the DB file from Docker image to S3. + # Implement syncing procedure. + # Update URL once the original file is available again. + DB_PATH = "data/microarray/GEOmetadb.sqlite" + + def build_query(self) -> str: + """Returns a query for getting GEO accessions from the local SQLite meta DB.""" + tables = [ + "SELECT *", + "FROM gse_gpl", + "JOIN gpl ON gse_gpl.gpl=gpl.gpl", + "JOIN gse ON gse.gse=gse_gpl.gse", + "GROUP BY gse_gpl.gse", + ] + + conditions = [ + f"HAVING gse.submission_date >= '{self.since}'", + f"gse.submission_date <= '{self.until}'", + ] + + if self.ids: + gpl_ids = (f"'{gpl_id}'" for gpl_id in self.ids) + conditions.append(f"gse_gpl.gpl IN ({', '.join(gpl_ids)})") + elif self.organism: + conditions.append(f"lower(organism)='{self.organism.lower()}'") + + return f"{' '.join(tables)} {' AND '.join(conditions)}" + + def collect_data(self) -> Set[str]: + """Gets new accessions from GEO database.""" + accessions = set() + + if self.ids: + message = ( + "Getting MicroArray GEO entries by GEO platform ID(s): " + f"{', '.join(self.ids)} for [{self.since} - {self.until}] range." + ) + elif self.keyword: + message = ( + f'Getting MicroArray GEO entries by "{self.keyword}" keyword ' + f"for [{self.since} - {self.until}] range." + ) + elif self.organism: + message = ( + f'Getting MicroArray GEO entries by "{self.organism}" organism ' + f"for [{self.since} - {self.until}] range." + ) + else: + return accessions + + logger.debug(message) + accessions.update(self.fetch_data()) + + return accessions + + def fetch_data(self) -> Set[str]: + """Retrieves accessions from the GEO meta DB.""" + + def match_keyword(row): + """ + Returns True if `row` matches `self.keyword` based regex. + Otherwise returns False. + """ + return re_keyword.match(" ".join((str(c) for c in row if c))) + + accessions = set() + + if not os.path.exists(self.DB_PATH): + logger.error("GEO meta database doesn't exist.") + return accessions + + connection = sqlite3.connect(self.DB_PATH) + connection.row_factory = sqlite3.Row + connection.text_factory = lambda b: b.decode(errors="ignore") + entries = connection.execute(self.build_query()).fetchall() + connection.close() + + if self.keyword: + re_keyword = re.compile(f".*{self.keyword}.*", re.IGNORECASE) # Keyword regex. + entries = filter(match_keyword, entries) + + entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries) + entries = set((GatheredAccession.create_from_ma_geo_entry(entry) for entry in entries)) + + if self.previous_accessions: + entries = (entry for entry in entries if entry.code not in self.previous_accessions) + accessions.update(entries) + + return accessions + + def get_ids(self) -> List[str]: + """Returns a combined list of passed GEO platform IDs.""" + ids = set() + + if self.options["gpl_id"]: + ids.update(self.options["gpl_id"]) + + if self.options["gpl_ids_file"]: + with open(self.options["gpl_ids_file"]) as gpl_ids_file: + ids.update((gpl_id.strip() for gpl_id in gpl_ids_file.readlines())) + + return sorted(ids) diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py new file mode 100644 index 000000000..f9497f3ba --- /dev/null +++ b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py @@ -0,0 +1,204 @@ +"""RNA-Seq accession gathering automation. +Data source: https://www.ebi.ac.uk/ena/portal/api/""" + +from json.decoder import JSONDecodeError +from typing import List, Set +from urllib.parse import quote + +import requests +from retrying import retry + +from data_refinery_common.logging import get_and_configure_logger +from data_refinery_common.models.gathered_accession import GatheredAccession +from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase + +logger = get_and_configure_logger(__name__) + + +class RNASeqAccessionAgent(AccessionAgentBase): + """ + RNA-Seq accession gathering agent. The data is fetched from + The European Nucleotide Archive (ENA) Portal. + See https://www.ebi.ac.uk/ena/portal/api/ for more information about the API + endpoints. + """ + + DATA_CHUNK_SIZE = 10000 + DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search" + + def build_query(self, taxon_id: str = None) -> str: + """ + Returns a query to use for getting specific taxon ID accessions. + Some special characters must remain unquoted. + """ + + AND = " AND " + OR = " OR " + instrument_models = ( + "HiSeq X Five", + "HiSeq X Ten", + "Illumina Genome Analyzer II", + "Illumina Genome Analyzer IIx", + "Illumina Genome Analyzer", + "Illumina HiScanSQ", + "Illumina HiSeq 1000", + "Illumina HiSeq 1500", + "Illumina HiSeq 2000", + "Illumina HiSeq 2500", + "Illumina HiSeq 3000", + "Illumina HiSeq 4000", + "Illumina MiSeq", + "Illumina NovaSeq 6000", + "Ion Torrent Proton", + "Ion Torrent S5 XL", + "Ion Torrent S5", + "NextSeq 500", + "NextSeq 550", + ) + + instrument_models = OR.join((f'instrument_model="{im}"' for im in instrument_models)) + conditions = [ + # Relevant date fields: collection_date, collection_date_submitted, + # first_public, last_updated. + f"first_public >= {self.since}", + f"first_public <= {self.until}", + f"({instrument_models})", + 'library_source="TRANSCRIPTOMIC"', + 'library_strategy="RNA-Seq"', + ] + + if taxon_id: + conditions.append(f"tax_eq({taxon_id})") + elif self.keyword: + search_fields = ( + "assembly_software", + "bio_material", + "center_name", + "collected_by", + "experiment_title", + "host_body_site", + "instrument_model", + "instrument_platform", + "library_name", + "project_name", + "sample_title", + "sequencing_method", + "study_title", + ) + search_fields = OR.join( + (f'{sf}="*{self.keyword}*"' for sf in search_fields) + ) # Keyword regex. + conditions.append(f"({search_fields})") + elif self.organism: + # `host`: Natural (as opposed to laboratory) host to the organism from which sample + # was obtained. + # `host_scientific_name`: Scientific name of the natural (as opposed to laboratory) + # host to the organism from which sample was obtained. + # `scientific_name` Scientific name of the organism from which the sample was derived. + # Neither `host_scientific_name` nor `scientific_name` available for search. + # https://www.ebi.ac.uk/ena/portal/api/searchFields?dataPortal=ena&format=json&result=read_study + conditions.append(f'host="{self.organism}"') + + return quote(AND.join(conditions), safe='*()-="<>/ ') # Must remain unquoted. + + def collect_data(self) -> Set[str]: + """Gets new accessions from EBI ENA API.""" + accessions = set() + + if self.ids: + logger.debug( + f"Getting RNA-Seq entries by taxon ID(s): " + f"{', '.join((str(i) for i in self.ids))} for [{self.since} - {self.until}] range." + ) + total = len(self.ids) + for idx, taxon_id in enumerate(self.ids): + if self.count and len(accessions) >= self.count: + break + + if total > 1: + logger.debug(f"Getting entries for taxon ID {taxon_id}, {idx + 1} of {total}.") + accessions.update(self.fetch_data(taxon_id=taxon_id)) + elif self.keyword: + logger.debug( + f'Getting RNA-Seq entries by "{self.keyword}" keyword ' + f"for [{self.since} - {self.until}] range." + ) + accessions.update(self.fetch_data()) + elif self.organism: + logger.debug( + f'Getting entries by "{self.organism}" organism ' + f"for [{self.since} - {self.until}] range." + ) + accessions.update(self.fetch_data()) + + return accessions + + def fetch_data(self, taxon_id=None) -> Set[str]: + """ + Retrieves accessions from API search endpoint. + The API allows to set limit to 0 (get all in one request) but we do + it in a paginated fashion with `self.DATA_CHUNK_SIZE` as a page size. + """ + + @retry(**self.retry_params) + def get_response(url, **kwargs): + """Gets response from an API endpoint.""" + return requests.post(url, **kwargs) + + accessions = set() + + fields = [ + "first_public", + "scientific_name", + "secondary_study_accession", + ] # For DRP/ERP/SRP-prefixed accessions. + data = { + "dataPortal": "ena", + # TODO(ark): add excludeAccessions/excludeAccessionType support. + "fields": ",".join(fields), # Use "all" to get all fields. + "format": "json", + "limit": self.DATA_CHUNK_SIZE, + "offset": 0, + "query": self.build_query(taxon_id=taxon_id), + "result": "read_study", + "sortFields": fields, + } + + is_done = False + while not is_done: + logger.debug( + f"Processing entries {data['offset'] + 1} - {data['offset'] + self.DATA_CHUNK_SIZE}" + ) + entries = () + try: + response = get_response(self.DATA_URL, data=data) + entries = response.json() + entries = (GatheredAccession.create_from_rnaseq_entry(entry) for entry in entries) + except JSONDecodeError: + is_done = True + except TypeError: + logger.error(f"Couldn't get data from {self.data_url}. Response: {entries}") + data["offset"] += self.DATA_CHUNK_SIZE + + if self.previous_accessions: + entries = (entry for entry in entries if entry.code not in self.previous_accessions) + accessions.update(entries) + + # Quit after getting a sufficient amount of accessions. + if self.count and len(accessions) >= self.count: + is_done = True + + return accessions + + def get_ids(self) -> List[str]: + """Returns a combined list of passed taxon IDs.""" + ids = set() + + if self.options["taxon_id"]: + ids.update(self.options["taxon_id"]) + + if self.options["taxon_ids_file"]: + with open(self.options["taxon_ids_file"]) as taxon_id_file: + ids.update((taxon_id.strip() for taxon_id in taxon_id_file.readlines())) + + return sorted(ids) diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py index c4808a191..445245d3a 100644 --- a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py +++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py @@ -8,32 +8,27 @@ import argparse import logging -import os import re -import sqlite3 -from datetime import datetime -from http.client import RemoteDisconnected -from json.decoder import JSONDecodeError -from typing import List, Set -from urllib.parse import quote from django.core.management.base import BaseCommand -import requests -from requests.exceptions import ConnectionError, ConnectTimeout -from retrying import retry -from urllib3.exceptions import ProtocolError - from data_refinery_common.logging import get_and_configure_logger -from data_refinery_common.models.accession import AccessionBacklogEntry -from data_refinery_common.models.experiment import Experiment +from data_refinery_common.models.gathered_accession import GatheredAccession +from data_refinery_foreman.gatherer.agents.microarray_ae import MicroArrayExpressAccessionAgent +from data_refinery_foreman.gatherer.agents.microarray_geo import MicroArrayGEOAccessionAgent +from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAccessionAgent -log = get_and_configure_logger(__name__) +logger = get_and_configure_logger(__name__) class Command(BaseCommand): """Creates agents and runs actual accession gathering.""" + DATA_SOURCE_MA_AE = "microarray-ae" + DATA_SOURCE_MA_GEO = "microarray-geo" + DATA_SOURCE_RNA_SEQ = "rna-seq" + DATA_SOURCES = (DATA_SOURCE_MA_AE, DATA_SOURCE_MA_GEO, DATA_SOURCE_RNA_SEQ) + RE_ACCESSION = re.compile(r"(\D+)(\d+)") RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}") @@ -76,14 +71,6 @@ def add_arguments(self, parser) -> None: default=True, help="Exclude previously gathered or surveyed accessions.", ) - parser.add_argument( - "-ne", - "--no-exclude-previous", - action="store_false", - default=False, - dest="exclude_previous", - help="Do not exclude previously gathered or surveyed accessions.", - ) parser.add_argument( "--gpl-id", action="extend", @@ -103,21 +90,22 @@ def add_arguments(self, parser) -> None: help="Keyword to use for filtering.", ) parser.add_argument( - "-m", - "--microarray", + "-lv", + "--log-verbose", action="store_true", default=False, - help="Collect MicroArray accessions.", + help="Enable verbose log output.", ) parser.add_argument( - "-o", "--organism", type=str, help="Organism name to use for filtering." + "-ne", + "--no-exclude-previous", + action="store_false", + default=False, + dest="exclude_previous", + help="Do not exclude previously gathered or surveyed accessions.", ) parser.add_argument( - "-r", - "--rna-seq", - action="store_true", - default=False, - help="Collect RNA-Seq accessions.", + "-o", "--organism", type=str, help="Organism name to use for filtering." ) parser.add_argument( "-s", @@ -126,6 +114,14 @@ def add_arguments(self, parser) -> None: required=True, help="Collect accessions made public on or after this date.", ) + parser.add_argument( + "-src", + "--source", + type=str, + action="extend", + nargs="+", + help="Gather accessions from selected sources.", + ) parser.add_argument( "--taxon-id", action="extend", @@ -144,28 +140,19 @@ def add_arguments(self, parser) -> None: type=str, help="Collect accessions made public before or on this date.", ) - parser.add_argument( - "-lv", - "--log-verbose", - action="store_true", - default=False, - help="Enable verbose log output.", - ) def set_verbosity_level(self, options) -> None: """Configures log verbosity level.""" if options["log_verbose"]: - log.addHandler(logging.StreamHandler()) - log.setLevel(logging.DEBUG) + logger.addHandler(logging.StreamHandler()) + logger.setLevel(logging.DEBUG) else: - log.setLevel(logging.ERROR) + logger.setLevel(logging.ERROR) def validate_args(self, options) -> None: """Validates arguments.""" - if not options["microarray"] and not options["rna_seq"]: - exit("Either --microarray or --rna-seq must be specified.") - errors = list() + since = options["since"] until = options["until"] if not self.RE_DATE.match(since): @@ -177,52 +164,65 @@ def validate_args(self, options) -> None: keyword = options["keyword"] organism = options["organism"] - if options["microarray"]: - ae_id = options["ae_id"] or options["ae_ids_file"] - gpl_id = options["gpl_id"] or options["gpl_ids_file"] - ids = ae_id or gpl_id - invalid_options_message = ( - "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or " - "ArrayExpress ID(s) [--ae-id, --ae-ids-file] / GEO platform ID(s) " - "[--gpl-id, --gpl-ids-file] must be specified." - ) - elif options["rna_seq"]: - taxon_id = options["taxon_id"] or options["taxon_ids_file"] - ids = taxon_id - invalid_options_message = ( - "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] " - "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified." + sources = options["source"] or self.DATA_SOURCES + + for source in sources: + if source in self.DATA_SOURCES: + continue + errors.append( + f"Unknown source: {source}. Supported sources: {', '.join(self.DATA_SOURCES)}" ) - if len([option for option in (ids, keyword, organism) if option]) != 1: - errors.append(invalid_options_message) + if self.DATA_SOURCE_MA_AE in sources: + ids = options["ae_id"] or options["ae_ids_file"] + if not (ids or keyword or organism): + errors.append( + ( + "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or " + "ArrayExpress ID(s) [--ae-id, --ae-ids-file] must be specified for " + f"'{self.DATA_SOURCE_MA_AE}' source." + ) + ) + if self.DATA_SOURCE_MA_GEO in sources: + ids = options["gpl_id"] or options["gpl_ids_file"] + if not (ids or keyword or organism): + errors.append( + ( + "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or " + "GEO platform ID(s) [--gpl-id, --gpl-ids-file] must be specified for " + f"'{self.DATA_SOURCE_MA_GEO}' source." + ) + ) + if self.DATA_SOURCE_RNA_SEQ in sources: + ids = options["taxon_id"] or options["taxon_ids_file"] + if not (ids or keyword or organism): + errors.append( + ( + "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] " + "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified for " + f"'{self.DATA_SOURCE_RNA_SEQ}' source." + ) + ) if errors: exit("\n".join(errors)) def handle(self, *args, **options): - """Runs the accession gathering process.""" + """Creates agents and runs the accession gathering process.""" self.validate_args(options) self.set_verbosity_level(options) agents = list() - if options["rna_seq"]: + sources = options["source"] or self.DATA_SOURCES + + if self.DATA_SOURCE_RNA_SEQ in sources: agents.append(RNASeqAccessionAgent(options)) - elif options["microarray"]: - if ( - options["ae_id"] - or options["ae_ids_file"] - or options["keyword"] - or options["organism"] - ): - agents.append(MicroArrayExpressAccessionAgent(options)) - if ( - options["gpl_id"] - or options["gpl_ids_file"] - or options["keyword"] - or options["organism"] - ): - agents.append(MicroArrayGEOAccessionAgent(options)) + + if self.DATA_SOURCE_MA_AE in sources: + agents.append(MicroArrayExpressAccessionAgent(options)) + + if self.DATA_SOURCE_MA_GEO in sources: + agents.append(MicroArrayGEOAccessionAgent(options)) entries = set() for agent in agents: @@ -245,487 +245,4 @@ def handle(self, *args, **options): output = "No accessions found." print(output) else: - AccessionBacklogEntry.objects.bulk_create(entries) - - -class AccessionAgentBase: - "Accession agent base class." - - previous_accessions = set() - retry_params = { - "retry_on_exception": lambda e: isinstance( - e, (ConnectionError, ConnectTimeout, ProtocolError, RemoteDisconnected) - ), - "stop_max_attempt_number": 5, - "wait_exponential_multiplier": 1000, # Seconds. - "wait_exponential_max": 16000, # Seconds. - } - - def __init__(self, options) -> None: - """Populates args and values for major variables.""" - self.options = options - self.count = options["count"] - self.keyword = options["keyword"] - self.organism = options["organism"] - self.since = options["since"] - self.until = options["until"] or datetime.now().strftime("%Y-%m-%d") - - self.populate_previous_accessions() - - def build_query(self): - """Returns query/query dict depending on the accession data source.""" - raise NotImplementedError - - def collect_data(self): - """Generates resulting entry collection.""" - raise NotImplementedError - - def fetch_data(self): - """Fetches data from an external or local data source.""" - raise NotImplementedError - - def get_ids(self): - """Gets IDs for query filtering depending on the accession technology.""" - raise NotImplementedError - - def populate_previous_accessions(self) -> None: - """Populates previous accession set from a provided excluded ids file.""" - if not self.options["exclude_previous"] or self.previous_accessions: - return - - # Gathered accessions. - self.previous_accessions.update( - (entry["code"] for entry in AccessionBacklogEntry.objects.values("code")) - ) - - # Surveyed accessions. - experiments = Experiment.objects.values("accession_code", "alternate_accession_code") - self.previous_accessions.update( - (experiment["accession_code"] for experiment in experiments) - ) - self.previous_accessions.update( - (experiment["alternate_accession_code"] for experiment in experiments) - ) - - -class MicroArrayExpressAccessionAgent(AccessionAgentBase): - """ - MicroArray ArrayExpress accession gathering agent. The data is fetched from - the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and - https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more - information about the API endpoints. - """ - - def __init__(self, options) -> None: - super().__init__(options) - - self.data_chunk_size = 100 - self.data_url = "https://www.ebi.ac.uk/biostudies/api/v1/search" - self.ids = self.get_ids() - - def build_query(self) -> dict: - """Returns a query dict for getting array/organism specific accessions.""" - query_dict = { - "directsub": "true", - "page": 1, - "pageSize": self.data_chunk_size, - "release_date": f"[{self.since} TO {self.until}]", - "type": "study", - } - - if self.ids: - # TODO(ark): figure out better way of array filtering. - # Also make sure it's equivalent to the array filtering in this query - # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208 - query_dict.update({"content": ", ".join(self.ids)}) - elif self.keyword: - query_dict.update({"content": self.keyword}) - elif self.organism: - query_dict.update({"organism": f'"{self.organism}"'}) - - return query_dict - - def collect_data(self) -> Set[str]: - """Gets new accessions from EBI Biostudies API.""" - accessions = set() - - if self.ids: - message = ( - "Getting MicroArray ArrayExpress entries by " - f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] " - "range." - ) - elif self.keyword: - message = ( - "Getting MicroArray ArrayExpress entries by " - f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.' - ) - elif self.organism: - message = ( - "Getting MicroArray ArrayExpress entries by " - f'"{self.organism}" organism for [{self.since} - {self.until}] range.' - ) - else: - return accessions - - log.debug(message) - accessions.update(self.fetch_data()) - - return accessions - - def fetch_data(self) -> Set[str]: - """Retrieves accessions from API search endpoint.""" - - @retry(**self.retry_params) - def get_response(url, **kwargs): - """Gets response from an API endpoint.""" - return requests.get(url, **kwargs) - - accessions = set() - - is_done = False - params = self.build_query() - while not is_done: - range_start = (params["page"] - 1) * params["pageSize"] + 1 - range_end = (params["page"] - 1) * params["pageSize"] + self.data_chunk_size - log.debug(f"Processing entries {range_start} - {range_end}") - - response = get_response(self.data_url, params=params) - entries = response.json().get("hits") - if entries: - entries = ( - AccessionBacklogEntry.create_from_ma_ae_entry(entry) for entry in entries - ) - params["page"] += 1 - else: - is_done = True - - if self.previous_accessions: - entries = (entry for entry in entries if entry.code not in self.previous_accessions) - accessions.update(entries) - - # Quit after getting a sufficient amount of accessions. - if self.count and len(accessions) >= self.count: - is_done = True - - return accessions - - def get_ids(self) -> List[str]: - """Returns a combined list of passed ArrayExpress IDs.""" - ids = set() - - if self.options["ae_id"]: - ids.update(self.options["ae_id"]) - - if self.options["ae_ids_file"]: - with open(self.options["ae_ids_file"]) as ae_ids_file: - ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines())) - - return sorted(ids) - - -class MicroArrayGEOAccessionAgent(AccessionAgentBase): - """ - MicroArray GEO accession gathering agent. The data is fetched from a local - SQLite GEO meta database. - """ - - def __init__(self, options) -> None: - super().__init__(options) - - self.db_path = "data/microarray/GEOmetadb.sqlite" - self.ids = self.get_ids() - - def build_query(self) -> str: - """Returns a query for getting GEO accessions from the local SQLite meta DB.""" - tables = [ - f"SELECT *", - "FROM gse_gpl", - "JOIN gpl ON gse_gpl.gpl=gpl.gpl", - "JOIN gse ON gse.gse=gse_gpl.gse", - "GROUP BY gse_gpl.gse", - ] - - conditions = [ - f"HAVING gse.submission_date >= '{self.since}'", - f"gse.submission_date <= '{self.until}'", - ] - - if self.ids: - gpl_ids = (f"'{gpl_id}'" for gpl_id in self.ids) - conditions.append(f"gse_gpl.gpl IN ({', '.join(gpl_ids)})") - elif self.organism: - conditions.append(f"lower(organism)='{self.organism.lower()}'") - - return f"{' '.join(tables)} {' AND '.join(conditions)}" - - def collect_data(self) -> Set[str]: - """Gets new accessions from GEO database.""" - accessions = set() - - if self.ids: - message = ( - "Getting MicroArray GEO entries by GEO platform ID(s): " - f"{', '.join(self.ids)} for [{self.since} - {self.until}] range." - ) - elif self.keyword: - message = ( - f'Getting MicroArray GEO entries by "{self.keyword}" keyword ' - f"for [{self.since} - {self.until}] range." - ) - elif self.organism: - message = ( - f'Getting MicroArray GEO entries by "{self.organism}" organism ' - f"for [{self.since} - {self.until}] range." - ) - else: - return accessions - - log.debug(message) - accessions.update(self.fetch_data()) - - return accessions - - def fetch_data(self) -> Set[str]: - """Retrieves accessions from the GEO meta DB.""" - - def match_keyword(row): - """ - Returns True if `row` matches `self.keyword` based regex. - Otherwise returns False. - """ - return re_keyword.match(" ".join((str(c) for c in row if c))) - - accessions = set() - - if not os.path.exists(self.db_path): - log.error("GEO meta database doesn't exist.") - return accessions - - connection = sqlite3.connect(self.db_path) - connection.row_factory = sqlite3.Row - connection.text_factory = lambda b: b.decode(errors="ignore") - entries = connection.execute(self.build_query()).fetchall() - connection.close() - - if self.keyword: - re_keyword = re.compile(f".*{self.keyword}.*", re.IGNORECASE) # Keyword regex. - entries = filter(match_keyword, entries) - - entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries) - entries = set((AccessionBacklogEntry.create_from_ma_geo_entry(entry) for entry in entries)) - - if self.previous_accessions: - entries = (entry for entry in entries if entry.code not in self.previous_accessions) - accessions.update(entries) - - return accessions - - def get_ids(self) -> List[str]: - """Returns a combined list of passed GEO platform IDs.""" - ids = set() - - if self.options["gpl_id"]: - ids.update(self.options["gpl_id"]) - - if self.options["gpl_ids_file"]: - with open(self.options["gpl_ids_file"]) as gpl_ids_file: - ids.update((gpl_id.strip() for gpl_id in gpl_ids_file.readlines())) - - return sorted(ids) - - -class RNASeqAccessionAgent(AccessionAgentBase): - """ - RNA-Seq accession gathering agent. The data is fetched from - The European Nucleotide Archive (ENA) Portal. - See https://www.ebi.ac.uk/ena/portal/api/ for more information about the API - endpoints. - """ - - def __init__(self, options) -> None: - super().__init__(options) - - self.data_chunk_size = 10000 - self.data_url = "https://www.ebi.ac.uk/ena/portal/api/search" - self.ids = self.get_ids() - - def build_query(self, taxon_id: str = None) -> str: - """ - Returns a query to use for getting specific taxon ID accessions. - Some special characters must remain unquoted. - """ - - AND = " AND " - OR = " OR " - instrument_models = ( - "HiSeq X Five", - "HiSeq X Ten", - "Illumina Genome Analyzer II", - "Illumina Genome Analyzer IIx", - "Illumina Genome Analyzer", - "Illumina HiScanSQ", - "Illumina HiSeq 1000", - "Illumina HiSeq 1500", - "Illumina HiSeq 2000", - "Illumina HiSeq 2500", - "Illumina HiSeq 3000", - "Illumina HiSeq 4000", - "Illumina MiSeq", - "Illumina NovaSeq 6000", - "Ion Torrent Proton", - "Ion Torrent S5 XL", - "Ion Torrent S5", - "NextSeq 500", - "NextSeq 550", - ) - - instrument_models = OR.join((f'instrument_model="{im}"' for im in instrument_models)) - conditions = [ - # Relevant date fields: collection_date, collection_date_submitted, - # first_public, last_updated. - f"first_public >= {self.since}", - f"first_public <= {self.until}", - f"({instrument_models})", - 'library_source="TRANSCRIPTOMIC"', - 'library_strategy="RNA-Seq"', - ] - - if taxon_id: - conditions.append(f"tax_eq({taxon_id})") - elif self.keyword: - search_fields = ( - "assembly_software", - "bio_material", - "center_name", - "collected_by", - "experiment_title", - "host_body_site", - "instrument_model", - "instrument_platform", - "library_name", - "project_name", - "sample_title", - "sequencing_method", - "study_title", - ) - search_fields = OR.join( - (f'{sf}="*{self.keyword}*"' for sf in search_fields) - ) # Keyword regex. - conditions.append(f"({search_fields})") - elif self.organism: - # `host`: Natural (as opposed to laboratory) host to the organism from which sample - # was obtained. - # `host_scientific_name`: Scientific name of the natural (as opposed to laboratory) - # host to the organism from which sample was obtained. - # `scientific_name` Scientific name of the organism from which the sample was derived. - # Neither `host_scientific_name` nor `scientific_name` available for search. - # https://www.ebi.ac.uk/ena/portal/api/searchFields?dataPortal=ena&format=json&result=read_study - conditions.append(f'host="{self.organism}"') - - return quote(AND.join(conditions), safe='*()-="<>/ ') # Must remain unquoted. - - def collect_data(self) -> Set[str]: - """Gets new accessions from EBI ENA API.""" - accessions = set() - - if self.ids: - log.debug( - f"Getting RNA-Seq entries by taxon ID(s): " - f"{', '.join((str(idx) for idx in self.ids))} for [{self.since} - {self.until}] range." - ) - total = len(self.ids) - for idx, taxon_id in enumerate(self.ids): - if self.count and len(accessions) >= self.count: - break - - if total > 1: - log.debug(f"Getting entries for taxon ID {taxon_id}, {idx + 1} of {total}.") - accessions.update(self.fetch_data(taxon_id=taxon_id)) - elif self.keyword: - log.debug( - f'Getting RNA-Seq entries by "{self.keyword}" keyword ' - f"for [{self.since} - {self.until}] range." - ) - accessions.update(self.fetch_data()) - elif self.organism: - log.debug( - f'Getting entries by "{self.organism}" organism ' - f"for [{self.since} - {self.until}] range." - ) - accessions.update(self.fetch_data()) - - return accessions - - def fetch_data(self, taxon_id=None) -> Set[str]: - """ - Retrieves accessions from API search endpoint. - The API allows to set limit to 0 (get all in one request) but we do - it in a paginated fashion with `self.data_chunk_size` as a page size. - """ - - @retry(**self.retry_params) - def get_response(url, **kwargs): - """Gets response from an API endpoint.""" - return requests.post(url, **kwargs) - - accessions = set() - - fields = [ - "first_public", - "scientific_name", - "secondary_study_accession", - ] # For DRP/ERP/SRP-prefixed accessions. - data = { - "dataPortal": "ena", - # TODO(ark): add excludeAccessions/excludeAccessionType support. - "fields": ",".join(fields), # Use "all" to get all fields. - "format": "json", - "limit": self.data_chunk_size, - "offset": 0, - "query": self.build_query(taxon_id=taxon_id), - "result": "read_study", - "sortFields": fields, - } - - is_done = False - while not is_done: - log.debug( - f"Processing entries {data['offset'] + 1} - {data['offset'] + self.data_chunk_size}" - ) - entries = () - try: - response = get_response(self.data_url, data=data) - entries = response.json() - # TODO(ark): add `organism` when -o, --organism flag is used. - entries = ( - AccessionBacklogEntry.create_from_rnaseq_entry(entry) for entry in entries - ) - except JSONDecodeError: - is_done = True - except TypeError: - log.error(f"Couldn't get data from {self.data_url}. Response: {entries}") - data["offset"] += self.data_chunk_size - - if self.previous_accessions: - entries = (entry for entry in entries if entry.code not in self.previous_accessions) - accessions.update(entries) - - # Quit after getting a sufficient amount of accessions. - if self.count and len(accessions) >= self.count: - is_done = True - - return accessions - - def get_ids(self) -> List[str]: - """Returns a combined list of passed taxon IDs.""" - ids = set() - - if self.options["taxon_id"]: - ids.update(self.options["taxon_id"]) - - if self.options["taxon_ids_file"]: - with open(self.options["taxon_ids_file"]) as taxon_id_file: - ids.update((taxon_id.strip() for taxon_id in taxon_id_file.readlines())) - - return sorted(ids) + GatheredAccession.objects.bulk_create(entries) From 30434e042320e418774257cf494f49860be7f57e Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Tue, 13 Sep 2022 17:17:19 -0700 Subject: [PATCH 05/24] Add a TODO. --- foreman/data_refinery_foreman/gatherer/agents/rna_seq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py index f9497f3ba..f54ba570a 100644 --- a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py +++ b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py @@ -34,6 +34,7 @@ def build_query(self, taxon_id: str = None) -> str: AND = " AND " OR = " OR " + # TODO(ark): extract instrument models to a config file. instrument_models = ( "HiSeq X Five", "HiSeq X Ten", From 811b77ff78d303316d1c10beaff18584122b8e4b Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Tue, 13 Sep 2022 18:37:09 -0700 Subject: [PATCH 06/24] Fix empty response issue. --- foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py index b5314302b..541bd86d2 100644 --- a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py +++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py @@ -92,7 +92,7 @@ def get_response(url, **kwargs): logger.debug(f"Processing entries {range_start} - {range_end}") response = get_response(self.DATA_URL, params=params) - entries = response.json().get("hits") + entries = response.json().get("hits", ()) if entries: entries = ( GatheredAccession.create_from_ma_ae_entry(entry, organism=self.organism) From df14fe3d1e63991f681371e3ecf325e7a0d16edc Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Wed, 14 Sep 2022 18:52:57 -0700 Subject: [PATCH 07/24] Address review comments. --- .../migrations/0071_gatheredaccession.py | 2 +- .../models/gathered_accession.py | 72 +++++++------------ .../gatherer/agents/base.py | 5 +- .../gatherer/agents/microarray_ae.py | 21 ++++-- .../gatherer/agents/microarray_geo.py | 23 ++++-- .../gatherer/agents/rna_seq.py | 24 +++++-- .../management/commands/gather_accessions.py | 60 ++++++++-------- 7 files changed, 112 insertions(+), 95 deletions(-) diff --git a/common/data_refinery_common/migrations/0071_gatheredaccession.py b/common/data_refinery_common/migrations/0071_gatheredaccession.py index a1740d96e..65d192b59 100644 --- a/common/data_refinery_common/migrations/0071_gatheredaccession.py +++ b/common/data_refinery_common/migrations/0071_gatheredaccession.py @@ -22,7 +22,7 @@ class Migration(migrations.Migration): verbose_name="ID", ), ), - ("code", models.TextField(unique=True)), + ("accession_code", models.TextField(unique=True)), ("created_at", models.DateTimeField(auto_now_add=True)), ("last_modified_at", models.DateTimeField(auto_now=True)), ("organism", models.TextField()), diff --git a/common/data_refinery_common/models/gathered_accession.py b/common/data_refinery_common/models/gathered_accession.py index 04b084533..e56ed615c 100644 --- a/common/data_refinery_common/models/gathered_accession.py +++ b/common/data_refinery_common/models/gathered_accession.py @@ -10,7 +10,7 @@ class GatheredAccession(models.Model): class Meta: db_table = "gathered_accessions" - code = models.TextField(unique=True) + accession_code = models.TextField(unique=True) created_at = models.DateTimeField(auto_now_add=True) last_modified_at = models.DateTimeField(auto_now=True) organism = models.TextField() @@ -21,64 +21,44 @@ class Meta: def __eq__(self, other: object) -> bool: """Returns True if two objects are equal. Otherwise returns False.""" - return isinstance(other, GatheredAccession) and self.code == other.code + return isinstance(other, GatheredAccession) and self.accession_code == other.accession_code def __hash__(self) -> int: """Returns accession object unique hash value.""" - return hash(self.code) + return hash(self.accession_code) def __str__(self) -> str: """Returns accession default string representation.""" - return ", ".join((self.code, self.technology, self.source, str(self.published_date.date()))) - - @staticmethod - def create_from_ma_ae_entry(entry, organism=None): - """Creates accession object from MicroArray ArrayExpress entry.""" - accession = GatheredAccession() - accession.code = entry["accession"] - accession.source = "ebi_biostudies" - accession.technology = "microarray" - - if organism: - accession.organism = organism - if "release_date" in entry: - accession.published_date = timezone.make_aware( - datetime.strptime(entry["release_date"], "%Y-%m-%d") + return ", ".join( + ( + self.accession_code, + self.technology, + self.source, + str(self.published_date.date()), ) - - return accession + ) @staticmethod - def create_from_ma_geo_entry(entry): - """Creates accession object from MicroArray GEO meta DB entry.""" + def create_from_external_entry(data, source, technology, organism=None): + """Creates accession object from MicroArray ArrayExpress entry.""" accession = GatheredAccession() - accession.code = entry["gse"] - accession.source = "geo_meta_db" - accession.technology = "microarray" - - if "organism" in entry: - accession.organism = entry["organism"].lower() - if "submission_date" in entry: - accession.published_date = timezone.make_aware( - datetime.strptime(entry["submission_date"], "%Y-%m-%d") - ) + accession.accession_code = ( + data.get("accession") or data.get("gse") or data.get("secondary_study_accession") + ) - return accession + organism = data.get("organism") or data.get("scientific_name") or organism + if organism: + accession.organism = organism.lower() - @staticmethod - def create_from_rnaseq_entry(entry): - """Creates accession object from RNA-Seq entry.""" - accession = GatheredAccession() - accession.code = entry["secondary_study_accession"] - accession.source = "ebi_ena_portal" - accession.technology = "rna-seq" + published_date = ( + data.get("first_public") or data.get("release_date") or data.get("submission_date") + ) + accession.published_date = timezone.make_aware( + datetime.strptime(published_date, "%Y-%m-%d") + ) - if "scientific_name" in entry: - accession.organism = entry["scientific_name"].lower() - if "first_public" in entry: - accession.published_date = timezone.make_aware( - datetime.strptime(entry["first_public"], "%Y-%m-%d") - ) + accession.source = source + accession.technology = technology return accession diff --git a/foreman/data_refinery_foreman/gatherer/agents/base.py b/foreman/data_refinery_foreman/gatherer/agents/base.py index 3754a4068..818bbf72c 100644 --- a/foreman/data_refinery_foreman/gatherer/agents/base.py +++ b/foreman/data_refinery_foreman/gatherer/agents/base.py @@ -66,7 +66,10 @@ def populate_previous_accessions(self) -> None: # Gathered accessions. self.previous_accessions.update( - (entry["code"] for entry in GatheredAccession.objects.values("code")) + ( + entry["accession_code"] + for entry in GatheredAccession.objects.values("accession_code") + ) ) # Surveyed accessions. diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py index 541bd86d2..3bfcf08fe 100644 --- a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py +++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py @@ -1,5 +1,7 @@ -"""MicroArray ArrayExpress accession gathering automation. -Data source: https://www.ebi.ac.uk/biostudies/help""" +""" +MicroArray ArrayExpress accession gathering automation. +Data source: https://www.ebi.ac.uk/biostudies/help +""" from typing import List, Set @@ -13,7 +15,7 @@ logger = get_and_configure_logger(__name__) -class MicroArrayExpressAccessionAgent(AccessionAgentBase): +class AEAgent(AccessionAgentBase): """ MicroArray ArrayExpress accession gathering agent. The data is fetched from the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and @@ -23,6 +25,9 @@ class MicroArrayExpressAccessionAgent(AccessionAgentBase): DATA_CHUNK_SIZE = 100 DATA_URL = "https://www.ebi.ac.uk/biostudies/api/v1/search" + SOURCE = "ebi-biostudies" + SOURCE_NAME = "microarray-ae" + TECHNOLOGY = "microarray" def build_query(self) -> dict: """Returns a query dict for getting array/organism specific accessions.""" @@ -95,7 +100,9 @@ def get_response(url, **kwargs): entries = response.json().get("hits", ()) if entries: entries = ( - GatheredAccession.create_from_ma_ae_entry(entry, organism=self.organism) + GatheredAccession.create_from_external_entry( + entry, self.SOURCE, self.TECHNOLOGY, organism=self.organism + ) for entry in entries ) params["page"] += 1 @@ -103,7 +110,11 @@ def get_response(url, **kwargs): is_done = True if self.previous_accessions: - entries = (entry for entry in entries if entry.code not in self.previous_accessions) + entries = ( + entry + for entry in entries + if entry.accession_code not in self.previous_accessions + ) accessions.update(entries) # Quit after getting a sufficient amount of accessions. diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py index 975c715b3..2500bcec5 100644 --- a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py +++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py @@ -1,5 +1,8 @@ -"""MicroArray GEO accession gathering automation. -Data source: local SQLite meta DB from https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html""" +""" +MicroArray GEO accession gathering automation. +Data source: local SQLite meta DB from +https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html +""" import os import re @@ -13,7 +16,7 @@ logger = get_and_configure_logger(__name__) -class MicroArrayGEOAccessionAgent(AccessionAgentBase): +class GEOAgent(AccessionAgentBase): """ MicroArray GEO accession gathering agent. The data is fetched from a local SQLite GEO meta database. @@ -23,6 +26,9 @@ class MicroArrayGEOAccessionAgent(AccessionAgentBase): # Implement syncing procedure. # Update URL once the original file is available again. DB_PATH = "data/microarray/GEOmetadb.sqlite" + SOURCE = "geo-meta-db" + SOURCE_NAME = "microarray-geo" + TECHNOLOGY = "microarray" def build_query(self) -> str: """Returns a query for getting GEO accessions from the local SQLite meta DB.""" @@ -101,10 +107,17 @@ def match_keyword(row): entries = filter(match_keyword, entries) entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries) - entries = set((GatheredAccession.create_from_ma_geo_entry(entry) for entry in entries)) + entries = set( + ( + GatheredAccession.create_from_external_entry(entry, self.SOURCE, self.TECHNOLOGY) + for entry in entries + ) + ) if self.previous_accessions: - entries = (entry for entry in entries if entry.code not in self.previous_accessions) + entries = ( + entry for entry in entries if entry.accession_code not in self.previous_accessions + ) accessions.update(entries) return accessions diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py index f54ba570a..577f815b8 100644 --- a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py +++ b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py @@ -1,5 +1,7 @@ -"""RNA-Seq accession gathering automation. -Data source: https://www.ebi.ac.uk/ena/portal/api/""" +""" +RNA-Seq accession gathering automation. +Data source: https://www.ebi.ac.uk/ena/portal/api/ +""" from json.decoder import JSONDecodeError from typing import List, Set @@ -15,7 +17,7 @@ logger = get_and_configure_logger(__name__) -class RNASeqAccessionAgent(AccessionAgentBase): +class RNASeqAgent(AccessionAgentBase): """ RNA-Seq accession gathering agent. The data is fetched from The European Nucleotide Archive (ENA) Portal. @@ -25,6 +27,9 @@ class RNASeqAccessionAgent(AccessionAgentBase): DATA_CHUNK_SIZE = 10000 DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search" + SOURCE = "ebi-ena-portal" + SOURCE_NAME = "rna-seq" + TECHNOLOGY = "rna-seq" def build_query(self, taxon_id: str = None) -> str: """ @@ -174,7 +179,12 @@ def get_response(url, **kwargs): try: response = get_response(self.DATA_URL, data=data) entries = response.json() - entries = (GatheredAccession.create_from_rnaseq_entry(entry) for entry in entries) + entries = ( + GatheredAccession.create_from_external_entry( + entry, self.SOURCE, self.TECHNOLOGY + ) + for entry in entries + ) except JSONDecodeError: is_done = True except TypeError: @@ -182,7 +192,11 @@ def get_response(url, **kwargs): data["offset"] += self.DATA_CHUNK_SIZE if self.previous_accessions: - entries = (entry for entry in entries if entry.code not in self.previous_accessions) + entries = ( + entry + for entry in entries + if entry.accession_code not in self.previous_accessions + ) accessions.update(entries) # Quit after getting a sufficient amount of accessions. diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py index 445245d3a..2b073ef45 100644 --- a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py +++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py @@ -11,12 +11,13 @@ import re from django.core.management.base import BaseCommand +from django.db.utils import IntegrityError from data_refinery_common.logging import get_and_configure_logger from data_refinery_common.models.gathered_accession import GatheredAccession -from data_refinery_foreman.gatherer.agents.microarray_ae import MicroArrayExpressAccessionAgent -from data_refinery_foreman.gatherer.agents.microarray_geo import MicroArrayGEOAccessionAgent -from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAccessionAgent +from data_refinery_foreman.gatherer.agents.microarray_ae import AEAgent +from data_refinery_foreman.gatherer.agents.microarray_geo import GEOAgent +from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAgent logger = get_and_configure_logger(__name__) @@ -24,11 +25,8 @@ class Command(BaseCommand): """Creates agents and runs actual accession gathering.""" - DATA_SOURCE_MA_AE = "microarray-ae" - DATA_SOURCE_MA_GEO = "microarray-geo" - DATA_SOURCE_RNA_SEQ = "rna-seq" - DATA_SOURCES = (DATA_SOURCE_MA_AE, DATA_SOURCE_MA_GEO, DATA_SOURCE_RNA_SEQ) - + DATA_AGENTS = (AEAgent, GEOAgent, RNASeqAgent) + DATA_SOURCE_NAMES = [agent.SOURCE_NAME for agent in DATA_AGENTS] RE_ACCESSION = re.compile(r"(\D+)(\d+)") RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}") @@ -164,43 +162,43 @@ def validate_args(self, options) -> None: keyword = options["keyword"] organism = options["organism"] - sources = options["source"] or self.DATA_SOURCES + source_names = options["source"] or self.DATA_SOURCE_NAMES - for source in sources: - if source in self.DATA_SOURCES: + for source_name in source_names: + if source_name in self.DATA_SOURCE_NAMES: continue errors.append( - f"Unknown source: {source}. Supported sources: {', '.join(self.DATA_SOURCES)}" + f"Unknown source: {source_name}. Supported sources: {', '.join(self.DATA_SOURCE_NAMES)}" ) - if self.DATA_SOURCE_MA_AE in sources: + if AEAgent.SOURCE_NAME in source_names: ids = options["ae_id"] or options["ae_ids_file"] if not (ids or keyword or organism): errors.append( ( "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or " "ArrayExpress ID(s) [--ae-id, --ae-ids-file] must be specified for " - f"'{self.DATA_SOURCE_MA_AE}' source." + f"'{AEAgent.SOURCE_NAME}' source." ) ) - if self.DATA_SOURCE_MA_GEO in sources: + if GEOAgent.SOURCE_NAME in source_names: ids = options["gpl_id"] or options["gpl_ids_file"] if not (ids or keyword or organism): errors.append( ( "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or " "GEO platform ID(s) [--gpl-id, --gpl-ids-file] must be specified for " - f"'{self.DATA_SOURCE_MA_GEO}' source." + f"'{GEOAgent.SOURCE_NAME}' source." ) ) - if self.DATA_SOURCE_RNA_SEQ in sources: + if RNASeqAgent.SOURCE_NAME in source_names: ids = options["taxon_id"] or options["taxon_ids_file"] if not (ids or keyword or organism): errors.append( ( "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] " "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified for " - f"'{self.DATA_SOURCE_RNA_SEQ}' source." + f"'{RNASeqAgent.SOURCE_NAME}' source." ) ) @@ -213,26 +211,21 @@ def handle(self, *args, **options): self.set_verbosity_level(options) agents = list() - sources = options["source"] or self.DATA_SOURCES - - if self.DATA_SOURCE_RNA_SEQ in sources: - agents.append(RNASeqAccessionAgent(options)) - - if self.DATA_SOURCE_MA_AE in sources: - agents.append(MicroArrayExpressAccessionAgent(options)) - - if self.DATA_SOURCE_MA_GEO in sources: - agents.append(MicroArrayGEOAccessionAgent(options)) + sources_names = options["source"] or self.DATA_SOURCE_NAMES + for cls in self.DATA_AGENTS: + if cls.SOURCE_NAME not in sources_names: + continue + agents.append(cls(options)) entries = set() for agent in agents: entries.update(agent.collect_data()) entries = sorted( # Sort the resulting list. - (entry for entry in entries if self.RE_ACCESSION.match(entry.code)), + (entry for entry in entries if self.RE_ACCESSION.match(entry.accession_code)), key=lambda entry: ( - self.RE_ACCESSION.match(entry.code).group(1), - int(self.RE_ACCESSION.match(entry.code).group(2)), + self.RE_ACCESSION.match(entry.accession_code).group(1), + int(self.RE_ACCESSION.match(entry.accession_code).group(2)), ), ) # Limit the number of output entries. @@ -245,4 +238,7 @@ def handle(self, *args, **options): output = "No accessions found." print(output) else: - GatheredAccession.objects.bulk_create(entries) + try: + GatheredAccession.objects.bulk_create(entries) + except IntegrityError as e: + logger.exception(f"Could not save new accessions to the database: {e}") From c4b43eecc479255a40902707bd026eeacff95788 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Wed, 21 Sep 2022 11:05:24 -0700 Subject: [PATCH 08/24] Rename agent files. --- .../gatherer/agents/{microarray_ae.py => ae_agent.py} | 0 .../gatherer/agents/{microarray_geo.py => geo_agent.py} | 0 .../gatherer/agents/{rna_seq.py => rnaseq_agent.py} | 0 .../gatherer/management/commands/gather_accessions.py | 6 +++--- 4 files changed, 3 insertions(+), 3 deletions(-) rename foreman/data_refinery_foreman/gatherer/agents/{microarray_ae.py => ae_agent.py} (100%) rename foreman/data_refinery_foreman/gatherer/agents/{microarray_geo.py => geo_agent.py} (100%) rename foreman/data_refinery_foreman/gatherer/agents/{rna_seq.py => rnaseq_agent.py} (100%) diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/ae_agent.py similarity index 100% rename from foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py rename to foreman/data_refinery_foreman/gatherer/agents/ae_agent.py diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py b/foreman/data_refinery_foreman/gatherer/agents/geo_agent.py similarity index 100% rename from foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py rename to foreman/data_refinery_foreman/gatherer/agents/geo_agent.py diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rnaseq_agent.py similarity index 100% rename from foreman/data_refinery_foreman/gatherer/agents/rna_seq.py rename to foreman/data_refinery_foreman/gatherer/agents/rnaseq_agent.py diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py index 2b073ef45..554b74350 100644 --- a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py +++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py @@ -15,9 +15,9 @@ from data_refinery_common.logging import get_and_configure_logger from data_refinery_common.models.gathered_accession import GatheredAccession -from data_refinery_foreman.gatherer.agents.microarray_ae import AEAgent -from data_refinery_foreman.gatherer.agents.microarray_geo import GEOAgent -from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAgent +from data_refinery_foreman.gatherer.agents.ae_agent import AEAgent +from data_refinery_foreman.gatherer.agents.geo_agent import GEOAgent +from data_refinery_foreman.gatherer.agents.rnaseq_agent import RNASeqAgent logger = get_and_configure_logger(__name__) From fef0e6fd4b07eccbdfc25a6d5860000b41203281 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Mon, 3 Oct 2022 16:15:57 -0700 Subject: [PATCH 09/24] Update terraform config in order to make it runnable on arm64. - Migrate `template_file` to `templatefile` - Inline user_data templates. - Fix s3 backend configuration. - Set postgres version to 11.16. - Change DB `name` to `db_name`. --- ami/instance.tf | 12 ++- ami/user-data-script.tf | 16 ---- infrastructure/backend.tf | 10 +-- infrastructure/batch.tf | 14 +++- infrastructure/database.tf | 34 ++++---- infrastructure/instances.tf | 162 ++++++++++++++---------------------- infrastructure/variables.tf | 2 +- 7 files changed, 103 insertions(+), 147 deletions(-) diff --git a/ami/instance.tf b/ami/instance.tf index 620b13803..80680f351 100644 --- a/ami/instance.tf +++ b/ami/instance.tf @@ -26,7 +26,11 @@ resource "aws_instance" "ubuntu-ami-template-instance" { # Our instance-user-data.sh script is built by Terraform at # apply-time so that it can put additional files onto the # instance. For more information see the definition of this resource. - user_data = data.template_file.ubuntu_instance_user_data.rendered + user_data = templatefile("ubuntu-instance-user-data.tpl.sh", + { + docker_apt_key = data.local_file.docker_apt_key.content + } + ) subnet_id = data.aws_subnet.ccdl_dev_subnet.id associate_public_ip_address = true @@ -46,7 +50,11 @@ resource "aws_instance" "ecs-ami-template-instance" { # Our instance-user-data.sh script is built by Terraform at # apply-time so that it can put additional files onto the # instance. For more information see the definition of this resource. - user_data = data.template_file.ecs_instance_user_data.rendered + user_data = templatefile("ecs-instance-user-data.tpl.sh", + { + docker_apt_key = data.local_file.docker_apt_key.content + } + ) subnet_id = data.aws_subnet.ccdl_dev_subnet.id associate_public_ip_address = true diff --git a/ami/user-data-script.tf b/ami/user-data-script.tf index d5d0718e3..585beacbe 100644 --- a/ami/user-data-script.tf +++ b/ami/user-data-script.tf @@ -2,19 +2,3 @@ data "local_file" "docker_apt_key" { filename = "docker-apt-key.gpg" } - -data "template_file" "ubuntu_instance_user_data" { - template = file("ubuntu-instance-user-data.tpl.sh") - - vars = { - docker_apt_key = data.local_file.docker_apt_key.content - } -} - -data "template_file" "ecs_instance_user_data" { - template = file("ecs-instance-user-data.tpl.sh") - - vars = { - docker_apt_key = data.local_file.docker_apt_key.content - } -} diff --git a/infrastructure/backend.tf b/infrastructure/backend.tf index 02e25fc39..294f50ef7 100644 --- a/infrastructure/backend.tf +++ b/infrastructure/backend.tf @@ -1,16 +1,8 @@ terraform { backend "s3" { - # Terraform will prompt the user for the other keys. - region = "us-east-1" - } -} - -data "terraform_remote_state" "network" { - backend = "s3" - config = { bucket = "refinebio-tfstate-deploy-${var.stage}" + encrypt = true key = "terraform-${var.user}.tfstate" region = "us-east-1" - encrypt = true } } diff --git a/infrastructure/batch.tf b/infrastructure/batch.tf index 8dbbebd94..ae1ab8216 100644 --- a/infrastructure/batch.tf +++ b/infrastructure/batch.tf @@ -16,7 +16,19 @@ module "batch" { data_refinery_keypair = aws_key_pair.data_refinery data_refinery_worker_security_group = aws_security_group.data_refinery_worker - data_refinery_worker_user_data = data.template_file.worker_script_smusher.rendered + data_refinery_worker_user_data = templatefile( + "workers-configuration/workers-instance-user-data.tpl.sh", + { + database_host = aws_instance.pg_bouncer.private_ip + database_name = aws_db_instance.postgres_db.name + database_password = var.database_password + database_port = var.database_port + database_user = var.database_user + region = var.region + stage = var.stage + user = var.user + } + ) data_refinery_worker_ami = var.worker_ami user = var.user diff --git a/infrastructure/database.tf b/infrastructure/database.tf index 8f4f8520e..503fcf75b 100644 --- a/infrastructure/database.tf +++ b/infrastructure/database.tf @@ -137,11 +137,11 @@ resource "aws_db_instance" "postgres_db" { allocated_storage = 100 storage_type = "gp2" engine = "postgres" - engine_version = "11.1" + engine_version = "11.16" allow_major_version_upgrade = true auto_minor_version_upgrade = false instance_class = "db.${var.database_instance_type}" - name = "data_refinery" + db_name = "data_refinery" port = var.database_hidden_port username = var.database_user password = var.database_password @@ -182,7 +182,19 @@ resource "aws_instance" "pg_bouncer" { # Our instance-user-data.sh script is built by Terraform at # apply-time so that it can put additional files onto the # instance. For more information see the definition of this resource. - user_data = data.template_file.pg_bouncer_script_smusher.rendered + user_data = templatefile("workers-configuration/pg-bouncer-instance-user-data.tpl.sh", + { + database_host = aws_db_instance.postgres_db.address + database_name = aws_db_instance.postgres_db.db_name + database_password = var.database_password + database_port = var.database_hidden_port + database_user = var.database_user + listen_port = var.database_port + region = var.region + stage = var.stage + user = var.user + } + ) tags = merge( var.default_tags, @@ -198,19 +210,3 @@ resource "aws_instance" "pg_bouncer" { tags = var.default_tags } } - -data "template_file" "pg_bouncer_script_smusher" { - template = file("workers-configuration/pg-bouncer-instance-user-data.tpl.sh") - - vars = { - database_host = aws_db_instance.postgres_db.address - database_user = var.database_user - database_port = var.database_hidden_port - database_password = var.database_password - database_name = aws_db_instance.postgres_db.name - listen_port = var.database_port - user = var.user - stage = var.stage - region = var.region - } -} diff --git a/infrastructure/instances.tf b/infrastructure/instances.tf index 379971e8d..bed2234dd 100644 --- a/infrastructure/instances.tf +++ b/infrastructure/instances.tf @@ -7,34 +7,11 @@ data "aws_ami" "ubuntu" { owners = ["589864003899"] filter { - name = "name" + name = "name" values = ["ccdl-ubuntu-18.04-*"] } } -# This script smusher exists in order to be able to circumvent a -# limitation of AWS which is that you get one script and one script -# only to set up the instance when it boots up. Because there is only -# one script you cannot place additional files your script may need -# onto the instance. Therefore this script smusher templates the files -# the instance-user-data.sh script needs into it, so that once it -# makes its way onto the instance it can spit them back out onto the -# disk. -data "template_file" "worker_script_smusher" { - template = file("workers-configuration/workers-instance-user-data.tpl.sh") - - vars = { - user = var.user - stage = var.stage - region = var.region - database_host = aws_instance.pg_bouncer.private_ip - database_port = var.database_port - database_user = var.database_user - database_password = var.database_password - database_name = aws_db_instance.postgres_db.name - } -} - ## # ElasticSearch ## @@ -53,7 +30,7 @@ data "aws_caller_identity" "current" { } resource "aws_elasticsearch_domain" "es" { - domain_name = "es-${var.user}-${var.stage}" + domain_name = "es-${var.user}-${var.stage}" elasticsearch_version = "6.3" advanced_options = { @@ -109,7 +86,7 @@ CONFIG var.default_tags, { Domain = "es-${var.user}-${var.stage}" - Name = "es-${var.user}-${var.stage}" + Name = "es-${var.user}-${var.stage}" } ) } @@ -130,56 +107,43 @@ data "local_file" "api_environment" { filename = "api-configuration/environment" } -# This script smusher serves a similar purpose to -# ${data.template_file.worker_script_smusher} but for the Nginx/API. -data "template_file" "api_server_script_smusher" { - template = file("api-configuration/api-server-instance-user-data.tpl.sh") - - vars = { - nginx_config = data.local_file.api_nginx_config.content - api_environment = data.local_file.api_environment.content - dockerhub_repo = var.dockerhub_repo - api_docker_image = var.api_docker_image - data_refinery_cert_bucket = aws_s3_bucket.data_refinery_cert_bucket.id - user = var.user - stage = var.stage - region = var.region - database_host = aws_instance.pg_bouncer.private_ip - database_user = var.database_user - database_password = var.database_password - database_name = aws_db_instance.postgres_db.name - elasticsearch_host = aws_elasticsearch_domain.es.endpoint - elasticsearch_port = "80" # AWS doesn't support the data transfer protocol on 9200 >:[ - log_group = aws_cloudwatch_log_group.data_refinery_log_group.name - log_stream = aws_cloudwatch_log_stream.log_stream_api.name - } - - depends_on = [ - aws_db_instance.postgres_db, - aws_elasticsearch_domain.es, - aws_instance.pg_bouncer, - aws_security_group_rule.data_refinery_api_http, - aws_security_group_rule.data_refinery_api_outbound, - aws_s3_bucket.data_refinery_cert_bucket, - ] -} - resource "aws_instance" "api_server_1" { - ami = data.aws_ami.ubuntu.id - instance_type = var.api_instance_type - availability_zone = "${var.region}a" + ami = data.aws_ami.ubuntu.id + instance_type = var.api_instance_type + availability_zone = "${var.region}a" vpc_security_group_ids = [aws_security_group.data_refinery_api.id] - iam_instance_profile = aws_iam_instance_profile.data_refinery_api.name - subnet_id = aws_subnet.data_refinery_1a.id + iam_instance_profile = aws_iam_instance_profile.data_refinery_api.name + subnet_id = aws_subnet.data_refinery_1a.id depends_on = [ aws_db_instance.postgres_db, aws_elasticsearch_domain.es, aws_instance.pg_bouncer, + aws_s3_bucket.data_refinery_cert_bucket, aws_security_group_rule.data_refinery_api_http, aws_security_group_rule.data_refinery_api_outbound, ] - user_data = data.template_file.api_server_script_smusher.rendered - key_name = aws_key_pair.data_refinery.key_name + + user_data = templatefile("api-configuration/api-server-instance-user-data.tpl.sh", + { + api_docker_image = var.api_docker_image + api_environment = data.local_file.api_environment.content + data_refinery_cert_bucket = aws_s3_bucket.data_refinery_cert_bucket.id + database_host = aws_instance.pg_bouncer.private_ip + database_name = aws_db_instance.postgres_db.db_name + database_password = var.database_password + database_user = var.database_user + dockerhub_repo = var.dockerhub_repo + elasticsearch_host = aws_elasticsearch_domain.es.endpoint + elasticsearch_port = "80" # AWS doesn't support the data transfer protocol on 9200 >:[ + log_group = aws_cloudwatch_log_group.data_refinery_log_group.name + log_stream = aws_cloudwatch_log_stream.log_stream_api.name + nginx_config = data.local_file.api_nginx_config.content + region = var.region + stage = var.stage + user = var.user + } + ) + key_name = aws_key_pair.data_refinery.key_name tags = merge( var.default_tags, @@ -210,44 +174,44 @@ data "local_file" "foreman_environment" { filename = "foreman-configuration/environment" } -# This script smusher serves a similar purpose to -# ${data.template_file.worker_script_smusher} but for the Foreman. -data "template_file" "foreman_server_script_smusher" { - template = file( - "foreman-configuration/foreman-server-instance-user-data.tpl.sh", - ) - - vars = { - foreman_environment = data.local_file.foreman_environment.content - dockerhub_repo = var.dockerhub_repo - foreman_docker_image = var.foreman_docker_image - user = var.user - stage = var.stage - region = var.region - database_host = aws_instance.pg_bouncer.private_ip - database_user = var.database_user - database_password = var.database_password - database_name = aws_db_instance.postgres_db.name - elasticsearch_host = aws_elasticsearch_domain.es.endpoint - elasticsearch_port = var.elasticsearch_port - log_group = aws_cloudwatch_log_group.data_refinery_log_group.name - } -} - resource "aws_instance" "foreman_server_1" { - ami = data.aws_ami.ubuntu.id - instance_type = var.foreman_instance_type - availability_zone = "${var.region}a" + ami = data.aws_ami.ubuntu.id + instance_type = var.foreman_instance_type + availability_zone = "${var.region}a" vpc_security_group_ids = [aws_security_group.data_refinery_foreman.id] - iam_instance_profile = aws_iam_instance_profile.data_refinery_foreman.name - subnet_id = aws_subnet.data_refinery_1a.id + iam_instance_profile = aws_iam_instance_profile.data_refinery_foreman.name + subnet_id = aws_subnet.data_refinery_1a.id + depends_on = [ aws_db_instance.postgres_db, - aws_instance.pg_bouncer, aws_elasticsearch_domain.es, + aws_instance.pg_bouncer, + aws_s3_bucket.data_refinery_cert_bucket, + aws_security_group_rule.data_refinery_api_http, + aws_security_group_rule.data_refinery_api_outbound, ] - user_data = data.template_file.foreman_server_script_smusher.rendered - key_name = aws_key_pair.data_refinery.key_name + + user_data = templatefile("api-configuration/api-server-instance-user-data.tpl.sh", + { + api_docker_image = var.api_docker_image + api_environment = data.local_file.api_environment.content + data_refinery_cert_bucket = aws_s3_bucket.data_refinery_cert_bucket.id + database_host = aws_instance.pg_bouncer.private_ip + database_name = aws_db_instance.postgres_db.db_name + database_password = var.database_password + database_user = var.database_user + dockerhub_repo = var.dockerhub_repo + elasticsearch_host = aws_elasticsearch_domain.es.endpoint + elasticsearch_port = "80" # AWS doesn't support the data transfer protocol on 9200 >:[ + log_group = aws_cloudwatch_log_group.data_refinery_log_group.name + log_stream = aws_cloudwatch_log_stream.log_stream_api.name + nginx_config = data.local_file.api_nginx_config.content + region = var.region + stage = var.stage + user = var.user + } + ) + key_name = aws_key_pair.data_refinery.key_name tags = merge( var.default_tags, diff --git a/infrastructure/variables.tf b/infrastructure/variables.tf index 3d56947a0..ab2589f14 100644 --- a/infrastructure/variables.tf +++ b/infrastructure/variables.tf @@ -256,7 +256,7 @@ output "environment_variables" { }, { name = "DATABASE_NAME" - value = aws_db_instance.postgres_db.name + value = aws_db_instance.postgres_db.db_name }, { name = "DATABASE_HOST" From 96867782b7ce3d38336eb3d78af18357d726741f Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Thu, 6 Oct 2022 09:19:40 -0700 Subject: [PATCH 10/24] Fix foreman misconfiguration. --- .../foreman-server-instance-user-data.tpl.sh | 9 +++--- infrastructure/instances.tf | 32 +++++++++---------- infrastructure/variables.tf | 8 +++++ 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh b/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh index e77541c75..3598c2a45 100644 --- a/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh +++ b/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh @@ -77,8 +77,8 @@ docker run \\ -e DATABASE_PASSWORD=${database_password} \\ -v /tmp:/tmp \\ -it ${dockerhub_repo}/dr_\"\$1\" python3 manage.py \"\$2\" -" >> /home/ubuntu/run_cron_job_test.sh -chmod +x /home/ubuntu/run_cron_job_test.sh +" >> /home/ubuntu/run_cron_job.sh +chmod +x /home/ubuntu/run_cron_job.sh # Use Monit to ensure the Foreman is always running apt-get -y update @@ -112,8 +112,9 @@ service monit restart # Install the cron job tests crontab -l > tempcron cat <> tempcron -0 12 * * MON /bin/bash /home/ubuntu/run_cron_job_test.sh affymetrix check_brainarray_gene_agreement >> /var/log/cron_job_tests.log 2>&1 -0 12 * * MON /bin/bash /home/ubuntu/run_cron_job_test.sh affymetrix check_tx_index_transcript_agreement >> /var/log/cron_job_tests.log 2>&1 +0 12 * * MON /bin/bash /home/ubuntu/run_cron_job.sh affymetrix check_brainarray_gene_agreement >> /var/log/cron_job_tests.log 2>&1 +0 12 * * MON /bin/bash /home/ubuntu/run_cron_job.sh affymetrix check_tx_index_transcript_agreement >> /var/log/cron_job_tests.log 2>&1 +0 12 * * ${accession_gathering_job_run_day} /bin/bash /home/ubuntu/run_cron_job.sh foreman gather_weekly_accessions >> /var/log/gather_weekly_accessions.log 2>&1 EOF # install new cron file crontab tempcron diff --git a/infrastructure/instances.tf b/infrastructure/instances.tf index bed2234dd..0628c31d3 100644 --- a/infrastructure/instances.tf +++ b/infrastructure/instances.tf @@ -191,24 +191,22 @@ resource "aws_instance" "foreman_server_1" { aws_security_group_rule.data_refinery_api_outbound, ] - user_data = templatefile("api-configuration/api-server-instance-user-data.tpl.sh", + user_data = templatefile("foreman-configuration/foreman-server-instance-user-data.tpl.sh", { - api_docker_image = var.api_docker_image - api_environment = data.local_file.api_environment.content - data_refinery_cert_bucket = aws_s3_bucket.data_refinery_cert_bucket.id - database_host = aws_instance.pg_bouncer.private_ip - database_name = aws_db_instance.postgres_db.db_name - database_password = var.database_password - database_user = var.database_user - dockerhub_repo = var.dockerhub_repo - elasticsearch_host = aws_elasticsearch_domain.es.endpoint - elasticsearch_port = "80" # AWS doesn't support the data transfer protocol on 9200 >:[ - log_group = aws_cloudwatch_log_group.data_refinery_log_group.name - log_stream = aws_cloudwatch_log_stream.log_stream_api.name - nginx_config = data.local_file.api_nginx_config.content - region = var.region - stage = var.stage - user = var.user + accession_gathering_job_run_day = var.accession_gathering_job_run_day + database_host = aws_instance.pg_bouncer.private_ip + database_name = aws_db_instance.postgres_db.name + database_password = var.database_password + database_user = var.database_user + dockerhub_repo = var.dockerhub_repo + elasticsearch_host = aws_elasticsearch_domain.es.endpoint + elasticsearch_port = var.elasticsearch_port + foreman_docker_image = var.foreman_docker_image + foreman_environment = data.local_file.foreman_environment.content + log_group = aws_cloudwatch_log_group.data_refinery_log_group.name + region = var.region + stage = var.stage + user = var.user } ) key_name = aws_key_pair.data_refinery.key_name diff --git a/infrastructure/variables.tf b/infrastructure/variables.tf index ab2589f14..8bf167971 100644 --- a/infrastructure/variables.tf +++ b/infrastructure/variables.tf @@ -223,6 +223,14 @@ variable "processing_compendia" { default = true } +variable "accession_gathering_job_run_day" { + default = "SAT" +} + +variable "max_accessions_gathered_per_run" { + default = 0 +} + # Output our production environment variables. output "environment_variables" { value = [ From dccc246703eb8d25fbca521ebd625de9fd257298 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Wed, 2 Nov 2022 09:11:57 -0700 Subject: [PATCH 11/24] Update workers Docker images: - Optimize and reorder building instructions. - Update Python to v3.8. - Resolve OS level package dependency conflicts. - Add and set up missing OS packages. - Add apt cache clean up instructions. - Use `--ignore-installed` pip flag for better deps management. - --- workers/dockerfiles/Dockerfile.affymetrix | 87 +++++++------- .../dockerfiles/Dockerfile.affymetrix_local | 3 +- workers/dockerfiles/Dockerfile.compendia | 106 +++++++++--------- workers/dockerfiles/Dockerfile.downloaders | 88 ++++++++------- workers/dockerfiles/Dockerfile.illumina | 87 +++++++------- workers/dockerfiles/Dockerfile.no_op | 81 ++++++------- workers/dockerfiles/Dockerfile.salmon | 100 +++++++++-------- workers/dockerfiles/Dockerfile.smasher | 86 +++++++------- workers/dockerfiles/Dockerfile.transcriptome | 61 +++++----- 9 files changed, 362 insertions(+), 337 deletions(-) diff --git a/workers/dockerfiles/Dockerfile.affymetrix b/workers/dockerfiles/Dockerfile.affymetrix index 33c6e2518..151473ecf 100644 --- a/workers/dockerfiles/Dockerfile.affymetrix +++ b/workers/dockerfiles/Dockerfile.affymetrix @@ -1,45 +1,51 @@ FROM ubuntu:20.04 +# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 +# For whatever reason this worked and 'en_US.UTF-8' did not. +ENV LANG C.UTF-8 + +# Prevent tzdata from prompting us for a timezone and hanging the build. +ENV DEBIAN_FRONTEND=noninteractive + RUN apt-get update -qq RUN apt-get install -y software-properties-common RUN add-apt-repository ppa:apt-fast/stable -RUN apt-get update -qq -RUN apt-get -y install apt-fast -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update -qq +RUN apt-get install -y apt-fast apt-transport-https # The packages related to R are somewhat weird, see the README for more details. - COPY workers/CRAN.gpg . -RUN \ - apt-fast update -qq && \ - apt-get install -y apt-transport-https && \ - apt-fast install -y lsb-release && \ - echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list && \ - apt-key add CRAN.gpg && \ - apt-fast update -qq && \ - apt-fast install -y \ +RUN apt-key add CRAN.gpg +RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ + >> /etc/apt/sources.list.d/added_repos.list + +RUN apt-fast update -qq && apt-fast install -y \ + build-essential \ + cmake \ + curl \ + cython3 \ ed \ git \ - mercurial \ libcairo-dev \ + libcurl4-openssl-dev \ libedit-dev \ + libpq-dev \ + libssl-dev \ + libxml2-dev \ + llvm-10-dev \ lsb-release \ - python3 \ + mercurial \ + pkg-config \ python3-pip \ + python3 \ python3-dev \ r-base-core \ - r-base-dev \ - libpq-dev \ - libxml2-dev \ - libssl-dev \ - libcurl4-openssl-dev \ - curl \ - wget && \ - rm -rf /var/lib/apt/lists/* + wget + RUN rm CRAN.gpg +RUN apt-get clean; rm -rf /var/lib/apt/lists/* +RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user @@ -47,41 +53,29 @@ WORKDIR /home/user ENV R_LIBS "/usr/local/lib/R/site-library" COPY common/install_devtools.R . - RUN Rscript install_devtools.R COPY workers/R_dependencies/affymetrix/dependencies.R . RUN Rscript dependencies.R COPY workers/affymetrix_dependencies.R . -COPY workers/install_ensg_pkgs.R . - RUN Rscript affymetrix_dependencies.R -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - RUN pip3 install pip --upgrade +RUN pip3 install setuptools --upgrade -RUN pip3 install setuptools --upgrade && \ - rm -rf /root/.cache - -COPY config/ config/ -COPY .boto .boto - -COPY common/dist/data-refinery-common-* common/ +# Install this one here instead of via requirements.txt because not +# all processors need it. +RUN pip3 install rpy2==3.4.5 +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install -r requirements.txt # Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) -COPY workers/data_refinery_workers/processors/requirements.txt . - -RUN pip3 install -r requirements.txt - -# Install this one here instead of via requirements.txt because not -# all processors need it. -RUN pip3 install rpy2==3.4.5 +# Clear out the pip3 cache. +RUN rm -rf /root/.cache ARG SYSTEM_VERSION @@ -89,6 +83,9 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION USER user +COPY .boto .boto +COPY config/ config/ COPY workers/ . +COPY workers/install_ensg_pkgs.R . ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.affymetrix_local b/workers/dockerfiles/Dockerfile.affymetrix_local index 84309b4e9..9a37692e6 100644 --- a/workers/dockerfiles/Dockerfile.affymetrix_local +++ b/workers/dockerfiles/Dockerfile.affymetrix_local @@ -6,9 +6,8 @@ USER root RUN rm -r common/ RUN pip3 uninstall -y data_refinery_common -# Reinstall common. -COPY common/dist/data-refinery-common-* common/ # Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) ARG SYSTEM_VERSION diff --git a/workers/dockerfiles/Dockerfile.compendia b/workers/dockerfiles/Dockerfile.compendia index 3a6df3f5f..2c6a38784 100644 --- a/workers/dockerfiles/Dockerfile.compendia +++ b/workers/dockerfiles/Dockerfile.compendia @@ -1,55 +1,67 @@ -FROM nvidia/cuda:11.1-runtime-ubuntu18.04 +FROM nvidia/cuda:11.8.0-runtime-ubuntu18.04 # This is very similar to the `smasher` image, but comes with OpenBLAS and some # of the other libraries required for fancyimpute. -RUN apt-get update -qq -RUN apt-get install -y software-properties-common -RUN add-apt-repository ppa:apt-fast/stable -RUN apt-get update -qq +# Prevent tzdata from prompting us for a timezone and hanging the build. +ENV DEBIAN_FRONTEND=noninteractive + +# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 +# For whatever reason this worked and 'en_US.UTF-8' did not. +ENV LANG C.UTF-8 + +# RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub # via https://github.com/ilikenwf/apt-fast/issues/85#issuecomment-261640099 RUN echo debconf apt-fast/maxdownloads string 16 | debconf-set-selections RUN echo debconf apt-fast/dlflag boolean true | debconf-set-selections RUN echo debconf apt-fast/aptmanager string apt-get | debconf-set-selections -RUN _APTMGR=apt-get apt-get install -y apt-fast - -RUN export DEBIAN_FRONTEND=noninteractive; \ - export DEBCONF_NONINTERACTIVE_SEEN=true; \ - echo 'tzdata tzdata/Areas select Etc' | debconf-set-selections; \ - echo 'tzdata tzdata/Zones/Etc select UTC' | debconf-set-selections; \ - apt-get update -qqy \ - && apt-get install -qqy --no-install-recommends \ - tzdata \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* +RUN echo 'tzdata tzdata/Areas select Etc' | debconf-set-selections +RUN echo 'tzdata tzdata/Zones/Etc select UTC' | debconf-set-selections + +RUN apt-get update -qq +RUN apt-get install -y software-properties-common +RUN add-apt-repository ppa:apt-fast/stable +RUN add-apt-repository ppa:deadsnakes/ppa +RUN add-apt-repository ppa:savoury1/llvm-defaults-10 + +RUN apt-get update -qq +RUN apt-get install -y apt-fast apt-transport-https tzdata COPY workers/CRAN.gpg . -RUN \ - apt-fast update -qq && \ - apt-fast install -y lsb-release && \ - echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list && \ - apt-key add CRAN.gpg && \ - apt-fast update -qq && \ - apt-fast install -y \ +RUN apt-key add CRAN.gpg +RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ + >> /etc/apt/sources.list.d/added_repos.list + +RUN apt-fast update -qq && apt-fast install -y \ + build-essential \ + cmake \ + curl \ + cython3 \ ed \ + gfortran \ git \ - liblapack-dev \ - libopenblas-dev \ - python3 \ - python3-pip \ + libcairo-dev \ libcurl4-openssl-dev \ + libedit-dev \ + libblas-dev \ + liblapack-dev \ libpq-dev \ - r-base-core \ - r-base-dev \ - libpq-dev \ - libxml2-dev \ libssl-dev \ - libcurl4-openssl-dev \ - curl \ - wget && \ - rm -rf /var/lib/apt/lists/* + libxml2-dev \ + llvm-10-dev \ + lsb-release \ + mercurial \ + pkg-config \ + python3-pip \ + python3.8 \ + python3.8-dev \ + r-base-core \ + wget + RUN rm CRAN.gpg +RUN apt-get clean; rm -rf /var/lib/apt/lists/* +RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user @@ -62,7 +74,6 @@ RUN ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local ENV R_LIBS "/usr/local/lib/R/site-library" COPY common/install_devtools.R . - RUN Rscript install_devtools.R COPY workers/R_dependencies/qn/dependencies.R . @@ -72,30 +83,21 @@ COPY workers/qn_dependencies.R . RUN Rscript qn_dependencies.R # End QN-specific -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - RUN pip3 install --upgrade pip - # Smasher-specific requirements -RUN pip3 install numpy scipy matplotlib pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD +RUN pip3 install --ignore-installed numpy scipy matplotlib pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD # End smasher-specific -COPY config/ config/ -COPY .boto .boto - COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed -r requirements.txt -RUN pip3 --no-cache-dir install -r requirements.txt -RUN pip3 install numpy==1.16.0 # Fix a downgrade - -COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed numpy==1.16.0 # Fix a downgrade # Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) -# Clear our the pip3 cache +# Clear out the pip3 cache. RUN rm -rf /root/.cache ARG SYSTEM_VERSION @@ -104,6 +106,8 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION USER user +COPY .boto .boto +COPY config/ config/ COPY workers/ . ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.downloaders b/workers/dockerfiles/Dockerfile.downloaders index f2fc9e78a..1b3337325 100644 --- a/workers/dockerfiles/Dockerfile.downloaders +++ b/workers/dockerfiles/Dockerfile.downloaders @@ -1,59 +1,64 @@ FROM ubuntu:18.04 -RUN apt-get update -qq +# Prevent tzdata from prompting us for a timezone and hanging the build. +ENV DEBIAN_FRONTEND=noninteractive + +# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 +# For whatever reason this worked and 'en_US.UTF-8' did not. +ENV LANG C.UTF-8 + +RUN apt-get update RUN apt-get install -y software-properties-common RUN add-apt-repository ppa:apt-fast/stable -RUN apt-get update -qq -RUN apt-get -y install apt-fast +RUN add-apt-repository ppa:deadsnakes/ppa +RUN add-apt-repository ppa:savoury1/llvm-defaults-10 -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update -qq +RUN apt-get install -y apt-fast apt-transport-https # The packages related to R are somewhat weird, see the README for more details. - COPY workers/CRAN.gpg . -RUN \ - apt-fast update -qq && \ - apt-get install -y apt-transport-https && \ - apt-fast install -y lsb-release && \ - echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list && \ - apt-key add CRAN.gpg && \ - apt-fast update -qq && \ - apt-fast install -y \ +RUN apt-key add CRAN.gpg +RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ + >> /etc/apt/sources.list.d/added_repos.list + +RUN apt-fast update -qq && apt-fast install -y \ + build-essential \ + cmake \ + curl \ + cython3 \ ed \ git \ - python3 \ - python3-pip \ - r-base-core \ - r-base-dev \ - libpq-dev \ - libxml2-dev \ - libssl-dev \ + libcairo-dev \ libcurl4-openssl-dev \ + libedit-dev \ libpq-dev \ - curl \ - wget && \ - rm -rf /var/lib/apt/lists/* + libssl-dev \ + libxml2-dev \ + llvm-10-dev \ + lsb-release \ + mercurial \ + pkg-config \ + python3-pip \ + python3.8 \ + python3.8-dev \ + r-base-core \ + wget + RUN rm CRAN.gpg +RUN apt-get clean; rm -rf /var/lib/apt/lists/* +RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - -RUN pip3 install --upgrade pip - ENV R_LIBS "/usr/local/lib/R/site-library" COPY common/install_devtools.R . - RUN Rscript install_devtools.R COPY workers/install_downloader_R_only.R . - RUN Rscript install_downloader_R_only.R # Aspera will only install as the current user. @@ -71,22 +76,19 @@ RUN rm aspera-cli-3.9.1-0.tar.bz2 # Now that we're done installing Aspera go back to being root for a bit. USER root -COPY config config -COPY .boto .boto - -COPY workers/data_refinery_workers/downloaders/requirements.txt . - -RUN pip3 install -r requirements.txt - +RUN pip3 install --upgrade pip # Install this rpy2 here instead of via requirements.txt because # pip-compile throws an error for it. RUN pip3 install rpy2==3.4.5 -COPY common/dist/data-refinery-common-* common/ + +COPY workers/data_refinery_workers/downloaders/requirements.txt . +RUN pip3 install --ignore-installed -r requirements.txt # Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) -# Clear our the pip3 cache +# Clear out the pip3 cache. RUN rm -rf /root/.cache ARG SYSTEM_VERSION @@ -95,6 +97,8 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION USER user +COPY .boto .boto +COPY config config COPY workers/ . ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.illumina b/workers/dockerfiles/Dockerfile.illumina index bc2294425..e4cc70268 100644 --- a/workers/dockerfiles/Dockerfile.illumina +++ b/workers/dockerfiles/Dockerfile.illumina @@ -1,46 +1,57 @@ FROM ubuntu:18.04 -RUN apt-get update -qq +# Prevent tzdata from prompting us for a timezone and hanging the build. +ENV DEBIAN_FRONTEND=noninteractive + +# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 +# For whatever reason this worked and 'en_US.UTF-8' did not. +ENV LANG C.UTF-8 + +RUN apt-get update RUN apt-get install -y software-properties-common RUN add-apt-repository ppa:apt-fast/stable +RUN add-apt-repository ppa:deadsnakes/ppa +RUN add-apt-repository ppa:savoury1/llvm-defaults-10 + RUN apt-get update -qq -RUN apt-get -y install apt-fast +RUN apt-get install -y apt-fast apt-transport-https # The packages related to R are somewhat weird, see the README for more details. - COPY workers/CRAN.gpg . -RUN \ - apt-fast update -qq && \ - apt-get install -y apt-transport-https && \ - apt-fast install -y lsb-release && \ - echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list && \ - apt-key add CRAN.gpg && \ - apt-fast update -qq +RUN apt-key add CRAN.gpg +RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ + >> /etc/apt/sources.list.d/added_repos.list -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-fast install -y \ +RUN apt-fast update -qq && apt-fast install -y \ + build-essential \ + cmake \ + curl \ + cython3 \ ed \ + gfortran \ git \ - mercurial \ libcairo-dev \ + libcurl4-openssl-dev \ libedit-dev \ + libblas-dev \ + liblapack-dev \ + libpq-dev \ + libssl-dev \ + libxml2-dev \ + llvm-10-dev \ lsb-release \ - python3 \ + mercurial \ + pkg-config \ python3-pip \ + python3.8 \ + python3.8-dev \ r-base-core \ - r-base-dev \ - libpq-dev \ - libxml2-dev \ - libssl-dev \ - libcurl4-openssl-dev \ - curl \ - wget && \ - rm -rf /var/lib/apt/lists/* + wget RUN rm CRAN.gpg +RUN apt-get clean; rm -rf /var/lib/apt/lists/* +RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user @@ -48,37 +59,24 @@ WORKDIR /home/user ENV R_LIBS "/usr/local/lib/R/site-library" COPY common/install_devtools.R . - RUN Rscript install_devtools.R COPY workers/R_dependencies/illumina/dependencies.R . RUN Rscript dependencies.R -# These are for Illumina +# These are for Illumina. COPY workers/illumina_dependencies.R . RUN Rscript illumina_dependencies.R -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - RUN pip3 install --upgrade pip - -COPY config/ config/ -COPY .boto .boto - -COPY workers/illumina_probe_maps/ probe_maps/ - COPY workers/data_refinery_workers/processors/requirements.txt . - -RUN pip3 install -r requirements.txt - -COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed -r requirements.txt # Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) -# Clear our the pip3 cache +# Clear out the pip3 cache. RUN rm -rf /root/.cache ARG SYSTEM_VERSION @@ -87,7 +85,10 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION USER user -COPY workers/data_refinery_workers/processors/detect_database.R . +COPY .boto .boto +COPY config/ config/ COPY workers/ . +COPY workers/data_refinery_workers/processors/detect_database.R . +COPY workers/illumina_probe_maps/ probe_maps/ ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.no_op b/workers/dockerfiles/Dockerfile.no_op index ee59a63a5..98f35d772 100644 --- a/workers/dockerfiles/Dockerfile.no_op +++ b/workers/dockerfiles/Dockerfile.no_op @@ -1,51 +1,56 @@ FROM ubuntu:18.04 +# Prevent tzdata from prompting us for a timezone and hanging the build. +ENV DEBIAN_FRONTEND=noninteractive + +# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 +# For whatever reason this worked and 'en_US.UTF-8' did not. +ENV LANG C.UTF-8 + RUN apt-get update -qq RUN apt-get install -y software-properties-common + RUN add-apt-repository ppa:apt-fast/stable # deadsnakes packages new python versions for older Ubuntu releases RUN add-apt-repository ppa:deadsnakes/ppa + RUN apt-get update -qq RUN apt-get -y install apt-fast -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive - # The packages related to R are somewhat weird, see the README for more details. - COPY workers/CRAN.gpg . -RUN \ - apt-fast update -qq && \ - apt-get install -y apt-transport-https && \ - apt-fast install -y lsb-release && \ - echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list && \ - apt-key add CRAN.gpg && \ - apt-fast update -qq && \ - apt-fast install -y \ +RUN apt-key add CRAN.gpg +RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ + >> /etc/apt/sources.list.d/added_repos.list + +RUN apt-fast update -qq && apt-fast install -y \ + build-essential \ + cmake \ + curl \ + cython3 \ ed \ git \ - mercurial \ libcairo-dev \ + libcurl4-openssl-dev \ libedit-dev \ + libfreetype6-dev \ + libpq-dev \ + libssl-dev \ + libxml2-dev \ + llvm-10-dev \ lsb-release \ - python3.6 \ - python3.6-dev \ + mercurial \ + pkg-config \ python3-pip \ + python3.8 \ + python3.8-dev \ r-base-core \ - r-base-dev \ - libpq-dev \ - libxml2-dev \ - libssl-dev \ - libcurl4-openssl-dev \ - curl \ - unzip \ - wget && \ - rm -rf /var/lib/apt/lists/* -RUN rm CRAN.gpg + wget -# Set the system python version to python3.6 from deadsnakes -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 +RUN rm CRAN.gpg +RUN apt-get clean; rm -rf /var/lib/apt/lists/* +RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user @@ -54,7 +59,6 @@ WORKDIR /home/user ENV R_LIBS "/usr/local/lib/R/site-library" COPY common/install_devtools.R . - RUN Rscript install_devtools.R COPY workers/R_dependencies/no_op/dependencies.R . @@ -62,6 +66,7 @@ RUN Rscript dependencies.R COPY workers/install_gene_convert.R . RUN Rscript install_gene_convert.R + RUN mkdir -p gene_indexes WORKDIR /home/user/gene_indexes ENV ID_REFINERY_URL https://zenodo.org/record/1410647/files/all_1536267482.zip @@ -72,23 +77,17 @@ RUN rm *.zip WORKDIR /home/user # End Noop-specific -COPY workers/data_refinery_workers/processors/requirements.txt . - -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - RUN pip3 install --upgrade pip -RUN pip3 install -r requirements.txt +RUN pip3 install numpy -COPY config/ config/ -COPY .boto .boto -COPY common/dist/data-refinery-common-* common/ +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed -r requirements.txt # Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) -# Clear our the pip3 cache +# Clear out the pip3 cache. RUN rm -rf /root/.cache ARG SYSTEM_VERSION @@ -97,6 +96,8 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION USER user +COPY .boto .boto +COPY config/ config/ COPY workers/ . ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.salmon b/workers/dockerfiles/Dockerfile.salmon index 31922880e..8bfbbe9a7 100644 --- a/workers/dockerfiles/Dockerfile.salmon +++ b/workers/dockerfiles/Dockerfile.salmon @@ -1,44 +1,54 @@ FROM ubuntu:18.04 -RUN apt-get update -qq +# Prevent tzdata from prompting us for a timezone and hanging the build. +ENV DEBIAN_FRONTEND=noninteractive + +# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 +# For whatever reason this worked and 'en_US.UTF-8' did not. +ENV LANG C.UTF-8 + +RUN apt-get update RUN apt-get install -y software-properties-common RUN add-apt-repository ppa:apt-fast/stable -RUN apt-get update -qq -RUN apt-get -y install apt-fast +RUN add-apt-repository ppa:deadsnakes/ppa +RUN add-apt-repository ppa:savoury1/llvm-defaults-10 -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update -qq +RUN apt-get install -y apt-fast apt-transport-https # The packages related to R are somewhat weird, see the README for more details. - COPY workers/CRAN.gpg . +RUN apt-key add CRAN.gpg +RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ + >> /etc/apt/sources.list.d/added_repos.list -RUN \ - apt-fast update -qq && \ - apt-get install -y apt-transport-https && \ - apt-fast install -y lsb-release && \ - echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list && \ - apt-key add CRAN.gpg && \ - apt-fast update -qq && \ - apt-fast install -y \ +RUN apt-fast update -qq && apt-fast install -y \ + build-essential \ + cmake \ + curl \ + cython3 \ ed \ git \ - mercurial \ libcairo-dev \ + libcurl4-openssl-dev \ libedit-dev \ + libpq-dev \ + libssl-dev \ + libxml2-dev \ + llvm-10-dev \ lsb-release \ - python3 \ + mercurial \ + pkg-config \ python3-pip \ - libxml2-dev \ - cmake \ + python3.8 \ + python3.8-dev \ r-base-core \ - libssl-dev \ - libcurl4-openssl-dev \ - curl \ - wget && \ - rm -rf /var/lib/apt/lists/* + wget + RUN rm CRAN.gpg +RUN apt-get upgrade; apt-get clean; rm -rf /var/lib/apt/lists/* +RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user @@ -60,44 +70,40 @@ RUN ln -sf `pwd`/Salmon-${SALMON_VERSION}_linux_x86_64/bin/salmon /usr/local/bin RUN rm -f Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz # End Salmon installation. -# Install R dependencies +# Install R dependencies. COPY common/install_devtools.R . RUN Rscript install_devtools.R + COPY workers/R_dependencies/tximport/dependencies.R tximport_dependencies.R RUN Rscript tximport_dependencies.R -# Install tximport +# Install tximport. COPY workers/install_tximport.R . RUN Rscript install_tximport.R -# Install SalmonTools +RUN pip3 install --upgrade pip +RUN pip3 install numpy + +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed -r requirements.txt + +# Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) + +# Install SalmonTools. RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && cd SalmonTools && git checkout 3e6654c2c10a5225498b623056993947fa688afc RUN cd SalmonTools && cmake . -DCMAKE_INSTALL_PREFIX=/usr/local && make install RUN rm -rf SalmonTools -# Install sra-tools +# Install sra-tools. ENV SRA_VERSION 2.9.1 -RUN wget "http://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \ +RUN wget "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \ tar zxfv sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz && \ cp -r sratoolkit.${SRA_VERSION}-ubuntu64/bin/* /usr/bin -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - -RUN pip3 install --upgrade pip - -COPY config/ config/ -COPY .boto .boto - -COPY workers/data_refinery_workers/processors/requirements.txt . - -RUN pip3 install -r requirements.txt - -COPY common/dist/data-refinery-common-* common/ - -# Get the latest version from the dist directory. -RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) +# Clear out the pip3 cache. +RUN rm -rf /root/.cache ARG SYSTEM_VERSION @@ -105,6 +111,8 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION USER user +COPY .boto .boto +COPY config/ config/ COPY workers/ . ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.smasher b/workers/dockerfiles/Dockerfile.smasher index 7b95644db..313ba2150 100644 --- a/workers/dockerfiles/Dockerfile.smasher +++ b/workers/dockerfiles/Dockerfile.smasher @@ -1,42 +1,57 @@ FROM ubuntu:18.04 -RUN apt-get update -qq +# Prevent tzdata from prompting us for a timezone and hanging the build. +ENV DEBIAN_FRONTEND=noninteractive + +# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 +# For whatever reason this worked and 'en_US.UTF-8' did not. +ENV LANG C.UTF-8 + +RUN apt-get update RUN apt-get install -y software-properties-common RUN add-apt-repository ppa:apt-fast/stable -RUN apt-get update -qq -RUN apt-get -y install apt-fast +RUN add-apt-repository ppa:deadsnakes/ppa +RUN add-apt-repository ppa:savoury1/llvm-defaults-10 -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update -qq +RUN apt-get install -y apt-fast apt-transport-https # The packages related to R are somewhat weird, see the README for more details. - COPY workers/CRAN.gpg . -RUN \ - apt-fast update -qq && \ - apt-get install -y apt-transport-https && \ - apt-fast install -y lsb-release && \ - echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list && \ - apt-key add CRAN.gpg && \ - apt-fast update -qq && \ - apt-fast install -y \ +RUN apt-key add CRAN.gpg +RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ + >> /etc/apt/sources.list.d/added_repos.list + +RUN apt-fast update -qq && apt-fast install -y \ + build-essential \ + cmake \ + curl \ + cython3 \ ed \ + gfortran \ git \ - python3 \ - python3-pip \ + libcairo-dev \ libcurl4-openssl-dev \ + libedit-dev \ + libblas-dev \ + liblapack-dev \ libpq-dev \ - r-base-core \ - r-base-dev \ - libpq-dev \ - libxml2-dev \ libssl-dev \ - libcurl4-openssl-dev \ - curl \ - wget && \ - rm -rf /var/lib/apt/lists/* + libxml2-dev \ + llvm-10-dev \ + lsb-release \ + mercurial \ + pkg-config \ + python3-pip \ + python3.8 \ + python3.8-dev \ + r-base-core \ + wget + RUN rm CRAN.gpg +RUN apt-get clean; rm -rf /var/lib/apt/lists/* +RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user @@ -45,7 +60,6 @@ WORKDIR /home/user ENV R_LIBS "/usr/local/lib/R/site-library" COPY common/install_devtools.R . - RUN Rscript install_devtools.R COPY workers/R_dependencies/qn/dependencies.R . @@ -55,29 +69,19 @@ COPY workers/qn_dependencies.R . RUN Rscript qn_dependencies.R # End QN-specific -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - RUN pip3 install --upgrade pip - # Smasher-specific requirements -RUN pip3 install numpy scipy matplotlib pandas==0.25.3 scikit-learn sympy nose rpy2==3.4.5 tzlocal +RUN pip3 install --ignore-installed nose numpy rpy2==3.4.5 # End smasher-specific -COPY config/ config/ -COPY .boto .boto - COPY workers/data_refinery_workers/processors/requirements.txt . - -RUN pip3 install -r requirements.txt - -COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed -r requirements.txt # Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) -# Clear our the pip3 cache +# Clear out the pip3 cache. RUN rm -rf /root/.cache ARG SYSTEM_VERSION @@ -86,6 +90,8 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION USER user +COPY .boto .boto +COPY config/ config/ COPY workers/ . ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.transcriptome b/workers/dockerfiles/Dockerfile.transcriptome index 5cb65a6b2..d1ac0ea63 100644 --- a/workers/dockerfiles/Dockerfile.transcriptome +++ b/workers/dockerfiles/Dockerfile.transcriptome @@ -1,36 +1,49 @@ FROM ubuntu:18.04 +# Prevent tzdata from prompting us for a timezone and hanging the build. +ENV DEBIAN_FRONTEND=noninteractive + +# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 +# For whatever reason this worked and 'en_US.UTF-8' did not. +ENV LANG C.UTF-8 + RUN apt-get update -qq RUN apt-get install -y software-properties-common RUN add-apt-repository ppa:apt-fast/stable +RUN add-apt-repository ppa:deadsnakes/ppa +RUN add-apt-repository ppa:savoury1/llvm-defaults-10 + RUN apt-get update -qq RUN apt-get -y install apt-fast -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive - -RUN \ - apt-fast update -qq && \ - apt-fast install -y \ +RUN apt-fast update -qq && apt-fast install -y \ + build-essential \ + curl \ + cython3 \ ed \ git \ - python3 \ - python3-pip \ libcurl4-openssl-dev \ + libfreetype6-dev \ libpq-dev \ - zlib1g-dev \ - curl \ - wget && \ - rm -rf /var/lib/apt/lists/* + llvm-10-dev \ + pkg-config \ + python3-pip \ + python3.8 \ + python3.8-dev \ + wget \ + zlib1g-dev + +RUN rm CRAN.gpg +RUN apt-get clean; rm -rf /var/lib/apt/lists/* +RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user # It's annoying that this can only be installed via git. RUN git clone https://github.com/deweylab/RSEM.git - RUN cd RSEM && make install - RUN rm -rf RSEM # Install Salmon @@ -48,36 +61,26 @@ ENV SALMON_VERSION 0.13.1 # ENV SALMON_VERSION 0.10.2 RUN wget https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz - RUN tar -xzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz # Salmon can extract to a different directory than the name of the tar file. RUN cp `tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | head -1 | cut -f1 -d"/"`/bin/salmon /usr/local/bin - RUN cp `tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | head -1 | cut -f1 -d"/"`/lib/* /usr/local/lib RUN rm -r Salmon* # End Salmon installation. -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - RUN pip3 install --upgrade pip - -COPY config/ config/ -COPY .boto .boto +RUN pip3 install numpy COPY workers/data_refinery_workers/processors/requirements.txt . - -RUN pip3 install -r requirements.txt - -COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed -r requirements.txt # Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) -# Clear our the pip3 cache +# Clear out the pip3 cache. RUN rm -rf /root/.cache ARG SYSTEM_VERSION @@ -86,6 +89,8 @@ ENV SYSTEM_VERSION $SYSTEM_VERSION USER user +COPY .boto .boto +COPY config/ config/ COPY workers/ . ENTRYPOINT [] From 46dfdae0bc77c8a6d5e66eba4844ab192c3bad45 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Thu, 1 Sep 2022 11:03:18 -0700 Subject: [PATCH 12/24] Add `Accession` model. Update pre-commit config. --- .pre-commit-config.yaml | 2 +- .../migrations/0071_auto_20220901_1653.py | 44 +++++++++++++++++++ .../data_refinery_common/models/__init__.py | 1 + .../data_refinery_common/models/accession.py | 22 ++++++++++ 4 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 common/data_refinery_common/migrations/0071_auto_20220901_1653.py create mode 100644 common/data_refinery_common/models/accession.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d024704da..b651ce24a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: - id: isort - repo: https://github.com/psf/black - rev: 19.10b0 + rev: 22.3.0 hooks: - id: black args: [--line-length=100] diff --git a/common/data_refinery_common/migrations/0071_auto_20220901_1653.py b/common/data_refinery_common/migrations/0071_auto_20220901_1653.py new file mode 100644 index 000000000..c7d3b0b63 --- /dev/null +++ b/common/data_refinery_common/migrations/0071_auto_20220901_1653.py @@ -0,0 +1,44 @@ +# Generated by Django 3.2.7 on 2022-09-01 16:53 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("data_refinery_common", "0070_auto_20211208_2118"), + ] + + operations = [ + migrations.CreateModel( + name="Accession", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("code", models.TextField()), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("last_modified_at", models.DateTimeField(auto_now=True)), + ("organism", models.TextField()), + ("published_date", models.DateTimeField()), + ("sample_count", models.PositiveIntegerField(default=0)), + ("source", models.TextField()), + ("technology", models.TextField()), + ], + options={ + "db_table": "accessions", + }, + ), + migrations.AddConstraint( + model_name="accession", + constraint=models.UniqueConstraint( + fields=("code", "source", "technology"), name="unique_accession" + ), + ), + ] diff --git a/common/data_refinery_common/models/__init__.py b/common/data_refinery_common/models/__init__.py index 39abe7ee3..8e9564153 100644 --- a/common/data_refinery_common/models/__init__.py +++ b/common/data_refinery_common/models/__init__.py @@ -1,3 +1,4 @@ +from data_refinery_common.models.accession import Accession # noqa from data_refinery_common.models.api_token import APIToken # noqa from data_refinery_common.models.associations.compendium_result_organism_association import ( # noqa CompendiumResultOrganismAssociation, diff --git a/common/data_refinery_common/models/accession.py b/common/data_refinery_common/models/accession.py new file mode 100644 index 000000000..dc93cfd88 --- /dev/null +++ b/common/data_refinery_common/models/accession.py @@ -0,0 +1,22 @@ +from django.db import models + + +class Accession(models.Model): + """Accession model.""" + + class Meta: + constraints = ( + models.UniqueConstraint( + fields=("code", "source", "technology"), name="unique_accession" + ), + ) + db_table = "accessions" + + code = models.TextField() + created_at = models.DateTimeField(auto_now_add=True) + last_modified_at = models.DateTimeField(auto_now=True) + organism = models.TextField() + published_date = models.DateTimeField() + sample_count = models.PositiveIntegerField(default=0) + source = models.TextField() + technology = models.TextField() From cb9249981fb8081d4bc5cf041598b17f77d83a55 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Thu, 8 Sep 2022 18:12:39 -0700 Subject: [PATCH 13/24] Port Python script to Django command. - Introduce AccessionBacklogEntry model. - Clean up command flags. - Get previous accessions from the DB. - --- .../migrations/0071_accessionbacklogentry.py | 38 + .../data_refinery_common/models/accession.py | 82 +- .../gatherer/__init__.py | 0 .../gatherer/management/__init__.py | 0 .../gatherer/management/commands/__init__.py | 0 .../management/commands/gather_accessions.py | 731 ++++++++++++++++++ foreman/data_refinery_foreman/settings.py | 17 +- foreman/dockerfiles/Dockerfile.foreman | 2 + 8 files changed, 856 insertions(+), 14 deletions(-) create mode 100644 common/data_refinery_common/migrations/0071_accessionbacklogentry.py create mode 100644 foreman/data_refinery_foreman/gatherer/__init__.py create mode 100644 foreman/data_refinery_foreman/gatherer/management/__init__.py create mode 100644 foreman/data_refinery_foreman/gatherer/management/commands/__init__.py create mode 100644 foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py diff --git a/common/data_refinery_common/migrations/0071_accessionbacklogentry.py b/common/data_refinery_common/migrations/0071_accessionbacklogentry.py new file mode 100644 index 000000000..86c04daed --- /dev/null +++ b/common/data_refinery_common/migrations/0071_accessionbacklogentry.py @@ -0,0 +1,38 @@ +# Generated by Django 3.2.7 on 2022-09-07 19:31 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("data_refinery_common", "0070_auto_20211208_2118"), + ] + + operations = [ + migrations.CreateModel( + name="AccessionBacklogEntry", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("code", models.TextField(unique=True)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("last_modified_at", models.DateTimeField(auto_now=True)), + ("organism", models.TextField()), + ("published_date", models.DateTimeField()), + ("sample_count", models.PositiveIntegerField(default=0)), + ("source", models.TextField()), + ("technology", models.TextField()), + ], + options={ + "db_table": "accession_backlog", + }, + ), + ] diff --git a/common/data_refinery_common/models/accession.py b/common/data_refinery_common/models/accession.py index dc93cfd88..6ac62da9f 100644 --- a/common/data_refinery_common/models/accession.py +++ b/common/data_refinery_common/models/accession.py @@ -1,18 +1,16 @@ +from datetime import datetime + from django.db import models +from django.utils import timezone -class Accession(models.Model): - """Accession model.""" +class AccessionBacklogEntry(models.Model): + """Accession backlog entry model.""" class Meta: - constraints = ( - models.UniqueConstraint( - fields=("code", "source", "technology"), name="unique_accession" - ), - ) - db_table = "accessions" - - code = models.TextField() + db_table = "accession_backlog" + + code = models.TextField(unique=True) created_at = models.DateTimeField(auto_now_add=True) last_modified_at = models.DateTimeField(auto_now=True) organism = models.TextField() @@ -20,3 +18,67 @@ class Meta: sample_count = models.PositiveIntegerField(default=0) source = models.TextField() technology = models.TextField() + + def __eq__(self, other: object) -> bool: + """Returns True if two objects are equal. Otherwise returns False.""" + return isinstance(other, AccessionBacklogEntry) and self.code == other.code + + def __hash__(self) -> int: + """Returns accession object unique hash value.""" + return hash(self.code) + + def __str__(self) -> str: + """Returns accession default string representation.""" + return ", ".join((self.code, self.technology, self.source, str(self.published_date.date()))) + + @staticmethod + def create_from_ma_ae_entry(entry): + """Creates accession object from MicroArray ArrayExpress entry.""" + accession = AccessionBacklogEntry() + accession.code = entry["accession"] + accession.source = "ebi_biostudies" + accession.technology = "microarray" + + if "organism" in entry: + accession.organism = entry["organism"] + if "release_date" in entry: + accession.published_date = timezone.make_aware( + datetime.strptime(entry["release_date"], "%Y-%m-%d") + ) + + return accession + + @staticmethod + def create_from_ma_geo_entry(entry): + """Creates accession object from MicroArray GEO meta DB entry.""" + accession = AccessionBacklogEntry() + accession.code = entry["gse"] + accession.source = "geo_meta_db" + accession.technology = "microarray" + + if "organism" in entry: + accession.organism = entry["organism"].lower() + if "submission_date" in entry: + + accession.published_date = timezone.make_aware( + datetime.strptime(entry["submission_date"], "%Y-%m-%d") + ) + + return accession + + @staticmethod + def create_from_rnaseq_entry(entry): + """Creates accession object from RNA-Seq entry.""" + accession = AccessionBacklogEntry() + accession.code = entry["secondary_study_accession"] + accession.source = "ebi_ena_portal" + accession.technology = "rna-seq" + + if "scientific_name" in entry: + accession.organism = entry["scientific_name"].lower() + if "first_public" in entry: + accession.published_date = timezone.make_aware( + datetime.strptime(entry["first_public"], "%Y-%m-%d") + ) + + return accession diff --git a/foreman/data_refinery_foreman/gatherer/__init__.py b/foreman/data_refinery_foreman/gatherer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/foreman/data_refinery_foreman/gatherer/management/__init__.py b/foreman/data_refinery_foreman/gatherer/management/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/__init__.py b/foreman/data_refinery_foreman/gatherer/management/commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py new file mode 100644 index 000000000..c4808a191 --- /dev/null +++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py @@ -0,0 +1,731 @@ +"""MicroArray (ArrayExpress, GEO) and RNA-Seq accession gathering automation. +Data sources: + - https://www.ebi.ac.uk/biostudies/help (MicroArray ArrayExpress). + - local SQLite meta DB from https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html + (MicroArray GEO). + - https://www.ebi.ac.uk/ena/portal/api/ (RNA-Seq). +""" + +import argparse +import logging +import os +import re +import sqlite3 +from datetime import datetime +from http.client import RemoteDisconnected +from json.decoder import JSONDecodeError +from typing import List, Set +from urllib.parse import quote + +from django.core.management.base import BaseCommand + +import requests +from requests.exceptions import ConnectionError, ConnectTimeout +from retrying import retry +from urllib3.exceptions import ProtocolError + +from data_refinery_common.logging import get_and_configure_logger +from data_refinery_common.models.accession import AccessionBacklogEntry +from data_refinery_common.models.experiment import Experiment + +log = get_and_configure_logger(__name__) + + +class Command(BaseCommand): + """Creates agents and runs actual accession gathering.""" + + RE_ACCESSION = re.compile(r"(\D+)(\d+)") + RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}") + + # TODO(ark): remove after upgrade to python3.8 where parser argument + # "extend" action is directly available. + # https://docs.python.org/3.8/library/argparse.html#action + class ExtendAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + items = getattr(namespace, self.dest) or [] + items.extend(values) + setattr(namespace, self.dest, items) + + def add_arguments(self, parser) -> None: + parser.register("action", "extend", Command.ExtendAction) + + parser.add_argument( + "--ae-id", + action="extend", + nargs="+", + type=str, + help="ArrayExpress ID(s) to use for filtering.", + ) + parser.add_argument( + "--ae-ids-file", + type=str, + help="Path to a file containing ArrayExpress ID(s) to use for filtering.", + ) + parser.add_argument("-c", "--count", type=int, help="Number of accessions to collect.") + parser.add_argument( + "-d", + "--dry-run", + action="store_true", + default=False, + help="Do not write the result to the database.", + ) + parser.add_argument( + "-e", + "--exclude-previous", + action="store_true", + default=True, + help="Exclude previously gathered or surveyed accessions.", + ) + parser.add_argument( + "-ne", + "--no-exclude-previous", + action="store_false", + default=False, + dest="exclude_previous", + help="Do not exclude previously gathered or surveyed accessions.", + ) + parser.add_argument( + "--gpl-id", + action="extend", + nargs="+", + type=str, + help="GEO platform ID(s) to use for filtering.", + ) + parser.add_argument( + "--gpl-ids-file", + type=str, + help="Path to a file containing GEO platform ID(s) to use for filtering.", + ) + parser.add_argument( + "-k", + "--keyword", + type=str, + help="Keyword to use for filtering.", + ) + parser.add_argument( + "-m", + "--microarray", + action="store_true", + default=False, + help="Collect MicroArray accessions.", + ) + parser.add_argument( + "-o", "--organism", type=str, help="Organism name to use for filtering." + ) + parser.add_argument( + "-r", + "--rna-seq", + action="store_true", + default=False, + help="Collect RNA-Seq accessions.", + ) + parser.add_argument( + "-s", + "--since", + type=str, + required=True, + help="Collect accessions made public on or after this date.", + ) + parser.add_argument( + "--taxon-id", + action="extend", + nargs="+", + type=int, + help="Taxon ID(s) to use for filtering.", + ) + parser.add_argument( + "--taxon-ids-file", + type=str, + help="Path to a file containing taxon ID(s) to use for filtering.", + ) + parser.add_argument( + "-u", + "--until", + type=str, + help="Collect accessions made public before or on this date.", + ) + parser.add_argument( + "-lv", + "--log-verbose", + action="store_true", + default=False, + help="Enable verbose log output.", + ) + + def set_verbosity_level(self, options) -> None: + """Configures log verbosity level.""" + if options["log_verbose"]: + log.addHandler(logging.StreamHandler()) + log.setLevel(logging.DEBUG) + else: + log.setLevel(logging.ERROR) + + def validate_args(self, options) -> None: + """Validates arguments.""" + if not options["microarray"] and not options["rna_seq"]: + exit("Either --microarray or --rna-seq must be specified.") + + errors = list() + since = options["since"] + until = options["until"] + if not self.RE_DATE.match(since): + errors.append('The -s, --since value must match "YYYY-MM-DD" format.') + if until and not self.RE_DATE.match(until): + errors.append('The -u, --until value must match "YYYY-MM-DD" format.') + if since and until and since > until: + errors.append("The -s, --since date must be earlier than -u, --until date.") + + keyword = options["keyword"] + organism = options["organism"] + if options["microarray"]: + ae_id = options["ae_id"] or options["ae_ids_file"] + gpl_id = options["gpl_id"] or options["gpl_ids_file"] + ids = ae_id or gpl_id + invalid_options_message = ( + "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or " + "ArrayExpress ID(s) [--ae-id, --ae-ids-file] / GEO platform ID(s) " + "[--gpl-id, --gpl-ids-file] must be specified." + ) + elif options["rna_seq"]: + taxon_id = options["taxon_id"] or options["taxon_ids_file"] + ids = taxon_id + invalid_options_message = ( + "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] " + "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified." + ) + + if len([option for option in (ids, keyword, organism) if option]) != 1: + errors.append(invalid_options_message) + + if errors: + exit("\n".join(errors)) + + def handle(self, *args, **options): + """Runs the accession gathering process.""" + self.validate_args(options) + self.set_verbosity_level(options) + + agents = list() + if options["rna_seq"]: + agents.append(RNASeqAccessionAgent(options)) + elif options["microarray"]: + if ( + options["ae_id"] + or options["ae_ids_file"] + or options["keyword"] + or options["organism"] + ): + agents.append(MicroArrayExpressAccessionAgent(options)) + if ( + options["gpl_id"] + or options["gpl_ids_file"] + or options["keyword"] + or options["organism"] + ): + agents.append(MicroArrayGEOAccessionAgent(options)) + + entries = set() + for agent in agents: + entries.update(agent.collect_data()) + + entries = sorted( # Sort the resulting list. + (entry for entry in entries if self.RE_ACCESSION.match(entry.code)), + key=lambda entry: ( + self.RE_ACCESSION.match(entry.code).group(1), + int(self.RE_ACCESSION.match(entry.code).group(2)), + ), + ) + # Limit the number of output entries. + entries = entries[: options["count"]] if options["count"] else entries + + if options["dry_run"]: + if entries: + output = "\n".join((str(entry) for entry in entries)) + else: + output = "No accessions found." + print(output) + else: + AccessionBacklogEntry.objects.bulk_create(entries) + + +class AccessionAgentBase: + "Accession agent base class." + + previous_accessions = set() + retry_params = { + "retry_on_exception": lambda e: isinstance( + e, (ConnectionError, ConnectTimeout, ProtocolError, RemoteDisconnected) + ), + "stop_max_attempt_number": 5, + "wait_exponential_multiplier": 1000, # Seconds. + "wait_exponential_max": 16000, # Seconds. + } + + def __init__(self, options) -> None: + """Populates args and values for major variables.""" + self.options = options + self.count = options["count"] + self.keyword = options["keyword"] + self.organism = options["organism"] + self.since = options["since"] + self.until = options["until"] or datetime.now().strftime("%Y-%m-%d") + + self.populate_previous_accessions() + + def build_query(self): + """Returns query/query dict depending on the accession data source.""" + raise NotImplementedError + + def collect_data(self): + """Generates resulting entry collection.""" + raise NotImplementedError + + def fetch_data(self): + """Fetches data from an external or local data source.""" + raise NotImplementedError + + def get_ids(self): + """Gets IDs for query filtering depending on the accession technology.""" + raise NotImplementedError + + def populate_previous_accessions(self) -> None: + """Populates previous accession set from a provided excluded ids file.""" + if not self.options["exclude_previous"] or self.previous_accessions: + return + + # Gathered accessions. + self.previous_accessions.update( + (entry["code"] for entry in AccessionBacklogEntry.objects.values("code")) + ) + + # Surveyed accessions. + experiments = Experiment.objects.values("accession_code", "alternate_accession_code") + self.previous_accessions.update( + (experiment["accession_code"] for experiment in experiments) + ) + self.previous_accessions.update( + (experiment["alternate_accession_code"] for experiment in experiments) + ) + + +class MicroArrayExpressAccessionAgent(AccessionAgentBase): + """ + MicroArray ArrayExpress accession gathering agent. The data is fetched from + the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and + https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more + information about the API endpoints. + """ + + def __init__(self, options) -> None: + super().__init__(options) + + self.data_chunk_size = 100 + self.data_url = "https://www.ebi.ac.uk/biostudies/api/v1/search" + self.ids = self.get_ids() + + def build_query(self) -> dict: + """Returns a query dict for getting array/organism specific accessions.""" + query_dict = { + "directsub": "true", + "page": 1, + "pageSize": self.data_chunk_size, + "release_date": f"[{self.since} TO {self.until}]", + "type": "study", + } + + if self.ids: + # TODO(ark): figure out better way of array filtering. + # Also make sure it's equivalent to the array filtering in this query + # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208 + query_dict.update({"content": ", ".join(self.ids)}) + elif self.keyword: + query_dict.update({"content": self.keyword}) + elif self.organism: + query_dict.update({"organism": f'"{self.organism}"'}) + + return query_dict + + def collect_data(self) -> Set[str]: + """Gets new accessions from EBI Biostudies API.""" + accessions = set() + + if self.ids: + message = ( + "Getting MicroArray ArrayExpress entries by " + f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] " + "range." + ) + elif self.keyword: + message = ( + "Getting MicroArray ArrayExpress entries by " + f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.' + ) + elif self.organism: + message = ( + "Getting MicroArray ArrayExpress entries by " + f'"{self.organism}" organism for [{self.since} - {self.until}] range.' + ) + else: + return accessions + + log.debug(message) + accessions.update(self.fetch_data()) + + return accessions + + def fetch_data(self) -> Set[str]: + """Retrieves accessions from API search endpoint.""" + + @retry(**self.retry_params) + def get_response(url, **kwargs): + """Gets response from an API endpoint.""" + return requests.get(url, **kwargs) + + accessions = set() + + is_done = False + params = self.build_query() + while not is_done: + range_start = (params["page"] - 1) * params["pageSize"] + 1 + range_end = (params["page"] - 1) * params["pageSize"] + self.data_chunk_size + log.debug(f"Processing entries {range_start} - {range_end}") + + response = get_response(self.data_url, params=params) + entries = response.json().get("hits") + if entries: + entries = ( + AccessionBacklogEntry.create_from_ma_ae_entry(entry) for entry in entries + ) + params["page"] += 1 + else: + is_done = True + + if self.previous_accessions: + entries = (entry for entry in entries if entry.code not in self.previous_accessions) + accessions.update(entries) + + # Quit after getting a sufficient amount of accessions. + if self.count and len(accessions) >= self.count: + is_done = True + + return accessions + + def get_ids(self) -> List[str]: + """Returns a combined list of passed ArrayExpress IDs.""" + ids = set() + + if self.options["ae_id"]: + ids.update(self.options["ae_id"]) + + if self.options["ae_ids_file"]: + with open(self.options["ae_ids_file"]) as ae_ids_file: + ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines())) + + return sorted(ids) + + +class MicroArrayGEOAccessionAgent(AccessionAgentBase): + """ + MicroArray GEO accession gathering agent. The data is fetched from a local + SQLite GEO meta database. + """ + + def __init__(self, options) -> None: + super().__init__(options) + + self.db_path = "data/microarray/GEOmetadb.sqlite" + self.ids = self.get_ids() + + def build_query(self) -> str: + """Returns a query for getting GEO accessions from the local SQLite meta DB.""" + tables = [ + f"SELECT *", + "FROM gse_gpl", + "JOIN gpl ON gse_gpl.gpl=gpl.gpl", + "JOIN gse ON gse.gse=gse_gpl.gse", + "GROUP BY gse_gpl.gse", + ] + + conditions = [ + f"HAVING gse.submission_date >= '{self.since}'", + f"gse.submission_date <= '{self.until}'", + ] + + if self.ids: + gpl_ids = (f"'{gpl_id}'" for gpl_id in self.ids) + conditions.append(f"gse_gpl.gpl IN ({', '.join(gpl_ids)})") + elif self.organism: + conditions.append(f"lower(organism)='{self.organism.lower()}'") + + return f"{' '.join(tables)} {' AND '.join(conditions)}" + + def collect_data(self) -> Set[str]: + """Gets new accessions from GEO database.""" + accessions = set() + + if self.ids: + message = ( + "Getting MicroArray GEO entries by GEO platform ID(s): " + f"{', '.join(self.ids)} for [{self.since} - {self.until}] range." + ) + elif self.keyword: + message = ( + f'Getting MicroArray GEO entries by "{self.keyword}" keyword ' + f"for [{self.since} - {self.until}] range." + ) + elif self.organism: + message = ( + f'Getting MicroArray GEO entries by "{self.organism}" organism ' + f"for [{self.since} - {self.until}] range." + ) + else: + return accessions + + log.debug(message) + accessions.update(self.fetch_data()) + + return accessions + + def fetch_data(self) -> Set[str]: + """Retrieves accessions from the GEO meta DB.""" + + def match_keyword(row): + """ + Returns True if `row` matches `self.keyword` based regex. + Otherwise returns False. + """ + return re_keyword.match(" ".join((str(c) for c in row if c))) + + accessions = set() + + if not os.path.exists(self.db_path): + log.error("GEO meta database doesn't exist.") + return accessions + + connection = sqlite3.connect(self.db_path) + connection.row_factory = sqlite3.Row + connection.text_factory = lambda b: b.decode(errors="ignore") + entries = connection.execute(self.build_query()).fetchall() + connection.close() + + if self.keyword: + re_keyword = re.compile(f".*{self.keyword}.*", re.IGNORECASE) # Keyword regex. + entries = filter(match_keyword, entries) + + entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries) + entries = set((AccessionBacklogEntry.create_from_ma_geo_entry(entry) for entry in entries)) + + if self.previous_accessions: + entries = (entry for entry in entries if entry.code not in self.previous_accessions) + accessions.update(entries) + + return accessions + + def get_ids(self) -> List[str]: + """Returns a combined list of passed GEO platform IDs.""" + ids = set() + + if self.options["gpl_id"]: + ids.update(self.options["gpl_id"]) + + if self.options["gpl_ids_file"]: + with open(self.options["gpl_ids_file"]) as gpl_ids_file: + ids.update((gpl_id.strip() for gpl_id in gpl_ids_file.readlines())) + + return sorted(ids) + + +class RNASeqAccessionAgent(AccessionAgentBase): + """ + RNA-Seq accession gathering agent. The data is fetched from + The European Nucleotide Archive (ENA) Portal. + See https://www.ebi.ac.uk/ena/portal/api/ for more information about the API + endpoints. + """ + + def __init__(self, options) -> None: + super().__init__(options) + + self.data_chunk_size = 10000 + self.data_url = "https://www.ebi.ac.uk/ena/portal/api/search" + self.ids = self.get_ids() + + def build_query(self, taxon_id: str = None) -> str: + """ + Returns a query to use for getting specific taxon ID accessions. + Some special characters must remain unquoted. + """ + + AND = " AND " + OR = " OR " + instrument_models = ( + "HiSeq X Five", + "HiSeq X Ten", + "Illumina Genome Analyzer II", + "Illumina Genome Analyzer IIx", + "Illumina Genome Analyzer", + "Illumina HiScanSQ", + "Illumina HiSeq 1000", + "Illumina HiSeq 1500", + "Illumina HiSeq 2000", + "Illumina HiSeq 2500", + "Illumina HiSeq 3000", + "Illumina HiSeq 4000", + "Illumina MiSeq", + "Illumina NovaSeq 6000", + "Ion Torrent Proton", + "Ion Torrent S5 XL", + "Ion Torrent S5", + "NextSeq 500", + "NextSeq 550", + ) + + instrument_models = OR.join((f'instrument_model="{im}"' for im in instrument_models)) + conditions = [ + # Relevant date fields: collection_date, collection_date_submitted, + # first_public, last_updated. + f"first_public >= {self.since}", + f"first_public <= {self.until}", + f"({instrument_models})", + 'library_source="TRANSCRIPTOMIC"', + 'library_strategy="RNA-Seq"', + ] + + if taxon_id: + conditions.append(f"tax_eq({taxon_id})") + elif self.keyword: + search_fields = ( + "assembly_software", + "bio_material", + "center_name", + "collected_by", + "experiment_title", + "host_body_site", + "instrument_model", + "instrument_platform", + "library_name", + "project_name", + "sample_title", + "sequencing_method", + "study_title", + ) + search_fields = OR.join( + (f'{sf}="*{self.keyword}*"' for sf in search_fields) + ) # Keyword regex. + conditions.append(f"({search_fields})") + elif self.organism: + # `host`: Natural (as opposed to laboratory) host to the organism from which sample + # was obtained. + # `host_scientific_name`: Scientific name of the natural (as opposed to laboratory) + # host to the organism from which sample was obtained. + # `scientific_name` Scientific name of the organism from which the sample was derived. + # Neither `host_scientific_name` nor `scientific_name` available for search. + # https://www.ebi.ac.uk/ena/portal/api/searchFields?dataPortal=ena&format=json&result=read_study + conditions.append(f'host="{self.organism}"') + + return quote(AND.join(conditions), safe='*()-="<>/ ') # Must remain unquoted. + + def collect_data(self) -> Set[str]: + """Gets new accessions from EBI ENA API.""" + accessions = set() + + if self.ids: + log.debug( + f"Getting RNA-Seq entries by taxon ID(s): " + f"{', '.join((str(idx) for idx in self.ids))} for [{self.since} - {self.until}] range." + ) + total = len(self.ids) + for idx, taxon_id in enumerate(self.ids): + if self.count and len(accessions) >= self.count: + break + + if total > 1: + log.debug(f"Getting entries for taxon ID {taxon_id}, {idx + 1} of {total}.") + accessions.update(self.fetch_data(taxon_id=taxon_id)) + elif self.keyword: + log.debug( + f'Getting RNA-Seq entries by "{self.keyword}" keyword ' + f"for [{self.since} - {self.until}] range." + ) + accessions.update(self.fetch_data()) + elif self.organism: + log.debug( + f'Getting entries by "{self.organism}" organism ' + f"for [{self.since} - {self.until}] range." + ) + accessions.update(self.fetch_data()) + + return accessions + + def fetch_data(self, taxon_id=None) -> Set[str]: + """ + Retrieves accessions from API search endpoint. + The API allows to set limit to 0 (get all in one request) but we do + it in a paginated fashion with `self.data_chunk_size` as a page size. + """ + + @retry(**self.retry_params) + def get_response(url, **kwargs): + """Gets response from an API endpoint.""" + return requests.post(url, **kwargs) + + accessions = set() + + fields = [ + "first_public", + "scientific_name", + "secondary_study_accession", + ] # For DRP/ERP/SRP-prefixed accessions. + data = { + "dataPortal": "ena", + # TODO(ark): add excludeAccessions/excludeAccessionType support. + "fields": ",".join(fields), # Use "all" to get all fields. + "format": "json", + "limit": self.data_chunk_size, + "offset": 0, + "query": self.build_query(taxon_id=taxon_id), + "result": "read_study", + "sortFields": fields, + } + + is_done = False + while not is_done: + log.debug( + f"Processing entries {data['offset'] + 1} - {data['offset'] + self.data_chunk_size}" + ) + entries = () + try: + response = get_response(self.data_url, data=data) + entries = response.json() + # TODO(ark): add `organism` when -o, --organism flag is used. + entries = ( + AccessionBacklogEntry.create_from_rnaseq_entry(entry) for entry in entries + ) + except JSONDecodeError: + is_done = True + except TypeError: + log.error(f"Couldn't get data from {self.data_url}. Response: {entries}") + data["offset"] += self.data_chunk_size + + if self.previous_accessions: + entries = (entry for entry in entries if entry.code not in self.previous_accessions) + accessions.update(entries) + + # Quit after getting a sufficient amount of accessions. + if self.count and len(accessions) >= self.count: + is_done = True + + return accessions + + def get_ids(self) -> List[str]: + """Returns a combined list of passed taxon IDs.""" + ids = set() + + if self.options["taxon_id"]: + ids.update(self.options["taxon_id"]) + + if self.options["taxon_ids_file"]: + with open(self.options["taxon_ids_file"]) as taxon_id_file: + ids.update((taxon_id.strip() for taxon_id in taxon_id_file.readlines())) + + return sorted(ids) diff --git a/foreman/data_refinery_foreman/settings.py b/foreman/data_refinery_foreman/settings.py index 7a489facc..5fea76d71 100644 --- a/foreman/data_refinery_foreman/settings.py +++ b/foreman/data_refinery_foreman/settings.py @@ -47,6 +47,7 @@ "data_refinery_common", "data_refinery_foreman.surveyor", "data_refinery_foreman.foreman", + "data_refinery_foreman.gatherer", "raven.contrib.django.raven_compat", "computedfields", ] @@ -108,10 +109,18 @@ # https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators AUTH_PASSWORD_VALIDATORS = [ - {"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",}, - {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",}, - {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",}, - {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",}, + { + "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", + }, ] diff --git a/foreman/dockerfiles/Dockerfile.foreman b/foreman/dockerfiles/Dockerfile.foreman index 8c09c6888..929ef2476 100644 --- a/foreman/dockerfiles/Dockerfile.foreman +++ b/foreman/dockerfiles/Dockerfile.foreman @@ -8,6 +8,8 @@ RUN apt-get -y install apt-fast RUN apt-fast update -qq && \ apt-fast install -y \ + gcc \ + libpq-dev \ python3 \ python3-pip From 4c7f8049506257c22d07bb96c606dd2221c1a8f7 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Tue, 13 Sep 2022 17:12:41 -0700 Subject: [PATCH 14/24] Address review comments. --- .../migrations/0071_auto_20220901_1653.py | 44 -- ...klogentry.py => 0071_gatheredaccession.py} | 6 +- .../data_refinery_common/models/__init__.py | 2 +- .../{accession.py => gathered_accession.py} | 20 +- .../gatherer/agents/__init__.py | 0 .../gatherer/agents/base.py | 79 +++ .../gatherer/agents/microarray_ae.py | 126 ++++ .../gatherer/agents/microarray_geo.py | 123 ++++ .../gatherer/agents/rna_seq.py | 204 ++++++ .../management/commands/gather_accessions.py | 643 +++--------------- 10 files changed, 626 insertions(+), 621 deletions(-) delete mode 100644 common/data_refinery_common/migrations/0071_auto_20220901_1653.py rename common/data_refinery_common/migrations/{0071_accessionbacklogentry.py => 0071_gatheredaccession.py} (88%) rename common/data_refinery_common/models/{accession.py => gathered_accession.py} (84%) create mode 100644 foreman/data_refinery_foreman/gatherer/agents/__init__.py create mode 100644 foreman/data_refinery_foreman/gatherer/agents/base.py create mode 100644 foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py create mode 100644 foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py create mode 100644 foreman/data_refinery_foreman/gatherer/agents/rna_seq.py diff --git a/common/data_refinery_common/migrations/0071_auto_20220901_1653.py b/common/data_refinery_common/migrations/0071_auto_20220901_1653.py deleted file mode 100644 index c7d3b0b63..000000000 --- a/common/data_refinery_common/migrations/0071_auto_20220901_1653.py +++ /dev/null @@ -1,44 +0,0 @@ -# Generated by Django 3.2.7 on 2022-09-01 16:53 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("data_refinery_common", "0070_auto_20211208_2118"), - ] - - operations = [ - migrations.CreateModel( - name="Accession", - fields=[ - ( - "id", - models.AutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("code", models.TextField()), - ("created_at", models.DateTimeField(auto_now_add=True)), - ("last_modified_at", models.DateTimeField(auto_now=True)), - ("organism", models.TextField()), - ("published_date", models.DateTimeField()), - ("sample_count", models.PositiveIntegerField(default=0)), - ("source", models.TextField()), - ("technology", models.TextField()), - ], - options={ - "db_table": "accessions", - }, - ), - migrations.AddConstraint( - model_name="accession", - constraint=models.UniqueConstraint( - fields=("code", "source", "technology"), name="unique_accession" - ), - ), - ] diff --git a/common/data_refinery_common/migrations/0071_accessionbacklogentry.py b/common/data_refinery_common/migrations/0071_gatheredaccession.py similarity index 88% rename from common/data_refinery_common/migrations/0071_accessionbacklogentry.py rename to common/data_refinery_common/migrations/0071_gatheredaccession.py index 86c04daed..a1740d96e 100644 --- a/common/data_refinery_common/migrations/0071_accessionbacklogentry.py +++ b/common/data_refinery_common/migrations/0071_gatheredaccession.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.7 on 2022-09-07 19:31 +# Generated by Django 3.2.7 on 2022-09-13 18:14 from django.db import migrations, models @@ -11,7 +11,7 @@ class Migration(migrations.Migration): operations = [ migrations.CreateModel( - name="AccessionBacklogEntry", + name="GatheredAccession", fields=[ ( "id", @@ -32,7 +32,7 @@ class Migration(migrations.Migration): ("technology", models.TextField()), ], options={ - "db_table": "accession_backlog", + "db_table": "gathered_accessions", }, ), ] diff --git a/common/data_refinery_common/models/__init__.py b/common/data_refinery_common/models/__init__.py index 8e9564153..2b544765d 100644 --- a/common/data_refinery_common/models/__init__.py +++ b/common/data_refinery_common/models/__init__.py @@ -1,4 +1,3 @@ -from data_refinery_common.models.accession import Accession # noqa from data_refinery_common.models.api_token import APIToken # noqa from data_refinery_common.models.associations.compendium_result_organism_association import ( # noqa CompendiumResultOrganismAssociation, @@ -46,6 +45,7 @@ from data_refinery_common.models.dataset_annotation import DatasetAnnotation # noqa from data_refinery_common.models.experiment import Experiment # noqa from data_refinery_common.models.experiment_annotation import ExperimentAnnotation # noqa +from data_refinery_common.models.gathered_accession import GatheredAccession # noqa from data_refinery_common.models.jobs.downloader_job import DownloaderJob # noqa from data_refinery_common.models.jobs.processor_job import ProcessorJob # noqa from data_refinery_common.models.jobs.survey_job import SurveyJob # noqa diff --git a/common/data_refinery_common/models/accession.py b/common/data_refinery_common/models/gathered_accession.py similarity index 84% rename from common/data_refinery_common/models/accession.py rename to common/data_refinery_common/models/gathered_accession.py index 6ac62da9f..04b084533 100644 --- a/common/data_refinery_common/models/accession.py +++ b/common/data_refinery_common/models/gathered_accession.py @@ -4,11 +4,11 @@ from django.utils import timezone -class AccessionBacklogEntry(models.Model): - """Accession backlog entry model.""" +class GatheredAccession(models.Model): + """Gathered accession model.""" class Meta: - db_table = "accession_backlog" + db_table = "gathered_accessions" code = models.TextField(unique=True) created_at = models.DateTimeField(auto_now_add=True) @@ -21,7 +21,7 @@ class Meta: def __eq__(self, other: object) -> bool: """Returns True if two objects are equal. Otherwise returns False.""" - return isinstance(other, AccessionBacklogEntry) and self.code == other.code + return isinstance(other, GatheredAccession) and self.code == other.code def __hash__(self) -> int: """Returns accession object unique hash value.""" @@ -32,15 +32,15 @@ def __str__(self) -> str: return ", ".join((self.code, self.technology, self.source, str(self.published_date.date()))) @staticmethod - def create_from_ma_ae_entry(entry): + def create_from_ma_ae_entry(entry, organism=None): """Creates accession object from MicroArray ArrayExpress entry.""" - accession = AccessionBacklogEntry() + accession = GatheredAccession() accession.code = entry["accession"] accession.source = "ebi_biostudies" accession.technology = "microarray" - if "organism" in entry: - accession.organism = entry["organism"] + if organism: + accession.organism = organism if "release_date" in entry: accession.published_date = timezone.make_aware( datetime.strptime(entry["release_date"], "%Y-%m-%d") @@ -51,7 +51,7 @@ def create_from_ma_ae_entry(entry): @staticmethod def create_from_ma_geo_entry(entry): """Creates accession object from MicroArray GEO meta DB entry.""" - accession = AccessionBacklogEntry() + accession = GatheredAccession() accession.code = entry["gse"] accession.source = "geo_meta_db" accession.technology = "microarray" @@ -69,7 +69,7 @@ def create_from_ma_geo_entry(entry): @staticmethod def create_from_rnaseq_entry(entry): """Creates accession object from RNA-Seq entry.""" - accession = AccessionBacklogEntry() + accession = GatheredAccession() accession.code = entry["secondary_study_accession"] accession.source = "ebi_ena_portal" accession.technology = "rna-seq" diff --git a/foreman/data_refinery_foreman/gatherer/agents/__init__.py b/foreman/data_refinery_foreman/gatherer/agents/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/foreman/data_refinery_foreman/gatherer/agents/base.py b/foreman/data_refinery_foreman/gatherer/agents/base.py new file mode 100644 index 000000000..3754a4068 --- /dev/null +++ b/foreman/data_refinery_foreman/gatherer/agents/base.py @@ -0,0 +1,79 @@ +"""Abstract base class for accession gathering automation agents.""" + +from abc import ABC, abstractmethod +from datetime import datetime +from http.client import RemoteDisconnected + +from requests.exceptions import ConnectionError, ConnectTimeout +from urllib3.exceptions import ProtocolError + +from data_refinery_common.logging import get_and_configure_logger +from data_refinery_common.models.experiment import Experiment +from data_refinery_common.models.gathered_accession import GatheredAccession + +logger = get_and_configure_logger(__name__) + + +class AccessionAgentBase(ABC): + "Accession agent abstract base class." + + previous_accessions = set() + retry_params = { + "retry_on_exception": lambda e: isinstance( + e, (ConnectionError, ConnectTimeout, ProtocolError, RemoteDisconnected) + ), + "stop_max_attempt_number": 5, + "wait_exponential_multiplier": 1000, # Seconds. + "wait_exponential_max": 16000, # Seconds. + } + + def __init__(self, options) -> None: + """Populates args and values for major variables.""" + self.options = options + self.count = options["count"] + self.keyword = options["keyword"] + self.organism = options["organism"] + self.since = options["since"] + self.until = options["until"] or datetime.now().strftime("%Y-%m-%d") + + self.ids = self.get_ids() + self.populate_previous_accessions() + + @abstractmethod + def build_query(self): + """Returns query/query dict depending on the accession data source.""" + pass + + @abstractmethod + def collect_data(self): + """Generates resulting entry collection.""" + pass + + @abstractmethod + def fetch_data(self): + """Fetches data from an external or local data source.""" + pass + + @abstractmethod + def get_ids(self): + """Gets IDs for query filtering depending on the accession technology.""" + pass + + def populate_previous_accessions(self) -> None: + """Populates previous accession set from a provided excluded ids file.""" + if not self.options["exclude_previous"] or self.previous_accessions: + return + + # Gathered accessions. + self.previous_accessions.update( + (entry["code"] for entry in GatheredAccession.objects.values("code")) + ) + + # Surveyed accessions. + experiments = Experiment.objects.values("accession_code", "alternate_accession_code") + self.previous_accessions.update( + (experiment["accession_code"] for experiment in experiments) + ) + self.previous_accessions.update( + (experiment["alternate_accession_code"] for experiment in experiments) + ) diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py new file mode 100644 index 000000000..b5314302b --- /dev/null +++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py @@ -0,0 +1,126 @@ +"""MicroArray ArrayExpress accession gathering automation. +Data source: https://www.ebi.ac.uk/biostudies/help""" + +from typing import List, Set + +import requests +from retrying import retry + +from data_refinery_common.logging import get_and_configure_logger +from data_refinery_common.models.gathered_accession import GatheredAccession +from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase + +logger = get_and_configure_logger(__name__) + + +class MicroArrayExpressAccessionAgent(AccessionAgentBase): + """ + MicroArray ArrayExpress accession gathering agent. The data is fetched from + the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and + https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more + information about the API endpoints. + """ + + DATA_CHUNK_SIZE = 100 + DATA_URL = "https://www.ebi.ac.uk/biostudies/api/v1/search" + + def build_query(self) -> dict: + """Returns a query dict for getting array/organism specific accessions.""" + query_dict = { + "directsub": "true", + "page": 1, + "pageSize": self.DATA_CHUNK_SIZE, + "release_date": f"[{self.since} TO {self.until}]", + "type": "study", + } + + if self.ids: + # TODO(ark): figure out better way of array filtering. + # Also make sure it's equivalent to the array filtering in this query + # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208 + query_dict.update({"content": ", ".join(self.ids)}) + elif self.keyword: + query_dict.update({"content": self.keyword}) + elif self.organism: + query_dict.update({"organism": f'"{self.organism}"'}) + + return query_dict + + def collect_data(self) -> Set[str]: + """Gets new accessions from EBI Biostudies API.""" + accessions = set() + + if self.ids: + message = ( + "Getting MicroArray ArrayExpress entries by " + f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] " + "range." + ) + elif self.keyword: + message = ( + "Getting MicroArray ArrayExpress entries by " + f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.' + ) + elif self.organism: + message = ( + "Getting MicroArray ArrayExpress entries by " + f'"{self.organism}" organism for [{self.since} - {self.until}] range.' + ) + else: + return accessions + + logger.debug(message) + accessions.update(self.fetch_data()) + + return accessions + + def fetch_data(self) -> Set[str]: + """Retrieves accessions from API search endpoint.""" + + @retry(**self.retry_params) + def get_response(url, **kwargs): + """Gets response from an API endpoint.""" + return requests.get(url, **kwargs) + + accessions = set() + + is_done = False + params = self.build_query() + while not is_done: + range_start = (params["page"] - 1) * params["pageSize"] + 1 + range_end = (params["page"] - 1) * params["pageSize"] + self.DATA_CHUNK_SIZE + logger.debug(f"Processing entries {range_start} - {range_end}") + + response = get_response(self.DATA_URL, params=params) + entries = response.json().get("hits") + if entries: + entries = ( + GatheredAccession.create_from_ma_ae_entry(entry, organism=self.organism) + for entry in entries + ) + params["page"] += 1 + else: + is_done = True + + if self.previous_accessions: + entries = (entry for entry in entries if entry.code not in self.previous_accessions) + accessions.update(entries) + + # Quit after getting a sufficient amount of accessions. + if self.count and len(accessions) >= self.count: + is_done = True + + return accessions + + def get_ids(self) -> List[str]: + """Returns a combined list of passed ArrayExpress IDs.""" + ids = set() + + if self.options["ae_id"]: + ids.update(self.options["ae_id"]) + + if self.options["ae_ids_file"]: + with open(self.options["ae_ids_file"]) as ae_ids_file: + ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines())) + + return sorted(ids) diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py new file mode 100644 index 000000000..975c715b3 --- /dev/null +++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py @@ -0,0 +1,123 @@ +"""MicroArray GEO accession gathering automation. +Data source: local SQLite meta DB from https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html""" + +import os +import re +import sqlite3 +from typing import List, Set + +from data_refinery_common.logging import get_and_configure_logger +from data_refinery_common.models.gathered_accession import GatheredAccession +from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase + +logger = get_and_configure_logger(__name__) + + +class MicroArrayGEOAccessionAgent(AccessionAgentBase): + """ + MicroArray GEO accession gathering agent. The data is fetched from a local + SQLite GEO meta database. + """ + + # TODO(ark): move the DB file from Docker image to S3. + # Implement syncing procedure. + # Update URL once the original file is available again. + DB_PATH = "data/microarray/GEOmetadb.sqlite" + + def build_query(self) -> str: + """Returns a query for getting GEO accessions from the local SQLite meta DB.""" + tables = [ + "SELECT *", + "FROM gse_gpl", + "JOIN gpl ON gse_gpl.gpl=gpl.gpl", + "JOIN gse ON gse.gse=gse_gpl.gse", + "GROUP BY gse_gpl.gse", + ] + + conditions = [ + f"HAVING gse.submission_date >= '{self.since}'", + f"gse.submission_date <= '{self.until}'", + ] + + if self.ids: + gpl_ids = (f"'{gpl_id}'" for gpl_id in self.ids) + conditions.append(f"gse_gpl.gpl IN ({', '.join(gpl_ids)})") + elif self.organism: + conditions.append(f"lower(organism)='{self.organism.lower()}'") + + return f"{' '.join(tables)} {' AND '.join(conditions)}" + + def collect_data(self) -> Set[str]: + """Gets new accessions from GEO database.""" + accessions = set() + + if self.ids: + message = ( + "Getting MicroArray GEO entries by GEO platform ID(s): " + f"{', '.join(self.ids)} for [{self.since} - {self.until}] range." + ) + elif self.keyword: + message = ( + f'Getting MicroArray GEO entries by "{self.keyword}" keyword ' + f"for [{self.since} - {self.until}] range." + ) + elif self.organism: + message = ( + f'Getting MicroArray GEO entries by "{self.organism}" organism ' + f"for [{self.since} - {self.until}] range." + ) + else: + return accessions + + logger.debug(message) + accessions.update(self.fetch_data()) + + return accessions + + def fetch_data(self) -> Set[str]: + """Retrieves accessions from the GEO meta DB.""" + + def match_keyword(row): + """ + Returns True if `row` matches `self.keyword` based regex. + Otherwise returns False. + """ + return re_keyword.match(" ".join((str(c) for c in row if c))) + + accessions = set() + + if not os.path.exists(self.DB_PATH): + logger.error("GEO meta database doesn't exist.") + return accessions + + connection = sqlite3.connect(self.DB_PATH) + connection.row_factory = sqlite3.Row + connection.text_factory = lambda b: b.decode(errors="ignore") + entries = connection.execute(self.build_query()).fetchall() + connection.close() + + if self.keyword: + re_keyword = re.compile(f".*{self.keyword}.*", re.IGNORECASE) # Keyword regex. + entries = filter(match_keyword, entries) + + entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries) + entries = set((GatheredAccession.create_from_ma_geo_entry(entry) for entry in entries)) + + if self.previous_accessions: + entries = (entry for entry in entries if entry.code not in self.previous_accessions) + accessions.update(entries) + + return accessions + + def get_ids(self) -> List[str]: + """Returns a combined list of passed GEO platform IDs.""" + ids = set() + + if self.options["gpl_id"]: + ids.update(self.options["gpl_id"]) + + if self.options["gpl_ids_file"]: + with open(self.options["gpl_ids_file"]) as gpl_ids_file: + ids.update((gpl_id.strip() for gpl_id in gpl_ids_file.readlines())) + + return sorted(ids) diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py new file mode 100644 index 000000000..f9497f3ba --- /dev/null +++ b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py @@ -0,0 +1,204 @@ +"""RNA-Seq accession gathering automation. +Data source: https://www.ebi.ac.uk/ena/portal/api/""" + +from json.decoder import JSONDecodeError +from typing import List, Set +from urllib.parse import quote + +import requests +from retrying import retry + +from data_refinery_common.logging import get_and_configure_logger +from data_refinery_common.models.gathered_accession import GatheredAccession +from data_refinery_foreman.gatherer.agents.base import AccessionAgentBase + +logger = get_and_configure_logger(__name__) + + +class RNASeqAccessionAgent(AccessionAgentBase): + """ + RNA-Seq accession gathering agent. The data is fetched from + The European Nucleotide Archive (ENA) Portal. + See https://www.ebi.ac.uk/ena/portal/api/ for more information about the API + endpoints. + """ + + DATA_CHUNK_SIZE = 10000 + DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search" + + def build_query(self, taxon_id: str = None) -> str: + """ + Returns a query to use for getting specific taxon ID accessions. + Some special characters must remain unquoted. + """ + + AND = " AND " + OR = " OR " + instrument_models = ( + "HiSeq X Five", + "HiSeq X Ten", + "Illumina Genome Analyzer II", + "Illumina Genome Analyzer IIx", + "Illumina Genome Analyzer", + "Illumina HiScanSQ", + "Illumina HiSeq 1000", + "Illumina HiSeq 1500", + "Illumina HiSeq 2000", + "Illumina HiSeq 2500", + "Illumina HiSeq 3000", + "Illumina HiSeq 4000", + "Illumina MiSeq", + "Illumina NovaSeq 6000", + "Ion Torrent Proton", + "Ion Torrent S5 XL", + "Ion Torrent S5", + "NextSeq 500", + "NextSeq 550", + ) + + instrument_models = OR.join((f'instrument_model="{im}"' for im in instrument_models)) + conditions = [ + # Relevant date fields: collection_date, collection_date_submitted, + # first_public, last_updated. + f"first_public >= {self.since}", + f"first_public <= {self.until}", + f"({instrument_models})", + 'library_source="TRANSCRIPTOMIC"', + 'library_strategy="RNA-Seq"', + ] + + if taxon_id: + conditions.append(f"tax_eq({taxon_id})") + elif self.keyword: + search_fields = ( + "assembly_software", + "bio_material", + "center_name", + "collected_by", + "experiment_title", + "host_body_site", + "instrument_model", + "instrument_platform", + "library_name", + "project_name", + "sample_title", + "sequencing_method", + "study_title", + ) + search_fields = OR.join( + (f'{sf}="*{self.keyword}*"' for sf in search_fields) + ) # Keyword regex. + conditions.append(f"({search_fields})") + elif self.organism: + # `host`: Natural (as opposed to laboratory) host to the organism from which sample + # was obtained. + # `host_scientific_name`: Scientific name of the natural (as opposed to laboratory) + # host to the organism from which sample was obtained. + # `scientific_name` Scientific name of the organism from which the sample was derived. + # Neither `host_scientific_name` nor `scientific_name` available for search. + # https://www.ebi.ac.uk/ena/portal/api/searchFields?dataPortal=ena&format=json&result=read_study + conditions.append(f'host="{self.organism}"') + + return quote(AND.join(conditions), safe='*()-="<>/ ') # Must remain unquoted. + + def collect_data(self) -> Set[str]: + """Gets new accessions from EBI ENA API.""" + accessions = set() + + if self.ids: + logger.debug( + f"Getting RNA-Seq entries by taxon ID(s): " + f"{', '.join((str(i) for i in self.ids))} for [{self.since} - {self.until}] range." + ) + total = len(self.ids) + for idx, taxon_id in enumerate(self.ids): + if self.count and len(accessions) >= self.count: + break + + if total > 1: + logger.debug(f"Getting entries for taxon ID {taxon_id}, {idx + 1} of {total}.") + accessions.update(self.fetch_data(taxon_id=taxon_id)) + elif self.keyword: + logger.debug( + f'Getting RNA-Seq entries by "{self.keyword}" keyword ' + f"for [{self.since} - {self.until}] range." + ) + accessions.update(self.fetch_data()) + elif self.organism: + logger.debug( + f'Getting entries by "{self.organism}" organism ' + f"for [{self.since} - {self.until}] range." + ) + accessions.update(self.fetch_data()) + + return accessions + + def fetch_data(self, taxon_id=None) -> Set[str]: + """ + Retrieves accessions from API search endpoint. + The API allows to set limit to 0 (get all in one request) but we do + it in a paginated fashion with `self.DATA_CHUNK_SIZE` as a page size. + """ + + @retry(**self.retry_params) + def get_response(url, **kwargs): + """Gets response from an API endpoint.""" + return requests.post(url, **kwargs) + + accessions = set() + + fields = [ + "first_public", + "scientific_name", + "secondary_study_accession", + ] # For DRP/ERP/SRP-prefixed accessions. + data = { + "dataPortal": "ena", + # TODO(ark): add excludeAccessions/excludeAccessionType support. + "fields": ",".join(fields), # Use "all" to get all fields. + "format": "json", + "limit": self.DATA_CHUNK_SIZE, + "offset": 0, + "query": self.build_query(taxon_id=taxon_id), + "result": "read_study", + "sortFields": fields, + } + + is_done = False + while not is_done: + logger.debug( + f"Processing entries {data['offset'] + 1} - {data['offset'] + self.DATA_CHUNK_SIZE}" + ) + entries = () + try: + response = get_response(self.DATA_URL, data=data) + entries = response.json() + entries = (GatheredAccession.create_from_rnaseq_entry(entry) for entry in entries) + except JSONDecodeError: + is_done = True + except TypeError: + logger.error(f"Couldn't get data from {self.data_url}. Response: {entries}") + data["offset"] += self.DATA_CHUNK_SIZE + + if self.previous_accessions: + entries = (entry for entry in entries if entry.code not in self.previous_accessions) + accessions.update(entries) + + # Quit after getting a sufficient amount of accessions. + if self.count and len(accessions) >= self.count: + is_done = True + + return accessions + + def get_ids(self) -> List[str]: + """Returns a combined list of passed taxon IDs.""" + ids = set() + + if self.options["taxon_id"]: + ids.update(self.options["taxon_id"]) + + if self.options["taxon_ids_file"]: + with open(self.options["taxon_ids_file"]) as taxon_id_file: + ids.update((taxon_id.strip() for taxon_id in taxon_id_file.readlines())) + + return sorted(ids) diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py index c4808a191..445245d3a 100644 --- a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py +++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py @@ -8,32 +8,27 @@ import argparse import logging -import os import re -import sqlite3 -from datetime import datetime -from http.client import RemoteDisconnected -from json.decoder import JSONDecodeError -from typing import List, Set -from urllib.parse import quote from django.core.management.base import BaseCommand -import requests -from requests.exceptions import ConnectionError, ConnectTimeout -from retrying import retry -from urllib3.exceptions import ProtocolError - from data_refinery_common.logging import get_and_configure_logger -from data_refinery_common.models.accession import AccessionBacklogEntry -from data_refinery_common.models.experiment import Experiment +from data_refinery_common.models.gathered_accession import GatheredAccession +from data_refinery_foreman.gatherer.agents.microarray_ae import MicroArrayExpressAccessionAgent +from data_refinery_foreman.gatherer.agents.microarray_geo import MicroArrayGEOAccessionAgent +from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAccessionAgent -log = get_and_configure_logger(__name__) +logger = get_and_configure_logger(__name__) class Command(BaseCommand): """Creates agents and runs actual accession gathering.""" + DATA_SOURCE_MA_AE = "microarray-ae" + DATA_SOURCE_MA_GEO = "microarray-geo" + DATA_SOURCE_RNA_SEQ = "rna-seq" + DATA_SOURCES = (DATA_SOURCE_MA_AE, DATA_SOURCE_MA_GEO, DATA_SOURCE_RNA_SEQ) + RE_ACCESSION = re.compile(r"(\D+)(\d+)") RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}") @@ -76,14 +71,6 @@ def add_arguments(self, parser) -> None: default=True, help="Exclude previously gathered or surveyed accessions.", ) - parser.add_argument( - "-ne", - "--no-exclude-previous", - action="store_false", - default=False, - dest="exclude_previous", - help="Do not exclude previously gathered or surveyed accessions.", - ) parser.add_argument( "--gpl-id", action="extend", @@ -103,21 +90,22 @@ def add_arguments(self, parser) -> None: help="Keyword to use for filtering.", ) parser.add_argument( - "-m", - "--microarray", + "-lv", + "--log-verbose", action="store_true", default=False, - help="Collect MicroArray accessions.", + help="Enable verbose log output.", ) parser.add_argument( - "-o", "--organism", type=str, help="Organism name to use for filtering." + "-ne", + "--no-exclude-previous", + action="store_false", + default=False, + dest="exclude_previous", + help="Do not exclude previously gathered or surveyed accessions.", ) parser.add_argument( - "-r", - "--rna-seq", - action="store_true", - default=False, - help="Collect RNA-Seq accessions.", + "-o", "--organism", type=str, help="Organism name to use for filtering." ) parser.add_argument( "-s", @@ -126,6 +114,14 @@ def add_arguments(self, parser) -> None: required=True, help="Collect accessions made public on or after this date.", ) + parser.add_argument( + "-src", + "--source", + type=str, + action="extend", + nargs="+", + help="Gather accessions from selected sources.", + ) parser.add_argument( "--taxon-id", action="extend", @@ -144,28 +140,19 @@ def add_arguments(self, parser) -> None: type=str, help="Collect accessions made public before or on this date.", ) - parser.add_argument( - "-lv", - "--log-verbose", - action="store_true", - default=False, - help="Enable verbose log output.", - ) def set_verbosity_level(self, options) -> None: """Configures log verbosity level.""" if options["log_verbose"]: - log.addHandler(logging.StreamHandler()) - log.setLevel(logging.DEBUG) + logger.addHandler(logging.StreamHandler()) + logger.setLevel(logging.DEBUG) else: - log.setLevel(logging.ERROR) + logger.setLevel(logging.ERROR) def validate_args(self, options) -> None: """Validates arguments.""" - if not options["microarray"] and not options["rna_seq"]: - exit("Either --microarray or --rna-seq must be specified.") - errors = list() + since = options["since"] until = options["until"] if not self.RE_DATE.match(since): @@ -177,52 +164,65 @@ def validate_args(self, options) -> None: keyword = options["keyword"] organism = options["organism"] - if options["microarray"]: - ae_id = options["ae_id"] or options["ae_ids_file"] - gpl_id = options["gpl_id"] or options["gpl_ids_file"] - ids = ae_id or gpl_id - invalid_options_message = ( - "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or " - "ArrayExpress ID(s) [--ae-id, --ae-ids-file] / GEO platform ID(s) " - "[--gpl-id, --gpl-ids-file] must be specified." - ) - elif options["rna_seq"]: - taxon_id = options["taxon_id"] or options["taxon_ids_file"] - ids = taxon_id - invalid_options_message = ( - "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] " - "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified." + sources = options["source"] or self.DATA_SOURCES + + for source in sources: + if source in self.DATA_SOURCES: + continue + errors.append( + f"Unknown source: {source}. Supported sources: {', '.join(self.DATA_SOURCES)}" ) - if len([option for option in (ids, keyword, organism) if option]) != 1: - errors.append(invalid_options_message) + if self.DATA_SOURCE_MA_AE in sources: + ids = options["ae_id"] or options["ae_ids_file"] + if not (ids or keyword or organism): + errors.append( + ( + "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or " + "ArrayExpress ID(s) [--ae-id, --ae-ids-file] must be specified for " + f"'{self.DATA_SOURCE_MA_AE}' source." + ) + ) + if self.DATA_SOURCE_MA_GEO in sources: + ids = options["gpl_id"] or options["gpl_ids_file"] + if not (ids or keyword or organism): + errors.append( + ( + "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or " + "GEO platform ID(s) [--gpl-id, --gpl-ids-file] must be specified for " + f"'{self.DATA_SOURCE_MA_GEO}' source." + ) + ) + if self.DATA_SOURCE_RNA_SEQ in sources: + ids = options["taxon_id"] or options["taxon_ids_file"] + if not (ids or keyword or organism): + errors.append( + ( + "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] " + "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified for " + f"'{self.DATA_SOURCE_RNA_SEQ}' source." + ) + ) if errors: exit("\n".join(errors)) def handle(self, *args, **options): - """Runs the accession gathering process.""" + """Creates agents and runs the accession gathering process.""" self.validate_args(options) self.set_verbosity_level(options) agents = list() - if options["rna_seq"]: + sources = options["source"] or self.DATA_SOURCES + + if self.DATA_SOURCE_RNA_SEQ in sources: agents.append(RNASeqAccessionAgent(options)) - elif options["microarray"]: - if ( - options["ae_id"] - or options["ae_ids_file"] - or options["keyword"] - or options["organism"] - ): - agents.append(MicroArrayExpressAccessionAgent(options)) - if ( - options["gpl_id"] - or options["gpl_ids_file"] - or options["keyword"] - or options["organism"] - ): - agents.append(MicroArrayGEOAccessionAgent(options)) + + if self.DATA_SOURCE_MA_AE in sources: + agents.append(MicroArrayExpressAccessionAgent(options)) + + if self.DATA_SOURCE_MA_GEO in sources: + agents.append(MicroArrayGEOAccessionAgent(options)) entries = set() for agent in agents: @@ -245,487 +245,4 @@ def handle(self, *args, **options): output = "No accessions found." print(output) else: - AccessionBacklogEntry.objects.bulk_create(entries) - - -class AccessionAgentBase: - "Accession agent base class." - - previous_accessions = set() - retry_params = { - "retry_on_exception": lambda e: isinstance( - e, (ConnectionError, ConnectTimeout, ProtocolError, RemoteDisconnected) - ), - "stop_max_attempt_number": 5, - "wait_exponential_multiplier": 1000, # Seconds. - "wait_exponential_max": 16000, # Seconds. - } - - def __init__(self, options) -> None: - """Populates args and values for major variables.""" - self.options = options - self.count = options["count"] - self.keyword = options["keyword"] - self.organism = options["organism"] - self.since = options["since"] - self.until = options["until"] or datetime.now().strftime("%Y-%m-%d") - - self.populate_previous_accessions() - - def build_query(self): - """Returns query/query dict depending on the accession data source.""" - raise NotImplementedError - - def collect_data(self): - """Generates resulting entry collection.""" - raise NotImplementedError - - def fetch_data(self): - """Fetches data from an external or local data source.""" - raise NotImplementedError - - def get_ids(self): - """Gets IDs for query filtering depending on the accession technology.""" - raise NotImplementedError - - def populate_previous_accessions(self) -> None: - """Populates previous accession set from a provided excluded ids file.""" - if not self.options["exclude_previous"] or self.previous_accessions: - return - - # Gathered accessions. - self.previous_accessions.update( - (entry["code"] for entry in AccessionBacklogEntry.objects.values("code")) - ) - - # Surveyed accessions. - experiments = Experiment.objects.values("accession_code", "alternate_accession_code") - self.previous_accessions.update( - (experiment["accession_code"] for experiment in experiments) - ) - self.previous_accessions.update( - (experiment["alternate_accession_code"] for experiment in experiments) - ) - - -class MicroArrayExpressAccessionAgent(AccessionAgentBase): - """ - MicroArray ArrayExpress accession gathering agent. The data is fetched from - the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and - https://www.ebi.ac.uk/biostudies/arrayexpress/help#programmatic for more - information about the API endpoints. - """ - - def __init__(self, options) -> None: - super().__init__(options) - - self.data_chunk_size = 100 - self.data_url = "https://www.ebi.ac.uk/biostudies/api/v1/search" - self.ids = self.get_ids() - - def build_query(self) -> dict: - """Returns a query dict for getting array/organism specific accessions.""" - query_dict = { - "directsub": "true", - "page": 1, - "pageSize": self.data_chunk_size, - "release_date": f"[{self.since} TO {self.until}]", - "type": "study", - } - - if self.ids: - # TODO(ark): figure out better way of array filtering. - # Also make sure it's equivalent to the array filtering in this query - # https://github.com/AlexsLemonade/accession_retrieval/blob/master/experiment_accession_retrieval.R#L208 - query_dict.update({"content": ", ".join(self.ids)}) - elif self.keyword: - query_dict.update({"content": self.keyword}) - elif self.organism: - query_dict.update({"organism": f'"{self.organism}"'}) - - return query_dict - - def collect_data(self) -> Set[str]: - """Gets new accessions from EBI Biostudies API.""" - accessions = set() - - if self.ids: - message = ( - "Getting MicroArray ArrayExpress entries by " - f"ArrayExpress ID(s): {', '.join(self.ids)} for [{self.since} - {self.until}] " - "range." - ) - elif self.keyword: - message = ( - "Getting MicroArray ArrayExpress entries by " - f'"{self.keyword}" keyword for [{self.since} - {self.until}] range.' - ) - elif self.organism: - message = ( - "Getting MicroArray ArrayExpress entries by " - f'"{self.organism}" organism for [{self.since} - {self.until}] range.' - ) - else: - return accessions - - log.debug(message) - accessions.update(self.fetch_data()) - - return accessions - - def fetch_data(self) -> Set[str]: - """Retrieves accessions from API search endpoint.""" - - @retry(**self.retry_params) - def get_response(url, **kwargs): - """Gets response from an API endpoint.""" - return requests.get(url, **kwargs) - - accessions = set() - - is_done = False - params = self.build_query() - while not is_done: - range_start = (params["page"] - 1) * params["pageSize"] + 1 - range_end = (params["page"] - 1) * params["pageSize"] + self.data_chunk_size - log.debug(f"Processing entries {range_start} - {range_end}") - - response = get_response(self.data_url, params=params) - entries = response.json().get("hits") - if entries: - entries = ( - AccessionBacklogEntry.create_from_ma_ae_entry(entry) for entry in entries - ) - params["page"] += 1 - else: - is_done = True - - if self.previous_accessions: - entries = (entry for entry in entries if entry.code not in self.previous_accessions) - accessions.update(entries) - - # Quit after getting a sufficient amount of accessions. - if self.count and len(accessions) >= self.count: - is_done = True - - return accessions - - def get_ids(self) -> List[str]: - """Returns a combined list of passed ArrayExpress IDs.""" - ids = set() - - if self.options["ae_id"]: - ids.update(self.options["ae_id"]) - - if self.options["ae_ids_file"]: - with open(self.options["ae_ids_file"]) as ae_ids_file: - ids.update((ae_id.strip() for ae_id in ae_ids_file.readlines())) - - return sorted(ids) - - -class MicroArrayGEOAccessionAgent(AccessionAgentBase): - """ - MicroArray GEO accession gathering agent. The data is fetched from a local - SQLite GEO meta database. - """ - - def __init__(self, options) -> None: - super().__init__(options) - - self.db_path = "data/microarray/GEOmetadb.sqlite" - self.ids = self.get_ids() - - def build_query(self) -> str: - """Returns a query for getting GEO accessions from the local SQLite meta DB.""" - tables = [ - f"SELECT *", - "FROM gse_gpl", - "JOIN gpl ON gse_gpl.gpl=gpl.gpl", - "JOIN gse ON gse.gse=gse_gpl.gse", - "GROUP BY gse_gpl.gse", - ] - - conditions = [ - f"HAVING gse.submission_date >= '{self.since}'", - f"gse.submission_date <= '{self.until}'", - ] - - if self.ids: - gpl_ids = (f"'{gpl_id}'" for gpl_id in self.ids) - conditions.append(f"gse_gpl.gpl IN ({', '.join(gpl_ids)})") - elif self.organism: - conditions.append(f"lower(organism)='{self.organism.lower()}'") - - return f"{' '.join(tables)} {' AND '.join(conditions)}" - - def collect_data(self) -> Set[str]: - """Gets new accessions from GEO database.""" - accessions = set() - - if self.ids: - message = ( - "Getting MicroArray GEO entries by GEO platform ID(s): " - f"{', '.join(self.ids)} for [{self.since} - {self.until}] range." - ) - elif self.keyword: - message = ( - f'Getting MicroArray GEO entries by "{self.keyword}" keyword ' - f"for [{self.since} - {self.until}] range." - ) - elif self.organism: - message = ( - f'Getting MicroArray GEO entries by "{self.organism}" organism ' - f"for [{self.since} - {self.until}] range." - ) - else: - return accessions - - log.debug(message) - accessions.update(self.fetch_data()) - - return accessions - - def fetch_data(self) -> Set[str]: - """Retrieves accessions from the GEO meta DB.""" - - def match_keyword(row): - """ - Returns True if `row` matches `self.keyword` based regex. - Otherwise returns False. - """ - return re_keyword.match(" ".join((str(c) for c in row if c))) - - accessions = set() - - if not os.path.exists(self.db_path): - log.error("GEO meta database doesn't exist.") - return accessions - - connection = sqlite3.connect(self.db_path) - connection.row_factory = sqlite3.Row - connection.text_factory = lambda b: b.decode(errors="ignore") - entries = connection.execute(self.build_query()).fetchall() - connection.close() - - if self.keyword: - re_keyword = re.compile(f".*{self.keyword}.*", re.IGNORECASE) # Keyword regex. - entries = filter(match_keyword, entries) - - entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries) - entries = set((AccessionBacklogEntry.create_from_ma_geo_entry(entry) for entry in entries)) - - if self.previous_accessions: - entries = (entry for entry in entries if entry.code not in self.previous_accessions) - accessions.update(entries) - - return accessions - - def get_ids(self) -> List[str]: - """Returns a combined list of passed GEO platform IDs.""" - ids = set() - - if self.options["gpl_id"]: - ids.update(self.options["gpl_id"]) - - if self.options["gpl_ids_file"]: - with open(self.options["gpl_ids_file"]) as gpl_ids_file: - ids.update((gpl_id.strip() for gpl_id in gpl_ids_file.readlines())) - - return sorted(ids) - - -class RNASeqAccessionAgent(AccessionAgentBase): - """ - RNA-Seq accession gathering agent. The data is fetched from - The European Nucleotide Archive (ENA) Portal. - See https://www.ebi.ac.uk/ena/portal/api/ for more information about the API - endpoints. - """ - - def __init__(self, options) -> None: - super().__init__(options) - - self.data_chunk_size = 10000 - self.data_url = "https://www.ebi.ac.uk/ena/portal/api/search" - self.ids = self.get_ids() - - def build_query(self, taxon_id: str = None) -> str: - """ - Returns a query to use for getting specific taxon ID accessions. - Some special characters must remain unquoted. - """ - - AND = " AND " - OR = " OR " - instrument_models = ( - "HiSeq X Five", - "HiSeq X Ten", - "Illumina Genome Analyzer II", - "Illumina Genome Analyzer IIx", - "Illumina Genome Analyzer", - "Illumina HiScanSQ", - "Illumina HiSeq 1000", - "Illumina HiSeq 1500", - "Illumina HiSeq 2000", - "Illumina HiSeq 2500", - "Illumina HiSeq 3000", - "Illumina HiSeq 4000", - "Illumina MiSeq", - "Illumina NovaSeq 6000", - "Ion Torrent Proton", - "Ion Torrent S5 XL", - "Ion Torrent S5", - "NextSeq 500", - "NextSeq 550", - ) - - instrument_models = OR.join((f'instrument_model="{im}"' for im in instrument_models)) - conditions = [ - # Relevant date fields: collection_date, collection_date_submitted, - # first_public, last_updated. - f"first_public >= {self.since}", - f"first_public <= {self.until}", - f"({instrument_models})", - 'library_source="TRANSCRIPTOMIC"', - 'library_strategy="RNA-Seq"', - ] - - if taxon_id: - conditions.append(f"tax_eq({taxon_id})") - elif self.keyword: - search_fields = ( - "assembly_software", - "bio_material", - "center_name", - "collected_by", - "experiment_title", - "host_body_site", - "instrument_model", - "instrument_platform", - "library_name", - "project_name", - "sample_title", - "sequencing_method", - "study_title", - ) - search_fields = OR.join( - (f'{sf}="*{self.keyword}*"' for sf in search_fields) - ) # Keyword regex. - conditions.append(f"({search_fields})") - elif self.organism: - # `host`: Natural (as opposed to laboratory) host to the organism from which sample - # was obtained. - # `host_scientific_name`: Scientific name of the natural (as opposed to laboratory) - # host to the organism from which sample was obtained. - # `scientific_name` Scientific name of the organism from which the sample was derived. - # Neither `host_scientific_name` nor `scientific_name` available for search. - # https://www.ebi.ac.uk/ena/portal/api/searchFields?dataPortal=ena&format=json&result=read_study - conditions.append(f'host="{self.organism}"') - - return quote(AND.join(conditions), safe='*()-="<>/ ') # Must remain unquoted. - - def collect_data(self) -> Set[str]: - """Gets new accessions from EBI ENA API.""" - accessions = set() - - if self.ids: - log.debug( - f"Getting RNA-Seq entries by taxon ID(s): " - f"{', '.join((str(idx) for idx in self.ids))} for [{self.since} - {self.until}] range." - ) - total = len(self.ids) - for idx, taxon_id in enumerate(self.ids): - if self.count and len(accessions) >= self.count: - break - - if total > 1: - log.debug(f"Getting entries for taxon ID {taxon_id}, {idx + 1} of {total}.") - accessions.update(self.fetch_data(taxon_id=taxon_id)) - elif self.keyword: - log.debug( - f'Getting RNA-Seq entries by "{self.keyword}" keyword ' - f"for [{self.since} - {self.until}] range." - ) - accessions.update(self.fetch_data()) - elif self.organism: - log.debug( - f'Getting entries by "{self.organism}" organism ' - f"for [{self.since} - {self.until}] range." - ) - accessions.update(self.fetch_data()) - - return accessions - - def fetch_data(self, taxon_id=None) -> Set[str]: - """ - Retrieves accessions from API search endpoint. - The API allows to set limit to 0 (get all in one request) but we do - it in a paginated fashion with `self.data_chunk_size` as a page size. - """ - - @retry(**self.retry_params) - def get_response(url, **kwargs): - """Gets response from an API endpoint.""" - return requests.post(url, **kwargs) - - accessions = set() - - fields = [ - "first_public", - "scientific_name", - "secondary_study_accession", - ] # For DRP/ERP/SRP-prefixed accessions. - data = { - "dataPortal": "ena", - # TODO(ark): add excludeAccessions/excludeAccessionType support. - "fields": ",".join(fields), # Use "all" to get all fields. - "format": "json", - "limit": self.data_chunk_size, - "offset": 0, - "query": self.build_query(taxon_id=taxon_id), - "result": "read_study", - "sortFields": fields, - } - - is_done = False - while not is_done: - log.debug( - f"Processing entries {data['offset'] + 1} - {data['offset'] + self.data_chunk_size}" - ) - entries = () - try: - response = get_response(self.data_url, data=data) - entries = response.json() - # TODO(ark): add `organism` when -o, --organism flag is used. - entries = ( - AccessionBacklogEntry.create_from_rnaseq_entry(entry) for entry in entries - ) - except JSONDecodeError: - is_done = True - except TypeError: - log.error(f"Couldn't get data from {self.data_url}. Response: {entries}") - data["offset"] += self.data_chunk_size - - if self.previous_accessions: - entries = (entry for entry in entries if entry.code not in self.previous_accessions) - accessions.update(entries) - - # Quit after getting a sufficient amount of accessions. - if self.count and len(accessions) >= self.count: - is_done = True - - return accessions - - def get_ids(self) -> List[str]: - """Returns a combined list of passed taxon IDs.""" - ids = set() - - if self.options["taxon_id"]: - ids.update(self.options["taxon_id"]) - - if self.options["taxon_ids_file"]: - with open(self.options["taxon_ids_file"]) as taxon_id_file: - ids.update((taxon_id.strip() for taxon_id in taxon_id_file.readlines())) - - return sorted(ids) + GatheredAccession.objects.bulk_create(entries) From 3fe5d8088d0862a1445cf002c4a4b1f067f4cb55 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Tue, 13 Sep 2022 17:17:19 -0700 Subject: [PATCH 15/24] Add a TODO. --- foreman/data_refinery_foreman/gatherer/agents/rna_seq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py index f9497f3ba..f54ba570a 100644 --- a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py +++ b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py @@ -34,6 +34,7 @@ def build_query(self, taxon_id: str = None) -> str: AND = " AND " OR = " OR " + # TODO(ark): extract instrument models to a config file. instrument_models = ( "HiSeq X Five", "HiSeq X Ten", From ee66ac812337f7cdf9101c7a906497d9e3ea75ba Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Tue, 13 Sep 2022 18:37:09 -0700 Subject: [PATCH 16/24] Fix empty response issue. --- foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py index b5314302b..541bd86d2 100644 --- a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py +++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py @@ -92,7 +92,7 @@ def get_response(url, **kwargs): logger.debug(f"Processing entries {range_start} - {range_end}") response = get_response(self.DATA_URL, params=params) - entries = response.json().get("hits") + entries = response.json().get("hits", ()) if entries: entries = ( GatheredAccession.create_from_ma_ae_entry(entry, organism=self.organism) From 5521286c64b4dc41a9ce070907157fce435c2cfa Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Wed, 14 Sep 2022 18:52:57 -0700 Subject: [PATCH 17/24] Address review comments. --- .../migrations/0071_gatheredaccession.py | 2 +- .../models/gathered_accession.py | 72 +++++++------------ .../gatherer/agents/base.py | 5 +- .../gatherer/agents/microarray_ae.py | 21 ++++-- .../gatherer/agents/microarray_geo.py | 23 ++++-- .../gatherer/agents/rna_seq.py | 24 +++++-- .../management/commands/gather_accessions.py | 60 ++++++++-------- 7 files changed, 112 insertions(+), 95 deletions(-) diff --git a/common/data_refinery_common/migrations/0071_gatheredaccession.py b/common/data_refinery_common/migrations/0071_gatheredaccession.py index a1740d96e..65d192b59 100644 --- a/common/data_refinery_common/migrations/0071_gatheredaccession.py +++ b/common/data_refinery_common/migrations/0071_gatheredaccession.py @@ -22,7 +22,7 @@ class Migration(migrations.Migration): verbose_name="ID", ), ), - ("code", models.TextField(unique=True)), + ("accession_code", models.TextField(unique=True)), ("created_at", models.DateTimeField(auto_now_add=True)), ("last_modified_at", models.DateTimeField(auto_now=True)), ("organism", models.TextField()), diff --git a/common/data_refinery_common/models/gathered_accession.py b/common/data_refinery_common/models/gathered_accession.py index 04b084533..e56ed615c 100644 --- a/common/data_refinery_common/models/gathered_accession.py +++ b/common/data_refinery_common/models/gathered_accession.py @@ -10,7 +10,7 @@ class GatheredAccession(models.Model): class Meta: db_table = "gathered_accessions" - code = models.TextField(unique=True) + accession_code = models.TextField(unique=True) created_at = models.DateTimeField(auto_now_add=True) last_modified_at = models.DateTimeField(auto_now=True) organism = models.TextField() @@ -21,64 +21,44 @@ class Meta: def __eq__(self, other: object) -> bool: """Returns True if two objects are equal. Otherwise returns False.""" - return isinstance(other, GatheredAccession) and self.code == other.code + return isinstance(other, GatheredAccession) and self.accession_code == other.accession_code def __hash__(self) -> int: """Returns accession object unique hash value.""" - return hash(self.code) + return hash(self.accession_code) def __str__(self) -> str: """Returns accession default string representation.""" - return ", ".join((self.code, self.technology, self.source, str(self.published_date.date()))) - - @staticmethod - def create_from_ma_ae_entry(entry, organism=None): - """Creates accession object from MicroArray ArrayExpress entry.""" - accession = GatheredAccession() - accession.code = entry["accession"] - accession.source = "ebi_biostudies" - accession.technology = "microarray" - - if organism: - accession.organism = organism - if "release_date" in entry: - accession.published_date = timezone.make_aware( - datetime.strptime(entry["release_date"], "%Y-%m-%d") + return ", ".join( + ( + self.accession_code, + self.technology, + self.source, + str(self.published_date.date()), ) - - return accession + ) @staticmethod - def create_from_ma_geo_entry(entry): - """Creates accession object from MicroArray GEO meta DB entry.""" + def create_from_external_entry(data, source, technology, organism=None): + """Creates accession object from MicroArray ArrayExpress entry.""" accession = GatheredAccession() - accession.code = entry["gse"] - accession.source = "geo_meta_db" - accession.technology = "microarray" - - if "organism" in entry: - accession.organism = entry["organism"].lower() - if "submission_date" in entry: - accession.published_date = timezone.make_aware( - datetime.strptime(entry["submission_date"], "%Y-%m-%d") - ) + accession.accession_code = ( + data.get("accession") or data.get("gse") or data.get("secondary_study_accession") + ) - return accession + organism = data.get("organism") or data.get("scientific_name") or organism + if organism: + accession.organism = organism.lower() - @staticmethod - def create_from_rnaseq_entry(entry): - """Creates accession object from RNA-Seq entry.""" - accession = GatheredAccession() - accession.code = entry["secondary_study_accession"] - accession.source = "ebi_ena_portal" - accession.technology = "rna-seq" + published_date = ( + data.get("first_public") or data.get("release_date") or data.get("submission_date") + ) + accession.published_date = timezone.make_aware( + datetime.strptime(published_date, "%Y-%m-%d") + ) - if "scientific_name" in entry: - accession.organism = entry["scientific_name"].lower() - if "first_public" in entry: - accession.published_date = timezone.make_aware( - datetime.strptime(entry["first_public"], "%Y-%m-%d") - ) + accession.source = source + accession.technology = technology return accession diff --git a/foreman/data_refinery_foreman/gatherer/agents/base.py b/foreman/data_refinery_foreman/gatherer/agents/base.py index 3754a4068..818bbf72c 100644 --- a/foreman/data_refinery_foreman/gatherer/agents/base.py +++ b/foreman/data_refinery_foreman/gatherer/agents/base.py @@ -66,7 +66,10 @@ def populate_previous_accessions(self) -> None: # Gathered accessions. self.previous_accessions.update( - (entry["code"] for entry in GatheredAccession.objects.values("code")) + ( + entry["accession_code"] + for entry in GatheredAccession.objects.values("accession_code") + ) ) # Surveyed accessions. diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py index 541bd86d2..3bfcf08fe 100644 --- a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py +++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py @@ -1,5 +1,7 @@ -"""MicroArray ArrayExpress accession gathering automation. -Data source: https://www.ebi.ac.uk/biostudies/help""" +""" +MicroArray ArrayExpress accession gathering automation. +Data source: https://www.ebi.ac.uk/biostudies/help +""" from typing import List, Set @@ -13,7 +15,7 @@ logger = get_and_configure_logger(__name__) -class MicroArrayExpressAccessionAgent(AccessionAgentBase): +class AEAgent(AccessionAgentBase): """ MicroArray ArrayExpress accession gathering agent. The data is fetched from the BioStudies database. See https://www.ebi.ac.uk/biostudies/help and @@ -23,6 +25,9 @@ class MicroArrayExpressAccessionAgent(AccessionAgentBase): DATA_CHUNK_SIZE = 100 DATA_URL = "https://www.ebi.ac.uk/biostudies/api/v1/search" + SOURCE = "ebi-biostudies" + SOURCE_NAME = "microarray-ae" + TECHNOLOGY = "microarray" def build_query(self) -> dict: """Returns a query dict for getting array/organism specific accessions.""" @@ -95,7 +100,9 @@ def get_response(url, **kwargs): entries = response.json().get("hits", ()) if entries: entries = ( - GatheredAccession.create_from_ma_ae_entry(entry, organism=self.organism) + GatheredAccession.create_from_external_entry( + entry, self.SOURCE, self.TECHNOLOGY, organism=self.organism + ) for entry in entries ) params["page"] += 1 @@ -103,7 +110,11 @@ def get_response(url, **kwargs): is_done = True if self.previous_accessions: - entries = (entry for entry in entries if entry.code not in self.previous_accessions) + entries = ( + entry + for entry in entries + if entry.accession_code not in self.previous_accessions + ) accessions.update(entries) # Quit after getting a sufficient amount of accessions. diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py index 975c715b3..2500bcec5 100644 --- a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py +++ b/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py @@ -1,5 +1,8 @@ -"""MicroArray GEO accession gathering automation. -Data source: local SQLite meta DB from https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html""" +""" +MicroArray GEO accession gathering automation. +Data source: local SQLite meta DB from +https://www.bioconductor.org/packages/release/bioc/html/GEOmetadb.html +""" import os import re @@ -13,7 +16,7 @@ logger = get_and_configure_logger(__name__) -class MicroArrayGEOAccessionAgent(AccessionAgentBase): +class GEOAgent(AccessionAgentBase): """ MicroArray GEO accession gathering agent. The data is fetched from a local SQLite GEO meta database. @@ -23,6 +26,9 @@ class MicroArrayGEOAccessionAgent(AccessionAgentBase): # Implement syncing procedure. # Update URL once the original file is available again. DB_PATH = "data/microarray/GEOmetadb.sqlite" + SOURCE = "geo-meta-db" + SOURCE_NAME = "microarray-geo" + TECHNOLOGY = "microarray" def build_query(self) -> str: """Returns a query for getting GEO accessions from the local SQLite meta DB.""" @@ -101,10 +107,17 @@ def match_keyword(row): entries = filter(match_keyword, entries) entries = ({key.lower(): entry[key] for key in entry.keys()} for entry in entries) - entries = set((GatheredAccession.create_from_ma_geo_entry(entry) for entry in entries)) + entries = set( + ( + GatheredAccession.create_from_external_entry(entry, self.SOURCE, self.TECHNOLOGY) + for entry in entries + ) + ) if self.previous_accessions: - entries = (entry for entry in entries if entry.code not in self.previous_accessions) + entries = ( + entry for entry in entries if entry.accession_code not in self.previous_accessions + ) accessions.update(entries) return accessions diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py index f54ba570a..577f815b8 100644 --- a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py +++ b/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py @@ -1,5 +1,7 @@ -"""RNA-Seq accession gathering automation. -Data source: https://www.ebi.ac.uk/ena/portal/api/""" +""" +RNA-Seq accession gathering automation. +Data source: https://www.ebi.ac.uk/ena/portal/api/ +""" from json.decoder import JSONDecodeError from typing import List, Set @@ -15,7 +17,7 @@ logger = get_and_configure_logger(__name__) -class RNASeqAccessionAgent(AccessionAgentBase): +class RNASeqAgent(AccessionAgentBase): """ RNA-Seq accession gathering agent. The data is fetched from The European Nucleotide Archive (ENA) Portal. @@ -25,6 +27,9 @@ class RNASeqAccessionAgent(AccessionAgentBase): DATA_CHUNK_SIZE = 10000 DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search" + SOURCE = "ebi-ena-portal" + SOURCE_NAME = "rna-seq" + TECHNOLOGY = "rna-seq" def build_query(self, taxon_id: str = None) -> str: """ @@ -174,7 +179,12 @@ def get_response(url, **kwargs): try: response = get_response(self.DATA_URL, data=data) entries = response.json() - entries = (GatheredAccession.create_from_rnaseq_entry(entry) for entry in entries) + entries = ( + GatheredAccession.create_from_external_entry( + entry, self.SOURCE, self.TECHNOLOGY + ) + for entry in entries + ) except JSONDecodeError: is_done = True except TypeError: @@ -182,7 +192,11 @@ def get_response(url, **kwargs): data["offset"] += self.DATA_CHUNK_SIZE if self.previous_accessions: - entries = (entry for entry in entries if entry.code not in self.previous_accessions) + entries = ( + entry + for entry in entries + if entry.accession_code not in self.previous_accessions + ) accessions.update(entries) # Quit after getting a sufficient amount of accessions. diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py index 445245d3a..2b073ef45 100644 --- a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py +++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py @@ -11,12 +11,13 @@ import re from django.core.management.base import BaseCommand +from django.db.utils import IntegrityError from data_refinery_common.logging import get_and_configure_logger from data_refinery_common.models.gathered_accession import GatheredAccession -from data_refinery_foreman.gatherer.agents.microarray_ae import MicroArrayExpressAccessionAgent -from data_refinery_foreman.gatherer.agents.microarray_geo import MicroArrayGEOAccessionAgent -from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAccessionAgent +from data_refinery_foreman.gatherer.agents.microarray_ae import AEAgent +from data_refinery_foreman.gatherer.agents.microarray_geo import GEOAgent +from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAgent logger = get_and_configure_logger(__name__) @@ -24,11 +25,8 @@ class Command(BaseCommand): """Creates agents and runs actual accession gathering.""" - DATA_SOURCE_MA_AE = "microarray-ae" - DATA_SOURCE_MA_GEO = "microarray-geo" - DATA_SOURCE_RNA_SEQ = "rna-seq" - DATA_SOURCES = (DATA_SOURCE_MA_AE, DATA_SOURCE_MA_GEO, DATA_SOURCE_RNA_SEQ) - + DATA_AGENTS = (AEAgent, GEOAgent, RNASeqAgent) + DATA_SOURCE_NAMES = [agent.SOURCE_NAME for agent in DATA_AGENTS] RE_ACCESSION = re.compile(r"(\D+)(\d+)") RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}") @@ -164,43 +162,43 @@ def validate_args(self, options) -> None: keyword = options["keyword"] organism = options["organism"] - sources = options["source"] or self.DATA_SOURCES + source_names = options["source"] or self.DATA_SOURCE_NAMES - for source in sources: - if source in self.DATA_SOURCES: + for source_name in source_names: + if source_name in self.DATA_SOURCE_NAMES: continue errors.append( - f"Unknown source: {source}. Supported sources: {', '.join(self.DATA_SOURCES)}" + f"Unknown source: {source_name}. Supported sources: {', '.join(self.DATA_SOURCE_NAMES)}" ) - if self.DATA_SOURCE_MA_AE in sources: + if AEAgent.SOURCE_NAME in source_names: ids = options["ae_id"] or options["ae_ids_file"] if not (ids or keyword or organism): errors.append( ( "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or " "ArrayExpress ID(s) [--ae-id, --ae-ids-file] must be specified for " - f"'{self.DATA_SOURCE_MA_AE}' source." + f"'{AEAgent.SOURCE_NAME}' source." ) ) - if self.DATA_SOURCE_MA_GEO in sources: + if GEOAgent.SOURCE_NAME in source_names: ids = options["gpl_id"] or options["gpl_ids_file"] if not (ids or keyword or organism): errors.append( ( "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] or " "GEO platform ID(s) [--gpl-id, --gpl-ids-file] must be specified for " - f"'{self.DATA_SOURCE_MA_GEO}' source." + f"'{GEOAgent.SOURCE_NAME}' source." ) ) - if self.DATA_SOURCE_RNA_SEQ in sources: + if RNASeqAgent.SOURCE_NAME in source_names: ids = options["taxon_id"] or options["taxon_ids_file"] if not (ids or keyword or organism): errors.append( ( "Exactly one of the keyword [-k, --keyword], organism [-o, --organism] " "or taxon ID(s) [--taxon-id, --taxon-ids-file] must be specified for " - f"'{self.DATA_SOURCE_RNA_SEQ}' source." + f"'{RNASeqAgent.SOURCE_NAME}' source." ) ) @@ -213,26 +211,21 @@ def handle(self, *args, **options): self.set_verbosity_level(options) agents = list() - sources = options["source"] or self.DATA_SOURCES - - if self.DATA_SOURCE_RNA_SEQ in sources: - agents.append(RNASeqAccessionAgent(options)) - - if self.DATA_SOURCE_MA_AE in sources: - agents.append(MicroArrayExpressAccessionAgent(options)) - - if self.DATA_SOURCE_MA_GEO in sources: - agents.append(MicroArrayGEOAccessionAgent(options)) + sources_names = options["source"] or self.DATA_SOURCE_NAMES + for cls in self.DATA_AGENTS: + if cls.SOURCE_NAME not in sources_names: + continue + agents.append(cls(options)) entries = set() for agent in agents: entries.update(agent.collect_data()) entries = sorted( # Sort the resulting list. - (entry for entry in entries if self.RE_ACCESSION.match(entry.code)), + (entry for entry in entries if self.RE_ACCESSION.match(entry.accession_code)), key=lambda entry: ( - self.RE_ACCESSION.match(entry.code).group(1), - int(self.RE_ACCESSION.match(entry.code).group(2)), + self.RE_ACCESSION.match(entry.accession_code).group(1), + int(self.RE_ACCESSION.match(entry.accession_code).group(2)), ), ) # Limit the number of output entries. @@ -245,4 +238,7 @@ def handle(self, *args, **options): output = "No accessions found." print(output) else: - GatheredAccession.objects.bulk_create(entries) + try: + GatheredAccession.objects.bulk_create(entries) + except IntegrityError as e: + logger.exception(f"Could not save new accessions to the database: {e}") From 6d3e17988765f045a69ebac8618a6fffad55a907 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Wed, 21 Sep 2022 11:05:24 -0700 Subject: [PATCH 18/24] Rename agent files. --- .../gatherer/agents/{microarray_ae.py => ae_agent.py} | 0 .../gatherer/agents/{microarray_geo.py => geo_agent.py} | 0 .../gatherer/agents/{rna_seq.py => rnaseq_agent.py} | 0 .../gatherer/management/commands/gather_accessions.py | 6 +++--- 4 files changed, 3 insertions(+), 3 deletions(-) rename foreman/data_refinery_foreman/gatherer/agents/{microarray_ae.py => ae_agent.py} (100%) rename foreman/data_refinery_foreman/gatherer/agents/{microarray_geo.py => geo_agent.py} (100%) rename foreman/data_refinery_foreman/gatherer/agents/{rna_seq.py => rnaseq_agent.py} (100%) diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py b/foreman/data_refinery_foreman/gatherer/agents/ae_agent.py similarity index 100% rename from foreman/data_refinery_foreman/gatherer/agents/microarray_ae.py rename to foreman/data_refinery_foreman/gatherer/agents/ae_agent.py diff --git a/foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py b/foreman/data_refinery_foreman/gatherer/agents/geo_agent.py similarity index 100% rename from foreman/data_refinery_foreman/gatherer/agents/microarray_geo.py rename to foreman/data_refinery_foreman/gatherer/agents/geo_agent.py diff --git a/foreman/data_refinery_foreman/gatherer/agents/rna_seq.py b/foreman/data_refinery_foreman/gatherer/agents/rnaseq_agent.py similarity index 100% rename from foreman/data_refinery_foreman/gatherer/agents/rna_seq.py rename to foreman/data_refinery_foreman/gatherer/agents/rnaseq_agent.py diff --git a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py index 2b073ef45..554b74350 100644 --- a/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py +++ b/foreman/data_refinery_foreman/gatherer/management/commands/gather_accessions.py @@ -15,9 +15,9 @@ from data_refinery_common.logging import get_and_configure_logger from data_refinery_common.models.gathered_accession import GatheredAccession -from data_refinery_foreman.gatherer.agents.microarray_ae import AEAgent -from data_refinery_foreman.gatherer.agents.microarray_geo import GEOAgent -from data_refinery_foreman.gatherer.agents.rna_seq import RNASeqAgent +from data_refinery_foreman.gatherer.agents.ae_agent import AEAgent +from data_refinery_foreman.gatherer.agents.geo_agent import GEOAgent +from data_refinery_foreman.gatherer.agents.rnaseq_agent import RNASeqAgent logger = get_and_configure_logger(__name__) From 2f6ff4a36d263433241b75629bef7ad041227652 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Thu, 17 Nov 2022 14:46:30 -0800 Subject: [PATCH 19/24] Workers Docker images refactoring. --- common/{ => R}/install_devtools.R | 12 +- .../dependencies}/README.md | 0 .../affymetrix/bioc/dependencies.R | 221 ++++++++++++++++++ .../affymetrix/cran}/dependencies.R | 0 .../affymetrix/cran}/packages.txt | 0 .../affymetrix/cran}/versions.tsv | 0 .../affymetrix/install_affy_only.R | 38 +++ .../affymetrix}/install_ensg_pkgs.R | 8 +- .../illumina/bioc/dependencies.R} | 0 .../illumina/cran}/dependencies.R | 0 .../dependencies/illumina/cran}/packages.txt | 0 .../dependencies/illumina/cran}/versions.tsv | 0 workers/{ => R/dependencies}/install_bioc.R | 10 +- .../dependencies}/install_downloader_R_only.R | 14 +- .../dependencies/no_op/cran}/dependencies.R | 0 .../dependencies/no_op/cran}/packages.txt | 0 .../dependencies/no_op/cran}/versions.tsv | 0 .../dependencies/no_op/install_gene_convert.R | 43 ++++ .../dependencies/qn/bioc/dependencies.R} | 12 +- .../dependencies/qn/cran}/dependencies.R | 0 .../dependencies/qn/cran}/packages.txt | 0 .../dependencies/qn/cran}/versions.tsv | 0 .../dependencies/tximport/cran/dependencies.R | 25 ++ .../dependencies/tximport/cran}/packages.txt | 0 .../dependencies/tximport/cran}/versions.tsv | 0 .../dependencies/tximport/install_tximport.R | 7 + .../R_dependencies/tximport/dependencies.R | 25 -- workers/affymetrix_dependencies.R | 220 ----------------- workers/ccache.conf | 5 + workers/dockerfiles/Dockerfile.affymetrix | 90 ++----- .../dockerfiles/Dockerfile.affymetrix_local | 12 +- workers/dockerfiles/Dockerfile.compendia | 154 ++++++------ workers/dockerfiles/Dockerfile.downloaders | 87 ++----- workers/dockerfiles/Dockerfile.illumina | 80 +------ workers/dockerfiles/Dockerfile.no_op | 90 ++----- workers/dockerfiles/Dockerfile.salmon | 114 +++------ workers/dockerfiles/Dockerfile.smasher | 90 ++----- workers/dockerfiles/Dockerfile.transcriptome | 90 ++----- workers/dockerfiles/Dockerfile.worker_base | 68 ++++++ workers/install_affy_only.R | 38 --- workers/install_gene_convert.R | 43 ---- workers/install_tximport.R | 7 - 42 files changed, 650 insertions(+), 953 deletions(-) rename common/{ => R}/install_devtools.R (95%) rename workers/{R_dependencies => R/dependencies}/README.md (100%) create mode 100644 workers/R/dependencies/affymetrix/bioc/dependencies.R rename workers/{R_dependencies/affymetrix => R/dependencies/affymetrix/cran}/dependencies.R (100%) rename workers/{R_dependencies/affymetrix => R/dependencies/affymetrix/cran}/packages.txt (100%) rename workers/{R_dependencies/affymetrix => R/dependencies/affymetrix/cran}/versions.tsv (100%) create mode 100644 workers/R/dependencies/affymetrix/install_affy_only.R rename workers/{ => R/dependencies/affymetrix}/install_ensg_pkgs.R (89%) rename workers/{illumina_dependencies.R => R/dependencies/illumina/bioc/dependencies.R} (100%) rename workers/{R_dependencies/illumina => R/dependencies/illumina/cran}/dependencies.R (100%) rename workers/{R_dependencies/illumina => R/dependencies/illumina/cran}/packages.txt (100%) rename workers/{R_dependencies/illumina => R/dependencies/illumina/cran}/versions.tsv (100%) rename workers/{ => R/dependencies}/install_bioc.R (58%) rename workers/{ => R/dependencies}/install_downloader_R_only.R (54%) rename workers/{R_dependencies/no_op => R/dependencies/no_op/cran}/dependencies.R (100%) rename workers/{R_dependencies/no_op => R/dependencies/no_op/cran}/packages.txt (100%) rename workers/{R_dependencies/no_op => R/dependencies/no_op/cran}/versions.tsv (100%) create mode 100644 workers/R/dependencies/no_op/install_gene_convert.R rename workers/{qn_dependencies.R => R/dependencies/qn/bioc/dependencies.R} (61%) rename workers/{R_dependencies/qn => R/dependencies/qn/cran}/dependencies.R (100%) rename workers/{R_dependencies/qn => R/dependencies/qn/cran}/packages.txt (100%) rename workers/{R_dependencies/qn => R/dependencies/qn/cran}/versions.tsv (100%) create mode 100644 workers/R/dependencies/tximport/cran/dependencies.R rename workers/{R_dependencies/tximport => R/dependencies/tximport/cran}/packages.txt (100%) rename workers/{R_dependencies/tximport => R/dependencies/tximport/cran}/versions.tsv (100%) create mode 100644 workers/R/dependencies/tximport/install_tximport.R delete mode 100644 workers/R_dependencies/tximport/dependencies.R delete mode 100644 workers/affymetrix_dependencies.R create mode 100644 workers/ccache.conf create mode 100644 workers/dockerfiles/Dockerfile.worker_base delete mode 100644 workers/install_affy_only.R delete mode 100644 workers/install_gene_convert.R delete mode 100644 workers/install_tximport.R diff --git a/common/install_devtools.R b/common/R/install_devtools.R similarity index 95% rename from common/install_devtools.R rename to common/R/install_devtools.R index 678418f19..e241152ea 100644 --- a/common/install_devtools.R +++ b/common/R/install_devtools.R @@ -16,9 +16,9 @@ # Cranlock was used to find the versions of dependencies to install # Treat warnings as errors, set CRAN mirror, and set parallelization: -options(warn=2) -options(repos=structure(c(CRAN="https://cloud.r-project.org/"))) -options(Ncpus=parallel::detectCores()) +options(warn = 2) +options(repos = structure(c(CRAN = "https://cloud.r-project.org/"))) +options(Ncpus = parallel::detectCores()) install_package_version <- function(package_name, version) { @@ -31,18 +31,18 @@ install_package_version <- function(package_name, version) { package_url <- paste0("https://cloud.r-project.org/src/contrib/", package_tarball) # Give CRAN a full minute to timeout since it's not always the most reliable. - curl_result <- system(paste0("curl --head --connect-timeout 60 ", package_url), intern=TRUE) + curl_result <- system(paste0("curl --head --connect-timeout 60 ", package_url), intern = TRUE) if (grepl("404", curl_result[1])) { package_url <- paste0("https://cloud.r-project.org/src/contrib/Archive/", package_name, "/", package_tarball) # Make sure the package actually exists in the archive! - curl_result <- system(paste0("curl --head --connect-timeout 120 ", package_url), intern=TRUE) + curl_result <- system(paste0("curl --head --connect-timeout 120 ", package_url), intern = TRUE) if (grepl("404", curl_result[1])) { stop(paste("Package", package_name, "version", version, "does not exist!")) } } - install.packages(package_url) + install.packages(package_url, Ncpus = 32) } # Generated using cranlock diff --git a/workers/R_dependencies/README.md b/workers/R/dependencies/README.md similarity index 100% rename from workers/R_dependencies/README.md rename to workers/R/dependencies/README.md diff --git a/workers/R/dependencies/affymetrix/bioc/dependencies.R b/workers/R/dependencies/affymetrix/bioc/dependencies.R new file mode 100644 index 000000000..1570292e0 --- /dev/null +++ b/workers/R/dependencies/affymetrix/bioc/dependencies.R @@ -0,0 +1,221 @@ +# Turn warnings into errors because biocLite throws warnings instead # of error if it fails to install something. +options(warn = 2) +options(repos = structure(c(CRAN = "https://cloud.r-project.org"))) +options(Ncpus = parallel::detectCores()) + +# Bioconductor packages, installed by devtools::install_url() + +# Helper function that installs a list of packages using the input URLs +install_with_url <- function(urls) { + pkg_ids <- devtools::install_url(urls) + if (any(is.na(pkg_ids))) { + pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ") + stop(paste("Failed to install package(s):", pkg_fails)) + } + return(pkg_ids) +} + +devtools::install_version("dplyr", version = "1.0.2") +devtools::install_version("locfit", version = "1.5-9.4") + +bioc_pkg_urls <- c( + "https://bioconductor.org/packages/3.11/bioc/src/contrib/oligoClasses_1.50.4.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/oligo_1.52.1.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/GEOquery_2.56.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/SCAN.UPC_2.30.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/affy_1.66.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/affyio_1.58.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/zlibbioc_1.34.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/preprocessCore_1.50.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/genefilter_1.70.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/sva_3.36.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/limma_3.44.3.tar.gz" +) +install_with_url(bioc_pkg_urls) + +# Invoke another R script to install BrainArray ensg packages +source("install_ensg_pkgs.R") + +# Install Bioconductor platform design (pd) packages +pd_experiment_pkgs <- c( + "https://bioconductor.org/packages/3.11/data/experiment/src/contrib/pd.atdschip.tiling_0.26.0.tar.gz" +) +install_with_url(pd_experiment_pkgs) + +pd_annotation_pkgs <- c( + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.081229.hg18.promoter.medip.hx1_0.99.4.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.2006.07.18.hg18.refseq.promoter_1.8.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.2006.07.18.mm8.refseq.promoter_0.99.3.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.2006.10.31.rn34.refseq.promoter_0.99.3.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ag_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.aragene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.aragene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ath1.121501_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.barley1_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bovgene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bovgene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bovine_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bsubtilis_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cangene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cangene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.canine_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.canine.2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.celegans_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.charm.hg18.example_0.99.4.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chicken_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chigene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chigene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chogene.2.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chogene.2.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.citrus_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.d.human_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.human_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.human.ht_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.mouse_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.mouse.ht_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.rat_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.rat.ht_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cotton_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyngene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyngene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyrgene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyrgene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cytogenetics.array_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drogene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drogene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drosgenome1_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drosophila.2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.e.coli.2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ecoli_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ecoli.asv2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.elegene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.elegene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.equgene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.equgene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.feinberg.hg18.me.hx1_0.99.3.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.feinberg.mm8.me.hx1_0.99.3.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.felgene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.felgene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.fingene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.fingene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.genomewidesnp.5_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.genomewidesnp.6_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.guigene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.guigene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hc.g110_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.focus_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133.plus.2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133a_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133a.2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133a.tag_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133b_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u219_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95a_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95av2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95b_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95c_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95d_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95e_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg18.60mer.expr_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ht.hg.u133.plus.pm_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ht.hg.u133a_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ht.mg.430a_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hta.2.0_3.12.2.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hu6800_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.huex.1.0.st.v2_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.1.0.st.v1_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.1.1.st.v1_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.2.0.st_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.2.1.st_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.maize_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping250k.nsp_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping250k.sty_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping50k.hind240_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping50k.xba240_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.margene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.margene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.medgene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.medgene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.medicago_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74a_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74av2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74b_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74bv2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74c_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74cv2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.1.0_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.2.0_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.3.0_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.3.1_3.8.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.4.0_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.moe430a_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.moe430b_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.moex.1.0.st.v1_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.1.0.st.v1_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.1.1.st.v1_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.2.0.st_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.2.1.st_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mouse430.2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mouse430a.2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mta.1.0_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mu11ksuba_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mu11ksubb_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.nugo.hs1a520180_3.4.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.nugo.mm1a520177_3.4.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ovigene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ovigene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.pae.g1a_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.plasmodium.anopheles_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.poplar_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.porcine_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.porgene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.porgene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rabgene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rabgene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rae230a_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rae230b_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.raex.1.0.st.v1_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.1.0.st.v1_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.1.1.st.v1_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.2.0.st_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.2.1.st_3.14.1.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rat230.2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rcngene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rcngene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rg.u34a_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rg.u34b_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rg.u34c_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rhegene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rhegene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rhesus_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rice_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rjpgene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rjpgene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rn.u34_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rta.1.0_3.12.2.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rusgene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rusgene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.s.aureus_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.soybean_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.soygene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.soygene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.sugar.cane_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.tomato_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.u133.x3p_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.vitis.vinifera_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.wheat_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.x.laevis.2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.x.tropicalis_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.xenopus.laevis_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.yeast.2_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.yg.s98_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.zebgene.1.0.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.zebgene.1.1.st_3.12.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.zebrafish_3.12.0.tar.gz" +) +install_with_url(pd_annotation_pkgs) + +# Load this libraries because apparently just installing it isn't +# enough to verify that the correct versions of dependencies are installed. +library("foreach") diff --git a/workers/R_dependencies/affymetrix/dependencies.R b/workers/R/dependencies/affymetrix/cran/dependencies.R similarity index 100% rename from workers/R_dependencies/affymetrix/dependencies.R rename to workers/R/dependencies/affymetrix/cran/dependencies.R diff --git a/workers/R_dependencies/affymetrix/packages.txt b/workers/R/dependencies/affymetrix/cran/packages.txt similarity index 100% rename from workers/R_dependencies/affymetrix/packages.txt rename to workers/R/dependencies/affymetrix/cran/packages.txt diff --git a/workers/R_dependencies/affymetrix/versions.tsv b/workers/R/dependencies/affymetrix/cran/versions.tsv similarity index 100% rename from workers/R_dependencies/affymetrix/versions.tsv rename to workers/R/dependencies/affymetrix/cran/versions.tsv diff --git a/workers/R/dependencies/affymetrix/install_affy_only.R b/workers/R/dependencies/affymetrix/install_affy_only.R new file mode 100644 index 000000000..2bce94bc4 --- /dev/null +++ b/workers/R/dependencies/affymetrix/install_affy_only.R @@ -0,0 +1,38 @@ +# Turn warnings into errors because biocLite throws warnings instead +# of error if it fails to install something. +options(warn = 2) +options(repos = structure(c(CRAN = "https://cloud.r-project.org"))) +options(Ncpus = parallel::detectCores()) + +# Use devtools::install_version() to install packages in cran. +devtools::install_version("dplyr", version = "1.0.0") +devtools::install_version("tidyr", version = "1.1.0") +devtools::install_version("ff", version = "2.2-14") +devtools::install_version("locfit", version = "1.5-9.4") + +# Helper function that installs a list of packages using the input URLs +install_with_url <- function(urls) { + pkg_ids <- devtools::install_url(urls) + if (any(is.na(pkg_ids))) { + pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ") + stop(paste("Failed to install package(s):", pkg_fails)) + } + return(pkg_ids) +} + +bioc_pkgs <- c( + "https://bioconductor.org/packages/3.11/bioc/src/contrib/oligoClasses_1.50.4.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/oligo_1.52.1.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/GEOquery_2.56.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/SCAN.UPC_2.30.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/affy_1.66.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/affyio_1.58.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/zlibbioc_1.34.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/preprocessCore_1.50.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/genefilter_1.70.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/sva_3.36.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/tximport_1.16.1.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/limma_3.44.3.tar.gz" +) +install_with_url(bioc_pkgs) diff --git a/workers/install_ensg_pkgs.R b/workers/R/dependencies/affymetrix/install_ensg_pkgs.R similarity index 89% rename from workers/install_ensg_pkgs.R rename to workers/R/dependencies/affymetrix/install_ensg_pkgs.R index 3f3b5bf08..5de25a262 100644 --- a/workers/install_ensg_pkgs.R +++ b/workers/R/dependencies/affymetrix/install_ensg_pkgs.R @@ -1,4 +1,4 @@ -options(Ncpus=parallel::detectCores()) +options(Ncpus = parallel::detectCores()) install.packages("xml2") library("xml2") ensg_url <- "http://brainarray.mbni.med.umich.edu/Brainarray/Database/CustomCDF/22.0.0/ensg.asp" @@ -39,8 +39,10 @@ lapply(data_rows, save_chip_pkg) # Write chips and pkg_urls to a tab-delimited file output_filename <- "/home/user/r_ensg_probe_pkgs.txt" -write.table(list(chips, pkg_urls), file=output_filename, quote=FALSE, - row.names=FALSE, col.names=FALSE, sep="\t") +write.table(list(chips, pkg_urls), + file = output_filename, quote = FALSE, + row.names = FALSE, col.names = FALSE, sep = "\t" +) # Install these ensg packages lapply(pkg_urls, devtools::install_url) diff --git a/workers/illumina_dependencies.R b/workers/R/dependencies/illumina/bioc/dependencies.R similarity index 100% rename from workers/illumina_dependencies.R rename to workers/R/dependencies/illumina/bioc/dependencies.R diff --git a/workers/R_dependencies/illumina/dependencies.R b/workers/R/dependencies/illumina/cran/dependencies.R similarity index 100% rename from workers/R_dependencies/illumina/dependencies.R rename to workers/R/dependencies/illumina/cran/dependencies.R diff --git a/workers/R_dependencies/illumina/packages.txt b/workers/R/dependencies/illumina/cran/packages.txt similarity index 100% rename from workers/R_dependencies/illumina/packages.txt rename to workers/R/dependencies/illumina/cran/packages.txt diff --git a/workers/R_dependencies/illumina/versions.tsv b/workers/R/dependencies/illumina/cran/versions.tsv similarity index 100% rename from workers/R_dependencies/illumina/versions.tsv rename to workers/R/dependencies/illumina/cran/versions.tsv diff --git a/workers/install_bioc.R b/workers/R/dependencies/install_bioc.R similarity index 58% rename from workers/install_bioc.R rename to workers/R/dependencies/install_bioc.R index c51d36988..3eb5aa29a 100644 --- a/workers/install_bioc.R +++ b/workers/R/dependencies/install_bioc.R @@ -1,12 +1,12 @@ # Turn warnings into errors because biocLite throws warnings instead # of error if it fails to install something. -options(warn=2) -options(repos=structure(c(CRAN="https://cloud.r-project.org"))) -options(Ncpus=parallel::detectCores()) +options(warn = 2) +options(repos = structure(c(CRAN = "https://cloud.r-project.org"))) +options(Ncpus = parallel::detectCores()) # Use devtools::install_version() to install packages in cran. -devtools::install_version('dplyr', version='1.0.0') -devtools::install_version('tidyr', version='1.1.0') +devtools::install_version("dplyr", version = "1.0.0") +devtools::install_version("tidyr", version = "1.1.0") # devtools::install_url() requires BiocInstaller # install.packages('https://bioconductor.org/packages/3.6/bioc/src/contrib/BiocInstaller_1.28.0.tar.gz') diff --git a/workers/install_downloader_R_only.R b/workers/R/dependencies/install_downloader_R_only.R similarity index 54% rename from workers/install_downloader_R_only.R rename to workers/R/dependencies/install_downloader_R_only.R index 02feb6275..85eb866b1 100644 --- a/workers/install_downloader_R_only.R +++ b/workers/R/dependencies/install_downloader_R_only.R @@ -1,8 +1,8 @@ # Turn warnings into errors because biocLite throws warnings instead # of error if it fails to install something. -options(warn=2) -options(repos=structure(c(CRAN="https://cloud.r-project.org"))) -options(Ncpus=parallel::detectCores()) +options(warn = 2) +options(repos = structure(c(CRAN = "https://cloud.r-project.org"))) +options(Ncpus = parallel::detectCores()) # Bioconductor packages, installed by devtools::install_url() @@ -10,15 +10,15 @@ options(Ncpus=parallel::detectCores()) # Helper function that installs a list of packages using the input URLs install_with_url <- function(urls) { pkg_ids <- devtools::install_url(urls) - if(any(is.na(pkg_ids))) { + if (any(is.na(pkg_ids))) { pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ") - stop(paste("Failed to install package(s):", pkg_fails )) + stop(paste("Failed to install package(s):", pkg_fails)) } return(pkg_ids) } bioc_pkgs <- c( - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/affyio_1.58.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/zlibbioc_1.34.0.tar.gz' + "https://bioconductor.org/packages/3.11/bioc/src/contrib/affyio_1.58.0.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/zlibbioc_1.34.0.tar.gz" ) install_with_url(bioc_pkgs) diff --git a/workers/R_dependencies/no_op/dependencies.R b/workers/R/dependencies/no_op/cran/dependencies.R similarity index 100% rename from workers/R_dependencies/no_op/dependencies.R rename to workers/R/dependencies/no_op/cran/dependencies.R diff --git a/workers/R_dependencies/no_op/packages.txt b/workers/R/dependencies/no_op/cran/packages.txt similarity index 100% rename from workers/R_dependencies/no_op/packages.txt rename to workers/R/dependencies/no_op/cran/packages.txt diff --git a/workers/R_dependencies/no_op/versions.tsv b/workers/R/dependencies/no_op/cran/versions.tsv similarity index 100% rename from workers/R_dependencies/no_op/versions.tsv rename to workers/R/dependencies/no_op/cran/versions.tsv diff --git a/workers/R/dependencies/no_op/install_gene_convert.R b/workers/R/dependencies/no_op/install_gene_convert.R new file mode 100644 index 000000000..2ad4afb84 --- /dev/null +++ b/workers/R/dependencies/no_op/install_gene_convert.R @@ -0,0 +1,43 @@ +# Turn warnings into errors because biocLite throws warnings instead +# of error if it fails to install something. +options(warn = 2) +options(Ncpus = parallel::detectCores()) +options(repos = structure(c(CRAN = "https://cloud.r-project.org"))) + +# Helper function that installs a list of packages using the input URLs +install_with_url <- function(urls) { + pkg_ids <- devtools::install_url(urls) + if (any(is.na(pkg_ids))) { + pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ") + stop(paste("Failed to install package(s):", pkg_fails)) + } + return(pkg_ids) +} + +devtools::install_version("dplyr", version = "1.0.2") + +bioc_pkgs <- c( + "https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz" +) +install_with_url(bioc_pkgs) + +illumina_pkgs <- c( + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv1.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv2.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv3.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv4.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1p1.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev2.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaRatv1.db_1.26.0.tar.gz" +) +install_with_url(illumina_pkgs) + +# Load these libraries because apparently just installing them isn't +# enough to verify that they have complementary versions. +library("optparse") +library(data.table) +library("dplyr") +library("rlang") +library(lazyeval) +library(AnnotationDbi) diff --git a/workers/qn_dependencies.R b/workers/R/dependencies/qn/bioc/dependencies.R similarity index 61% rename from workers/qn_dependencies.R rename to workers/R/dependencies/qn/bioc/dependencies.R index 8238bb7a4..2abd45f49 100644 --- a/workers/qn_dependencies.R +++ b/workers/R/dependencies/qn/bioc/dependencies.R @@ -1,19 +1,19 @@ -options(warn=2) -options(repos=structure(c(CRAN="https://cloud.r-project.org"))) -options(Ncpus=parallel::detectCores()) +options(warn = 2) +options(repos = structure(c(CRAN = "https://cloud.r-project.org"))) +options(Ncpus = parallel::detectCores()) # Helper function that installs a list of packages using the input URLs install_with_url <- function(urls) { pkg_ids <- devtools::install_url(urls) - if(any(is.na(pkg_ids))) { + if (any(is.na(pkg_ids))) { pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ") - stop(paste("Failed to install package(s):", pkg_fails )) + stop(paste("Failed to install package(s):", pkg_fails)) } return(pkg_ids) } bioc_pkgs <- c( - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/preprocessCore_1.50.0.tar.gz' + "https://bioconductor.org/packages/3.11/bioc/src/contrib/preprocessCore_1.50.0.tar.gz" ) install_with_url(bioc_pkgs) diff --git a/workers/R_dependencies/qn/dependencies.R b/workers/R/dependencies/qn/cran/dependencies.R similarity index 100% rename from workers/R_dependencies/qn/dependencies.R rename to workers/R/dependencies/qn/cran/dependencies.R diff --git a/workers/R_dependencies/qn/packages.txt b/workers/R/dependencies/qn/cran/packages.txt similarity index 100% rename from workers/R_dependencies/qn/packages.txt rename to workers/R/dependencies/qn/cran/packages.txt diff --git a/workers/R_dependencies/qn/versions.tsv b/workers/R/dependencies/qn/cran/versions.tsv similarity index 100% rename from workers/R_dependencies/qn/versions.tsv rename to workers/R/dependencies/qn/cran/versions.tsv diff --git a/workers/R/dependencies/tximport/cran/dependencies.R b/workers/R/dependencies/tximport/cran/dependencies.R new file mode 100644 index 000000000..02c5ed011 --- /dev/null +++ b/workers/R/dependencies/tximport/cran/dependencies.R @@ -0,0 +1,25 @@ +# Generated from cranlock +options(warn = 2) +options(Ncpus = parallel::detectCores()) +options(repos = structure(c(CRAN = "https://cloud.r-project.org"))) +devtools::install_version("getopt", version = "1.20.3") +devtools::install_version("optparse", version = "1.4.4") +devtools::install_version("rjson", version = "0.2.19") +devtools::install_version("R6", version = "2.4.0") +devtools::install_version("pkgconfig", version = "2.0.2") +devtools::install_version("rlang", version = "0.4.0") +devtools::install_version("zeallot", version = "0.1.0") +devtools::install_version("backports", version = "1.1.4") +devtools::install_version("glue", version = "1.3.1") +devtools::install_version("digest", version = "0.6.19") +devtools::install_version("vctrs", version = "0.1.0") +devtools::install_version("hms", version = "0.4.2") +devtools::install_version("Rcpp", version = "1.0.1") +devtools::install_version("assertthat", version = "0.2.1") +devtools::install_version("crayon", version = "1.3.4") +devtools::install_version("cli", version = "1.1.0") +devtools::install_version("utf8", version = "1.1.4") +devtools::install_version("fansi", version = "0.4.0") +devtools::install_version("pillar", version = "1.4.2") +devtools::install_version("tibble", version = "2.1.3") +devtools::install_version("readr", version = "1.1.1") diff --git a/workers/R_dependencies/tximport/packages.txt b/workers/R/dependencies/tximport/cran/packages.txt similarity index 100% rename from workers/R_dependencies/tximport/packages.txt rename to workers/R/dependencies/tximport/cran/packages.txt diff --git a/workers/R_dependencies/tximport/versions.tsv b/workers/R/dependencies/tximport/cran/versions.tsv similarity index 100% rename from workers/R_dependencies/tximport/versions.tsv rename to workers/R/dependencies/tximport/cran/versions.tsv diff --git a/workers/R/dependencies/tximport/install_tximport.R b/workers/R/dependencies/tximport/install_tximport.R new file mode 100644 index 000000000..f28c97538 --- /dev/null +++ b/workers/R/dependencies/tximport/install_tximport.R @@ -0,0 +1,7 @@ +# Turn warnings into errors because biocLite throws warnings instead +# of error if it fails to install something. +options(warn = 2) +options(Ncpus = parallel::detectCores()) +options(repos = structure(c(CRAN = "https://cloud.r-project.org"))) + +devtools::install_url("https://bioconductor.org/packages/3.11/bioc/src/contrib/tximport_1.16.1.tar.gz") diff --git a/workers/R_dependencies/tximport/dependencies.R b/workers/R_dependencies/tximport/dependencies.R deleted file mode 100644 index 62fce5df5..000000000 --- a/workers/R_dependencies/tximport/dependencies.R +++ /dev/null @@ -1,25 +0,0 @@ -# Generated from cranlock -options(warn=2) -options(Ncpus=parallel::detectCores()) -options(repos=structure(c(CRAN="https://cloud.r-project.org"))) -devtools::install_version('getopt', version='1.20.3') -devtools::install_version('optparse', version='1.4.4') -devtools::install_version('rjson', version='0.2.19') -devtools::install_version('R6', version='2.4.0') -devtools::install_version('pkgconfig', version='2.0.2') -devtools::install_version('rlang', version='0.4.0') -devtools::install_version('zeallot', version='0.1.0') -devtools::install_version('backports', version='1.1.4') -devtools::install_version('glue', version='1.3.1') -devtools::install_version('digest', version='0.6.19') -devtools::install_version('vctrs', version='0.1.0') -devtools::install_version('hms', version='0.4.2') -devtools::install_version('Rcpp', version='1.0.1') -devtools::install_version('assertthat', version='0.2.1') -devtools::install_version('crayon', version='1.3.4') -devtools::install_version('cli', version='1.1.0') -devtools::install_version('utf8', version='1.1.4') -devtools::install_version('fansi', version='0.4.0') -devtools::install_version('pillar', version='1.4.2') -devtools::install_version('tibble', version='2.1.3') -devtools::install_version('readr', version='1.1.1') diff --git a/workers/affymetrix_dependencies.R b/workers/affymetrix_dependencies.R deleted file mode 100644 index 5fd501002..000000000 --- a/workers/affymetrix_dependencies.R +++ /dev/null @@ -1,220 +0,0 @@ -# Turn warnings into errors because biocLite throws warnings instead # of error if it fails to install something. -options(warn=2) -options(repos=structure(c(CRAN="https://cloud.r-project.org"))) -options(Ncpus=parallel::detectCores()) - -# Bioconductor packages, installed by devtools::install_url() - -# Helper function that installs a list of packages using the input URLs -install_with_url <- function(urls) { - pkg_ids <- devtools::install_url(urls) - if(any(is.na(pkg_ids))) { - pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ") - stop(paste("Failed to install package(s):", pkg_fails )) - } - return(pkg_ids) -} - -devtools::install_version('dplyr', version='1.0.2') - -bioc_pkg_urls <- c( - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/oligoClasses_1.50.4.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/oligo_1.52.1.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/GEOquery_2.56.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/SCAN.UPC_2.30.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/affy_1.66.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/affyio_1.58.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/zlibbioc_1.34.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/preprocessCore_1.50.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/genefilter_1.70.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/sva_3.36.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/limma_3.44.3.tar.gz' -) -install_with_url(bioc_pkg_urls) - -# Invoke another R script to install BrainArray ensg packages -source("install_ensg_pkgs.R") - -# Install Bioconductor platform design (pd) packages -pd_experiment_pkgs <- c( - 'https://bioconductor.org/packages/3.11/data/experiment/src/contrib/pd.atdschip.tiling_0.26.0.tar.gz' -) -install_with_url(pd_experiment_pkgs) - -pd_annotation_pkgs <- c( - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.081229.hg18.promoter.medip.hx1_0.99.4.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.2006.07.18.hg18.refseq.promoter_1.8.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.2006.07.18.mm8.refseq.promoter_0.99.3.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.2006.10.31.rn34.refseq.promoter_0.99.3.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ag_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.aragene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.aragene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ath1.121501_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.barley1_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bovgene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bovgene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bovine_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.bsubtilis_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cangene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cangene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.canine_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.canine.2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.celegans_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.charm.hg18.example_0.99.4.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chicken_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chigene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chigene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chogene.2.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.chogene.2.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.citrus_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.d.human_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.human_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.human.ht_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.mouse_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.mouse.ht_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.rat_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.clariom.s.rat.ht_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cotton_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyngene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyngene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyrgene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cyrgene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.cytogenetics.array_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drogene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drogene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drosgenome1_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.drosophila.2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.e.coli.2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ecoli_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ecoli.asv2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.elegene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.elegene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.equgene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.equgene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.feinberg.hg18.me.hx1_0.99.3.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.feinberg.mm8.me.hx1_0.99.3.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.felgene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.felgene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.fingene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.fingene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.genomewidesnp.5_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.genomewidesnp.6_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.guigene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.guigene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hc.g110_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.focus_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133.plus.2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133a_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133a.2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133a.tag_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u133b_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u219_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95a_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95av2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95b_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95c_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95d_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg.u95e_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hg18.60mer.expr_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ht.hg.u133.plus.pm_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ht.hg.u133a_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ht.mg.430a_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hta.2.0_3.12.2.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hu6800_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.huex.1.0.st.v2_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.1.0.st.v1_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.1.1.st.v1_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.2.0.st_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.hugene.2.1.st_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.maize_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping250k.nsp_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping250k.sty_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping50k.hind240_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mapping50k.xba240_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.margene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.margene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.medgene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.medgene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.medicago_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74a_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74av2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74b_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74bv2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74c_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mg.u74cv2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.1.0_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.2.0_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.3.0_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.3.1_3.8.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mirna.4.0_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.moe430a_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.moe430b_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.moex.1.0.st.v1_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.1.0.st.v1_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.1.1.st.v1_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.2.0.st_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mogene.2.1.st_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mouse430.2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mouse430a.2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mta.1.0_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mu11ksuba_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.mu11ksubb_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.nugo.hs1a520180_3.4.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.nugo.mm1a520177_3.4.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ovigene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ovigene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.pae.g1a_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.plasmodium.anopheles_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.poplar_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.porcine_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.porgene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.porgene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rabgene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rabgene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rae230a_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rae230b_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.raex.1.0.st.v1_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.1.0.st.v1_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.1.1.st.v1_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.2.0.st_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.ragene.2.1.st_3.14.1.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rat230.2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rcngene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rcngene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rg.u34a_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rg.u34b_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rg.u34c_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rhegene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rhegene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rhesus_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rice_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rjpgene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rjpgene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rn.u34_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rta.1.0_3.12.2.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rusgene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.rusgene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.s.aureus_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.soybean_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.soygene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.soygene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.sugar.cane_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.tomato_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.u133.x3p_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.vitis.vinifera_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.wheat_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.x.laevis.2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.x.tropicalis_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.xenopus.laevis_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.yeast.2_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.yg.s98_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.zebgene.1.0.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.zebgene.1.1.st_3.12.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/pd.zebrafish_3.12.0.tar.gz' -) -install_with_url(pd_annotation_pkgs) - -# Load this libraries because apparently just installing it isn't -# enough to verify that the correct versions of dependencies are installed. -library('foreach') diff --git a/workers/ccache.conf b/workers/ccache.conf new file mode 100644 index 000000000..cde43e665 --- /dev/null +++ b/workers/ccache.conf @@ -0,0 +1,5 @@ +max_size = 5.0G +# important for R CMD INSTALL *.tar.gz as tarballs are expanded freshly -> fresh ctime +sloppiness = include_file_ctime +# also important as the (temp.) directory name will differ +hash_dir = false diff --git a/workers/dockerfiles/Dockerfile.affymetrix b/workers/dockerfiles/Dockerfile.affymetrix index 151473ecf..3ee59ea1a 100644 --- a/workers/dockerfiles/Dockerfile.affymetrix +++ b/workers/dockerfiles/Dockerfile.affymetrix @@ -1,81 +1,30 @@ -FROM ubuntu:20.04 +FROM ccdlstaging/dr_worker_base:latest -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 +# Fail in case of an error at any stage in the pipe. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update -qq -RUN apt-get install -y software-properties-common -RUN add-apt-repository ppa:apt-fast/stable - -RUN apt-get update -qq -RUN apt-get install -y apt-fast apt-transport-https - -# The packages related to R are somewhat weird, see the README for more details. -COPY workers/CRAN.gpg . -RUN apt-key add CRAN.gpg -RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list - -RUN apt-fast update -qq && apt-fast install -y \ - build-essential \ - cmake \ - curl \ - cython3 \ - ed \ - git \ - libcairo-dev \ - libcurl4-openssl-dev \ - libedit-dev \ - libpq-dev \ - libssl-dev \ - libxml2-dev \ - llvm-10-dev \ - lsb-release \ - mercurial \ - pkg-config \ - python3-pip \ - python3 \ - python3-dev \ - r-base-core \ - wget - -RUN rm CRAN.gpg -RUN apt-get clean; rm -rf /var/lib/apt/lists/* -RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config - -RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user -ENV R_LIBS "/usr/local/lib/R/site-library" - -COPY common/install_devtools.R . -RUN Rscript install_devtools.R - -COPY workers/R_dependencies/affymetrix/dependencies.R . -RUN Rscript dependencies.R - -COPY workers/affymetrix_dependencies.R . -RUN Rscript affymetrix_dependencies.R +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip setuptools && \ + # Install this one here instead of via requirements.txt because not + # all processors need it. + pip3 install rpy2==3.4.5 -RUN pip3 install pip --upgrade -RUN pip3 install setuptools --upgrade +# Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) -# Install this one here instead of via requirements.txt because not -# all processors need it. -RUN pip3 install rpy2==3.4.5 COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install -r requirements.txt +RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt -# Get the latest version from the dist directory. -COPY common/dist/data-refinery-common-* common/ -RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) +COPY workers/R/dependencies/affymetrix/cran/dependencies.R dependencies.R +RUN Rscript dependencies.R + +COPY workers/R/dependencies/affymetrix/install_ensg_pkgs.R . -# Clear out the pip3 cache. -RUN rm -rf /root/.cache +COPY workers/R/dependencies/affymetrix/bioc/dependencies.R dependencies.R +RUN Rscript dependencies.R ARG SYSTEM_VERSION @@ -86,6 +35,7 @@ USER user COPY .boto .boto COPY config/ config/ COPY workers/ . -COPY workers/install_ensg_pkgs.R . + +RUN ccache -s ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.affymetrix_local b/workers/dockerfiles/Dockerfile.affymetrix_local index 9a37692e6..3d7eff18f 100644 --- a/workers/dockerfiles/Dockerfile.affymetrix_local +++ b/workers/dockerfiles/Dockerfile.affymetrix_local @@ -1,14 +1,18 @@ FROM ccdlstaging/dr_affymetrix:latest -USER root +# Fail in case of an error at any stage in the pipe. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +WORKDIR /home/user # Remove the version of common already installed. -RUN rm -r common/ -RUN pip3 uninstall -y data_refinery_common +RUN rm -r common && \ + pip3 uninstall -y data_refinery_common # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) ARG SYSTEM_VERSION diff --git a/workers/dockerfiles/Dockerfile.compendia b/workers/dockerfiles/Dockerfile.compendia index 2c6a38784..82e14df01 100644 --- a/workers/dockerfiles/Dockerfile.compendia +++ b/workers/dockerfiles/Dockerfile.compendia @@ -3,102 +3,98 @@ FROM nvidia/cuda:11.8.0-runtime-ubuntu18.04 # This is very similar to the `smasher` image, but comes with OpenBLAS and some # of the other libraries required for fancyimpute. +# Fail in case of an error at any stage in the pipe. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +WORKDIR /home/user + # Prevent tzdata from prompting us for a timezone and hanging the build. ENV DEBIAN_FRONTEND=noninteractive # Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 # For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - -# RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub -# via https://github.com/ilikenwf/apt-fast/issues/85#issuecomment-261640099 -RUN echo debconf apt-fast/maxdownloads string 16 | debconf-set-selections -RUN echo debconf apt-fast/dlflag boolean true | debconf-set-selections -RUN echo debconf apt-fast/aptmanager string apt-get | debconf-set-selections -RUN echo 'tzdata tzdata/Areas select Etc' | debconf-set-selections -RUN echo 'tzdata tzdata/Zones/Etc select UTC' | debconf-set-selections - -RUN apt-get update -qq -RUN apt-get install -y software-properties-common -RUN add-apt-repository ppa:apt-fast/stable -RUN add-apt-repository ppa:deadsnakes/ppa -RUN add-apt-repository ppa:savoury1/llvm-defaults-10 - -RUN apt-get update -qq -RUN apt-get install -y apt-fast apt-transport-https tzdata +ENV LANG=C.UTF-8 COPY workers/CRAN.gpg . -RUN apt-key add CRAN.gpg -RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list - -RUN apt-fast update -qq && apt-fast install -y \ - build-essential \ - cmake \ - curl \ - cython3 \ - ed \ - gfortran \ - git \ - libcairo-dev \ - libcurl4-openssl-dev \ - libedit-dev \ - libblas-dev \ - liblapack-dev \ - libpq-dev \ - libssl-dev \ - libxml2-dev \ - llvm-10-dev \ - lsb-release \ - mercurial \ - pkg-config \ - python3-pip \ - python3.8 \ - python3.8-dev \ - r-base-core \ - wget - -RUN rm CRAN.gpg -RUN apt-get clean; rm -rf /var/lib/apt/lists/* -RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 - -RUN groupadd user && useradd --create-home --home-dir /home/user -g user user -WORKDIR /home/user - -RUN wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 -RUN tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ -RUN ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin/ -# We need a few special packages for QN -ENV R_LIBS "/usr/local/lib/R/site-library" - -COPY common/install_devtools.R . +# RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub +# via https://github.com/ilikenwf/apt-fast/issues/85#issuecomment-261640099 +RUN echo debconf apt-fast/maxdownloads string 16 | debconf-set-selections && \ + echo debconf apt-fast/dlflag boolean true | debconf-set-selections && \ + echo debconf apt-fast/aptmanager string apt-get | debconf-set-selections && \ + echo 'tzdata tzdata/Areas select Etc' | debconf-set-selections && \ + echo 'tzdata tzdata/Zones/Etc select UTC' | debconf-set-selections && \ + apt-get update -qq && \ + apt-get install --no-install-recommends -y software-properties-common && \ + add-apt-repository ppa:apt-fast/stable && \ + add-apt-repository ppa:deadsnakes/ppa && \ + add-apt-repository ppa:savoury1/llvm-defaults-10 && \ + apt-get update -qq && \ + apt-get install --no-install-recommends -y apt-fast apt-transport-https gpg-agent tzdata && \ + apt-key add CRAN.gpg && \ + echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ + >> /etc/apt/sources.list.d/added_repos.list && \ + apt-fast update -qq && \ + apt-fast install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + cython3 \ + ed \ + gfortran \ + git \ + libblas-dev \ + libcairo-dev \ + libcurl4-openssl-dev \ + libedit-dev \ + liblapack-dev \ + libpq-dev \ + libssl-dev \ + libxml2-dev \ + llvm-10-dev \ + lsb-release \ + mercurial \ + pkg-config \ + python3-pip \ + python3.8 \ + python3.8-dev \ + r-base-core \ + wget && \ + rm CRAN.gpg && \ + apt-get clean; rm -rf /var/lib/apt/lists/* && \ + ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config & \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ + groupadd user && useradd --create-home --home-dir /home/user -g user user && \ + wget -q https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 && \ + tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ && \ + ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin/ + +# We need a few special packages for QN. +ENV R_LIBS="/usr/local/lib/R/site-library" + +COPY common/R/install_devtools.R . RUN Rscript install_devtools.R -COPY workers/R_dependencies/qn/dependencies.R . +COPY workers/R/dependencies/qn/cran/dependencies.R . RUN Rscript dependencies.R -COPY workers/qn_dependencies.R . -RUN Rscript qn_dependencies.R +COPY workers/R/dependencies/qn/bioc/dependencies.R . +RUN Rscript dependencies.R # End QN-specific -RUN pip3 install --upgrade pip -# Smasher-specific requirements -RUN pip3 install --ignore-installed numpy scipy matplotlib pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD -# End smasher-specific - -COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt - -RUN pip3 install --ignore-installed numpy==1.16.0 # Fix a downgrade - # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) -# Clear out the pip3 cache. -RUN rm -rf /root/.cache +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir numpy scipy matplotlib \ + pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD + +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt && \ + pip3 install --ignore-installed --no-cache-dir numpy==1.16.0 ARG SYSTEM_VERSION diff --git a/workers/dockerfiles/Dockerfile.downloaders b/workers/dockerfiles/Dockerfile.downloaders index 1b3337325..3a5974491 100644 --- a/workers/dockerfiles/Dockerfile.downloaders +++ b/workers/dockerfiles/Dockerfile.downloaders @@ -1,64 +1,11 @@ -FROM ubuntu:18.04 +FROM ccdlstaging/dr_worker_base:latest -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive +# Fail in case of an error at any stage in the pipe. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - -RUN apt-get update -RUN apt-get install -y software-properties-common -RUN add-apt-repository ppa:apt-fast/stable -RUN add-apt-repository ppa:deadsnakes/ppa -RUN add-apt-repository ppa:savoury1/llvm-defaults-10 - -RUN apt-get update -qq -RUN apt-get install -y apt-fast apt-transport-https - -# The packages related to R are somewhat weird, see the README for more details. -COPY workers/CRAN.gpg . -RUN apt-key add CRAN.gpg -RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list - -RUN apt-fast update -qq && apt-fast install -y \ - build-essential \ - cmake \ - curl \ - cython3 \ - ed \ - git \ - libcairo-dev \ - libcurl4-openssl-dev \ - libedit-dev \ - libpq-dev \ - libssl-dev \ - libxml2-dev \ - llvm-10-dev \ - lsb-release \ - mercurial \ - pkg-config \ - python3-pip \ - python3.8 \ - python3.8-dev \ - r-base-core \ - wget - -RUN rm CRAN.gpg -RUN apt-get clean; rm -rf /var/lib/apt/lists/* -RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 - -RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user -ENV R_LIBS "/usr/local/lib/R/site-library" - -COPY common/install_devtools.R . -RUN Rscript install_devtools.R - -COPY workers/install_downloader_R_only.R . +COPY workers/R/dependencies/install_downloader_R_only.R . RUN Rscript install_downloader_R_only.R # Aspera will only install as the current user. @@ -67,29 +14,25 @@ USER user # Install Aspera. We have to install it using Holland Computing Center's conda # repo because download.asperasoft.com now returns 403s -RUN wget -q https://anaconda.org/HCC/aspera-cli/3.9.1/download/linux-64/aspera-cli-3.9.1-0.tar.bz2 -RUN [ "$(sha256sum aspera-cli-3.9.1-0.tar.bz2 | cut -d' ' -f1)" = 60a09a7f3795186954079869106aa89a64183b7be8e0da7cbbe9d57c66c9bcdb ] -RUN mkdir -p .aspera/cli -RUN tar xf aspera-cli-3.9.1-0.tar.bz2 -C .aspera/cli -RUN rm aspera-cli-3.9.1-0.tar.bz2 +RUN wget -q https://anaconda.org/HCC/aspera-cli/3.9.1/download/linux-64/aspera-cli-3.9.1-0.tar.bz2 && \ + [ "$(sha256sum aspera-cli-3.9.1-0.tar.bz2 | cut -d ' ' -f1)" = 60a09a7f3795186954079869106aa89a64183b7be8e0da7cbbe9d57c66c9bcdb ] && \ + mkdir -p .aspera/cli && \ + tar xf aspera-cli-3.9.1-0.tar.bz2 -C .aspera/cli && \ + rm aspera-cli-3.9.1-0.tar.bz2 # Now that we're done installing Aspera go back to being root for a bit. USER root -RUN pip3 install --upgrade pip -# Install this rpy2 here instead of via requirements.txt because -# pip-compile throws an error for it. -RUN pip3 install rpy2==3.4.5 - -COPY workers/data_refinery_workers/downloaders/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir rpy2==3.4.5 # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) -# Clear out the pip3 cache. -RUN rm -rf /root/.cache +COPY workers/data_refinery_workers/downloaders/requirements.txt . +RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt ARG SYSTEM_VERSION diff --git a/workers/dockerfiles/Dockerfile.illumina b/workers/dockerfiles/Dockerfile.illumina index e4cc70268..949c22405 100644 --- a/workers/dockerfiles/Dockerfile.illumina +++ b/workers/dockerfiles/Dockerfile.illumina @@ -1,83 +1,25 @@ -FROM ubuntu:18.04 +FROM ccdlstaging/dr_worker_base:latest -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive +# Fail in case of an error at any stage in the pipe. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - -RUN apt-get update -RUN apt-get install -y software-properties-common -RUN add-apt-repository ppa:apt-fast/stable -RUN add-apt-repository ppa:deadsnakes/ppa -RUN add-apt-repository ppa:savoury1/llvm-defaults-10 - -RUN apt-get update -qq -RUN apt-get install -y apt-fast apt-transport-https - -# The packages related to R are somewhat weird, see the README for more details. -COPY workers/CRAN.gpg . -RUN apt-key add CRAN.gpg -RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list - -RUN apt-fast update -qq && apt-fast install -y \ - build-essential \ - cmake \ - curl \ - cython3 \ - ed \ - gfortran \ - git \ - libcairo-dev \ - libcurl4-openssl-dev \ - libedit-dev \ - libblas-dev \ - liblapack-dev \ - libpq-dev \ - libssl-dev \ - libxml2-dev \ - llvm-10-dev \ - lsb-release \ - mercurial \ - pkg-config \ - python3-pip \ - python3.8 \ - python3.8-dev \ - r-base-core \ - wget - -RUN rm CRAN.gpg -RUN apt-get clean; rm -rf /var/lib/apt/lists/* -RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 - -RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user -ENV R_LIBS "/usr/local/lib/R/site-library" - -COPY common/install_devtools.R . -RUN Rscript install_devtools.R - -COPY workers/R_dependencies/illumina/dependencies.R . +COPY workers/R/dependencies/illumina/cran/dependencies.R . RUN Rscript dependencies.R # These are for Illumina. -COPY workers/illumina_dependencies.R . -RUN Rscript illumina_dependencies.R - -RUN pip3 install --upgrade pip -COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt +COPY workers/R/dependencies/illumina/bioc/dependencies.R . +RUN Rscript dependencies.R # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) -# Clear out the pip3 cache. -RUN rm -rf /root/.cache +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed -r requirements.txt ARG SYSTEM_VERSION diff --git a/workers/dockerfiles/Dockerfile.no_op b/workers/dockerfiles/Dockerfile.no_op index 98f35d772..a0cb7855e 100644 --- a/workers/dockerfiles/Dockerfile.no_op +++ b/workers/dockerfiles/Dockerfile.no_op @@ -1,94 +1,38 @@ -FROM ubuntu:18.04 +FROM ccdlstaging/dr_worker_base:latest -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive +# Fail in case of an error at any stage in the pipe. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - -RUN apt-get update -qq -RUN apt-get install -y software-properties-common - -RUN add-apt-repository ppa:apt-fast/stable -# deadsnakes packages new python versions for older Ubuntu releases -RUN add-apt-repository ppa:deadsnakes/ppa - -RUN apt-get update -qq -RUN apt-get -y install apt-fast - -# The packages related to R are somewhat weird, see the README for more details. -COPY workers/CRAN.gpg . -RUN apt-key add CRAN.gpg -RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list - -RUN apt-fast update -qq && apt-fast install -y \ - build-essential \ - cmake \ - curl \ - cython3 \ - ed \ - git \ - libcairo-dev \ - libcurl4-openssl-dev \ - libedit-dev \ - libfreetype6-dev \ - libpq-dev \ - libssl-dev \ - libxml2-dev \ - llvm-10-dev \ - lsb-release \ - mercurial \ - pkg-config \ - python3-pip \ - python3.8 \ - python3.8-dev \ - r-base-core \ - wget - -RUN rm CRAN.gpg -RUN apt-get clean; rm -rf /var/lib/apt/lists/* -RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 - -RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user -# Noop-specific requirements -ENV R_LIBS "/usr/local/lib/R/site-library" - -COPY common/install_devtools.R . -RUN Rscript install_devtools.R - -COPY workers/R_dependencies/no_op/dependencies.R . +COPY workers/R/dependencies/no_op/cran/dependencies.R . RUN Rscript dependencies.R -COPY workers/install_gene_convert.R . +COPY workers/R/dependencies/no_op/install_gene_convert.R . RUN Rscript install_gene_convert.R +# Noop-specific. RUN mkdir -p gene_indexes WORKDIR /home/user/gene_indexes ENV ID_REFINERY_URL https://zenodo.org/record/1410647/files/all_1536267482.zip -RUN curl -O $ID_REFINERY_URL -RUN echo $ID_REFINERY_URL > /etc/identifier_refinery_url -RUN unzip *.zip -RUN rm *.zip +RUN curl -O $ID_REFINERY_URL && \ + echo $ID_REFINERY_URL > /etc/identifier_refinery_url && \ + unzip *.zip && \ + rm *.zip +# End Noop-specific. + WORKDIR /home/user -# End Noop-specific -RUN pip3 install --upgrade pip -RUN pip3 install numpy +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir numpy COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt +RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) - -# Clear out the pip3 cache. -RUN rm -rf /root/.cache +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) ARG SYSTEM_VERSION diff --git a/workers/dockerfiles/Dockerfile.salmon b/workers/dockerfiles/Dockerfile.salmon index 8bfbbe9a7..724ddac60 100644 --- a/workers/dockerfiles/Dockerfile.salmon +++ b/workers/dockerfiles/Dockerfile.salmon @@ -1,109 +1,59 @@ -FROM ubuntu:18.04 - -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive - -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - -RUN apt-get update -RUN apt-get install -y software-properties-common -RUN add-apt-repository ppa:apt-fast/stable -RUN add-apt-repository ppa:deadsnakes/ppa -RUN add-apt-repository ppa:savoury1/llvm-defaults-10 - -RUN apt-get update -qq -RUN apt-get install -y apt-fast apt-transport-https - -# The packages related to R are somewhat weird, see the README for more details. -COPY workers/CRAN.gpg . -RUN apt-key add CRAN.gpg -RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list - -RUN apt-fast update -qq && apt-fast install -y \ - build-essential \ - cmake \ - curl \ - cython3 \ - ed \ - git \ - libcairo-dev \ - libcurl4-openssl-dev \ - libedit-dev \ - libpq-dev \ - libssl-dev \ - libxml2-dev \ - llvm-10-dev \ - lsb-release \ - mercurial \ - pkg-config \ - python3-pip \ - python3.8 \ - python3.8-dev \ - r-base-core \ - wget - -RUN rm CRAN.gpg -RUN apt-get upgrade; apt-get clean; rm -rf /var/lib/apt/lists/* -RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 - -RUN groupadd user && useradd --create-home --home-dir /home/user -g user user +FROM ccdlstaging/dr_worker_base:latest + +# Fail in case of an error at any stage in the pipe. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + WORKDIR /home/user # Install Salmon - -# Tximport requires all experiments to be processed with the same version of Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496 +# Tximport requires all experiments to be processed with the same version of +# Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496. # This is something that should be considered when updating salmon, because # all samples from incomplete experiments must have salmon run on them again. ENV SALMON_VERSION 0.13.1 -RUN wget https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz -RUN mkdir Salmon-${SALMON_VERSION}_linux_x86_64 # On version 0.13.1 salmon was being extracted to a folder with an all lowercase name # the options `-C` and `--strip-components` allow us to specify the name for the resulting file -RUN tar -xzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz -C Salmon-${SALMON_VERSION}_linux_x86_64 --strip-components 1 -# Create soft link `/usr/local/bin/salmon` that points to the actual program -RUN ln -sf `pwd`/Salmon-${SALMON_VERSION}_linux_x86_64/bin/salmon /usr/local/bin/ -RUN rm -f Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz +RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \ + mkdir "Salmon-${SALMON_VERSION}_linux_x86_64" && \ + tar -xzf "Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" \ + -C "Salmon-${SALMON_VERSION}_linux_x86_64" --strip-components 1 && \ + ln -sf "$(pwd)/Salmon-${SALMON_VERSION}_linux_x86_64/bin/salmon" \ + /usr/local/bin/ && \ + rm -f "Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" # End Salmon installation. -# Install R dependencies. -COPY common/install_devtools.R . -RUN Rscript install_devtools.R - -COPY workers/R_dependencies/tximport/dependencies.R tximport_dependencies.R +COPY workers/R/dependencies/tximport/cran/dependencies.R tximport_dependencies.R RUN Rscript tximport_dependencies.R # Install tximport. -COPY workers/install_tximport.R . +COPY workers/R/dependencies/tximport/install_tximport.R . RUN Rscript install_tximport.R -RUN pip3 install --upgrade pip -RUN pip3 install numpy - -COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir numpy # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) + +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt # Install SalmonTools. -RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && cd SalmonTools && git checkout 3e6654c2c10a5225498b623056993947fa688afc -RUN cd SalmonTools && cmake . -DCMAKE_INSTALL_PREFIX=/usr/local && make install -RUN rm -rf SalmonTools +RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && \ + cd SalmonTools && \ + git checkout 3e6654c2c10a5225498b623056993947fa688afc && \ + cmake . -DCMAKE_INSTALL_PREFIX=/usr/local && \ + make install && \ + rm -rf SalmonTools # Install sra-tools. ENV SRA_VERSION 2.9.1 -RUN wget "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \ - tar zxfv sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz && \ - cp -r sratoolkit.${SRA_VERSION}-ubuntu64/bin/* /usr/bin - -# Clear out the pip3 cache. -RUN rm -rf /root/.cache +RUN wget -q "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \ + tar zxfv "sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \ + cp -r "sratoolkit.${SRA_VERSION}-ubuntu64/bin/"* /usr/bin ARG SYSTEM_VERSION diff --git a/workers/dockerfiles/Dockerfile.smasher b/workers/dockerfiles/Dockerfile.smasher index 313ba2150..00ea02baa 100644 --- a/workers/dockerfiles/Dockerfile.smasher +++ b/workers/dockerfiles/Dockerfile.smasher @@ -1,88 +1,26 @@ -FROM ubuntu:18.04 +FROM ccdlstaging/dr_worker_base:latest -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive +# Fail in case of an error at any stage in the pipe. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - -RUN apt-get update -RUN apt-get install -y software-properties-common -RUN add-apt-repository ppa:apt-fast/stable -RUN add-apt-repository ppa:deadsnakes/ppa -RUN add-apt-repository ppa:savoury1/llvm-defaults-10 - -RUN apt-get update -qq -RUN apt-get install -y apt-fast apt-transport-https - -# The packages related to R are somewhat weird, see the README for more details. -COPY workers/CRAN.gpg . -RUN apt-key add CRAN.gpg -RUN echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list - -RUN apt-fast update -qq && apt-fast install -y \ - build-essential \ - cmake \ - curl \ - cython3 \ - ed \ - gfortran \ - git \ - libcairo-dev \ - libcurl4-openssl-dev \ - libedit-dev \ - libblas-dev \ - liblapack-dev \ - libpq-dev \ - libssl-dev \ - libxml2-dev \ - llvm-10-dev \ - lsb-release \ - mercurial \ - pkg-config \ - python3-pip \ - python3.8 \ - python3.8-dev \ - r-base-core \ - wget - -RUN rm CRAN.gpg -RUN apt-get clean; rm -rf /var/lib/apt/lists/* -RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 - -RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user -# We need a few special packages for QN -ENV R_LIBS "/usr/local/lib/R/site-library" - -COPY common/install_devtools.R . -RUN Rscript install_devtools.R - -COPY workers/R_dependencies/qn/dependencies.R . -RUN Rscript dependencies.R - -COPY workers/qn_dependencies.R . -RUN Rscript qn_dependencies.R -# End QN-specific +RUN pip3 install --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir nose numpy rpy2==3.4.5 -RUN pip3 install --upgrade pip -# Smasher-specific requirements -RUN pip3 install --ignore-installed nose numpy rpy2==3.4.5 -# End smasher-specific +# Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt +RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt -# Get the latest version from the dist directory. -COPY common/dist/data-refinery-common-* common/ -RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) +COPY workers/R/dependencies/qn/cran/dependencies.R . +RUN Rscript dependencies.R -# Clear out the pip3 cache. -RUN rm -rf /root/.cache +COPY workers/R/dependencies/qn/bioc/dependencies.R . +RUN Rscript dependencies.R ARG SYSTEM_VERSION diff --git a/workers/dockerfiles/Dockerfile.transcriptome b/workers/dockerfiles/Dockerfile.transcriptome index d1ac0ea63..1c04e7390 100644 --- a/workers/dockerfiles/Dockerfile.transcriptome +++ b/workers/dockerfiles/Dockerfile.transcriptome @@ -1,87 +1,41 @@ -FROM ubuntu:18.04 +FROM ccdlstaging/dr_worker_base:latest -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive +# Fail in case of an error at any stage in the pipe. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG C.UTF-8 - -RUN apt-get update -qq -RUN apt-get install -y software-properties-common -RUN add-apt-repository ppa:apt-fast/stable -RUN add-apt-repository ppa:deadsnakes/ppa -RUN add-apt-repository ppa:savoury1/llvm-defaults-10 - -RUN apt-get update -qq -RUN apt-get -y install apt-fast - -RUN apt-fast update -qq && apt-fast install -y \ - build-essential \ - curl \ - cython3 \ - ed \ - git \ - libcurl4-openssl-dev \ - libfreetype6-dev \ - libpq-dev \ - llvm-10-dev \ - pkg-config \ - python3-pip \ - python3.8 \ - python3.8-dev \ - wget \ - zlib1g-dev - -RUN rm CRAN.gpg -RUN apt-get clean; rm -rf /var/lib/apt/lists/* -RUN ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 - -RUN groupadd user && useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user # It's annoying that this can only be installed via git. -RUN git clone https://github.com/deweylab/RSEM.git -RUN cd RSEM && make install -RUN rm -rf RSEM - -# Install Salmon +RUN git clone https://github.com/deweylab/RSEM.git && \ + cd RSEM && make install && \ + rm -rf RSEM -# Tximport requires all experiments to be processed with the same version of Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496 +# Install Salmon. +# Tximport requires all experiments to be processed with the same version of +# Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496. # This is something that should be considered when updating salmon, because # all samples from incomplete experiments must have salmon run on them again. ENV SALMON_VERSION 0.13.1 -# Doesn't work: -# salmon: relocation error: /usr/local/bin/../lib/librt.so.1: symbol __vdso_clock_gettime, version GLIBC_PRIVATE not defined in file libc.so.6 with link time reference -# ENV SALMON_VERSION 0.10.0 - -# Binary releases moved to bioconda, doesn't work anymore. -# ENV SALMON_VERSION 0.10.2 - -RUN wget https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz -RUN tar -xzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz - # Salmon can extract to a different directory than the name of the tar file. -RUN cp `tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | head -1 | cut -f1 -d"/"`/bin/salmon /usr/local/bin -RUN cp `tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | head -1 | cut -f1 -d"/"`/lib/* /usr/local/lib - -RUN rm -r Salmon* +RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \ + tar -xzf "Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \ + cp "$(tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | \ + head -1 | cut -f1 -d '/')/bin/salmon" /usr/local/bin && \ + cp "$(tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | \ + head -1 | cut -f1 -d '/')/lib/"* /usr/local/lib/ && \ + rm -r Salmon* # End Salmon installation. -RUN pip3 install --upgrade pip -RUN pip3 install numpy +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir numpy -COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt - -# Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install common/$(ls common -1 | sort --version-sort | tail -1) +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) -# Clear out the pip3 cache. -RUN rm -rf /root/.cache +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt ARG SYSTEM_VERSION diff --git a/workers/dockerfiles/Dockerfile.worker_base b/workers/dockerfiles/Dockerfile.worker_base new file mode 100644 index 000000000..d8f1a0588 --- /dev/null +++ b/workers/dockerfiles/Dockerfile.worker_base @@ -0,0 +1,68 @@ +FROM ubuntu:18.04 + +# Fail in case of an error at any stage in the pipe. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +WORKDIR /home/user + +# Prevent tzdata from prompting us for a timezone and hanging the build. +ENV DEBIAN_FRONTEND=noninteractive + +# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 +# For whatever reason this worked and 'en_US.UTF-8' did not. +ENV LANG=C.UTF-8 + +RUN apt-get update && \ + apt-get install --no-install-recommends -y software-properties-common && \ + add-apt-repository ppa:apt-fast/stable && \ + add-apt-repository ppa:deadsnakes/ppa && \ + add-apt-repository ppa:savoury1/llvm-defaults-10 && \ + apt-get update -qq && \ + apt-get install --no-install-recommends -y apt-fast apt-transport-https gpg-agent + +# The packages related to R are somewhat weird, see the README for more details. +COPY workers/CRAN.gpg . +RUN apt-key add CRAN.gpg && \ + echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ + >> /etc/apt/sources.list.d/added_repos.list + +RUN apt-fast update -qq && apt-fast install -y \ + build-essential \ + ccache \ + cmake \ + curl \ + cython3 \ + ed \ + git \ + libcairo-dev \ + libcurl4-gnutls-dev \ + libedit-dev \ + libgit2-dev \ + libpq-dev \ + libssl-dev \ + libxml2-dev \ + llvm-10-dev \ + lsb-release \ + mercurial \ + pkg-config \ + python3-pip \ + python3.8 \ + python3.8-dev \ + r-base-core \ + wget && \ + rm CRAN.gpg && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ + ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ + groupadd user && useradd --create-home --home-dir /home/user -g user user + +# Set up ccache. +COPY workers/ccache.conf /root/.ccache/ccache.conf +RUN for i in c++ cc g++ gcc; do ln -s /usr/bin/ccache /usr/local/bin/$i; done +ENV PATH="/usr/local/bin:${PATH}" + +# Pre-install dev tools. +ENV R_LIBS="/usr/local/lib/R/site-library" +COPY common/R/install_devtools.R . +RUN Rscript install_devtools.R diff --git a/workers/install_affy_only.R b/workers/install_affy_only.R deleted file mode 100644 index b488a0287..000000000 --- a/workers/install_affy_only.R +++ /dev/null @@ -1,38 +0,0 @@ -# Turn warnings into errors because biocLite throws warnings instead -# of error if it fails to install something. -options(warn=2) -options(repos=structure(c(CRAN="https://cloud.r-project.org"))) -options(Ncpus=parallel::detectCores()) - -# Use devtools::install_version() to install packages in cran. -devtools::install_version('dplyr', version='1.0.0') -devtools::install_version('tidyr', version='1.1.0') -devtools::install_version('ff', version='2.2-14') -devtools::install_version('locfit', version='1.5-9.4') - -# Helper function that installs a list of packages using the input URLs -install_with_url <- function(urls) { - pkg_ids <- devtools::install_url(urls) - if(any(is.na(pkg_ids))) { - pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ") - stop(paste("Failed to install package(s):", pkg_fails )) - } - return(pkg_ids) -} - -bioc_pkgs <- c( - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/oligoClasses_1.50.4.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/oligo_1.52.1.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/GEOquery_2.56.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/SCAN.UPC_2.30.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/affy_1.66.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/affyio_1.58.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/zlibbioc_1.34.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/preprocessCore_1.50.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/genefilter_1.70.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/sva_3.36.0.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/tximport_1.16.1.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/limma_3.44.3.tar.gz' -) -install_with_url(bioc_pkgs) diff --git a/workers/install_gene_convert.R b/workers/install_gene_convert.R deleted file mode 100644 index 5e053f9c6..000000000 --- a/workers/install_gene_convert.R +++ /dev/null @@ -1,43 +0,0 @@ -# Turn warnings into errors because biocLite throws warnings instead -# of error if it fails to install something. -options(warn=2) -options(Ncpus=parallel::detectCores()) -options(repos=structure(c(CRAN="https://cloud.r-project.org"))) - -# Helper function that installs a list of packages using the input URLs -install_with_url <- function(urls) { - pkg_ids <- devtools::install_url(urls) - if(any(is.na(pkg_ids))) { - pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ") - stop(paste("Failed to install package(s):", pkg_fails )) - } - return(pkg_ids) -} - -devtools::install_version('dplyr', version='1.0.2') - -bioc_pkgs <- c( - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz' -) -install_with_url(bioc_pkgs) - -illumina_pkgs <- c( - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv1.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv2.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv3.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv4.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1p1.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev2.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaRatv1.db_1.26.0.tar.gz' -) -install_with_url(illumina_pkgs) - -# Load these libraries because apparently just installing them isn't -# enough to verify that they have complementary versions. -library("optparse") -library(data.table) -library("dplyr") -library("rlang") -library(lazyeval) -library(AnnotationDbi) diff --git a/workers/install_tximport.R b/workers/install_tximport.R deleted file mode 100644 index 2889fc630..000000000 --- a/workers/install_tximport.R +++ /dev/null @@ -1,7 +0,0 @@ -# Turn warnings into errors because biocLite throws warnings instead -# of error if it fails to install something. -options(warn=2) -options(Ncpus=parallel::detectCores()) -options(repos=structure(c(CRAN="https://cloud.r-project.org"))) - -devtools::install_url('https://bioconductor.org/packages/3.11/bioc/src/contrib/tximport_1.16.1.tar.gz') From 5f3b5625a0c13a72a9e191ff7c15255cb0fdd415 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Tue, 22 Nov 2022 13:45:21 -0800 Subject: [PATCH 20/24] Update docker images. --- common/dockerfiles/Dockerfile.base | 65 +++++++++++ .../dependencies/illumina/bioc/dependencies.R | 36 +++---- workers/dockerfiles/Dockerfile.affymetrix | 41 ++++--- .../dockerfiles/Dockerfile.affymetrix_local | 3 +- workers/dockerfiles/Dockerfile.compendia | 102 +++++++++--------- workers/dockerfiles/Dockerfile.downloaders | 23 ++-- workers/dockerfiles/Dockerfile.illumina | 12 ++- workers/dockerfiles/Dockerfile.no_op | 23 ++-- workers/dockerfiles/Dockerfile.salmon | 28 ++--- workers/dockerfiles/Dockerfile.smasher | 27 ++--- workers/dockerfiles/Dockerfile.transcriptome | 20 ++-- workers/dockerfiles/Dockerfile.worker_base | 68 ------------ 12 files changed, 238 insertions(+), 210 deletions(-) create mode 100644 common/dockerfiles/Dockerfile.base delete mode 100644 workers/dockerfiles/Dockerfile.worker_base diff --git a/common/dockerfiles/Dockerfile.base b/common/dockerfiles/Dockerfile.base new file mode 100644 index 000000000..5b833cef9 --- /dev/null +++ b/common/dockerfiles/Dockerfile.base @@ -0,0 +1,65 @@ +FROM ubuntu:18.04 + +# Fail in case of an error at any stage in the pipe. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +WORKDIR /home/user + +# Prevent tzdata from prompting us for a timezone and hanging the build. +ENV DEBIAN_FRONTEND=noninteractive + +# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 +# For whatever reason this worked and 'en_US.UTF-8' did not. +ENV LANG=C.UTF-8 + +COPY workers/CRAN.gpg . +RUN apt-get update && \ + apt-get install --no-install-recommends -y software-properties-common && \ + add-apt-repository ppa:apt-fast/stable && \ + add-apt-repository ppa:deadsnakes/ppa && \ + add-apt-repository ppa:savoury1/llvm-defaults-10 && \ + apt-get update -qq && \ + apt-get install --no-install-recommends -y \ + apt-fast \ + apt-transport-https \ + gpg-agent && \ + apt-key add CRAN.gpg && \ + echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ + >> /etc/apt/sources.list.d/added_repos.list && \ + apt-fast update -qq && apt-fast install -y \ + build-essential \ + cmake \ + curl \ + cython3 \ + ed \ + git \ + libcairo-dev \ + libcurl4-gnutls-dev \ + libedit-dev \ + libgit2-dev \ + libpq-dev \ + libssl-dev \ + libxml2-dev \ + llvm-10-dev \ + lsb-release \ + mercurial \ + pkg-config \ + python3-pip \ + python3.8 \ + python3.8-dev \ + r-base-core \ + wget && \ + apt-get clean && \ + rm CRAN.gpg && \ + rm -rf /var/lib/apt/lists/* && \ + ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config && \ + update-alternatives --install \ + /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ + groupadd user && \ + useradd --create-home --home-dir /home/user/ -g user user && \ + chown -R user /home/user/ + +# Pre-install dev tools. +ENV R_LIBS=/usr/local/lib/R/site-library +COPY common/R/install_devtools.R . +RUN Rscript install_devtools.R diff --git a/workers/R/dependencies/illumina/bioc/dependencies.R b/workers/R/dependencies/illumina/bioc/dependencies.R index b3e598f79..06aee98a5 100644 --- a/workers/R/dependencies/illumina/bioc/dependencies.R +++ b/workers/R/dependencies/illumina/bioc/dependencies.R @@ -1,36 +1,36 @@ -options(warn=2) -options(repos=structure(c(CRAN="https://cloud.r-project.org"))) -options(Ncpus=parallel::detectCores()) +options(warn = 2) +options(repos = structure(c(CRAN = "https://cloud.r-project.org"))) +options(Ncpus = parallel::detectCores()) # Helper function that installs a list of packages using the input URLs install_with_url <- function(urls) { pkg_ids <- devtools::install_url(urls) - if(any(is.na(pkg_ids))) { + if (any(is.na(pkg_ids))) { pkg_fails <- paste(urls[is.na(pkg_ids)], collapse = "; ") - stop(paste("Failed to install package(s):", pkg_fails )) + stop(paste("Failed to install package(s):", pkg_fails)) } return(pkg_ids) } -devtools::install_version('dplyr', version='1.0.2') +devtools::install_version("dplyr", version = "1.0.2") bioc_pkgs <- c( - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/oligoClasses_1.50.4.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/oligo_1.52.1.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz', - 'https://bioconductor.org/packages/3.11/bioc/src/contrib/limma_3.44.3.tar.gz' + "https://bioconductor.org/packages/3.11/bioc/src/contrib/oligoClasses_1.50.4.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/oligo_1.52.1.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/AnnotationDbi_1.50.3.tar.gz", + "https://bioconductor.org/packages/3.11/bioc/src/contrib/limma_3.44.3.tar.gz" ) install_with_url(bioc_pkgs) illumina_pkgs <- c( - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv1.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv2.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv3.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv4.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1p1.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev2.db_1.26.0.tar.gz', - 'https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaRatv1.db_1.26.0.tar.gz' + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv1.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv2.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv3.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaHumanv4.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev1p1.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaMousev2.db_1.26.0.tar.gz", + "https://bioconductor.org/packages/3.11/data/annotation/src/contrib/illuminaRatv1.db_1.26.0.tar.gz" ) install_with_url(illumina_pkgs) diff --git a/workers/dockerfiles/Dockerfile.affymetrix b/workers/dockerfiles/Dockerfile.affymetrix index 3ee59ea1a..e90300240 100644 --- a/workers/dockerfiles/Dockerfile.affymetrix +++ b/workers/dockerfiles/Dockerfile.affymetrix @@ -5,18 +5,12 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"] WORKDIR /home/user -RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip setuptools && \ - # Install this one here instead of via requirements.txt because not - # all processors need it. - pip3 install rpy2==3.4.5 - -# Get the latest version from the dist directory. -COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed --no-cache-dir \ - common/$(ls common -1 | sort --version-sort | tail -1) - -COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt +# Install and set up ccache. +RUN apt-get update && \ + apt-get install --no-install-recommends -y ccache && \ + for i in c++ cc g++ gcc; do ln -fs /usr/bin/ccache /usr/local/bin/$i; done +COPY workers/ccache.conf /root/.ccache/ccache.conf +ENV PATH="/usr/local/bin:${PATH}" COPY workers/R/dependencies/affymetrix/cran/dependencies.R dependencies.R RUN Rscript dependencies.R @@ -26,9 +20,28 @@ COPY workers/R/dependencies/affymetrix/install_ensg_pkgs.R . COPY workers/R/dependencies/affymetrix/bioc/dependencies.R dependencies.R RUN Rscript dependencies.R +RUN pip3 install --ignore-installed --upgrade pip setuptools && \ + # Install this one here instead of via requirements.txt because not + # all processors need it. + pip3 install --ignore-installed rpy2==3.4.5 + +# Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed \ + common/$(ls common -1 | sort --version-sort | tail -1) + +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip3 cache. + rm -r /root/.cache + ARG SYSTEM_VERSION +ENV SYSTEM_VERSION=$SYSTEM_VERSION -ENV SYSTEM_VERSION $SYSTEM_VERSION +# Print compiler cache stats (the cache hit ratio should be ~90%). +RUN ccache --show-stats && \ + # Clear out the ccache. + ccache --clear USER user @@ -36,6 +49,4 @@ COPY .boto .boto COPY config/ config/ COPY workers/ . -RUN ccache -s - ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.affymetrix_local b/workers/dockerfiles/Dockerfile.affymetrix_local index 3d7eff18f..7f1af8ad6 100644 --- a/workers/dockerfiles/Dockerfile.affymetrix_local +++ b/workers/dockerfiles/Dockerfile.affymetrix_local @@ -15,8 +15,7 @@ RUN pip3 install --ignore-installed --no-cache-dir \ common/$(ls common -1 | sort --version-sort | tail -1) ARG SYSTEM_VERSION - -ENV SYSTEM_VERSION $SYSTEM_VERSION +ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user diff --git a/workers/dockerfiles/Dockerfile.compendia b/workers/dockerfiles/Dockerfile.compendia index 82e14df01..f2afa185f 100644 --- a/workers/dockerfiles/Dockerfile.compendia +++ b/workers/dockerfiles/Dockerfile.compendia @@ -1,13 +1,11 @@ -FROM nvidia/cuda:11.8.0-runtime-ubuntu18.04 - # This is very similar to the `smasher` image, but comes with OpenBLAS and some # of the other libraries required for fancyimpute. +FROM nvidia/cuda:11.8.0-runtime-ubuntu18.04 + # Fail in case of an error at any stage in the pipe. SHELL ["/bin/bash", "-o", "pipefail", "-c"] -WORKDIR /home/user - # Prevent tzdata from prompting us for a timezone and hanging the build. ENV DEBIAN_FRONTEND=noninteractive @@ -33,45 +31,52 @@ RUN echo debconf apt-fast/maxdownloads string 16 | debconf-set-selections && \ apt-get install --no-install-recommends -y apt-fast apt-transport-https gpg-agent tzdata && \ apt-key add CRAN.gpg && \ echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list && \ + >> /etc/apt/sources.list.d/added_repos.list && \ apt-fast update -qq && \ apt-fast install -y \ - build-essential \ - ccache \ - cmake \ - curl \ - cython3 \ - ed \ - gfortran \ - git \ - libblas-dev \ - libcairo-dev \ - libcurl4-openssl-dev \ - libedit-dev \ - liblapack-dev \ - libpq-dev \ - libssl-dev \ - libxml2-dev \ - llvm-10-dev \ - lsb-release \ - mercurial \ - pkg-config \ - python3-pip \ - python3.8 \ - python3.8-dev \ - r-base-core \ - wget && \ + build-essential \ + ccache \ + cmake \ + curl \ + cython3 \ + ed \ + gfortran \ + git \ + libblas-dev \ + libcairo-dev \ + libcurl4-openssl-dev \ + libedit-dev \ + liblapack-dev \ + libpq-dev \ + libssl-dev \ + libxml2-dev \ + llvm-10-dev \ + lsb-release \ + mercurial \ + pkg-config \ + python3-pip \ + python3.8 \ + python3.8-dev \ + r-base-core \ + wget && \ + apt-get clean && \ rm CRAN.gpg && \ - apt-get clean; rm -rf /var/lib/apt/lists/* && \ - ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config & \ - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ - groupadd user && useradd --create-home --home-dir /home/user -g user user && \ - wget -q https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 && \ + rm -rf /var/lib/apt/lists/* && \ + ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config && \ + update-alternatives --install /usr/bin/python3 python3 \ + /usr/bin/python3.8 1 && \ + groupadd user && \ + useradd --create-home --home-dir /home/user -g user user + +WORKDIR /home/user +RUN wget -q https://bitbucket.org/ariya/phantomjs/downloads/\ +phantomjs-2.1.1-linux-x86_64.tar.bz2 && \ tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ && \ - ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin/ + ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs \ + /usr/local/bin/ # We need a few special packages for QN. -ENV R_LIBS="/usr/local/lib/R/site-library" +ENV R_LIBS=/usr/local/lib/R/site-library COPY common/R/install_devtools.R . RUN Rscript install_devtools.R @@ -83,22 +88,23 @@ COPY workers/R/dependencies/qn/bioc/dependencies.R . RUN Rscript dependencies.R # End QN-specific +RUN pip3 install --ignore-installed --upgrade pip && \ + pip3 install --ignore-installed numpy scipy matplotlib \ + pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD + +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed -r requirements.txt && \ + # Pin setuptools as a workaround for + # https://github.com/pypa/setuptools/issues/3693 + pip3 install --ignore-installed setuptools==65.0.1 numpy==1.16.0 + # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed --no-cache-dir \ +RUN pip3 install --ignore-installed \ common/$(ls common -1 | sort --version-sort | tail -1) -RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ - pip3 install --ignore-installed --no-cache-dir numpy scipy matplotlib \ - pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD - -COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt && \ - pip3 install --ignore-installed --no-cache-dir numpy==1.16.0 - ARG SYSTEM_VERSION - -ENV SYSTEM_VERSION $SYSTEM_VERSION +ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user diff --git a/workers/dockerfiles/Dockerfile.downloaders b/workers/dockerfiles/Dockerfile.downloaders index 3a5974491..c65faf5dc 100644 --- a/workers/dockerfiles/Dockerfile.downloaders +++ b/workers/dockerfiles/Dockerfile.downloaders @@ -10,12 +10,16 @@ RUN Rscript install_downloader_R_only.R # Aspera will only install as the current user. # Even using `su - user &&` doesn't work... -USER user +USER user # Install Aspera. We have to install it using Holland Computing Center's conda # repo because download.asperasoft.com now returns 403s -RUN wget -q https://anaconda.org/HCC/aspera-cli/3.9.1/download/linux-64/aspera-cli-3.9.1-0.tar.bz2 && \ - [ "$(sha256sum aspera-cli-3.9.1-0.tar.bz2 | cut -d ' ' -f1)" = 60a09a7f3795186954079869106aa89a64183b7be8e0da7cbbe9d57c66c9bcdb ] && \ + +RUN wget -q "https://anaconda.org/HCC/aspera-cli/3.9.1/download/\ +linux-64/aspera-cli-3.9.1-0.tar.bz2" && \ + [ "$(sha256sum aspera-cli-3.9.1-0.tar.bz2 | cut -d ' ' -f1)" =\ + "60a09a7f3795186954079869106aa89a64183b7be8e0da7cbbe9d57c66c9bcdb" ] && \ + rm -rf .aspera && \ mkdir -p .aspera/cli && \ tar xf aspera-cli-3.9.1-0.tar.bz2 -C .aspera/cli && \ rm aspera-cli-3.9.1-0.tar.bz2 @@ -23,20 +27,21 @@ RUN wget -q https://anaconda.org/HCC/aspera-cli/3.9.1/download/linux-64/aspera-c # Now that we're done installing Aspera go back to being root for a bit. USER root -RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ - pip3 install --ignore-installed --no-cache-dir rpy2==3.4.5 +RUN pip3 install --ignore-installed --upgrade pip && \ + pip3 install --ignore-installed rpy2==3.4.5 # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed --no-cache-dir \ +RUN pip3 install --ignore-installed \ common/$(ls common -1 | sort --version-sort | tail -1) COPY workers/data_refinery_workers/downloaders/requirements.txt . -RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt +RUN pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip3 cache. + rm -r /root/.cache ARG SYSTEM_VERSION - -ENV SYSTEM_VERSION $SYSTEM_VERSION +ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user diff --git a/workers/dockerfiles/Dockerfile.illumina b/workers/dockerfiles/Dockerfile.illumina index 949c22405..fe65bf7ab 100644 --- a/workers/dockerfiles/Dockerfile.illumina +++ b/workers/dockerfiles/Dockerfile.illumina @@ -14,16 +14,18 @@ RUN Rscript dependencies.R # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed --no-cache-dir \ +RUN pip3 install --ignore-installed \ common/$(ls common -1 | sort --version-sort | tail -1) -RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip +RUN pip3 install --ignore-installed --upgrade pip + COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt +RUN pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip3 cache. + rm -r /root/.cache ARG SYSTEM_VERSION - -ENV SYSTEM_VERSION $SYSTEM_VERSION +ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user diff --git a/workers/dockerfiles/Dockerfile.no_op b/workers/dockerfiles/Dockerfile.no_op index a0cb7855e..7f55589d0 100644 --- a/workers/dockerfiles/Dockerfile.no_op +++ b/workers/dockerfiles/Dockerfile.no_op @@ -14,29 +14,30 @@ RUN Rscript install_gene_convert.R # Noop-specific. RUN mkdir -p gene_indexes WORKDIR /home/user/gene_indexes -ENV ID_REFINERY_URL https://zenodo.org/record/1410647/files/all_1536267482.zip +ENV ID_REFINERY_URL=https://zenodo.org/record/1410647/files/all_1536267482.zip RUN curl -O $ID_REFINERY_URL && \ echo $ID_REFINERY_URL > /etc/identifier_refinery_url && \ unzip *.zip && \ rm *.zip # End Noop-specific. -WORKDIR /home/user - -RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ - pip3 install --ignore-installed --no-cache-dir numpy - -COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt +WORKDIR /home/user/ # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed --no-cache-dir \ +RUN pip3 install --ignore-installed \ common/$(ls common -1 | sort --version-sort | tail -1) -ARG SYSTEM_VERSION +RUN pip3 install --ignore-installed --upgrade pip && \ + pip3 install --ignore-installed numpy + +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip3 cache. + rm -r /root/.cache -ENV SYSTEM_VERSION $SYSTEM_VERSION +ARG SYSTEM_VERSION +ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user diff --git a/workers/dockerfiles/Dockerfile.salmon b/workers/dockerfiles/Dockerfile.salmon index 724ddac60..1bb8de6e6 100644 --- a/workers/dockerfiles/Dockerfile.salmon +++ b/workers/dockerfiles/Dockerfile.salmon @@ -10,11 +10,13 @@ WORKDIR /home/user # Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496. # This is something that should be considered when updating salmon, because # all samples from incomplete experiments must have salmon run on them again. -ENV SALMON_VERSION 0.13.1 +ENV SALMON_VERSION=0.13.1 -# On version 0.13.1 salmon was being extracted to a folder with an all lowercase name -# the options `-C` and `--strip-components` allow us to specify the name for the resulting file -RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \ +# On version 0.13.1 salmon was being extracted to a folder with an all +# lowercase name the options `-C` and `--strip-components` allow us to specify +#the name for the resulting file. +RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/\ +v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \ mkdir "Salmon-${SALMON_VERSION}_linux_x86_64" && \ tar -xzf "Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" \ -C "Salmon-${SALMON_VERSION}_linux_x86_64" --strip-components 1 && \ @@ -30,16 +32,18 @@ RUN Rscript tximport_dependencies.R COPY workers/R/dependencies/tximport/install_tximport.R . RUN Rscript install_tximport.R -RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ - pip3 install --ignore-installed --no-cache-dir numpy +RUN pip3 install --ignore-installed --upgrade pip && \ + pip3 install --ignore-installed numpy # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed --no-cache-dir \ +RUN pip3 install --ignore-installed \ common/$(ls common -1 | sort --version-sort | tail -1) COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt +RUN pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip3 cache. + rm -r /root/.cache # Install SalmonTools. RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && \ @@ -50,14 +54,14 @@ RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && \ rm -rf SalmonTools # Install sra-tools. -ENV SRA_VERSION 2.9.1 -RUN wget -q "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \ +ENV SRA_VERSION=2.9.1 +RUN wget -q "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/\ +sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \ tar zxfv "sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \ cp -r "sratoolkit.${SRA_VERSION}-ubuntu64/bin/"* /usr/bin ARG SYSTEM_VERSION - -ENV SYSTEM_VERSION $SYSTEM_VERSION +ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user diff --git a/workers/dockerfiles/Dockerfile.smasher b/workers/dockerfiles/Dockerfile.smasher index 00ea02baa..ebdf924a4 100644 --- a/workers/dockerfiles/Dockerfile.smasher +++ b/workers/dockerfiles/Dockerfile.smasher @@ -1,30 +1,31 @@ -FROM ccdlstaging/dr_worker_base:latest +FROM ccdlstaging/dr_base:latest # Fail in case of an error at any stage in the pipe. SHELL ["/bin/bash", "-o", "pipefail", "-c"] WORKDIR /home/user -RUN pip3 install --no-cache-dir --upgrade pip && \ - pip3 install --ignore-installed --no-cache-dir nose numpy rpy2==3.4.5 +COPY workers/R/dependencies/qn/cran/dependencies.R . +RUN Rscript dependencies.R + +COPY workers/R/dependencies/qn/bioc/dependencies.R . +RUN Rscript dependencies.R + +RUN pip3 install --ignore-installed --upgrade pip && \ + pip3 install --ignore-installed nose numpy rpy2==3.4.5 # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed --no-cache-dir \ +RUN pip3 install --ignore-installed \ common/$(ls common -1 | sort --version-sort | tail -1) COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt - -COPY workers/R/dependencies/qn/cran/dependencies.R . -RUN Rscript dependencies.R - -COPY workers/R/dependencies/qn/bioc/dependencies.R . -RUN Rscript dependencies.R +RUN pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip3 cache. + rm -rf /root/.cache ARG SYSTEM_VERSION - -ENV SYSTEM_VERSION $SYSTEM_VERSION +ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user diff --git a/workers/dockerfiles/Dockerfile.transcriptome b/workers/dockerfiles/Dockerfile.transcriptome index 1c04e7390..8ce0d974a 100644 --- a/workers/dockerfiles/Dockerfile.transcriptome +++ b/workers/dockerfiles/Dockerfile.transcriptome @@ -1,4 +1,4 @@ -FROM ccdlstaging/dr_worker_base:latest +FROM ccdlstaging/dr_base:latest # Fail in case of an error at any stage in the pipe. SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -15,10 +15,11 @@ RUN git clone https://github.com/deweylab/RSEM.git && \ # Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496. # This is something that should be considered when updating salmon, because # all samples from incomplete experiments must have salmon run on them again. -ENV SALMON_VERSION 0.13.1 +ENV SALMON_VERSION=0.13.1 # Salmon can extract to a different directory than the name of the tar file. -RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \ +RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/\ +v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \ tar -xzf "Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \ cp "$(tar -tzf Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz | \ head -1 | cut -f1 -d '/')/bin/salmon" /usr/local/bin && \ @@ -27,19 +28,20 @@ RUN wget -q "https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_V rm -r Salmon* # End Salmon installation. -RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ - pip3 install --ignore-installed --no-cache-dir numpy +RUN pip3 install --ignore-installed --upgrade pip && \ + pip3 install --ignore-installed numpy COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed --no-cache-dir \ +RUN pip3 install --ignore-installed \ common/$(ls common -1 | sort --version-sort | tail -1) COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed --no-cache-dir -r requirements.txt +RUN pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip3 cache. + rm -rf /root/.cache ARG SYSTEM_VERSION - -ENV SYSTEM_VERSION $SYSTEM_VERSION +ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user diff --git a/workers/dockerfiles/Dockerfile.worker_base b/workers/dockerfiles/Dockerfile.worker_base deleted file mode 100644 index d8f1a0588..000000000 --- a/workers/dockerfiles/Dockerfile.worker_base +++ /dev/null @@ -1,68 +0,0 @@ -FROM ubuntu:18.04 - -# Fail in case of an error at any stage in the pipe. -SHELL ["/bin/bash", "-o", "pipefail", "-c"] - -WORKDIR /home/user - -# Prevent tzdata from prompting us for a timezone and hanging the build. -ENV DEBIAN_FRONTEND=noninteractive - -# Source: https://github.com/thisbejim/Pyrebase/issues/87#issuecomment-354452082 -# For whatever reason this worked and 'en_US.UTF-8' did not. -ENV LANG=C.UTF-8 - -RUN apt-get update && \ - apt-get install --no-install-recommends -y software-properties-common && \ - add-apt-repository ppa:apt-fast/stable && \ - add-apt-repository ppa:deadsnakes/ppa && \ - add-apt-repository ppa:savoury1/llvm-defaults-10 && \ - apt-get update -qq && \ - apt-get install --no-install-recommends -y apt-fast apt-transport-https gpg-agent - -# The packages related to R are somewhat weird, see the README for more details. -COPY workers/CRAN.gpg . -RUN apt-key add CRAN.gpg && \ - echo "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/" \ - >> /etc/apt/sources.list.d/added_repos.list - -RUN apt-fast update -qq && apt-fast install -y \ - build-essential \ - ccache \ - cmake \ - curl \ - cython3 \ - ed \ - git \ - libcairo-dev \ - libcurl4-gnutls-dev \ - libedit-dev \ - libgit2-dev \ - libpq-dev \ - libssl-dev \ - libxml2-dev \ - llvm-10-dev \ - lsb-release \ - mercurial \ - pkg-config \ - python3-pip \ - python3.8 \ - python3.8-dev \ - r-base-core \ - wget && \ - rm CRAN.gpg && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - ln -s /usr/bin/llvm-config-10 /usr/bin/llvm-config && \ - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ - groupadd user && useradd --create-home --home-dir /home/user -g user user - -# Set up ccache. -COPY workers/ccache.conf /root/.ccache/ccache.conf -RUN for i in c++ cc g++ gcc; do ln -s /usr/bin/ccache /usr/local/bin/$i; done -ENV PATH="/usr/local/bin:${PATH}" - -# Pre-install dev tools. -ENV R_LIBS="/usr/local/lib/R/site-library" -COPY common/R/install_devtools.R . -RUN Rscript install_devtools.R From 13089441a9f96b8778fcb8852e62be26eb017b6d Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Mon, 28 Nov 2022 10:42:12 -0800 Subject: [PATCH 21/24] Add another docker images update. --- workers/dockerfiles/Dockerfile.compendia | 8 +++----- workers/dockerfiles/Dockerfile.downloaders | 2 +- workers/dockerfiles/Dockerfile.illumina | 2 +- workers/dockerfiles/Dockerfile.no_op | 2 +- workers/dockerfiles/Dockerfile.salmon | 2 +- 5 files changed, 7 insertions(+), 9 deletions(-) diff --git a/workers/dockerfiles/Dockerfile.compendia b/workers/dockerfiles/Dockerfile.compendia index f2afa185f..4dd80559f 100644 --- a/workers/dockerfiles/Dockerfile.compendia +++ b/workers/dockerfiles/Dockerfile.compendia @@ -69,8 +69,8 @@ RUN echo debconf apt-fast/maxdownloads string 16 | debconf-set-selections && \ useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user -RUN wget -q https://bitbucket.org/ariya/phantomjs/downloads/\ -phantomjs-2.1.1-linux-x86_64.tar.bz2 && \ +RUN wget "https://bitbucket.org/ariya/phantomjs/downloads/\ +phantomjs-2.1.1-linux-x86_64.tar.bz2" && \ tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ && \ ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs \ /usr/local/bin/ @@ -94,9 +94,7 @@ RUN pip3 install --ignore-installed --upgrade pip && \ COPY workers/data_refinery_workers/processors/requirements.txt . RUN pip3 install --ignore-installed -r requirements.txt && \ - # Pin setuptools as a workaround for - # https://github.com/pypa/setuptools/issues/3693 - pip3 install --ignore-installed setuptools==65.0.1 numpy==1.16.0 + pip3 install --ignore-installed numpy==1.16.0 # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ diff --git a/workers/dockerfiles/Dockerfile.downloaders b/workers/dockerfiles/Dockerfile.downloaders index c65faf5dc..145d567e1 100644 --- a/workers/dockerfiles/Dockerfile.downloaders +++ b/workers/dockerfiles/Dockerfile.downloaders @@ -1,4 +1,4 @@ -FROM ccdlstaging/dr_worker_base:latest +FROM ccdlstaging/dr_base:latest # Fail in case of an error at any stage in the pipe. SHELL ["/bin/bash", "-o", "pipefail", "-c"] diff --git a/workers/dockerfiles/Dockerfile.illumina b/workers/dockerfiles/Dockerfile.illumina index fe65bf7ab..648d2e758 100644 --- a/workers/dockerfiles/Dockerfile.illumina +++ b/workers/dockerfiles/Dockerfile.illumina @@ -1,4 +1,4 @@ -FROM ccdlstaging/dr_worker_base:latest +FROM ccdlstaging/dr_base:latest # Fail in case of an error at any stage in the pipe. SHELL ["/bin/bash", "-o", "pipefail", "-c"] diff --git a/workers/dockerfiles/Dockerfile.no_op b/workers/dockerfiles/Dockerfile.no_op index 7f55589d0..8d55281ca 100644 --- a/workers/dockerfiles/Dockerfile.no_op +++ b/workers/dockerfiles/Dockerfile.no_op @@ -1,4 +1,4 @@ -FROM ccdlstaging/dr_worker_base:latest +FROM ccdlstaging/dr_base:latest # Fail in case of an error at any stage in the pipe. SHELL ["/bin/bash", "-o", "pipefail", "-c"] diff --git a/workers/dockerfiles/Dockerfile.salmon b/workers/dockerfiles/Dockerfile.salmon index 1bb8de6e6..b6395ef58 100644 --- a/workers/dockerfiles/Dockerfile.salmon +++ b/workers/dockerfiles/Dockerfile.salmon @@ -1,4 +1,4 @@ -FROM ccdlstaging/dr_worker_base:latest +FROM ccdlstaging/dr_base:latest # Fail in case of an error at any stage in the pipe. SHELL ["/bin/bash", "-o", "pipefail", "-c"] From f7191badbd0d28d8ef944bd91e5287d8ba3a918b Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Thu, 1 Dec 2022 17:05:49 -0800 Subject: [PATCH 22/24] Optimize docker images, put cache invalidating instructions to the tail. --- .pre-commit-config.yaml | 2 +- .../dockerfiles/Dockerfile.affymetrix_local | 10 +-- workers/dockerfiles/Dockerfile.compendia | 36 +++++------ workers/dockerfiles/Dockerfile.downloaders | 34 +++++------ workers/dockerfiles/Dockerfile.illumina | 25 ++++---- workers/dockerfiles/Dockerfile.no_op | 35 +++++------ workers/dockerfiles/Dockerfile.salmon | 61 +++++++++---------- workers/dockerfiles/Dockerfile.smasher | 19 +++--- workers/dockerfiles/Dockerfile.transcriptome | 29 +++++---- 9 files changed, 124 insertions(+), 127 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d024704da..b651ce24a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: - id: isort - repo: https://github.com/psf/black - rev: 19.10b0 + rev: 22.3.0 hooks: - id: black args: [--line-length=100] diff --git a/workers/dockerfiles/Dockerfile.affymetrix_local b/workers/dockerfiles/Dockerfile.affymetrix_local index 7f1af8ad6..08db9e197 100644 --- a/workers/dockerfiles/Dockerfile.affymetrix_local +++ b/workers/dockerfiles/Dockerfile.affymetrix_local @@ -6,20 +6,20 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"] WORKDIR /home/user # Remove the version of common already installed. -RUN rm -r common && \ +RUN rm -rf common && \ pip3 uninstall -y data_refinery_common # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ RUN pip3 install --ignore-installed --no-cache-dir \ - common/$(ls common -1 | sort --version-sort | tail -1) + common/$(ls common -1 | sort --version-sort | tail -1) + +COPY config/ config/ +COPY workers/ . ARG SYSTEM_VERSION ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user -COPY config/ config/ -COPY workers/ . - ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.compendia b/workers/dockerfiles/Dockerfile.compendia index 4dd80559f..9a44a7ecc 100644 --- a/workers/dockerfiles/Dockerfile.compendia +++ b/workers/dockerfiles/Dockerfile.compendia @@ -69,11 +69,6 @@ RUN echo debconf apt-fast/maxdownloads string 16 | debconf-set-selections && \ useradd --create-home --home-dir /home/user -g user user WORKDIR /home/user -RUN wget "https://bitbucket.org/ariya/phantomjs/downloads/\ -phantomjs-2.1.1-linux-x86_64.tar.bz2" && \ - tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ && \ - ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs \ - /usr/local/bin/ # We need a few special packages for QN. ENV R_LIBS=/usr/local/lib/R/site-library @@ -86,28 +81,35 @@ RUN Rscript dependencies.R COPY workers/R/dependencies/qn/bioc/dependencies.R . RUN Rscript dependencies.R -# End QN-specific - -RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed numpy scipy matplotlib \ - pandas==0.25.3 scikit-learn sympy nose rpy2===3.4.5 tzlocal fancySVD +# End QN-specific. COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt && \ - pip3 install --ignore-installed numpy==1.16.0 +RUN pip3 install --ignore-installed --upgrade pip && \ + pip3 install --ignore-installed nose numpy scipy matplotlib \ + pandas==0.25.3 scikit-learn sympy rpy2===3.4.5 tzlocal fancySVD && \ + pip3 install --ignore-installed -r requirements.txt && \ + pip3 install --ignore-installed numpy==1.16.0 && \ + # Clear out the pip cache. + rm -rf /root/.cache # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ RUN pip3 install --ignore-installed \ - common/$(ls common -1 | sort --version-sort | tail -1) + common/$(ls common -1 | sort --version-sort | tail -1) && \ + # Install phantomjs. + wget "https://bitbucket.org/ariya/phantomjs/downloads/\ +phantomjs-2.1.1-linux-x86_64.tar.bz2" && \ + tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ && \ + ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs \ + /usr/local/bin/ + +COPY .boto .boto +COPY config/ config/ +COPY workers/ . ARG SYSTEM_VERSION ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user -COPY .boto .boto -COPY config/ config/ -COPY workers/ . - ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.downloaders b/workers/dockerfiles/Dockerfile.downloaders index 145d567e1..02917518d 100644 --- a/workers/dockerfiles/Dockerfile.downloaders +++ b/workers/dockerfiles/Dockerfile.downloaders @@ -8,13 +8,24 @@ WORKDIR /home/user COPY workers/R/dependencies/install_downloader_R_only.R . RUN Rscript install_downloader_R_only.R +COPY workers/data_refinery_workers/downloaders/requirements.txt . +RUN pip3 install --ignore-installed --upgrade pip && \ + pip3 install --ignore-installed rpy2==3.4.5 && \ + pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip cache. + rm -rf /root/.cache + +# Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed \ + common/$(ls common -1 | sort --version-sort | tail -1) + # Aspera will only install as the current user. # Even using `su - user &&` doesn't work... - USER user + # Install Aspera. We have to install it using Holland Computing Center's conda # repo because download.asperasoft.com now returns 403s - RUN wget -q "https://anaconda.org/HCC/aspera-cli/3.9.1/download/\ linux-64/aspera-cli-3.9.1-0.tar.bz2" && \ [ "$(sha256sum aspera-cli-3.9.1-0.tar.bz2 | cut -d ' ' -f1)" =\ @@ -27,26 +38,13 @@ linux-64/aspera-cli-3.9.1-0.tar.bz2" && \ # Now that we're done installing Aspera go back to being root for a bit. USER root -RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed rpy2==3.4.5 - -# Get the latest version from the dist directory. -COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed \ - common/$(ls common -1 | sort --version-sort | tail -1) - -COPY workers/data_refinery_workers/downloaders/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt && \ - # Clear out the pip3 cache. - rm -r /root/.cache +COPY .boto .boto +COPY config config +COPY workers/ . ARG SYSTEM_VERSION ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user -COPY .boto .boto -COPY config config -COPY workers/ . - ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.illumina b/workers/dockerfiles/Dockerfile.illumina index 648d2e758..e578b0b69 100644 --- a/workers/dockerfiles/Dockerfile.illumina +++ b/workers/dockerfiles/Dockerfile.illumina @@ -12,22 +12,16 @@ RUN Rscript dependencies.R COPY workers/R/dependencies/illumina/bioc/dependencies.R . RUN Rscript dependencies.R +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed --upgrade pip && \ + pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip cache. + rm -rf /root/.cache + # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ RUN pip3 install --ignore-installed \ - common/$(ls common -1 | sort --version-sort | tail -1) - -RUN pip3 install --ignore-installed --upgrade pip - -COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt && \ - # Clear out the pip3 cache. - rm -r /root/.cache - -ARG SYSTEM_VERSION -ENV SYSTEM_VERSION=$SYSTEM_VERSION - -USER user + common/$(ls common -1 | sort --version-sort | tail -1) COPY .boto .boto COPY config/ config/ @@ -35,4 +29,9 @@ COPY workers/ . COPY workers/data_refinery_workers/processors/detect_database.R . COPY workers/illumina_probe_maps/ probe_maps/ +ARG SYSTEM_VERSION +ENV SYSTEM_VERSION=$SYSTEM_VERSION + +USER user + ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.no_op b/workers/dockerfiles/Dockerfile.no_op index 8d55281ca..0d98229ee 100644 --- a/workers/dockerfiles/Dockerfile.no_op +++ b/workers/dockerfiles/Dockerfile.no_op @@ -11,9 +11,23 @@ RUN Rscript dependencies.R COPY workers/R/dependencies/no_op/install_gene_convert.R . RUN Rscript install_gene_convert.R +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed --upgrade pip && \ + pip3 install --ignore-installed numpy && \ + pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip cache. + rm -rf /root/.cache + +# Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed \ + common/$(ls common -1 | sort --version-sort | tail -1) + # Noop-specific. RUN mkdir -p gene_indexes + WORKDIR /home/user/gene_indexes + ENV ID_REFINERY_URL=https://zenodo.org/record/1410647/files/all_1536267482.zip RUN curl -O $ID_REFINERY_URL && \ echo $ID_REFINERY_URL > /etc/identifier_refinery_url && \ @@ -21,28 +35,15 @@ RUN curl -O $ID_REFINERY_URL && \ rm *.zip # End Noop-specific. -WORKDIR /home/user/ - -# Get the latest version from the dist directory. -COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed \ - common/$(ls common -1 | sort --version-sort | tail -1) - -RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed numpy +WORKDIR /home/user -COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt && \ - # Clear out the pip3 cache. - rm -r /root/.cache +COPY .boto .boto +COPY config/ config/ +COPY workers/ . ARG SYSTEM_VERSION ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user -COPY .boto .boto -COPY config/ config/ -COPY workers/ . - ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.salmon b/workers/dockerfiles/Dockerfile.salmon index b6395ef58..b4493b2e9 100644 --- a/workers/dockerfiles/Dockerfile.salmon +++ b/workers/dockerfiles/Dockerfile.salmon @@ -5,6 +5,25 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"] WORKDIR /home/user +COPY workers/R/dependencies/tximport/cran/dependencies.R tximport_dependencies.R +RUN Rscript tximport_dependencies.R + +# Install tximport. +COPY workers/R/dependencies/tximport/install_tximport.R . +RUN Rscript install_tximport.R + +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed --upgrade pip && \ + pip3 install --ignore-installed numpy && \ + pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip cache. + rm -rf /root/.cache + +# Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed \ + common/$(ls common -1 | sort --version-sort | tail -1) + # Install Salmon # Tximport requires all experiments to be processed with the same version of # Salmon to work https://github.com/AlexsLemonade/refinebio/issues/1496. @@ -25,48 +44,28 @@ v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \ rm -f "Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" # End Salmon installation. -COPY workers/R/dependencies/tximport/cran/dependencies.R tximport_dependencies.R -RUN Rscript tximport_dependencies.R - -# Install tximport. -COPY workers/R/dependencies/tximport/install_tximport.R . -RUN Rscript install_tximport.R - -RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed numpy - -# Get the latest version from the dist directory. -COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed \ - common/$(ls common -1 | sort --version-sort | tail -1) - -COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt && \ - # Clear out the pip3 cache. - rm -r /root/.cache - -# Install SalmonTools. -RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && \ +ENV SRA_VERSION=2.9.1 +RUN \ + # Install SalmonTools. + git clone https://github.com/COMBINE-lab/SalmonTools.git && \ cd SalmonTools && \ git checkout 3e6654c2c10a5225498b623056993947fa688afc && \ cmake . -DCMAKE_INSTALL_PREFIX=/usr/local && \ make install && \ - rm -rf SalmonTools - -# Install sra-tools. -ENV SRA_VERSION=2.9.1 -RUN wget -q "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/\ + rm -rf SalmonTools && \ + # Install sra-tools. + wget -q "https://ftp.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/\ sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \ tar zxfv "sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \ cp -r "sratoolkit.${SRA_VERSION}-ubuntu64/bin/"* /usr/bin +COPY .boto .boto +COPY config/ config/ +COPY workers/ . + ARG SYSTEM_VERSION ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user -COPY .boto .boto -COPY config/ config/ -COPY workers/ . - ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.smasher b/workers/dockerfiles/Dockerfile.smasher index ebdf924a4..bdf146474 100644 --- a/workers/dockerfiles/Dockerfile.smasher +++ b/workers/dockerfiles/Dockerfile.smasher @@ -11,26 +11,25 @@ RUN Rscript dependencies.R COPY workers/R/dependencies/qn/bioc/dependencies.R . RUN Rscript dependencies.R +COPY workers/data_refinery_workers/processors/requirements.txt . RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed nose numpy rpy2==3.4.5 + pip3 install --ignore-installed nose numpy rpy2==3.4.5 && \ + pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip cache. + rm -rf /root/.cache # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ RUN pip3 install --ignore-installed \ - common/$(ls common -1 | sort --version-sort | tail -1) + common/$(ls common -1 | sort --version-sort | tail -1) -COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt && \ - # Clear out the pip3 cache. - rm -rf /root/.cache +COPY .boto .boto +COPY config/ config/ +COPY workers/ . ARG SYSTEM_VERSION ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user -COPY .boto .boto -COPY config/ config/ -COPY workers/ . - ENTRYPOINT [] diff --git a/workers/dockerfiles/Dockerfile.transcriptome b/workers/dockerfiles/Dockerfile.transcriptome index 8ce0d974a..ae75c87a8 100644 --- a/workers/dockerfiles/Dockerfile.transcriptome +++ b/workers/dockerfiles/Dockerfile.transcriptome @@ -5,6 +5,17 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"] WORKDIR /home/user +COPY workers/data_refinery_workers/processors/requirements.txt . +RUN pip3 install --ignore-installed --upgrade pip && \ + pip3 install --ignore-installed numpy && \ + pip3 install --ignore-installed -r requirements.txt && \ + # Clear out the pip cache. + rm -rf /root/.cache + +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed \ + common/$(ls common -1 | sort --version-sort | tail -1) + # It's annoying that this can only be installed via git. RUN git clone https://github.com/deweylab/RSEM.git && \ cd RSEM && make install && \ @@ -28,25 +39,13 @@ v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \ rm -r Salmon* # End Salmon installation. -RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed numpy - -COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed \ - common/$(ls common -1 | sort --version-sort | tail -1) - -COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed -r requirements.txt && \ - # Clear out the pip3 cache. - rm -rf /root/.cache +COPY .boto .boto +COPY config/ config/ +COPY workers/ . ARG SYSTEM_VERSION ENV SYSTEM_VERSION=$SYSTEM_VERSION USER user -COPY .boto .boto -COPY config/ config/ -COPY workers/ . - ENTRYPOINT [] From a10865346ddce740acdbc1d6f6bb9513c94f03f5 Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Fri, 2 Dec 2022 17:04:51 -0800 Subject: [PATCH 23/24] Add `--no-cache-dir` to pip. Move common package intallation to the bottom. --- workers/dockerfiles/Dockerfile.compendia | 23 +++++++++---------- workers/dockerfiles/Dockerfile.downloaders | 18 +++++++-------- workers/dockerfiles/Dockerfile.illumina | 8 +++---- workers/dockerfiles/Dockerfile.no_op | 18 +++++++-------- workers/dockerfiles/Dockerfile.salmon | 24 +++++++++----------- workers/dockerfiles/Dockerfile.smasher | 10 ++++---- workers/dockerfiles/Dockerfile.transcriptome | 21 ++++++++--------- 7 files changed, 54 insertions(+), 68 deletions(-) diff --git a/workers/dockerfiles/Dockerfile.compendia b/workers/dockerfiles/Dockerfile.compendia index 9a44a7ecc..8790d859c 100644 --- a/workers/dockerfiles/Dockerfile.compendia +++ b/workers/dockerfiles/Dockerfile.compendia @@ -84,25 +84,24 @@ RUN Rscript dependencies.R # End QN-specific. COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed nose numpy scipy matplotlib \ +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir nose numpy scipy matplotlib \ pandas==0.25.3 scikit-learn sympy rpy2===3.4.5 tzlocal fancySVD && \ - pip3 install --ignore-installed -r requirements.txt && \ - pip3 install --ignore-installed numpy==1.16.0 && \ - # Clear out the pip cache. - rm -rf /root/.cache + pip3 install --ignore-installed --no-cache-dir -r requirements.txt && \ + pip3 install --ignore-installed --no-cache-dir numpy==1.16.0 -# Get the latest version from the dist directory. -COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed \ - common/$(ls common -1 | sort --version-sort | tail -1) && \ - # Install phantomjs. - wget "https://bitbucket.org/ariya/phantomjs/downloads/\ +# Install phantomjs. +RUN wget -q "https://bitbucket.org/ariya/phantomjs/downloads/\ phantomjs-2.1.1-linux-x86_64.tar.bz2" && \ tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 -C /usr/local/share/ && \ ln -s /usr/local/share/phantomjs-2.1.1-linux-x86_64/bin/phantomjs \ /usr/local/bin/ +# Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) + COPY .boto .boto COPY config/ config/ COPY workers/ . diff --git a/workers/dockerfiles/Dockerfile.downloaders b/workers/dockerfiles/Dockerfile.downloaders index 02917518d..9b611956d 100644 --- a/workers/dockerfiles/Dockerfile.downloaders +++ b/workers/dockerfiles/Dockerfile.downloaders @@ -9,16 +9,9 @@ COPY workers/R/dependencies/install_downloader_R_only.R . RUN Rscript install_downloader_R_only.R COPY workers/data_refinery_workers/downloaders/requirements.txt . -RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed rpy2==3.4.5 && \ - pip3 install --ignore-installed -r requirements.txt && \ - # Clear out the pip cache. - rm -rf /root/.cache - -# Get the latest version from the dist directory. -COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed \ - common/$(ls common -1 | sort --version-sort | tail -1) +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir rpy2==3.4.5 && \ + pip3 install --ignore-installed --no-cache-dir -r requirements.txt # Aspera will only install as the current user. # Even using `su - user &&` doesn't work... @@ -38,6 +31,11 @@ linux-64/aspera-cli-3.9.1-0.tar.bz2" && \ # Now that we're done installing Aspera go back to being root for a bit. USER root +# Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) + COPY .boto .boto COPY config config COPY workers/ . diff --git a/workers/dockerfiles/Dockerfile.illumina b/workers/dockerfiles/Dockerfile.illumina index e578b0b69..200b37725 100644 --- a/workers/dockerfiles/Dockerfile.illumina +++ b/workers/dockerfiles/Dockerfile.illumina @@ -13,14 +13,12 @@ COPY workers/R/dependencies/illumina/bioc/dependencies.R . RUN Rscript dependencies.R COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed -r requirements.txt && \ - # Clear out the pip cache. - rm -rf /root/.cache +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir -r requirements.txt # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed \ +RUN pip3 install --ignore-installed --no-cache-dir \ common/$(ls common -1 | sort --version-sort | tail -1) COPY .boto .boto diff --git a/workers/dockerfiles/Dockerfile.no_op b/workers/dockerfiles/Dockerfile.no_op index 0d98229ee..60f686265 100644 --- a/workers/dockerfiles/Dockerfile.no_op +++ b/workers/dockerfiles/Dockerfile.no_op @@ -12,16 +12,9 @@ COPY workers/R/dependencies/no_op/install_gene_convert.R . RUN Rscript install_gene_convert.R COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed numpy && \ - pip3 install --ignore-installed -r requirements.txt && \ - # Clear out the pip cache. - rm -rf /root/.cache - -# Get the latest version from the dist directory. -COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed \ - common/$(ls common -1 | sort --version-sort | tail -1) +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir numpy && \ + pip3 install --ignore-installed --no-cache-dir -r requirements.txt # Noop-specific. RUN mkdir -p gene_indexes @@ -37,6 +30,11 @@ RUN curl -O $ID_REFINERY_URL && \ WORKDIR /home/user +# Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) + COPY .boto .boto COPY config/ config/ COPY workers/ . diff --git a/workers/dockerfiles/Dockerfile.salmon b/workers/dockerfiles/Dockerfile.salmon index b4493b2e9..046ee0b89 100644 --- a/workers/dockerfiles/Dockerfile.salmon +++ b/workers/dockerfiles/Dockerfile.salmon @@ -13,16 +13,9 @@ COPY workers/R/dependencies/tximport/install_tximport.R . RUN Rscript install_tximport.R COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed numpy && \ - pip3 install --ignore-installed -r requirements.txt && \ - # Clear out the pip cache. - rm -rf /root/.cache - -# Get the latest version from the dist directory. -COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed \ - common/$(ls common -1 | sort --version-sort | tail -1) +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir numpy && \ + pip3 install --ignore-installed --no-cache-dir -r requirements.txt # Install Salmon # Tximport requires all experiments to be processed with the same version of @@ -45,9 +38,9 @@ v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \ # End Salmon installation. ENV SRA_VERSION=2.9.1 -RUN \ - # Install SalmonTools. - git clone https://github.com/COMBINE-lab/SalmonTools.git && \ + +# Install SalmonTools. +RUN git clone https://github.com/COMBINE-lab/SalmonTools.git && \ cd SalmonTools && \ git checkout 3e6654c2c10a5225498b623056993947fa688afc && \ cmake . -DCMAKE_INSTALL_PREFIX=/usr/local && \ @@ -59,6 +52,11 @@ sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \ tar zxfv "sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz" && \ cp -r "sratoolkit.${SRA_VERSION}-ubuntu64/bin/"* /usr/bin +# Get the latest version from the dist directory. +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) + COPY .boto .boto COPY config/ config/ COPY workers/ . diff --git a/workers/dockerfiles/Dockerfile.smasher b/workers/dockerfiles/Dockerfile.smasher index bdf146474..926f98208 100644 --- a/workers/dockerfiles/Dockerfile.smasher +++ b/workers/dockerfiles/Dockerfile.smasher @@ -12,15 +12,13 @@ COPY workers/R/dependencies/qn/bioc/dependencies.R . RUN Rscript dependencies.R COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed nose numpy rpy2==3.4.5 && \ - pip3 install --ignore-installed -r requirements.txt && \ - # Clear out the pip cache. - rm -rf /root/.cache +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir nose numpy rpy2==3.4.5 && \ + pip3 install --ignore-installed --no-cache-dir -r requirements.txt # Get the latest version from the dist directory. COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed \ +RUN pip3 install --ignore-installed --no-cache-dir \ common/$(ls common -1 | sort --version-sort | tail -1) COPY .boto .boto diff --git a/workers/dockerfiles/Dockerfile.transcriptome b/workers/dockerfiles/Dockerfile.transcriptome index ae75c87a8..f7dbe3676 100644 --- a/workers/dockerfiles/Dockerfile.transcriptome +++ b/workers/dockerfiles/Dockerfile.transcriptome @@ -6,18 +6,11 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"] WORKDIR /home/user COPY workers/data_refinery_workers/processors/requirements.txt . -RUN pip3 install --ignore-installed --upgrade pip && \ - pip3 install --ignore-installed numpy && \ - pip3 install --ignore-installed -r requirements.txt && \ - # Clear out the pip cache. - rm -rf /root/.cache - -COPY common/dist/data-refinery-common-* common/ -RUN pip3 install --ignore-installed \ - common/$(ls common -1 | sort --version-sort | tail -1) - -# It's annoying that this can only be installed via git. -RUN git clone https://github.com/deweylab/RSEM.git && \ +RUN pip3 install --ignore-installed --no-cache-dir --upgrade pip && \ + pip3 install --ignore-installed --no-cache-dir numpy && \ + pip3 install --ignore-installed --no-cache-dir -r requirements.txt && \ + # It's annoying that this can only be installed via git. + git clone https://github.com/deweylab/RSEM.git && \ cd RSEM && make install && \ rm -rf RSEM @@ -39,6 +32,10 @@ v${SALMON_VERSION}/Salmon-${SALMON_VERSION}_linux_x86_64.tar.gz" && \ rm -r Salmon* # End Salmon installation. +COPY common/dist/data-refinery-common-* common/ +RUN pip3 install --ignore-installed --no-cache-dir \ + common/$(ls common -1 | sort --version-sort | tail -1) + COPY .boto .boto COPY config/ config/ COPY workers/ . From 29b8d24e7f8db875c92fa785b70730ffdaa7834a Mon Sep 17 00:00:00 2001 From: Arkadii Yakovets Date: Thu, 2 Feb 2023 10:25:52 -0800 Subject: [PATCH 24/24] Rename cron job script. --- .../foreman-server-instance-user-data.tpl.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh b/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh index 3598c2a45..c53d368bf 100644 --- a/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh +++ b/infrastructure/foreman-configuration/foreman-server-instance-user-data.tpl.sh @@ -77,8 +77,8 @@ docker run \\ -e DATABASE_PASSWORD=${database_password} \\ -v /tmp:/tmp \\ -it ${dockerhub_repo}/dr_\"\$1\" python3 manage.py \"\$2\" -" >> /home/ubuntu/run_cron_job.sh -chmod +x /home/ubuntu/run_cron_job.sh +" >> /home/ubuntu/run_manage_command.sh +chmod +x /home/ubuntu/run_manage_command.sh # Use Monit to ensure the Foreman is always running apt-get -y update @@ -112,9 +112,9 @@ service monit restart # Install the cron job tests crontab -l > tempcron cat <> tempcron -0 12 * * MON /bin/bash /home/ubuntu/run_cron_job.sh affymetrix check_brainarray_gene_agreement >> /var/log/cron_job_tests.log 2>&1 -0 12 * * MON /bin/bash /home/ubuntu/run_cron_job.sh affymetrix check_tx_index_transcript_agreement >> /var/log/cron_job_tests.log 2>&1 -0 12 * * ${accession_gathering_job_run_day} /bin/bash /home/ubuntu/run_cron_job.sh foreman gather_weekly_accessions >> /var/log/gather_weekly_accessions.log 2>&1 +0 12 * * MON /bin/bash /home/ubuntu/run_manage_command.sh affymetrix check_brainarray_gene_agreement >> /var/log/affymetrix_checks.log 2>&1 +0 12 * * MON /bin/bash /home/ubuntu/run_manage_command.sh affymetrix check_tx_index_transcript_agreement >> /var/log/affymetrix_checks.log 2>&1 +0 12 * * ${accession_gathering_job_run_day} /bin/bash /home/ubuntu/run_manage_command.sh foreman gather_weekly_accessions >> /var/log/weekly_accessions.log 2>&1 EOF # install new cron file crontab tempcron