fix line endings and trailing whitespaces

AlexsLemonade · Jan 9, 2020 · ed41812 · ed41812
1 parent 87e2e98
commit ed41812
Show file tree

Hide file tree

Showing 25 changed files with 10,235 additions and 10,226 deletions.
diff --git a/.circleci/filter_tests.sh b/.circleci/filter_tests.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 echo $(git log --format=oneline -n 1 "$CIRCLE_SHA1");
 if [[ $(git log --format=oneline -n 1 "$CIRCLE_SHA1") = *"noslow"* ]];
 then

diff --git a/.circleci/git_decrypt.sh b/.circleci/git_decrypt.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # Unlock encrypted files
 cd ~/refinebio/.circleci
 git clean -f

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -8,9 +8,9 @@ Describe the big picture of your changes.
 
 ## Methods
 
-If this pull request has any implications for data or metadata processing or addresses an issue labeled `sci review`, please include an overview of the methods used (e.g., briefly explain _how_ the data gets processed). 
+If this pull request has any implications for data or metadata processing or addresses an issue labeled `sci review`, please include an overview of the methods used (e.g., briefly explain _how_ the data gets processed).
 See [#267](https://github.com/AlexsLemonade/refinebio/pull/267) for rationale.
-Include sufficient detail for reviewers or users that are not expert developers to evaluate the validity of the approach. 
+Include sufficient detail for reviewers or users that are not expert developers to evaluate the validity of the approach.
 Please attach or link to example input and output data if applicable.
 It may also be appropriate to include a description of any functional or unit tests in this section depending on their content.
 Any pull request with a methods section requires scientific review in addition to code review.

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,12 +1,26 @@
 ---
 repos:
-  - repo: https://github.com/psf/black
-    rev: stable
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.4.0
     hooks:
-      - id: black
-        args: [--line-length=100]
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: [--fix=lf]
+      - id: trailing-whitespace
+        args: [--markdown-linebreak-ext=md]
+      - id: check-added-large-files
+      - id: check-executables-have-shebangs
+      - id: check-merge-conflict
+      - id: check-docstring-first
+      - id: check-yaml
 
   - repo: https://github.com/pre-commit/mirrors-isort
     rev: v4.3.21
     hooks:
       - id: isort
+
+  - repo: https://github.com/psf/black
+    rev: stable
+    hooks:
+      - id: black
+        args: [--line-length=100]
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2017-2018, Greene Laboratory, University of Pennsylvania and 
+Copyright (c) 2017-2018, Greene Laboratory, University of Pennsylvania and
 Childhood Cancer Data Lab, Alex's Lemonade Stand Foundation.
 All rights reserved.
 

diff --git a/api/data_refinery_api/views.py b/api/data_refinery_api/views.py
@@ -744,7 +744,9 @@ def get_queryset(self):
         """
         queryset = (
             Sample.public_objects.prefetch_related("organism")
-            .prefetch_related(Prefetch("results", queryset=ComputationalResult.objects.order_by('time_start')))
+            .prefetch_related(
+                Prefetch("results", queryset=ComputationalResult.objects.order_by("time_start"))
+            )
             .prefetch_related("results__processor")
             .prefetch_related("results__computationalresultannotation_set")
             .prefetch_related("results__computedfile_set")

diff --git a/common/data_refinery_common/models/models.py b/common/data_refinery_common/models/models.py
@@ -1,3 +1,19 @@
+"""
+# First Order Classes
+
+This represent the primary data types we will be querying
+and filtering against.
+
+# Files
+
+These are the database representations of files
+which live on local disk, on ephemeral storage,
+or on AWS cloud services.
+
+# Associations
+
+These represent the relationships between items in the other tables.
+"""
 import os
 import shutil
 import uuid
@@ -36,14 +52,6 @@
 CURRENT_SALMON_VERSION = "salmon " + get_env_variable("SALMON_VERSION", "0.13.1")
 CHUNK_SIZE = 1024 * 256  # chunk_size is in bytes
 
-"""
-# First Order Classes
-
-This represent the primary data types we will be querying
-and filtering against.
-
-"""
-
 
 class PublicObjectsManager(models.Manager):
     """
@@ -771,15 +779,6 @@ def save(self, *args, **kwargs):
         return super(OrganismIndex, self).save(*args, **kwargs)
 
 
-"""
-# Files
-
-These are the database representations of files
-which live on local disk, on ephemeral storage,
-or on AWS cloud services.
-"""
-
-
 class OriginalFile(models.Model):
     """ A representation of a file from an external source """
 
@@ -1524,13 +1523,6 @@ def terms_and_conditions(self):
         return settings.TERMS_AND_CONDITIONS
 
 
-"""
-# Associations
-
-These represent the relationships between items in the other tables.
-"""
-
-
 class ExperimentSampleAssociation(models.Model):
 
     experiment = models.ForeignKey(Experiment, blank=False, null=False, on_delete=models.CASCADE)

diff --git a/foreman/supported_platforms.csv b/foreman/supported_platforms.csv
diff --git a/infrastructure/data-refinery-key.pem b/infrastructure/data-refinery-key.pem
diff --git a/infrastructure/logging.tf b/infrastructure/logging.tf
@@ -53,4 +53,4 @@ resource "aws_cloudwatch_log_stream" "log_stream_api_nginx_access" {
 resource "aws_cloudwatch_log_stream" "log_stream_api_nginx_error" {
   name           = "log-stream-api-nginx-error-${var.user}-${var.stage}"
   log_group_name = "${aws_cloudwatch_log_group.data_refinery_log_group.name}"
-}
+}
diff --git a/infrastructure/nomad-configuration/lead_server.hcl b/infrastructure/nomad-configuration/lead_server.hcl
@@ -22,4 +22,3 @@ consul {
   server_auto_join    = false
   client_auto_join    = false
 }
-
diff --git a/workers/CRAN.gpg b/workers/CRAN.gpg
@@ -38,4 +38,3 @@ xai/btgtyvoSde69e9T8xKb3tiJ90b1RgIT32cdOKhIzRDdyCm7kSmlU2tzbfaDIba+UfOGy
 y00HHnlxSYBfZ8TPG5GKktLSKrgvTNQZ2B2OfiegYPMY
 =IwiZ
 -----END PGP PUBLIC KEY BLOCK-----
-
diff --git a/workers/LICENSE_DATASET.txt b/workers/LICENSE_DATASET.txt
@@ -55,4 +55,3 @@ UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS T
     No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.
     This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.
     The rights granted under, and the subject matter referenced, in this License were drafted utilizing the terminology of the Berne Convention for the Protection of Literary and Artistic Works (as amended on September 28, 1979), the Rome Convention of 1961, the WIPO Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and the Universal Copyright Convention (as revised on July 24, 1971). These rights and subject matter take effect in the relevant jurisdiction in which the License terms are sought to be enforced according to the corresponding provisions of the implementation of those treaty provisions in the applicable national law. If the standard suite of rights granted under applicable copyright law includes additional rights not granted under this License, such additional rights are deemed to be included in the License; this License is not intended to restrict the license of any rights under applicable law.
-
diff --git a/workers/README_DATASET.md b/workers/README_DATASET.md
@@ -14,7 +14,7 @@ Currently, we only support skipping quantile normalization for RNA-seq experimen
 * Individual gene expression matrices and their corresponding sample metadata files are in their own directories.
 
 * Gene expression matrices are the tab-separated value (TSV) files named by the experiment accession number (if aggregated by experiment) or species name (if aggregated by species).
-Note that samples are _columns_ and rows are _genes_ or _features_. 
+Note that samples are _columns_ and rows are _genes_ or _features_.
 This pattern is consistent with the input for many programs specifically designed for working with high-throughput gene expression data but may be transposed from what other machine learning libraries are expecting.
 
 * Sample metadata (e.g. disease vs. control labels) are contained in TSV files with `metadata` in the filename as well as any JSON files.
@@ -72,6 +72,6 @@ If you would prefer to report issues via e-mail, you can also email [ccdl@alexsl
 
 Please use the following:
 
-Casey S. Greene, Dongbo Hu, Richard W. W. Jones, Stephanie Liu, David S. Mejia, Rob Patro, Stephen R. Piccolo, Ariel Rodriguez Romero, Hirak Sarkar, Candace L. Savonen, Jaclyn N. Taroni, William E. Vauclain, Deepashree Venkatesh Prasad, Kurt G. Wheeler. **refine.bio: a resource of uniformly processed publicly available gene expression datasets.** URL: https://www.refine.bio 
+Casey S. Greene, Dongbo Hu, Richard W. W. Jones, Stephanie Liu, David S. Mejia, Rob Patro, Stephen R. Piccolo, Ariel Rodriguez Romero, Hirak Sarkar, Candace L. Savonen, Jaclyn N. Taroni, William E. Vauclain, Deepashree Venkatesh Prasad, Kurt G. Wheeler. **refine.bio: a resource of uniformly processed publicly available gene expression datasets.** URL: https://www.refine.bio
 
 _Note that the contributor list is in alphabetical order as we prepare a manuscript for submission._
diff --git a/workers/README_NORMALIZED.md b/workers/README_NORMALIZED.md
@@ -1,6 +1,6 @@
 # refine.bio Normalized Compendium
 
-This is a refine.bio normalized compendium comprised of all the samples from a species that we were able to process, aggregate, and normalize. 
+This is a refine.bio normalized compendium comprised of all the samples from a species that we were able to process, aggregate, and normalize.
 Normalized compendia provide a snapshot of the most complete collection of gene expression that refine.bio can produce for each supported organism.
 
 You can read more about how we process refine.bio compendia in [our documentation](http://docs.refine.bio/en/latest/main_text.html#refine-bio-compendia).
@@ -9,12 +9,12 @@ You can read more about how we process refine.bio compendia in [our documentatio
 
 This download includes a gene expression matrix and experiment and sample metadata for all samples from a given organism that are fit for inclusion in the normalized compendium.
 
-* The `aggregated_metadata.json` file contains experiment metadata and information about the transformation applied to the data. 
+* The `aggregated_metadata.json` file contains experiment metadata and information about the transformation applied to the data.
 Specifically, the `scale_by` field notes any row-wise transformation that was performed on the gene expression data. For normalized compendia, this value should always be `NONE`.
 
-* The gene expression matrix is the tab-separated value (TSV) file that bears the species name. 
+* The gene expression matrix is the tab-separated value (TSV) file that bears the species name.
 For example, if you have downloaded the zebrafish normalized compendium, you would find the gene expression matrix in the file `DANIO_RERIO/DANIO_RERIO.tsv`.
-Note that samples are _columns_ and rows are _genes_ or _features_. 
+Note that samples are _columns_ and rows are _genes_ or _features_.
 This pattern is consistent with the input for many programs specifically designed for working with high-throughput gene expression data but may be transposed from what other machine learning libraries are expecting.
 
 * Sample metadata (e.g. disease vs. control labels) are contained in the TSV file with `metadata` in the filename as well as any JSON files.
@@ -25,24 +25,24 @@ The contents of a sample's `refinebio_annotations` field include the submitter-s
 
 Please see [our documentation](http://docs.refine.bio/en/latest/) for more details.
 
-## Notes and observations 
+## Notes and observations
 
-Combining all samples from a given species is a technical challenge, as it necessitates the integration of different microarray platforms and microarray data with RNA-seq data. 
-Although the normalization steps we perform eliminate some sources of technical bias, it is imperfect and an active area of development. 
-We strongly encourage you to consider using methods or models that can account for such biases and to explore and visualize the data with particular concern for samples' technology of origin (RNA-seq, microarray). 
+Combining all samples from a given species is a technical challenge, as it necessitates the integration of different microarray platforms and microarray data with RNA-seq data.
+Although the normalization steps we perform eliminate some sources of technical bias, it is imperfect and an active area of development.
+We strongly encourage you to consider using methods or models that can account for such biases and to explore and visualize the data with particular concern for samples' technology of origin (RNA-seq, microarray).
 
 ### Methods evaluation and exploratory data analysis
 
-To identify appropriate methods for processing the initial releases of normalized compendia (described [here](http://docs.refine.bio/en/latest/main_text.html#species-compendia)), we performed a series of evaluations in a small zebrafish test compendium. 
+To identify appropriate methods for processing the initial releases of normalized compendia (described [here](http://docs.refine.bio/en/latest/main_text.html#species-compendia)), we performed a series of evaluations in a small zebrafish test compendium.
 We've made these evaluations available and have documented our rationale on GitHub [here](https://github.com/AlexsLemonade/compendium-processing/tree/94089d2de170f0ca7b87e9e5c32239a8591faaa7/select_imputation_method).
 
-We have also performed exploratory analyses in a larger zebrafish test compendium ([GitHub](https://github.com/AlexsLemonade/compendium-processing/tree/94089d2de170f0ca7b87e9e5c32239a8591faaa7/quality_check)). 
+We have also performed exploratory analyses in a larger zebrafish test compendium ([GitHub](https://github.com/AlexsLemonade/compendium-processing/tree/94089d2de170f0ca7b87e9e5c32239a8591faaa7/quality_check)).
 We _briefly_ summarize our findings below, including links to relevant notebooks or plots:
 
 * Genes that are longer tend to have higher values in RNA-seq data as compared to microarray data ([notebook](https://alexslemonade.github.io/compendium-processing/quality_check/07-technology_diff_exp.nb.html)).
 * Unsurprisingly, shorter genes are less likely to be observed in RNA-seq data ([notebook](https://alexslemonade.github.io/compendium-processing/quality_check/06-lowly_expressed_genes.nb.html)).
 * Genes that are often zero in RNA-seq data have lower average expression in microarray data ([notebook](https://alexslemonade.github.io/compendium-processing/quality_check/08-gene_lengths.nb.html)).
-* We observe some differences in technology in the first two principle components, but there is also a group of RNA-seq samples that are different from all other samples (see below). 
+* We observe some differences in technology in the first two principle components, but there is also a group of RNA-seq samples that are different from all other samples (see below).
 These are samples from the Wellcome Sanger Institute Zebrafish Mutation Project ([notebook](https://alexslemonade.github.io/compendium-processing/quality_check/11-rnaseq_bias.nb.html)).
 
 
@@ -95,6 +95,6 @@ If you would prefer to report issues via e-mail, you can also email [ccdl@alexsl
 
 Please use the following:
 
-Casey S. Greene, Dongbo Hu, Richard W. W. Jones, Stephanie Liu, David S. Mejia, Rob Patro, Stephen R. Piccolo, Ariel Rodriguez Romero, Hirak Sarkar, Candace L. Savonen, Jaclyn N. Taroni, William E. Vauclain, Deepashree Venkatesh Prasad, Kurt G. Wheeler. **refine.bio: a resource of uniformly processed publicly available gene expression datasets.** URL: https://www.refine.bio 
+Casey S. Greene, Dongbo Hu, Richard W. W. Jones, Stephanie Liu, David S. Mejia, Rob Patro, Stephen R. Piccolo, Ariel Rodriguez Romero, Hirak Sarkar, Candace L. Savonen, Jaclyn N. Taroni, William E. Vauclain, Deepashree Venkatesh Prasad, Kurt G. Wheeler. **refine.bio: a resource of uniformly processed publicly available gene expression datasets.** URL: https://www.refine.bio
 
-_Note that the contributor list is in alphabetical order as we prepare a manuscript for submission._
+_Note that the contributor list is in alphabetical order as we prepare a manuscript for submission._
diff --git a/workers/README_QUANT.md b/workers/README_QUANT.md
@@ -36,6 +36,6 @@ If you would prefer to report issues via e-mail, you can also email [ccdl@alexsl
 
 Please use the following:
 
-Casey S. Greene, Dongbo Hu, Richard W. W. Jones, Stephanie Liu, David S. Mejia, Rob Patro, Stephen R. Piccolo, Ariel Rodriguez Romero, Hirak Sarkar, Candace L. Savonen, Jaclyn N. Taroni, William E. Vauclain, Deepashree Venkatesh Prasad, Kurt G. Wheeler. **refine.bio: a resource of uniformly processed publicly available gene expression datasets.** URL: https://www.refine.bio 
+Casey S. Greene, Dongbo Hu, Richard W. W. Jones, Stephanie Liu, David S. Mejia, Rob Patro, Stephen R. Piccolo, Ariel Rodriguez Romero, Hirak Sarkar, Candace L. Savonen, Jaclyn N. Taroni, William E. Vauclain, Deepashree Venkatesh Prasad, Kurt G. Wheeler. **refine.bio: a resource of uniformly processed publicly available gene expression datasets.** URL: https://www.refine.bio
 
-_Note that the contributor list is in alphabetical order as we prepare a manuscript for submission._
+_Note that the contributor list is in alphabetical order as we prepare a manuscript for submission._
diff --git a/workers/R_dependencies/README.md b/workers/R_dependencies/README.md
@@ -1,7 +1,7 @@
 # R dependencies
 In this directory, we store all the scripts for installing our R dependencies.
 These scripts are managed via `cranlock`, our system for locking versions of
-transitive dependencies. 
+transitive dependencies.
 
 ## Layout
 In each directory, there are three files: `packages.txt`,

diff --git a/workers/data_refinery_workers/processors/gene_convert_illumina.R b/workers/data_refinery_workers/processors/gene_convert_illumina.R
@@ -19,11 +19,11 @@ library("rlang")
 suppressPackageStartupMessages(library(AnnotationDbi))
 
 option_list = list(
-  make_option(c("-p", "--platform"), type="character", default="", 
+  make_option(c("-p", "--platform"), type="character", default="",
               help="Platform", metavar="character"),
-  make_option(c("-i", "--inputFile"), type="character", default="", 
+  make_option(c("-i", "--inputFile"), type="character", default="",
               help="inputFile", metavar="character"),
-  make_option(c("-o", "--outputFile"), type="character", default="", 
+  make_option(c("-o", "--outputFile"), type="character", default="",
               help="outputFile", metavar="character")
 )
 
@@ -36,14 +36,14 @@ outFilePath <- opt$outputFile
 
 # Read the data file
 message("Reading data file...")
-suppressWarnings(exprs <- fread(filePath, 
-					stringsAsFactors=FALSE, 
-					sep="\t", header=TRUE, 
-					autostart=10, 
-					data.table=FALSE, 
-					check.names=FALSE, 
-					fill=TRUE, 
-					na.strings="", 
+suppressWarnings(exprs <- fread(filePath,
+					stringsAsFactors=FALSE,
+					sep="\t", header=TRUE,
+					autostart=10,
+					data.table=FALSE,
+					check.names=FALSE,
+					fill=TRUE,
+					na.strings="",
 					showProgress=FALSE)
 				)
 

diff --git a/workers/data_refinery_workers/processors/illumina.R b/workers/data_refinery_workers/processors/illumina.R
@@ -1,7 +1,7 @@
 ###
 # illumina.R
 #
-# Originally written by Stephen Piccolo, 
+# Originally written by Stephen Piccolo,
 # modified by Rich Jones for Alex's Lemonade Stand Foundation.
 #
 ###
@@ -318,21 +318,21 @@ sig = function(y, m, verbose=TRUE)
 suppressPackageStartupMessages(library("optparse"))
 
 option_list = list(
-  make_option(c("-p", "--probeId"), type="character", default="PROBE_ID", 
+  make_option(c("-p", "--probeId"), type="character", default="PROBE_ID",
               help="Probe ID", metavar="character"),
-  make_option(c("-e", "--expression"), type="character", default=".AVG_Signal", 
+  make_option(c("-e", "--expression"), type="character", default=".AVG_Signal",
               help="expression", metavar="character"),
-  make_option(c("-d", "--detection"), type="character", default="Detection Pval", 
+  make_option(c("-d", "--detection"), type="character", default="Detection Pval",
               help="Detection Pval", metavar="character"),
-  make_option(c("-l", "--platform"), type="character", default="illuminaHumanv4", 
+  make_option(c("-l", "--platform"), type="character", default="illuminaHumanv4",
               help="Platform", metavar="character"),
-  make_option(c("-c", "--cores"), type="character", default="1", 
+  make_option(c("-c", "--cores"), type="character", default="1",
               help="Number of cores", metavar="character"),
-  make_option(c("-i", "--inputFile"), type="character", default="", 
+  make_option(c("-i", "--inputFile"), type="character", default="",
               help="inputFile", metavar="character"),
-  make_option(c("-o", "--outputFile"), type="character", default="", 
+  make_option(c("-o", "--outputFile"), type="character", default="",
               help="outputFile", metavar="character")
-); 
+);
 
 opt_parser = OptionParser(option_list=option_list);
 opt = parse_args(opt_parser);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -22,4 +22,3 @@ consul {
		server_auto_join = false
		client_auto_join = false
		}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -38,4 +38,3 @@ xai/btgtyvoSde69e9T8xKb3tiJ90b1RgIT32cdOKhIzRDdyCm7kSmlU2tzbfaDIba+UfOGy
		y00HHnlxSYBfZ8TPG5GKktLSKrgvTNQZ2B2OfiegYPMY
		=IwiZ
		-----END PGP PUBLIC KEY BLOCK-----
Original file line number	Diff line number	Diff line change
Expand Up		@@ -55,4 +55,3 @@ UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS T
		No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.
		This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.
		The rights granted under, and the subject matter referenced, in this License were drafted utilizing the terminology of the Berne Convention for the Protection of Literary and Artistic Works (as amended on September 28, 1979), the Rome Convention of 1961, the WIPO Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and the Universal Copyright Convention (as revised on July 24, 1971). These rights and subject matter take effect in the relevant jurisdiction in which the License terms are sought to be enforced according to the corresponding provisions of the implementation of those treaty provisions in the applicable national law. If the standard suite of rights granted under applicable copyright law includes additional rights not granted under this License, such additional rights are deemed to be included in the License; this License is not intended to restrict the license of any rights under applicable law.