Skip to content

Commit

Permalink
[GEN-1574] Add merge and decode to staging (#161)
Browse files Browse the repository at this point in the history
* initial commit of changes to migrate to staging

* add tests

* update dockerfile to use rstudio

* add merge and decode as individual step in nextflow

* remove unused library

* add save_synapse

* add merge and decode as option

* add logging

* remove extra bracket

* update to use params docker

* add ability to add version comment

* add missing comment arg in call

* change comment shorthand

* remove short flag

* add comment method

* update readme

* add better instructions

* revert comment storage, fix comments
  • Loading branch information
rxu17 authored Nov 1, 2024
1 parent 86afb66 commit 24a49b7
Show file tree
Hide file tree
Showing 10 changed files with 301 additions and 148 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build-docker-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
module: ["references", "table_updates"] # Define the modules you want to loop through for builds
module: ["references", "table_updates", "uploads"] # Define the modules you want to loop through for builds
env:
REGISTRY: ghcr.io
IMAGE_NAME: sage-bionetworks/genie-bpc-pipeline
Expand Down
11 changes: 7 additions & 4 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,17 @@ workflow BPC_PIPELINE {
ch_comment = Channel.value(params.comment)

if (params.step == "update_potential_phi_fields_table") {
update_potential_phi_fields_table(ch_comment, params.production)// validate_data.out.view()
} else if (params.step == "update_data_table") {
update_potential_phi_fields_table(ch_comment, params.production)
// validate_data.out.view()
} else if (params.step == "merge_and_uncode_rca_uploads"){
merge_and_uncode_rca_uploads("default", ch_cohort, ch_comment, params.production)
} else if (params.step == "update_data_table") {
update_data_table("default", ch_cohort, ch_comment, params.production)
} else if (params.step == "genie_bpc_pipeline"){
} else if (params.step == "genie_bpc_pipeline"){
update_potential_phi_fields_table(ch_comment, params.production)
run_quac_upload_report_error(update_potential_phi_fields_table.out, ch_cohort)
run_quac_upload_report_warning(run_quac_upload_report_error.out, ch_cohort, params.production)
merge_and_uncode_rca_uploads(run_quac_upload_report_warning.out, ch_cohort, params.production)
merge_and_uncode_rca_uploads(run_quac_upload_report_warning.out, ch_cohort, ch_comment, params.production)
// remove_patients_from_merged(merge_and_uncode_rca_uploads.out, ch_cohort, params.production)
update_data_table(merge_and_uncode_rca_uploads.out, ch_cohort, ch_comment, params.production)
update_date_tracking_table(update_data_table.out, ch_cohort, ch_comment, params.production)
Expand Down
7 changes: 4 additions & 3 deletions modules/merge_and_uncode_rca_uploads.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ Merge and uncode REDcap export data files.
*/
process merge_and_uncode_rca_uploads {

container 'sagebionetworks/genie-bpc-pipeline-uploads'
container "$params.uploads_docker"
secret 'SYNAPSE_AUTH_TOKEN'
debug true

input:
val previous
val cohort
val comment
val production

output:
Expand All @@ -19,13 +20,13 @@ process merge_and_uncode_rca_uploads {
if (production) {
"""
cd /usr/local/src/myscripts/
Rscript merge_and_uncode_rca_uploads.R -c $cohort -u -v
Rscript merge_and_uncode_rca_uploads.R -c $cohort -v --production --save_synapse --comment $comment
"""
}
else {
"""
cd /usr/local/src/myscripts/
Rscript merge_and_uncode_rca_uploads.R -c $cohort -v
Rscript merge_and_uncode_rca_uploads.R -c $cohort -v --save_synapse --comment $comment
"""
}
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ profiles {
params {
// docker image parameters, see nextflow_schema.json for details
references_docker = "sagebionetworks/genie-bpc-pipeline-references"
uploads_docker = "sagebionetworks/genie-bpc-pipeline-uploads"
table_updates_docker = "sagebionetworks/genie-bpc-pipeline-table-updates"
}
}
Expand Down
9 changes: 7 additions & 2 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,19 @@
"description": "Available BPC steps",
"enum": [
"update_potential_phi_fields_table",
"genie_bpc_pipeline",
"update_data_table"
"merge_and_uncode_rca_uploads",
"update_data_table",
"genie_bpc_pipeline"
]
},
"references_docker":{
"type": "string",
"description": "Name of docker to use in processes in scripts/references"
},
"uploads_docker":{
"type": "string",
"description": "Name of docker to use in processes in scripts/uploads"
},
"table_updates_docker":{
"type": "string",
"description": "Name of docker to use in processes in scripts/table_updates"
Expand Down
35 changes: 24 additions & 11 deletions scripts/uploads/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,21 +1,34 @@
FROM r-base:4.0.0
FROM rstudio/r-base:4.0-bullseye

# Set working directory
WORKDIR /usr/local/src/myscripts

# Set environment variable for renv version
ENV RENV_VERSION 0.14.0

RUN rm /etc/apt/apt.conf.d/default
RUN apt-get update -y
RUN apt-get install -y dpkg-dev zlib1g-dev libssl-dev libffi-dev
# procps is required for nextflow tower
RUN apt-get install -y curl libcurl4-openssl-dev procps
RUN R -e "install.packages('synapser', repos=c('http://ran.synapse.org', 'http://cran.fhcrc.org'))"
# Update apt-get and install system dependencies (only install required)
RUN apt-get update -y && \
apt-get install -y --no-install-recommends \
dpkg-dev zlib1g-dev libssl-dev libffi-dev \
libcurl4-openssl-dev curl procps && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ENV PYTHON /usr/local/lib/R/site-library/PythonEmbedInR/bin/python3.6
# Install R packages including remotes and renv
RUN R -e "install.packages('remotes', repos = 'https://cloud.r-project.org')" && \
R -e "remotes::install_github('rstudio/renv', ref = '${RENV_VERSION}')" || true

RUN R -e "install.packages('remotes', repos = c(CRAN = 'https://cloud.r-project.org'))"
RUN R -e "remotes::install_github('rstudio/renv@${RENV_VERSION}')"
# Install synapser with specific version
RUN R -e "remotes::install_version('synapser', version = '0.11.7', repos = c('http://ran.synapse.org', 'http://cran.fhcrc.org'))"

COPY . .
# Set Python environment variable for R
ENV PYTHON /usr/local/lib/R/site-library/PythonEmbedInR/bin/python3.6

# Copy only renv.lock first to leverage docker cache for dependencies
COPY renv.lock renv.lock

# Restore R environment with renv
RUN R -e "renv::restore()"

# Copy the local project files into the container
COPY . .
68 changes: 64 additions & 4 deletions scripts/uploads/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
[![automated](https://img.shields.io/docker/cloud/automated/sagebionetworks/genie-bpc-pipeline-uploads)](https://hub.docker.com/r/sagebionetworks/genie-bpc-pipeline-uploads)
![status](https://img.shields.io/docker/cloud/build/sagebionetworks/genie-bpc-pipeline-uploads)

## Setup

You will want to run on an EC2 instance with docker installed.

## Installation

Clone this repository and navigate to the directory:
Expand All @@ -10,11 +14,43 @@ git clone [email protected]:Sage-Bionetworks/genie-bpc-pipeline.git
cd genie-bpc-pipeline/bpc/uploads/
```

Install all required R packages:
### Local installation

Install all required R packages by copying and running the commands in the Dockerfile
at `scripts/uploads/Dockerfile` except for the last step which is copying your current repo.

### Docker

Alternatively, you can pull the docker image associated with this module from [here](https://github.com/Sage-Bionetworks/genie-bpc-pipeline/pkgs/container/genie-bpc-pipeline) into your EC2.

You can also pull the production docker:

```bash
git pull sagebionetworks/genie-bpc-pipeline-uploads
```
R -e 'renv::restore()'

To run the docker image:

1. Run the docker

```bash
docker run -d --name <nickname_for_container> <container_name> /bin/bash -c "while true; do sleep 1; done"
```

2. **Optional.** Do anything you need to do to the container (e.g: copy current local changes to the docker)

```bash
docker cp ./. test_container:/usr/local/src/myscripts
```

3. Execute container into a bash session

```bash
docker exec -it <nickname_for_container> /bin/bash
```

Now you can run the script commands

## Synapse credentials

Cache your Synapse personal access token (PAT) as an environmental variable:
Expand Down Expand Up @@ -45,7 +81,7 @@ Usage: merge_and_uncode_rca_uploads.R [options]
Options:
-c COHORT, --cohort=COHORT
BPC cohort
-u, --save_synapse
Save output to Synapse
Expand All @@ -54,13 +90,21 @@ Options:
-h, --help
Show this help message and exit
--production
Whether to run in production mode or not (staging mode).
```

Example run:
Example run (staging run):
```
Rscript merge_and_uncode_rca_uploads.R -c NSCLC -u -a $SYNAPSE_AUTH_TOKEN
```

Example run (production run):
```
Rscript merge_and_uncode_rca_uploads.R -c NSCLC -u -a $SYNAPSE_AUTH_TOKEN --production
```

## Usage: remove patient IDs from a REDCap formatted file

To display the command line interface:
Expand Down Expand Up @@ -100,3 +144,19 @@ Example run:
```
Rscript remove_patients_from_merged.R -i syn23285494 -c NSCLC -r syn29266682 -v
```

## Running tests
There are unit tests under `scripts/uploads/tests`.

1. Install the `mockery` and `testthat` packages via the command line:

```bash
R -e "remotes::install_cran('testthat')"
```

2. Run the following in a R session in your EC2:

```R
library(testthat)
test_dir("/usr/local/src/myscripts/tests")
```
3 changes: 2 additions & 1 deletion scripts/uploads/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ synapse:
type: table
description: PRISSMM formatted data dictionaries for REDCap
rca_files:
id: syn23286928
production_id: syn23286928
staging_id: syn63887337
name: curated
type: folder
description: REDCap academic formatted files of merged BPC data ready for import to Synapse tables
Expand Down
Loading

0 comments on commit 24a49b7

Please sign in to comment.