From 1ecf20e8d77a6721f6613d45cc69cfbe2963a234 Mon Sep 17 00:00:00 2001 From: sof202 Date: Tue, 4 Feb 2025 13:43:11 +0000 Subject: [PATCH 1/2] refactor: specifically take in $RCONFIG for config file location --- .../DNAm/preprocessing/CETYGOdeconvolution.r | 4 +- array/DNAm/preprocessing/calcQCMetrics.r | 6 +- .../preprocessing/checkColnamesSampleSheet.r | 88 +++++++++---------- array/DNAm/preprocessing/checkRconfigFile.r | 3 +- array/DNAm/preprocessing/clusterCellTypes.r | 6 +- .../jobSubmission/1_runDNAmQC.sh | 14 +-- array/DNAm/preprocessing/loadDataGDS.r | 4 +- array/DNAm/preprocessing/normalisation.r | 7 +- 8 files changed, 65 insertions(+), 67 deletions(-) diff --git a/array/DNAm/preprocessing/CETYGOdeconvolution.r b/array/DNAm/preprocessing/CETYGOdeconvolution.r index 8c527485..d694dad0 100755 --- a/array/DNAm/preprocessing/CETYGOdeconvolution.r +++ b/array/DNAm/preprocessing/CETYGOdeconvolution.r @@ -21,10 +21,10 @@ #----------------------------------------------------------------------# args<-commandArgs(trailingOnly = TRUE) -dataDir <- args[1] +dataDir <- args[[1]] +configFile <- args[[2]] gdsFile <-paste0(dataDir, "/2_gds/raw.gds") -configFile <- paste0(dataDir, "/config.r") source(configFile) diff --git a/array/DNAm/preprocessing/calcQCMetrics.r b/array/DNAm/preprocessing/calcQCMetrics.r index cc9d48b7..f029dada 100755 --- a/array/DNAm/preprocessing/calcQCMetrics.r +++ b/array/DNAm/preprocessing/calcQCMetrics.r @@ -21,13 +21,13 @@ # DEFINE PARAMETERS #----------------------------------------------------------------------# args<-commandArgs(trailingOnly = TRUE) -dataDir <- args[1] -refDir <- args[2] +dataDir <- args[[1]] +refDir <- args[[2]] +configFile <- args[[3]] gdsFile <-paste0(dataDir, "/2_gds/raw.gds") qcData <-paste0(dataDir, "/2_gds/QCmetrics/QCmetrics.rdata") genoFile <- paste0(dataDir, "/0_metadata/epicSNPs.raw") -configFile <- paste0(dataDir, "/config.r") source(configFile) diff --git a/array/DNAm/preprocessing/checkColnamesSampleSheet.r b/array/DNAm/preprocessing/checkColnamesSampleSheet.r index 42cd7865..cc7bdd76 100644 --- a/array/DNAm/preprocessing/checkColnamesSampleSheet.r +++ b/array/DNAm/preprocessing/checkColnamesSampleSheet.r @@ -1,80 +1,78 @@ - - ## This script checks that sample sheet columns are formatted correctly prior to DNAm QC ## args <- commandArgs(trailingOnly = TRUE) -dataDir <- args[1] -configFile <- paste0(dataDir, "/config.r") +configFile <- args[[1]] # Load libraries library(stringdist, warn.conflicts = FALSE, quietly = TRUE) # for amatch() -'%ni%' <- Negate('%in%') # define '%ni%' (not in) +"%ni%" <- Negate("%in%") # define '%ni%' (not in) # Load sample sheet sampleSheet <- read.csv(paste0(dataDir, "/0_metadata/sampleSheet.csv"), na.strings = c("", "NA"), stringsAsFactors = FALSE) # Column names to test -req_cols <- c('Sample_ID','Individual_ID') # required column names -bsnm_cols <- c('Basename','Chip_ID','Chip_Location','Sentrix_ID','Sentrix_Position') # required when Basename not present -opt_cols <- c('Age') # optional column names -cond_cols <- c('Sex','Genotype_IID','Cell_Type') # conditionally column names +req_cols <- c("Sample_ID", "Individual_ID") # required column names +bsnm_cols <- c("Basename", "Chip_ID", "Chip_Location", "Sentrix_ID", "Sentrix_Position") # required when Basename not present +opt_cols <- c("Age") # optional column names +cond_cols <- c("Sex", "Genotype_IID", "Cell_Type") # conditionally column names # source checkColnames() source("checkColnamesFunction.r") -#1. Check required column names ------------------------------------------------------------------------------ +# 1. Check required column names ------------------------------------------------------------------------------ cat("1. Checking required column names: ") -cat(c('Basename',req_cols,'\n')) +cat(c("Basename", req_cols, "\n")) # check Basename first -b <- checkColnames(sampleSheet, bsnm_cols[1], type='Required', verbose=F) - -if(b$allPresent){ - # if Basename present, continue to check Basename alongside other required columns - checkColnames(sampleSheet, c('Basename',req_cols), type='Required') -}else{ - # if Basename not present, check Chip and Sentrix as alternatives - chip <- checkColnames(sampleSheet, bsnm_cols[2:3], type='Required', verbose=F) - sntrx <- checkColnames(sampleSheet, bsnm_cols[4:5], type='Required', verbose=F) - - # if either Chip or Sentrix present, continue to check other required columns - if(any(chip$allPresent | sntrx$allPresent)){ - cat("Basename column not found, but at least 2 of the following alternative columns are present: ", bsnm_cols[2:5],'\n') - cat("Checking remaining required columns",'\n') - checkColnames(sampleSheet, req_cols, type='Required') - }else{ - cat("Basename column not found, and neither set of alternative column names are present: ", bsnm_cols[2:5],'\n') - cat("Checking remaining required columns",'\n') - checkColnames(sampleSheet, req_cols, type='Required') - } +b <- checkColnames(sampleSheet, bsnm_cols[1], type = "Required", verbose = F) + +if (b$allPresent) { + # if Basename present, continue to check Basename alongside other required columns + checkColnames(sampleSheet, c("Basename", req_cols), type = "Required") +} else { + # if Basename not present, check Chip and Sentrix as alternatives + chip <- checkColnames(sampleSheet, bsnm_cols[2:3], type = "Required", verbose = F) + sntrx <- checkColnames(sampleSheet, bsnm_cols[4:5], type = "Required", verbose = F) + + # if either Chip or Sentrix present, continue to check other required columns + if (any(chip$allPresent | sntrx$allPresent)) { + cat("Basename column not found, but at least 2 of the following alternative columns are present: ", bsnm_cols[2:5], "\n") + cat("Checking remaining required columns", "\n") + checkColnames(sampleSheet, req_cols, type = "Required") + } else { + cat("Basename column not found, and neither set of alternative column names are present: ", bsnm_cols[2:5], "\n") + cat("Checking remaining required columns", "\n") + checkColnames(sampleSheet, req_cols, type = "Required") + } } -#2. Check optional column names ------------------------------------------------------------------------------ +# 2. Check optional column names ------------------------------------------------------------------------------ cat("2. Checking optional column names: ") -cat(opt_cols,'\n') -checkColnames(sampleSheet, opt_cols, type='Optional') +cat(opt_cols, "\n") +checkColnames(sampleSheet, opt_cols, type = "Optional") -#3. Check conditional column names --------------------------------------------------------------------------- -cat("Sourcing conditional variables from config.r",'\n') +# 3. Check conditional column names --------------------------------------------------------------------------- +cat("Sourcing conditional variables from config.r", "\n") source(configFile) -cond_status <- c(sexCheck,snpCheck,ctCheck) # T/Fs from config file -cat(paste0(c("sexCheck","snpCheck","ctCheck"),"=", cond_status),'\n') +cond_status <- c(sexCheck, snpCheck, ctCheck) # T/Fs from config file +cat(paste0(c("sexCheck", "snpCheck", "ctCheck"), "=", cond_status), "\n") # subset conditional colnames to those TRUE in config cond_cols.filtered <- cond_cols[cond_status] -if(all(cond_status==F)){ - cat("No conditional variables to check") -}else{ - cat("3. Checking conditional column names: ",'\n') - cat(cond_cols.filtered,'\n') - checkColnames(sampleSheet, cond_cols.filtered, type='Conditional') +if (all(cond_status == F)) { + cat("No conditional variables to check") +} else { + cat("3. Checking conditional column names: ", "\n") + cat(cond_cols.filtered, "\n") + checkColnames(sampleSheet, cond_cols.filtered, type = "Conditional") } -# ---------------------------------------------------------------------------------------------------------- # \ No newline at end of file +# ---------------------------------------------------------------------------------------------------------- # + diff --git a/array/DNAm/preprocessing/checkRconfigFile.r b/array/DNAm/preprocessing/checkRconfigFile.r index e10447d4..6165831f 100755 --- a/array/DNAm/preprocessing/checkRconfigFile.r +++ b/array/DNAm/preprocessing/checkRconfigFile.r @@ -22,8 +22,7 @@ print("checking config.r file parameters are present and correctly formatted...") args <- commandArgs(trailingOnly = TRUE) -dataDir <- args[1] -configFile <- file.path(dataDir, "config.r") +configFile <- args[[1]] source(configFile) diff --git a/array/DNAm/preprocessing/clusterCellTypes.r b/array/DNAm/preprocessing/clusterCellTypes.r index c0883f85..88b04f2a 100755 --- a/array/DNAm/preprocessing/clusterCellTypes.r +++ b/array/DNAm/preprocessing/clusterCellTypes.r @@ -18,14 +18,14 @@ #----------------------------------------------------------------------# args<-commandArgs(trailingOnly = TRUE) -dataDir <- args[1] -refDir <- args[2] +dataDir <- args[[1]] +refDir <- args[[2]] +configFile <- args[[3]] gdsFile <-paste0(dataDir, "/2_gds/raw.gds") qcOutFolder<-paste0(dataDir, "/2_gds/QCmetrics") qcData <-paste0(dataDir, "/2_gds/QCmetrics/QCmetrics.rdata") genoFile <- paste0(dataDir, "/0_metadata/epicSNPs.raw") -configFile <- paste0(dataDir, "/config.r") source(configFile) diff --git a/array/DNAm/preprocessing/jobSubmission/1_runDNAmQC.sh b/array/DNAm/preprocessing/jobSubmission/1_runDNAmQC.sh index 4dacf2c2..7c13ecd6 100755 --- a/array/DNAm/preprocessing/jobSubmission/1_runDNAmQC.sh +++ b/array/DNAm/preprocessing/jobSubmission/1_runDNAmQC.sh @@ -52,7 +52,7 @@ module load $RVERS # load specified R version cd ${SCRIPTSDIR}/array/DNAm/preprocessing/ -Rscript checkRconfigFile.r ${DATADIR} +Rscript checkRconfigFile.r ${RCONFIG} config_malformed=$? if [[ "${config_malformed}" -ne 0 ]]; then print_error_message \ @@ -71,7 +71,7 @@ if [[ "${library_did_not_install}" -ne 0 ]]; then "Exiting..." fi -Rscript checkColnamesSampleSheet.r ${DATADIR} +Rscript checkColnamesSampleSheet.r ${RCONFIG} sample_sheet_malformed=$? if [[ "${sample_sheet_malformed}" -ne 0 ]]; then print_error_message \ @@ -81,7 +81,7 @@ fi mkdir -p ${GDSDIR}/QCmetrics -Rscript loadDataGDS.r ${DATADIR} +Rscript loadDataGDS.r ${DATADIR} ${RCONFIG} gds_problem_identified=$? if [[ "${gds_problem_identified}" -ne 0 ]]; then print_error_message \ @@ -91,9 +91,9 @@ fi chmod 755 ${DATADIR}/2_gds/raw.gds -Rscript calcQCMetrics.r ${DATADIR} ${REFDIR} +Rscript calcQCMetrics.r ${DATADIR} ${REFDIR} ${RCONFIG} -Rscript clusterCellTypes.r ${DATADIR} ${REFDIR} +Rscript clusterCellTypes.r ${DATADIR} ${REFDIR} ${RCONFIG} most_recent_git_tag=$(git describe --tags --abbrev=0) current_commit_hash=$(git rev-parse HEAD) @@ -106,12 +106,12 @@ mv QC.html ${GDSDIR}/QCmetrics/ mkdir -p ${DATADIR}/3_normalised -Rscript normalisation.r ${DATADIR} ${REFDIR} +Rscript normalisation.r ${DATADIR} ${REFDIR} ${RCONFIG} chmod 755 ${DATADIR}/2_gds/rawNorm.gds mkdir -p ${GDSDIR}/QCmetrics/CETYGO -Rscript CETYGOdeconvolution.r ${DATADIR} +Rscript CETYGOdeconvolution.r ${DATADIR} ${RCONFIG} ## print finish date and time echo Job finished on: diff --git a/array/DNAm/preprocessing/loadDataGDS.r b/array/DNAm/preprocessing/loadDataGDS.r index 77373280..2b2f5d77 100755 --- a/array/DNAm/preprocessing/loadDataGDS.r +++ b/array/DNAm/preprocessing/loadDataGDS.r @@ -17,11 +17,11 @@ # DEFINE PARAMETERS #----------------------------------------------------------------------# args <- commandArgs(trailingOnly = TRUE) -dataDir <- args[1] +dataDir <- args[[1]] +configFile <- args[[2]] gdsFile <- file.path(dataDir, "2_gds/raw.gds") -configFile <- paste0(dataDir, "/config.r") source(configFile) arrayType <- toupper(arrayType) diff --git a/array/DNAm/preprocessing/normalisation.r b/array/DNAm/preprocessing/normalisation.r index 9847779e..2464c6d2 100755 --- a/array/DNAm/preprocessing/normalisation.r +++ b/array/DNAm/preprocessing/normalisation.r @@ -19,13 +19,14 @@ # DEFINE PARAMETERS #----------------------------------------------------------------------# args<-commandArgs(trailingOnly = TRUE) -dataDir <- args[1] -refDir <- args[2] +dataDir <- args[[1]] +refDir <- args[[2]] +configFile <- args[[3]] + gdsFile <-file.path(dataDir, "/2_gds/raw.gds") normgdsFile<-sub("\\.gds", "Norm.gds", gdsFile) qcOutFolder<-file.path(dataDir, "/2_gds/QCmetrics") normData<-file.path(dataDir, "/3_normalised/normalised.rdata") -configFile <- paste0(dataDir, "/config.r") source(configFile) From 23e73997c0d586f2803e09cd68161c8a79ce4f67 Mon Sep 17 00:00:00 2001 From: sof202 Date: Tue, 4 Feb 2025 13:43:32 +0000 Subject: [PATCH 2/2] refactor: make argument assignment consistent with other scritps --- array/DNAm/preprocessing/QC.rmd | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/array/DNAm/preprocessing/QC.rmd b/array/DNAm/preprocessing/QC.rmd index 92f1ae8d..0e15fe13 100755 --- a/array/DNAm/preprocessing/QC.rmd +++ b/array/DNAm/preprocessing/QC.rmd @@ -23,16 +23,15 @@ library(RColorBrewer, warn.conflicts = FALSE, quietly = TRUE) library(pheatmap, warn.conflicts = FALSE, quietly = TRUE) library(data.table, warn.conflicts = FALSE, quietly = TRUE) - -source(args[4]) ### change the content of this file to run QC with different thresholds -### prior to running this Rmarkdown which summarises the QC output, QC metrics must have been generated - -dataDir <- args[2] -refDir <- args[3] -most_recent_git_tag <- args[6] -current_commit_hash <- args[7] +dataDir <- args[[2]] +refDir <- args[[3]] +configFile <- args[[4]] +most_recent_git_tag <- args[[6]] +current_commit_hash <- args[[7]] setwd(dataDir) +source(configFile) + qcData <-paste0(dataDir, "/2_gds/QCmetrics/QCmetrics.rdata") load(qcData)