From 1961a5ae82242666161806022e339d12bb618330 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Tue, 28 Nov 2023 15:48:24 +0000
Subject: [PATCH] harriets changes

---
 assets/deploy_scripts/bsub.sh                 |   2 +-
 assets/deploy_scripts/bsub__removeWork.sh     |   2 +-
 assets/deploy_scripts/bsub_test.sh            |   2 +-
 assets/deploy_scripts/bsub_test_celltypes.sh  |   2 +-
 assets/deploy_scripts/bsub_test_recluster.sh  |  29 +
 .../input_setups/recluster_profile.nf         | 138 +++++
 .../nohup_start_nextflow_lsf.sh               |   2 +-
 .../nohup_start_nextflow_lsf__removeWork.sh   |   2 +-
 .../nohup_start_nextflow_lsf_celltypes.sh     |   2 +-
 .../nohup_start_nextflow_lsf_recluster.sh     |  27 +
 .../nohup_start_nextflow_lsf_test.sh          |   2 +-
 bin/0026-plot_filtered_cells.py               |  17 +-
 bin/0028-plot_predicted_sex.py                |   5 +-
 bin/0030-estimate_pca_elbow.py                |   5 +-
 bin/0035-scanpy_normalize_pca.py              | 143 +----
 ...canpy_cluster_validate_resolution-keras.py |   3 +-
 bin/pca_anndata.py                            | 556 ++++++++++++++++++
 conf/base.conf                                |  15 +-
 main.nf                                       |  13 +-
 .../nf-core/modules/clustering/functions.nf   |   2 +-
 modules/nf-core/modules/clustering/main.nf    |  82 ++-
 .../modules/estimate_pca_elbow/main.nf        |   3 -
 .../nf-core/modules/normalise_and_pca/main.nf |  95 +--
 subworkflows/qc.nf                            |  84 ++-
 workflows/yascp.nf                            |  21 +-
 25 files changed, 954 insertions(+), 300 deletions(-)
 create mode 100755 assets/deploy_scripts/bsub_test_recluster.sh
 create mode 100644 assets/deploy_scripts/input_setups/recluster_profile.nf
 create mode 100755 assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh
 create mode 100755 bin/pca_anndata.py

diff --git a/assets/deploy_scripts/bsub.sh b/assets/deploy_scripts/bsub.sh
index bcbe9189..0d9012b8 100755
--- a/assets/deploy_scripts/bsub.sh
+++ b/assets/deploy_scripts/bsub.sh
@@ -21,5 +21,5 @@ if ["$varname" = ''];
 fi
 sample="$RUN_ID"
 echo -e "\n Submitting yascp (https://github.com/wtsi-hgi/yascp) with input file $INPUT_FILE"
-bsub -R'select[mem>8000] rusage[mem=8000]' -J $sample -n 1 -M 8000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf.sh $INPUT_FILE
+bsub -R'select[mem>8000] rusage[mem=8000]' -J $sample -n 1 -M 8000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf.sh $INPUT_FILE
 echo "Submitted job can be killed with: bkill -J $sample"
\ No newline at end of file
diff --git a/assets/deploy_scripts/bsub__removeWork.sh b/assets/deploy_scripts/bsub__removeWork.sh
index 1f2e5dfa..f1bffef1 100755
--- a/assets/deploy_scripts/bsub__removeWork.sh
+++ b/assets/deploy_scripts/bsub__removeWork.sh
@@ -5,5 +5,5 @@ INPUT_FILE=$1
 export RUN_ID="${PWD##*/}"
 sample="$RUN_ID.yascp"
 echo "Cleaning the work directory (https://github.com/wtsi-hgi/yascp) with input file $INPUT_FILE by using '-entry WORK_DIR_REMOVAL --remove_work_dir' "
-bsub -R'select[mem>4000] rusage[mem=4000]' -J $sample -n 1 -M 4000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh $INPUT_FILE
+bsub -R'select[mem>4000] rusage[mem=4000]' -J $sample -n 1 -M 4000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh $INPUT_FILE
 echo "Submitted job can be killed with: bkill -J $sample"
\ No newline at end of file
diff --git a/assets/deploy_scripts/bsub_test.sh b/assets/deploy_scripts/bsub_test.sh
index 8a163fff..52b474dd 100755
--- a/assets/deploy_scripts/bsub_test.sh
+++ b/assets/deploy_scripts/bsub_test.sh
@@ -25,5 +25,5 @@ fi
 
 sample="$RUN_ID.yascp"
 echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in test mode withsample OneK1k dataset"
-bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_test -n 1 -M 4000 -o yascp_test.o -e yascp_test.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
+bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_test -n 1 -M 4000 -o yascp_test.o -e yascp_test.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
 echo "Submitted job can be killed with: bkill -J yascp_test"
\ No newline at end of file
diff --git a/assets/deploy_scripts/bsub_test_celltypes.sh b/assets/deploy_scripts/bsub_test_celltypes.sh
index 3c6dc200..7a12a9ac 100755
--- a/assets/deploy_scripts/bsub_test_celltypes.sh
+++ b/assets/deploy_scripts/bsub_test_celltypes.sh
@@ -25,5 +25,5 @@ fi
 
 sample="$RUN_ID.yascp"
 echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in JUST_CELLTYPES mode with input file $INPUT_FILE"
-bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_celltypes -n 1 -M 4000 -o yascp_celltypes.o -e yascp_celltypes.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh  $INPUT_FILE
+bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_celltypes -n 1 -M 4000 -o yascp_celltypes.o -e yascp_celltypes.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh  $INPUT_FILE
 echo "Submitted job can be killed with: bkill -J yascp_celltypes"
\ No newline at end of file
diff --git a/assets/deploy_scripts/bsub_test_recluster.sh b/assets/deploy_scripts/bsub_test_recluster.sh
new file mode 100755
index 00000000..7c8d6b97
--- /dev/null
+++ b/assets/deploy_scripts/bsub_test_recluster.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+CWD1="$PWD"
+parentdir="$(dirname "$CWD1")"
+INPUT_FILE=$1
+export RUN_ID="${PWD##*/}"
+
+# export SINGULARITY_CACHEDIR='/software/hgi/containers/yascp'
+
+export NXF_OPTS="-Xms5G -Xmx5G"
+export SINGULARITY_TMPDIR=$PWD/work/tmp
+export TEMP=$PWD/work/tmp
+export TMP_DIR=$PWD/work/tmp
+
+echo press ENTER to NOT fetch containers, otherwise provide writable path:
+read varname
+
+if ["$varname" = ''];
+    then
+        export NXF_SINGULARITY_CACHEDIR='/software/hgi/containers/yascp'
+        export SINGULARITY_DISABLE_CACHE=0
+    else
+        echo Yascp Will fetch the containers and place them in $varname
+        export NXF_SINGULARITY_CACHEDIR=$varname
+fi
+
+sample="$RUN_ID.yascp"
+echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in JUST_RECLUSTER mode with input file $INPUT_FILE"
+bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_cluster -n 1 -M 4000 -o yascp_cluster.o -e yascp_cluster.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh  $INPUT_FILE
+echo "Submitted job can be killed with: bkill -J yascp_cluster"
\ No newline at end of file
diff --git a/assets/deploy_scripts/input_setups/recluster_profile.nf b/assets/deploy_scripts/input_setups/recluster_profile.nf
new file mode 100644
index 00000000..cd84a1b0
--- /dev/null
+++ b/assets/deploy_scripts/input_setups/recluster_profile.nf
@@ -0,0 +1,138 @@
+params {
+
+    lisi{
+        run_process=true
+    }
+    replace_genotype_ids=false
+    write_h5=true
+    cluster_validate_resolution_keras = true
+    // run_celltype_assignment = true
+    project_name = 'T_Cell_Bio_Response'
+    filter_outliers = false
+    extra_sample_metadata =""
+    output_dir = outdir= "${launchDir}/recluster_resolutions"
+    cellex_cluster_markers=true 
+    cluster_markers = false
+    normalise_andata = false
+    skip_handover = true
+    // output_dir = outdir= "${launchDir}/results"
+    // run_celltype_assignment=true
+    split_ad_per_bach=true //if not splitting the celltype assignment will be run on full tranche
+    // input_data_table = "$outdir/handover/Summary_plots/$RUN_ID/Fetch Pipeline/Input/input_table.tsv"
+    // cellbender_location="${output_dir}/nf-preprocessing/cellbender" //!!!!! if cellbender is run already then can skip this by selecting  input = 'existing_cellbender' instead input = 'cellbender'
+    // existing_cellsnp="${output_dir}/cellsnp"
+    cellbender_location="/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/qc/results_11_09_2023/nf-preprocessing/cellbender" //!!!!! if cellbender is run already then can skip this by selecting  input = 'existing_cellbender' instead input = 'cellbender'
+    existing_cellsnp="/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/qc/results/cellsnp"
+
+    skip_preprocessing = true
+    // file__anndata_merged = '/lustre/scratch126/humgen/projects/sc-eqtl-ibd/analysis/harriet_analysis/230313_hb58_yascp_analysis/231114_h5ad_files_for_MCC/231120_TCs_only_regressed_counts_HVGs.h5ad'
+
+    harmony{
+        run_process= true
+    }
+    umap{
+        run_process = true
+        colors_quantitative{
+            description = 'Comma separated string of quantitative variables that will be used to color points.'
+            value = 'n_cells,total_counts,pct_counts_gene_group__mito_transcript,prob_doublet,pct_counts_gene_group__ribo_rna,Azimuth:predicted.celltype.l2.score,Azimuth:mapping.score,log10_ngenes_by_count'
+        }
+        colors_categorical{
+            description = 'Comma separated string of categorical variables that will be used to color points.'
+            value = 'cell_passes_qc,cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2,experiment_id,Azimuth:predicted.celltype.l2,Celltypist:Immune_All_Low:predicted_labels,Celltypist:Immune_All_High:predicted_labels,donor_id'
+        }
+    }
+
+    mads_categories ='pct_counts_gene_group__mito_transcript,pct_counts_gene_group__mito_protein,pct_counts_gene_group__ribo_protein,pct_counts_gene_group__ribo_rna,total_counts,n_genes_by_counts,log10_ngenes_by_count'
+    // hard_filters_file       = "${projectDir}/../sample_qc.yml"
+    // hard_filters_drop = false //#This indicates whether we want to drop the cells that fail hard filters of just flag them
+
+    cluster{
+        description = """Parameters for clustering. All pairwise combinations of
+        method and resolution will be performed."""
+        number_neighbors{
+            description = """Number of neighbors. If <= 0, uses number of unique
+            experiment_id."""
+            value = 15
+        }
+        methods{
+            description = 'Clustering method. Valid options [leiden|louvain].'
+            value = 'leiden'
+        }
+        resolutions{
+            description = 'Clustering resolution.'
+            value = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
+        }
+
+        variables_boxplot{
+            decription = 'Generate boxplots of these variables for each cluster.'
+            value ='n_cells,total_counts,pct_counts_gene_group__mito_transcript'
+        }
+
+        known_markers{
+            run_process = false
+            description = """Files with markers that will be used to generate
+            dotplots. Each marker file should be the full path and have the
+            following columns: cell_type, hgnc_symbol. The following columns
+            are optional: p_value_adj. Use "" for a single entry in the
+            file_id and file value to indicate no plots."""
+            value = [
+                [ file_id: 'SmillieCS_31348891', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon/SmillieCS-31348891/database.tsv' ],
+                [ file_id: 'ParikhK_30814735', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon/ParikhK-30814735/database.tsv' ],
+                [ file_id: 'JamesKR_32066951', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon-immune/JamesKR-32066951/database.tsv' ]
+            ]
+        }
+
+
+
+
+    }  
+    bbknn{
+        run_process = true
+    }
+
+    celltype_assignment{
+        run_celltype_assignment=false
+        run_azimuth=true
+        run_keras=false
+        run_celltypist=true
+    }
+    reduced_dims{
+        vars_to_regress{
+            value = ''   
+        }
+    }
+
+}
+
+process {
+
+    withName: plot_distributions{
+        containerOptions = "--containall --cleanenv --workdir /tmp -B /tmp"
+    }
+
+    withName: cellex_cluster_markers{
+        maxForks=7
+        memory = 300.GB
+    }
+    
+    withName: GATHER_DATA{
+        maxForks=7
+        memory = 100.GB
+    }
+    withName: LISI{
+        maxForks=7
+        memory = 300.GB
+    }
+    withName: cluster_validate_resolution_keras{
+        memory = 300.GB
+    }
+
+    withName: umap_calculate_and_plot{
+        memory = 300.GB
+    }
+
+    withName: sccaf_assess_clustering{
+        memory = 300.GB
+    }
+    
+}
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf.sh
index f1dfcbc0..ee8066ab 100755
--- a/assets/deploy_scripts/nohup_start_nextflow_lsf.sh
+++ b/assets/deploy_scripts/nohup_start_nextflow_lsf.sh
@@ -17,7 +17,7 @@ parentdir="$(dirname "$CWD1")"
 export RUN_ID="${PWD##*/}"
 mkdir $PWD/work || echo 'exists'
 mkdir $PWD/work/tmp || echo 'exists'
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh
index f640bbf3..28db82dc 100755
--- a/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh
+++ b/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh
@@ -21,7 +21,7 @@ export RUN_ID="${PWD##*/}"
 # export TEMP=$PWD/tmp
 # export TMP_DIR=$PWD/tmp
 
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger  -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger  -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh
index 800475d7..295cf5c7 100755
--- a/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh
+++ b/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh
@@ -17,7 +17,7 @@ parentdir="$(dirname "$CWD1")"
 export RUN_ID="${PWD##*/}"
 mkdir $PWD/work || echo 'exists'
 mkdir $PWD/work/tmp || echo 'exists'
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger -entry JUST_CELLTYPES -c $INPUT_FILE  --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -entry JUST_CELLTYPES -c $INPUT_FILE  --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh
new file mode 100755
index 00000000..995e07e5
--- /dev/null
+++ b/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+INPUT_FILE=$1
+dt=`date +"%Y_%m_%d_%T"`
+cp nextflow.nohup.log ./nextflow.nohup_$dt.log2 || echo 'first time running'
+# activate Nextflow conda env
+
+# clean up previous run files
+rm -f *.log
+rm -f nextflow.nohup.PID.txt 
+
+# start Nextflow in background:
+export NXF_OPTS="-Xms5G -Xmx5G"
+
+CWD1="$PWD"
+parentdir="$(dirname "$CWD1")"
+# export RUN_ID="${parentdir##*/}"
+export RUN_ID="${PWD##*/}"
+mkdir $PWD/work || echo 'exists'
+mkdir $PWD/work/tmp || echo 'exists'
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -entry JUST_RECLUSTER -c /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/input_setups/recluster_profile.nf -c $INPUT_FILE  --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+
+# get process PID 
+sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
+echo $PID > nextflow.nohup.PID.txt
+echo "Nextflow PID is $PID (saved in ./nextflow.nohup.PID.txt)" 
+echo kill with \"kill $PID\"
+echo "check logs files nextflow.nohup.log and .nextflow.log"
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
index 6a8e1946..cc5fd45d 100755
--- a/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
+++ b/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
@@ -16,7 +16,7 @@ parentdir="$(dirname "$CWD1")"
 export RUN_ID="${PWD##*/}"
 mkdir $PWD/work || echo 'exists'
 mkdir $PWD/work/tmp || echo 'exists'
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger,test --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger,test --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
diff --git a/bin/0026-plot_filtered_cells.py b/bin/0026-plot_filtered_cells.py
index 1752b769..7295070b 100755
--- a/bin/0026-plot_filtered_cells.py
+++ b/bin/0026-plot_filtered_cells.py
@@ -67,13 +67,16 @@ def main():
     # Check if any difference between before and after filters.	If not,
     # return early.
     df_after_filters = df[df.filter_type.isin(['after_filters'])]
-    filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[
-        df_after_filters.experiment_id,
-        'n_cells_left_in_adata'
-    ].values
-    if all(filt):
-        print("No difference detected before and after filters. No plots.")
-        return()
+    try:
+        filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[
+            df_after_filters.experiment_id,
+            'n_cells_left_in_adata'
+        ].values
+        if all(filt):
+            print("No difference detected before and after filters. No plots.")
+            return()
+    except:
+            return()
 
     # Set some plotting parameters
     plt_height = 16  # 1.5 * df.experiment_id.nunique()
diff --git a/bin/0028-plot_predicted_sex.py b/bin/0028-plot_predicted_sex.py
index 700e98d3..fad7fab6 100755
--- a/bin/0028-plot_predicted_sex.py
+++ b/bin/0028-plot_predicted_sex.py
@@ -60,7 +60,10 @@ def main():
 
     # Load the AnnData file
     adata = sc.read_h5ad(filename=options.h5)
-
+    try:
+        adata.X=adata.layers['counts']
+    except:
+        _='counts may be already set'
     # If we have a flag for cells that pass QC then filter down to them
     if 'cell_passes_qc' in adata.obs:
         adata = adata[adata.obs['cell_passes_qc'], :]
diff --git a/bin/0030-estimate_pca_elbow.py b/bin/0030-estimate_pca_elbow.py
index 75c1e490..0a952160 100755
--- a/bin/0030-estimate_pca_elbow.py
+++ b/bin/0030-estimate_pca_elbow.py
@@ -78,7 +78,10 @@ def main():
 
     # Read in the dataframe
     adata = sc.read_h5ad(filename=options.h5)
-
+    try:
+        adata.X=adata.layers['counts']
+    except:
+        _='counts may be already set'
     kneedle_dict = {}
     output_dict = {}
 
diff --git a/bin/0035-scanpy_normalize_pca.py b/bin/0035-scanpy_normalize_pca.py
index 99747ec5..93c6ba67 100755
--- a/bin/0035-scanpy_normalize_pca.py
+++ b/bin/0035-scanpy_normalize_pca.py
@@ -372,7 +372,6 @@ def scanpy_normalize_and_pca(
     sc.pp.filter_genes(adata, min_cells=5)
     # Only consider genes expressed in more than 0.5% of cells:
     # sc.pp.filter_genes(adata, min_cells=0.005*len(adata.obs.index))
-
     # Total-count normalize (library-size correct) the data matrix X to
     # counts per million, so that counts become comparable among cells.
     sc.pp.normalize_total(
@@ -385,26 +384,8 @@ def scanpy_normalize_and_pca(
     # Logarithmize the data: X = log(X + 1) where log = natural logorithm.
     # Numpy has a nice function to undo this np.expm1(adata.X).
     sc.pp.log1p(adata)
-    # Delete automatically added uns - UPDATE: bad idea to delete as this slot
-    # is used in _highly_variable_genes_single_batch.
-    # del adata.uns['log1p']
-    # Add record of this operation.
-    # adata.layers['log1p_cpm'] = adata.X.copy()
-    # adata.uns['log1p_cpm'] = {'transformation': 'ln(CPM+1)'}
     adata.layers['log1p_cp10k'] = adata.X.copy()
     adata.uns['log1p_cp10k'] = {'transformation': 'ln(CP10k+1)'}
-
-    # Stash the unprocessed data in the raw slot.
-    # adata.raw.X.data is now ln(CPM+1).
-    # NOTE: - Layers are not preserved in adata.raw, though obs, var, uns are.
-    #       - If genes are filtered (e.g.,
-    #         sc.pp.filter_genes(adata, min_cells=1)), the full dataset will
-    #         remain in the raw slot.
-    #       - We store in the raw slot because later for UMAP and marker gene
-    #         analysis, we can easily tell scanpy to use the raw slot via the
-    #         use_raw = True flag. Raw was specifically designed for this use
-    #         case of ln(CPM+1),
-    # Can be deleted later: del adata.raw
     adata.raw = adata
     # adata_raw = adata.raw.to_adata()
 
@@ -433,32 +414,7 @@ def scanpy_normalize_and_pca(
         batch_key=variable_feature_batch_key,
         inplace=True
     )
-    if verbose:
-        print('{}: {} (all batches); {} ({})'.format(
-            'Number of variable features detected',
-            adata.var['highly_variable_intersection'].sum(),
-            adata.var['highly_variable'].sum(),
-            'after ranking the number of batches where a feature is variable'
-        ))
-    # If n_top_genes = None, then one needs to set 'highly_variable'.
-    # Here, highly_variable_intersection is only true for genes variable across
-    # all batch keys (i.e., 'highly_variable_nbatches' = n_batch_keys):
-    # adata.var.loc[
-    #     adata.var["highly_variable_intersection"],
-    #     ["highly_variable_nbatches"]
-    # ]
-    #
-    # If n_top_genes = None, then one also needs needs to set highly_variable'.
-    # Fix bug in PCA when we have set batch_key. More below:
-    # https://github.com/theislab/scanpy/issues/1032
-    # adata.var['highly_variable'] = adata.var['highly_variable_intersection']
-    #
-    # Alternatively, if one specifies n_top_genes, then genes are ranked by
-    # 'highly_variable_nbatches' and highly_variable is set to the top n.
-    # adata.var.loc[
-    #     adata.var["highly_variable"],
-    #     ["highly_variable_nbatches"]
-    # ]
+
 
     if plot:
         # Plot highly variable genes.
@@ -609,106 +565,15 @@ def scanpy_normalize_and_pca(
             copy=False
         )
 
+
     # Keep a record of the different gene scores
     if score_genes_df is not None:
         adata.uns['df_score_genes'] = score_genes_df_updated
 
-    # Calculate PCs.
-
-    seed_value = 0
-    # 0. Set `PYTHONHASHSEED` environment variable at a fixed value
-    os.environ['PYTHONHASHSEED'] = str(seed_value)
-    # 1. Set `python` built-in pseudo-random generator at a fixed value
-    random.seed(seed_value)
-    # 2. Set `numpy` pseudo-random generator at a fixed value
-    np.random.seed(seed_value)
-
-    sc.tl.pca(
-        adata,
-        n_comps=min(200, adata.var['highly_variable'].sum()),
-        zero_center=True,  # Set to true for standard PCA
-        svd_solver='arpack',  # arpack reproducible when zero_center = True
-        use_highly_variable=True,
-        copy=False,
-        random_state=np.random.RandomState(0),
-        chunked=False
-    )
-    # pca(
-    #     adata,
-    #     n_comps=min(200, adata.var['highly_variable'].sum()),
-    #     svd_solver='arpack',  # lobpcg not found in current sklearn
-    #     use_highly_variable=True,
-    #     copy=False
-    # )
-
-    # Save PCs to a seperate file for Harmony.
-    pca_df = pd.DataFrame(
-        adata.obsm['X_pca'],
-        index=adata.obs_names,
-        columns=[
-            'PC{}'.format(x) for x in range(1, adata.obsm['X_pca'].shape[1]+1)
-        ]
-    )
-    pca_df.to_csv(
-        '{}-pcs.tsv.gz'.format(output_file),
-        sep='\t',
-        index=True,
-        index_label='cell_barcode',
-        na_rep='',
-        compression=compression_opts
-    )
-
-    # Save the metadata to a seperate file for Harmony.
-    adata.obs.to_csv(
-        '{}-metadata.tsv.gz'.format(output_file),
-        sep='\t',
-        index=True,
-        quoting=csv.QUOTE_NONNUMERIC,
-        index_label='cell_barcode',
-        na_rep='',
-        compression=compression_opts
-    )
-
     # Save the data.
     adata.write(
-        '{}-normalized_pca.h5ad'.format(output_file),
+        '{}-normalized.h5ad'.format(output_file),
         compression='gzip'
-        #compression_opts=anndata_compression_opts
-    )
-    # adata_merged.write_csvs(output_file)
-    # adata_merged.write_loom(output_file+".loom"))
-
-    # Plot the PC info.
-    if plot:
-        # Plot the vanilla PCs.
-        # sc.pl.pca(
-        #     adata,
-        #     color='experiment_id',
-        #     components=['1,2', '3,4']
-        # )
-        _ = sc.pl.pca_variance_ratio(
-            adata,
-            n_pcs=adata.obsm['X_pca'].shape[1],
-            log=False,
-            show=False,
-            save='-{}.pdf'.format(output_file)
-        )
-        _ = sc.pl.pca_variance_ratio(
-            adata,
-            n_pcs=adata.obsm['X_pca'].shape[1],
-            log=True,
-            show=False,
-            save='-{}-log.pdf'.format(output_file)
-        )
-
-    # Save the filtered count matrix for input to other software like scVI
-    adata.X = adata.layers['counts']
-    del adata.layers['counts']
-    del adata.raw
-    adata.write(
-        '{}-normalized_pca-counts.h5ad'.format(output_file),
-        compression='gzip'
-        #compression_opts=anndata_compression_opts
     )
 
     return(output_file)
@@ -852,7 +717,7 @@ def main():
     drop_cell_passes_qc_from_clustering=options.drop_cell_passes_qc_from_clustering
     # Load the AnnData file
     adata = sc.read_h5ad(filename=options.h5)
-
+    # adata_comp = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/test_recluster/work/6f/e30114c18a6dc6f620da63e187f348/f9d037b7109a2a7f96cb3ad63b97ff/outlier_filtered_adata.h5ad')
     # if this is the subclustering analysis, the count matrix should be used
     # by default, the analysis is "conventional" and thus will be skipped
     if options.layer != "none":
diff --git a/bin/0057-scanpy_cluster_validate_resolution-keras.py b/bin/0057-scanpy_cluster_validate_resolution-keras.py
index 0dafeafa..f5579e9a 100755
--- a/bin/0057-scanpy_cluster_validate_resolution-keras.py
+++ b/bin/0057-scanpy_cluster_validate_resolution-keras.py
@@ -519,7 +519,8 @@ def main():
             # Virtual devices must be set before GPUs have been initialized
             print(e)
     else:
-        raise Exception('ERROR: no GPUs detected.')
+        _ = 'running without gpus'
+        # raise Exception('ERROR: no GPUs detected.')
 
     # Get additional data we are going to append to the output model info
     dict_add = {}
diff --git a/bin/pca_anndata.py b/bin/pca_anndata.py
new file mode 100755
index 00000000..591c9ad4
--- /dev/null
+++ b/bin/pca_anndata.py
@@ -0,0 +1,556 @@
+#!/usr/bin/env python
+
+
+__date__ = '2020-03-13'
+__version__ = '0.0.1'
+
+import argparse
+from distutils.version import LooseVersion
+import os
+os.environ['NUMBA_CACHE_DIR']='/tmp'
+os.environ['MPLCONFIGDIR']='/tmp'
+import random
+import numpy as np
+import scipy as sp
+# import sklearn.utils
+import sklearn.decomposition
+import pandas as pd
+import scanpy as sc
+import csv
+import time
+from datetime import timedelta
+
+# Set seed for reproducibility
+seed_value = 0
+# 0. Set `PYTHONHASHSEED` environment variable at a fixed value
+os.environ['PYTHONHASHSEED'] = str(seed_value)
+# 1. Set `python` built-in pseudo-random generator at a fixed value
+random.seed(seed_value)
+# 2. Set `numpy` pseudo-random generator at a fixed value
+np.random.seed(seed_value)
+
+# Set scanpy settings
+# sc verbosity: errors (0), warnings (1), info (2), hints (3)
+# sc.settings.verbosity = 3
+# sc.logging.print_versions()
+# sc.settings.set_figure_params(dpi=80)
+
+
+def pca(
+    data,
+    n_comps=None,
+    svd_solver='arpack',
+    use_highly_variable=None,
+    copy=False
+):
+    """Compute PCA coordinates, loadings and variance decomposition.
+
+    Derived from scanpy 1.5.1.
+    Principal component analysis [Pedregosa11]_.]
+    Uses the implementation of *scikit-learn* [Pedregosa11]_.
+
+    Parameters
+    ----------
+    data
+        The (annotated) data matrix of shape `n_obs` × `n_vars`.
+        Rows correspond to cells and columns to genes.
+    n_comps
+        Number of principal components to compute. Defaults to 50, or 1 -
+        minimum dimension size of selected representation.
+    svd_solver
+        SVD solver to use:
+        `'arpack'` (the default)
+          for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`)
+        `'randomized'`
+          for the randomized algorithm due to Halko (2009).
+        `'auto'`
+          chooses automatically depending on the size of the problem.
+        `'lobpcg'`
+          An alternative SciPy solver.
+        .. versionchanged:: 1.4.5
+           Default value changed from `'auto'` to `'arpack'`.
+        Efficient computation of the principal components of a sparse matrix
+        currently only works with the `'arpack`' or `'lobpcg'` solvers.
+    use_highly_variable
+        Whether to use highly variable genes only, stored in
+        `.var['highly_variable']`.
+        By default uses them if they have been determined beforehand.
+    copy
+        If an :class:`~anndata.AnnData` is passed, determines whether a copy
+        is returned. Is ignored otherwise.
+    Returns
+    -------
+    adata : anndata.AnnData
+        …otherwise if `copy=True` it returns or else adds fields to `adata`:
+        `.obsm['X_pca']`
+             PCA representation of data.
+        `.varm['PCs']`
+             The principal components containing the loadings.
+        `.uns['pca']['variance_ratio']`
+             Ratio of explained variance.
+        `.uns['pca']['variance']`
+             Explained variance, equivalent to the eigenvalues of the
+             covariance matrix.
+    """
+    adata = data.copy() if copy else data
+
+    if use_highly_variable and 'highly_variable' not in adata.var.keys():
+        raise ValueError(
+            'Did not find adata.var[\'highly_variable\']. '
+            'Either your data already only consists of highly-variable genes '
+            'or consider running `pp.highly_variable_genes` first.'
+        )
+    if use_highly_variable is None:
+        if 'highly_variable' in adata.var.keys():
+            use_highly_variable = True
+        else:
+            use_highly_variable = False
+
+    if use_highly_variable:
+        adata_comp = (
+            adata[:, adata.var['highly_variable']]
+        )
+    else:
+        adata_comp = adata
+
+    if n_comps is None:
+        min_dim = min(adata_comp.n_vars, adata_comp.n_obs)
+        n_comps = min_dim - 1
+
+    # random_state = sklearn.utils.check_random_state(random_state)
+    X = adata_comp.X
+
+    # If sparse, make dense.
+    # Another option:
+    # output = _pca_with_sparse(
+    #     X, n_comps, solver=svd_solver, random_state=random_state
+    # )
+    if sp.sparse.issparse(X):
+        X = X.toarray()
+
+    # Sort out the solver
+    if svd_solver == 'auto':
+        svd_solver = 'arpack'
+    if svd_solver not in {'arpack', 'randomized'}:
+        raise ValueError(
+            'svd_solver: {svd_solver} can not be used with sparse input.'
+        )
+
+    pca_ = sklearn.decomposition.PCA(
+        n_components=n_comps,
+        svd_solver=svd_solver,
+        random_state=0
+    )
+    X_pca = pca_.fit_transform(X)
+
+    # Cast to whatever datatype.
+    # dtype = 'float32'
+    # dtype
+    #     Numpy data type string to which to convert the result.
+    # if X_pca.dtype.descr != np.dtype(dtype).descr:
+    #     X_pca = X_pca.astype(dtype)
+
+    # Update the adata frame (if copy=False, then this is the same input adata
+    # that the user provided)
+    adata.obsm['X_pca'] = X_pca
+    adata.uns['pca'] = {}
+    adata.uns['pca']['params'] = {
+        'zero_center': True,
+        'use_highly_variable': use_highly_variable,
+    }
+    if use_highly_variable:
+        adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps))
+        adata.varm['PCs'][adata.var['highly_variable']] = pca_.components_.T
+    else:
+        adata.varm['PCs'] = pca_.components_.T
+    adata.uns['pca']['variance'] = pca_.explained_variance_
+    adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_
+
+    return adata if copy else None
+
+
+def score_cells(
+    adata,
+    score_genes_df,
+    score_genes_df_column='ensembl_gene_id',
+    only_use_variable_genes=False
+):
+    """Scores each cell.
+
+    Parameters
+    ----------
+    adata : AnnData
+        Input AnnData object. Assume adata.X is norm->log1p->scaled data.
+    score_genes_df : pd.DataFrame
+        Dataframe of marker genes. Needs to have score_genes_df_column and
+        score_id column. If one score_id == 'cell_cycle', then requires a
+        grouping_id column with 'G2/M' and 'S'.
+    score_genes_df_column : string
+        Column in score_genes_df to use for gene ids (e.g., hgnc_symbol,
+        ensembl_gene_id)
+    only_use_variable_genes : boolean
+        Only use variable genes to calculate scores. If True, score_id will
+        be changed to <score_id>__hvg_only. Note this flage does not apply
+        to score_id == 'cell_cycle'.
+
+
+    Returns
+    -------
+    adata : AnnData
+        AnnData object with scores calculated and stored in
+        adata.obs[<score_id>].
+    score_genes_df : pd.DataFrame
+        The score_genes_df with the following columns added:
+        gene_found_in_adata, gene_found_is_highly_variable. It is suggested
+        that this dataframe is added to the adata.uns slot.
+    """
+    verbose = False  # For debugging purposes.
+
+    # Update the score_genes_df with details on the genes and if they were
+    # found in adata and if they are highly variable.
+    score_genes_df['gene_found_in_adata'] = np.in1d(
+        score_genes_df[score_genes_df_column],
+        adata.var.index
+    )
+    score_genes_df['gene_found_is_highly_variable'] = np.in1d(
+        score_genes_df[score_genes_df_column],
+        adata.var.index[adata.var['highly_variable']]
+    )
+
+    # Set the gene pool parameter.
+    gene_pool = None  # If None, all genes are randomly sampled for background
+    if only_use_variable_genes:
+        gene_pool = adata.var.index[adata.var['highly_variable']]
+
+    # Loop over each score_id in score_genes_df, updating adata.
+    for score_id, df_group in score_genes_df.groupby('score_id'):
+        # Downsample to only those genes found in the data.
+        df_group = df_group.loc[
+            df_group['gene_found_in_adata'], :
+        ]
+        if df_group.shape[0] == 0:
+            continue
+
+        # If we are supposed to use only_use_variable_genes, then do so.
+        if only_use_variable_genes:
+            if score_id == 'cell_cycle':
+                continue
+            score_id = '{}__hvg_only'.format(score_id)
+            df_group = df_group.loc[
+                df_group['gene_found_is_highly_variable'], :
+            ]
+            if df_group.shape[0] == 0:
+                continue
+        if verbose:
+            print('Scoring {}'.format(score_id))
+
+        # Set the number of control genes.
+        ctrl_size = 50
+        if df_group.shape[0] > 50:
+            ctrl_size = df_group.shape[0]
+        if gene_pool is not None:
+            if ctrl_size > len(gene_pool):
+                raise Exception(
+                    'Error in gene scoring ctrl_size > len(gene_pool)'
+                )
+
+        # If the score_id is cell_cycle, then use the specific cell cycle
+        # scoring function.
+        if score_id == 'cell_cycle':
+            # NOTE: Setting ctrl_size` is not possible, as it's set as
+            #       `min(len(s_genes), len(g2m_genes))`.
+            sc.tl.score_genes_cell_cycle(
+                adata,
+                s_genes=df_group.loc[
+                    df_group['grouping_id'] == 'S', score_genes_df_column
+                ],
+                g2m_genes=df_group.loc[
+                    df_group['grouping_id'] == 'G2/M', score_genes_df_column
+                ],
+                copy=False,
+                gene_pool=gene_pool,  # Default is None (aka, use all)
+                n_bins=25,  # Default is 25
+                use_raw=False
+            )
+        else:
+            sc.tl.score_genes(
+                adata,
+                df_group[score_genes_df_column],
+                ctrl_size=ctrl_size,  # Default is 50
+                gene_pool=gene_pool,  # Default is None (aka, use all)
+                n_bins=25,  # Default is 25
+                score_name=score_id,
+                random_state=0,  # Default is 0
+                copy=False,
+                use_raw=False
+            )
+
+    return adata, score_genes_df
+
+
+def pca_analysis(
+    adata,
+    output_file,
+    variable_feature_batch_key='experiment_id',
+    n_variable_features=2000,
+    exclude_hv_gene_df=[],
+    score_genes_df=None,
+    verbose=True,
+    plot=True,
+    anndata_compression_opts=4
+):
+
+    # Calculate PCs.
+    seed_value = 0
+    # 0. Set `PYTHONHASHSEED` environment variable at a fixed value
+    os.environ['PYTHONHASHSEED'] = str(seed_value)
+    # 1. Set `python` built-in pseudo-random generator at a fixed value
+    random.seed(seed_value)
+    # 2. Set `numpy` pseudo-random generator at a fixed value
+    np.random.seed(seed_value)
+
+    sc.tl.pca(
+        adata,
+        n_comps=min(200, adata.var['highly_variable'].sum()),
+        zero_center=True,  # Set to true for standard PCA
+        svd_solver='arpack',  # arpack reproducible when zero_center = True
+        use_highly_variable=True,
+        copy=False,
+        random_state=np.random.RandomState(0),
+        chunked=False
+    )
+
+    # Save PCs to a seperate file for Harmony.
+    pca_df = pd.DataFrame(
+        adata.obsm['X_pca'],
+        index=adata.obs_names,
+        columns=[
+            'PC{}'.format(x) for x in range(1, adata.obsm['X_pca'].shape[1]+1)
+        ]
+    )
+    
+    compression_opts = 'gzip'
+    if LooseVersion(pd.__version__) > '1.0.0':
+        compression_opts = dict(
+            method='gzip',
+            compresslevel=9
+        )
+    
+    pca_df.to_csv(
+        '{}-pcs.tsv.gz'.format(output_file),
+        sep='\t',
+        index=True,
+        index_label='cell_barcode',
+        na_rep='',
+        compression=compression_opts
+    )
+
+    # Save the metadata to a seperate file for Harmony.
+    adata.obs.to_csv(
+        '{}-metadata.tsv.gz'.format(output_file),
+        sep='\t',
+        index=True,
+        quoting=csv.QUOTE_NONNUMERIC,
+        index_label='cell_barcode',
+        na_rep='',
+        compression=compression_opts
+    )
+
+    # Save the data.
+    adata.write(
+        '{}-normalized_pca.h5ad'.format(output_file),
+        compression='gzip'
+    )
+    # Plot the PC info.
+    if plot:
+        # Plot the vanilla PCs.
+        sc.pl.pca_variance_ratio(
+            adata,
+            n_pcs=adata.obsm['X_pca'].shape[1],
+            log=False,
+            show=False,
+            save='-{}.pdf'.format(output_file)
+        )
+        
+        sc.pl.pca_variance_ratio(
+            adata,
+            n_pcs=adata.obsm['X_pca'].shape[1],
+            log=True,
+            show=False,
+            save='-{}-log.pdf'.format(output_file)
+        )
+
+    # Save the filtered count matrix for input to other software like scVI
+    adata.X = adata.layers['counts']
+    del adata.layers['counts']
+    del adata.raw
+    adata.write(
+        '{}-normalized_pca-counts.h5ad'.format(output_file),
+        compression='gzip'
+        #compression_opts=anndata_compression_opts
+    )
+
+
+def main():
+    """Run CLI."""
+    parser = argparse.ArgumentParser(
+        description="""
+            Read anndata object. Normalize, calculate PCs. Save new anndata
+            object along with csv file of PCs.
+            """
+    )
+
+    parser.add_argument(
+        '-v', '--version',
+        action='version',
+        version='%(prog)s {version}'.format(version=__version__)
+    )
+
+    parser.add_argument(
+        '-h5', '--h5_anndata',
+        action='store',
+        dest='h5',
+        required=True,
+        help='H5 AnnData file.'
+    )
+
+    parser.add_argument(
+        '-layer', '--overwrite_x_with_layer',
+        action='store',
+        dest='layer',
+        default='none',
+        help='Specify a layer of the AnnData file, which should be used for \
+            the following normalization and downstream analysis. This should \
+            go together with the analysis mode of the pipeline as \
+            "conventional" or "subclustering". \
+            (default: %(default)s)'
+    )
+
+    parser.add_argument(
+        '-bk', '--batch_key',
+        action='store',
+        dest='bk',
+        default='experiment_id',
+        help='Batch key for highly-variable feature (e.g., gene) detection.\
+            If specified, highly-variable features are selected within each\
+            batch separately and merged.\
+            (default: %(default)s)'
+    )
+
+    parser.add_argument(
+        '-nvf', '--number_variable_features',
+        action='store',
+        dest='nvf',
+        default=2000,
+        type=int,
+        help='After calculating variable features within each batch set via\
+            <batch_key>, rank features by number of batches where they are\
+            variable and select the top <number_variable_features>.\
+            (default: %(default)s)'
+    )
+
+    parser.add_argument(
+        '-vge', '--variable_genes_exclude',
+        action='store',
+        dest='vge',
+        default='',
+        help='Tab-delimited file with genes to exclude from the highly\
+            variable gene list. Must contain ensembl_gene_id column.\
+            (default: None - keep all variable genes)'
+    )
+
+    parser.add_argument(
+        '-vr', '--vars_to_regress',
+        action='store',
+        dest='vr',
+        default='',
+        help='Comma seperated list of metadata variables to regress prior to\
+            calculating PCs. Example: gene_group__mito_transcript,n_count.\
+            (default: "" and sc.pp.regress_out is not called)'
+    )
+
+    parser.add_argument(
+        '-sg', '--score_genes',
+        action='store',
+        dest='sg',
+        default='',
+        help='Tab-delimited file of genes for scores. Needs to have\
+            ensembl_gene_id and score_id column. If one\
+            score_id == "cell_cycle", then requires a grouping_id column with\
+            "G2/M" and "S".'
+    )
+
+    parser.add_argument(
+        '-drop_cell_passes_qc_from_clustering', '--drop_cell_passes_qc_from_clusteringdrop_cell_passes_qc_from_clustering',
+        action='store',
+        dest='drop_cell_passes_qc_from_clustering',
+        default=False,
+        help='Whether we want to drop cells before clustering based on the cell_passes_qc filter established by outlier filter part of pipeline'
+    )
+
+
+    parser.add_argument(
+        '-ncpu', '--number_cpu',
+        action='store',
+        dest='ncpu',
+        default=4,
+        type=int,
+        help='Number of CPUs to use.\
+            (default: %(default)s)'
+    )
+
+    parser.add_argument(
+        '--anndata_compression_opts',
+        action='store',
+        dest='anndata_compression_opts',
+        default=4,
+        type=int,
+        help='Compression level in anndata. A larger value decreases disk \
+            space requirements at the cost of compression time. \
+            (default: %(default)s)'
+    )
+
+    parser.add_argument(
+        '-of', '--output_file',
+        action='store',
+        dest='of',
+        default='adata-normalize_pca',
+        help='Directory and basename of output files.\
+            (default: %(default)s)'
+    )
+
+    options = parser.parse_args()
+
+    # Scanpy settings
+    sc.settings.figdir = os.getcwd()  # figure output directory to match base.
+    sc.settings.n_jobs = options.ncpu  # number CPUs
+    # sc.settings.max_memory = 500  # in Gb
+    # sc.set_figure_params(dpi_save = 300)
+    drop_cell_passes_qc_from_clustering=options.drop_cell_passes_qc_from_clustering
+    # Load the AnnData file
+    adata = sc.read_h5ad(filename=options.h5)
+    try:
+        del adata.uns
+    except:
+        _='still there'
+    # adata_comp = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/test_recluster/work/6f/e30114c18a6dc6f620da63e187f348/f9d037b7109a2a7f96cb3ad63b97ff/outlier_filtered_adata.h5ad')
+        # adata_comp = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/test_recluster/work/91/676237d4521fd78b293a8c4e548394/adata-normalized_pca.h5ad')
+    
+    start_time = time.time()
+    
+    pca_analysis(
+        adata,
+        output_file=options.of,
+        variable_feature_batch_key=options.bk,
+        n_variable_features=options.nvf,
+        anndata_compression_opts=options.anndata_compression_opts
+    )
+    execution_summary = "Analysis execution time [{}]:\t{}".format(
+        "pca.py",
+        str(timedelta(seconds=time.time()-start_time))
+    )
+    print(execution_summary)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/conf/base.conf b/conf/base.conf
index d31eed54..cc41cee9 100755
--- a/conf/base.conf
+++ b/conf/base.conf
@@ -36,15 +36,16 @@ params{
     mem1= 12000
     copy_mode = "rellink"
     split_bam = false
+    cluster_markers = true
     existing_cellsnp="${projectDir}/assets/existing_cellsnp"
     existing_vireo=''
-    skip_preprocessing{
-        value=false
-        gt_match_file="" // #We prvide this if we want to exclude a particular samples matched to a ceirtain GT cohortc from the adaptive qc
-        gt_match_based_adaptive_qc_exclusion_pattern = '' // #We run the adaptive QC on these patterns independently regardless on assigned celltype.        
-        file__anndata_merged = ''
-        file__cells_filtered = ''
-    }
+    normalise_andata = true
+    skip_preprocessing=false
+    gt_match_file="" // #We prvide this if we want to exclude a particular samples matched to a ceirtain GT cohortc from the adaptive qc
+    gt_match_based_adaptive_qc_exclusion_pattern = '' // #We run the adaptive QC on these patterns independently regardless on assigned celltype.        
+    file__anndata_merged = ''
+    file__cells_filtered = ''
+    id_in='experiment_id'
     genotype_phenotype_mapping_file =''
     extra_sample_metadata = ''
     use_phenotype_ids_for_gt_match = true //#if false this will keep the genotype ids, for this to be used have to set a genotype_phenotype_mapping_file to a path to csv where firs column contains genotype ids and second contains phenotype ids to replace these to.
diff --git a/main.nf b/main.nf
index c6dea7ed..138101bf 100755
--- a/main.nf
+++ b/main.nf
@@ -13,6 +13,9 @@ include { YASCP } from "$projectDir/workflows/yascp"
 include { RETRIEVE_RECOURSES;RETRIEVE_RECOURSES_TEST_DATASET } from "$projectDir/subworkflows/local/retrieve_recourses"
 include {RSYNC_RESULTS_REMOVE_WORK_DIR} from "$projectDir/modules/local/rsync_results_remove_work_dir/main"
 include {celltype} from "$projectDir/subworkflows/celltype"
+include {qc} from "$projectDir/subworkflows/qc"
+include {dummy_filtered_channel} from "$projectDir/modules/nf-core/modules/merge_samples/functions"
+
 ////// WORKFLOW: Run main nf-core/yascp analysis pipeline
 // This is the default entry point, we have others to update ceirtain parts of the results. 
 // Please go to ./workflows/yascp to see the main Yascp workflow.
@@ -46,11 +49,19 @@ workflow {
 
 
 workflow JUST_CELLTYPES{
-    file__anndata_merged = Channel.from(params.skip_preprocessing.file__anndata_merged)
+    file__anndata_merged = Channel.from(params.file__anndata_merged)
     celltype(file__anndata_merged)
 }
 
 
+workflow JUST_RECLUSTER{
+    file__anndata_merged = Channel.from(params.file__anndata_merged)
+    gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq")
+    dummy_filtered_channel(file__anndata_merged,params.id_in)
+    file__cells_filtered = dummy_filtered_channel.out.anndata_metadata
+    qc(file__anndata_merged,file__cells_filtered,gt_outlier_input) //This runs the Clusterring and qc assessments of the datasets.
+                
+}
 
 
 ////// You do not need to concern about the workflows bellow as these are Cardinal Specific and used for development
diff --git a/modules/nf-core/modules/clustering/functions.nf b/modules/nf-core/modules/clustering/functions.nf
index 5f6efa32..b54847fb 100755
--- a/modules/nf-core/modules/clustering/functions.nf
+++ b/modules/nf-core/modules/clustering/functions.nf
@@ -341,7 +341,7 @@ process cluster_validate_resolution_keras {
     // ------------------------------------------------------------------------
     //cache false        // cache results from run
     //maxForks 2         // hard to control memory usage. limit to 3 concurrent
-    label 'gpu'        // use GPU
+    label 'process_low'        // use GPU
     scratch false      // use tmp directory
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
         container "https://yascp.cog.sanger.ac.uk/public/singularity_images/wtsihgi_nf_scrna_qc_6bb6af5-2021-12-23-3270149cf265.sif"
diff --git a/modules/nf-core/modules/clustering/main.nf b/modules/nf-core/modules/clustering/main.nf
index ea4e0b2a..2c850608 100755
--- a/modules/nf-core/modules/clustering/main.nf
+++ b/modules/nf-core/modules/clustering/main.nf
@@ -71,37 +71,34 @@ workflow CLUSTERING {
         //     cluster_validate_resolution__sparsity,
         //     cluster_validate_resolution__train_size_cells
         // )
-        if (params.utilise_gpu){
-            if (params.cluster_validate_resolution_keras){
+        
+    if (params.cluster_validate_resolution_keras){
 
-            
-            cluster_validate_resolution_keras( 
-                cluster.out.outdir,
-                cluster.out.anndata,
-                cluster.out.metadata,
-                cluster.out.pcs,
-                cluster.out.reduced_dims,
-                cluster.out.clusters,
-                cluster_validate_resolution__sparsity,
-                cluster_validate_resolution__train_size_cells,
-                cluster.out.outdir__reduced_dims
-            )
+        
+        cluster_validate_resolution_keras( 
+            cluster.out.outdir,
+            cluster.out.anndata,
+            cluster.out.metadata,
+            cluster.out.pcs,
+            cluster.out.reduced_dims,
+            cluster.out.clusters,
+            cluster_validate_resolution__sparsity,
+            cluster_validate_resolution__train_size_cells,
+            cluster.out.outdir__reduced_dims
+        )
 
-            plot_resolution_validate(
-                cluster_validate_resolution_keras.out.plot_input.groupTuple()
-            )
-            }
-            
+        plot_resolution_validate(
+            cluster_validate_resolution_keras.out.plot_input.groupTuple()
+        )
         }
+            
+        
 
         SCCAF(cluster.out.outdir,
           cluster.out.anndata,
           cluster.out.clusters,
           sccaf_minacc)
 
-
-
-
         // // Generate UMAPs of the results.
         umap_calculate_and_plot(
             cluster.out.outdir,
@@ -118,28 +115,29 @@ workflow CLUSTERING {
         )
         dummy_output=umap_calculate_and_plot.out.dummy_output
         // // Find marker genes for clusters
-        cluster_markers(
-            cluster.out.outdir,
-            cluster.out.anndata,
-            cluster.out.metadata,
-            cluster.out.pcs,
-            cluster.out.reduced_dims,
-            cluster.out.clusters,
-            cluster_marker__methods
-        )
-
-        // // Find marker genes for clusters using CELLEX
-        cellex_cluster_markers(
-            cluster.out.outdir,
-            cluster.out.anndata
-        )
+        if (params.cluster_markers){
+            cluster_markers(
+                cluster.out.outdir,
+                cluster.out.anndata,
+                cluster.out.metadata,
+                cluster.out.pcs,
+                cluster.out.reduced_dims,
+                cluster.out.clusters,
+                cluster_marker__methods
+            )
 
-        // Prep adata file for cellxgene website
-        prep_cellxgene(
-            cluster.out.outdir,
-            cluster.out.anndata
-        )
+            // // Find marker genes for clusters using CELLEX
+            cellex_cluster_markers(
+                cluster.out.outdir,
+                cluster.out.anndata
+            )
 
+            // Prep adata file for cellxgene website
+            prep_cellxgene(
+                cluster.out.outdir,
+                cluster.out.anndata
+            )
+        }
         emit:
             dummy_output
 
diff --git a/modules/nf-core/modules/estimate_pca_elbow/main.nf b/modules/nf-core/modules/estimate_pca_elbow/main.nf
index 05d3d9ac..2251e3af 100755
--- a/modules/nf-core/modules/estimate_pca_elbow/main.nf
+++ b/modules/nf-core/modules/estimate_pca_elbow/main.nf
@@ -43,10 +43,7 @@ process ESTIMATE_PCA_ELBOW {
     script:
         
         outdir = "${outdir_prev}"
-        log.info("""outdir = ${outdir}""")
-        // from the file__anndata job.
         outfile = "${file__anndata}".minus(".h5ad")
-            .split("-").drop(1).join("-")
         outfile = "${outfile}-knee"
         """
             rm -fr plots
diff --git a/modules/nf-core/modules/normalise_and_pca/main.nf b/modules/nf-core/modules/normalise_and_pca/main.nf
index b0f81031..696128c9 100755
--- a/modules/nf-core/modules/normalise_and_pca/main.nf
+++ b/modules/nf-core/modules/normalise_and_pca/main.nf
@@ -3,6 +3,59 @@ def random_hex(n) {
     Long.toUnsignedString(new Random().nextLong(), n).toUpperCase()
 }
 
+process PCA {
+
+    label 'process_medium'
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://yascp.cog.sanger.ac.uk/public/singularity_images/nf_qc_scrna_v1.img"
+        // /software/hgi/containers/nf_qc_scrna_v1.img
+    } else {
+        container "mercury/nf_qc_scrna:v1"
+    }
+
+    publishDir  path: "${outdir}",
+                saveAs: {filename -> filename.replaceAll("-", "")},
+                mode: "${params.copy_mode}",
+                overwrite: "true"
+    
+    input:
+        path(file__anndata)
+        val(outdir)
+        val(layer)
+
+    output:
+        val(outdir, emit: outdir)
+        val("${outdir}", emit: outdir3)
+        path("adata-normalized_pca.h5ad", emit: anndata)
+        path("adata-metadata.tsv.gz", emit: metadata)
+        path("adata-pcs.tsv.gz", emit: pcs)
+        path(
+            "adata-normalized_pca-counts.h5ad",
+            emit: anndata_filtered_counts
+        )
+        val("${param_details}", emit: param_details)
+        path("plots/*.pdf")
+        path("plots/*.png") optional true
+
+    script:
+
+        """
+        rm -fr plots
+        pca_anndata.py \
+            --h5_anndata ${file__anndata} \
+            --overwrite_x_with_layer ${layer} \
+            --output_file adata \
+            --number_cpu ${task.cpus} \
+            --drop_cell_passes_qc_from_clustering ${params.drop_cell_passes_qc_from_clustering}
+        mkdir plots
+        
+        mv *pdf plots/ 2>/dev/null || true
+        mv *png plots/ 2>/dev/null || true
+        """
+
+}
+
+
 process NORMALISE_AND_PCA {
     // Takes annData object, nomalizes across samples, calculates PCs.
     // NOTE: Once normalization is set, it would be faster to normalize per
@@ -25,7 +78,7 @@ process NORMALISE_AND_PCA {
                 overwrite: "true"
 
     input:
-        val(outdir_prev)
+        
         path(file__anndata)
         val(analysis_mode)
         val(layer)
@@ -36,16 +89,8 @@ process NORMALISE_AND_PCA {
     output:
         
         val(outdir, emit: outdir)
-        
         val("${outdir}", emit: outdir3)
-        path("adata-normalized_pca.h5ad", emit: anndata)
-        path("adata-metadata.tsv.gz", emit: metadata)
-        path("adata-pcs.tsv.gz", emit: pcs)
-        
-        path(
-            "adata-normalized_pca-counts.h5ad",
-            emit: anndata_filtered_counts
-        )
+        path("adata-normalized.h5ad", emit: anndata)
         val("${param_details}", emit: param_details)
         path("plots/*.pdf")
         path("plots/*.png") optional true
@@ -66,19 +111,14 @@ process NORMALISE_AND_PCA {
             cmd__vars_to_regress = "--vars_to_regress ${vars_to_regress}"
         }
 
-        // todo - mo11 - these paths are confusing
-
-        outdir = "${outdir_prev}/normalize=total_count.${param_details}"
+        outdir = "${params.outdir}/clustering/normalize=total_count.${param_details}"
         // Add details on the genes we are exlcuding from hgv list.
         file_vge = "${file__genes_exclude_hvg.getSimpleName()}"
         outdir = "${outdir}.hvg_exclude=${file_vge}"
         // Add details on the scores we are using.
         file_score = "${file__genes_score.getSimpleName()}"
         outdir = "${outdir}.scores=${file_score}"
-
-
         // this is where the subfolder 1 is determined
-
         // Customize command for optional files.
         cmd__genes_exclude_hvg = ""
         if (file__genes_exclude_hvg.name != "no_file__genes_exclude_hvg") {
@@ -89,7 +129,6 @@ process NORMALISE_AND_PCA {
             cmd__genes_score = "--score_genes ${file__genes_score}"
         }
 
-
         """
         rm -fr plots
         0035-scanpy_normalize_pca.py \
@@ -106,26 +145,4 @@ process NORMALISE_AND_PCA {
         mv *pdf plots/ 2>/dev/null || true
         mv *png plots/ 2>/dev/null || true
         """
-        // Old version with bash evaluation of optional commands
-        //
-        // echo "normalize_pca: ${process_info}"
-        // # If there are entries in the variable_genes_exclude file, add it to
-        // # the call.
-        // cmd__vg_exclude="--variable_genes_exclude ${file__genes_exclude_hvg}"
-        // val=\$(cat ${file__genes_exclude_hvg} | wc -l)
-        // if [ \$val -eq 0 ]; then cmd__vg_exclude=""; fi
-        // # If there are entries in the score_genes file, add it to the call.
-        // cmd__score_genes="--score_genes ${file__genes_score}"
-        // val=\$(cat ${file__genes_score} | wc -l)
-        // if [ \$val -eq 0 ]; then cmd__score_genes=""; fi
-        // 0035-scanpy_normalize_pca.py \
-        //     --h5_anndata ${file__anndata} \
-        //     --output_file adata \
-        //     --number_cpu ${task.cpus} \
-        //     ${cmd__vars_to_regress} \
-        //     \${cmd__vg_exclude} \
-        //     \${cmd__score_genes}
-        // mkdir plots
-        // mv *pdf plots/ 2>/dev/null || true
-        // mv *png plots/ 2>/dev/null || true
 }
diff --git a/subworkflows/qc.nf b/subworkflows/qc.nf
index 9ea2ff74..2d88024f 100755
--- a/subworkflows/qc.nf
+++ b/subworkflows/qc.nf
@@ -5,7 +5,7 @@ include {OUTLIER_FILTER} from "$projectDir/modules/nf-core/modules/outlier_filte
 include {PLOT_STATS} from "$projectDir/modules/nf-core/modules/plot_stats/main"
 include {ESTIMATE_PCA_ELBOW} from "$projectDir/modules/nf-core/modules/estimate_pca_elbow/main"
 include {SUBSET_PCS} from "$projectDir/modules/nf-core/modules/subset_pcs/main"
-include {NORMALISE_AND_PCA} from "$projectDir/modules/nf-core/modules/normalise_and_pca/main"
+include {NORMALISE_AND_PCA; PCA} from "$projectDir/modules/nf-core/modules/normalise_and_pca/main"
 include {HARMONY} from "$projectDir/modules/nf-core/modules/harmony/main"
 include {BBKNN} from "$projectDir/modules/nf-core/modules/bbknn/main"
 include {ADD_EXTRA_METADATA_TO_H5AD} from "$projectDir/modules/nf-core/modules/adata_manipulations/main"
@@ -19,7 +19,7 @@ workflow qc {
     take:
         file__anndata_merged
         file__cells_filtered
-        assignments_all_pools
+        gt_outlier_input
     main:
         log.info "--- Running QC metrics --- "
         // if(params.extra_metadata!=''){
@@ -29,26 +29,14 @@ workflow qc {
         // }else{
         //     log.info '''--- No extra metadata to add to h5ad ---'''
         // }
-        file__anndata_merged.map{val1 -> tuple('full', val1)}.set{out1}
 
         CELL_HARD_FILTERS(file__anndata_merged,params.hard_filters_file,params.hard_filters_drop)
         if(params.hard_filters_file != "no_file__file_sample_qc"){
             file__anndata_merged = CELL_HARD_FILTERS.out.anndata
         }
         
-        // Next we define an input channel to outlier filtering strategy in case if params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !=''
-        // i.e - if we want to exclude a particular cohort that has been matched by gt match from the adaptive qc we feed this in the outlier_filter()
-        if(params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !=''){
-            gt_outlier_input = assignments_all_pools
-        }else{
-            gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq")
-        }
-
-        file__anndata_merged.subscribe { println "value1: $it" }
-        file__cells_filtered.subscribe { println "value2: $it" }
-        gt_outlier_input.subscribe { println "value3: $it" }
         //FILTERING OUTLIER CELLS
-        if (params.sample_qc.cell_filters.filter_outliers.run_process) {
+        if (params.filter_outliers) {
             log.info """---Running automatic outlier cell filtering.----"""
             OUTLIER_FILTER(
                 params.outdir,
@@ -67,18 +55,28 @@ workflow qc {
         }
 
         
+        if (params.normalise_andata){
+            NORMALISE_AND_PCA(
+                file__anndata_merged,
+                params.mode,
+                params.layer,
+                params.genes_exclude_hvg,
+                params.genes_score,
+                params.reduced_dims.vars_to_regress.value)
+            andata = NORMALISE_AND_PCA.out.anndata
+            outdir = NORMALISE_AND_PCA.out.outdir
+
+        }else{
+            andata = file__anndata_merged
+            outdir = "${params.outdir}"
+            LI4 = Channel.of([1, 'dummy_lisi'])
+        }
 
-        NORMALISE_AND_PCA(params.outdir+'/clustering',
-            file__anndata_merged,
-            params.mode,
-            params.layer,
-            params.genes_exclude_hvg,
-            params.genes_score,
-            params.reduced_dims.vars_to_regress.value)
+        PCA(andata,params.outdir,params.layer)
 
         ESTIMATE_PCA_ELBOW(
-            NORMALISE_AND_PCA.out.outdir,
-            NORMALISE_AND_PCA.out.anndata,
+            PCA.out.outdir,
+            PCA.out.anndata,
             params.reduced_dims.n_dims.add_n_to_estimate
         )
 
@@ -91,21 +89,21 @@ workflow qc {
         }
 
         SUBSET_PCS(
-            NORMALISE_AND_PCA.out.outdir,
-            NORMALISE_AND_PCA.out.anndata,
-            NORMALISE_AND_PCA.out.metadata,
-            NORMALISE_AND_PCA.out.pcs,
-            NORMALISE_AND_PCA.out.param_details,
+            PCA.out.outdir,
+            PCA.out.anndata,
+            PCA.out.metadata,
+            PCA.out.pcs,
+            PCA.out.param_details,
             n_pcs
         )
-        file__anndata_merged.subscribe { println "PLOT_STATS input: $it" }
+        PCA.out.outdir.subscribe { println "outdir input: $it" }
         PLOT_STATS(file__anndata_merged,
                     file__cells_filtered,
                     SUBSET_PCS.out.outdir,
                     SUBSET_PCS.out.anndata,
                     n_pcs)
 
-        file__anndata_merged = NORMALISE_AND_PCA.out.anndata
+        file__anndata_merged = PCA.out.anndata
         
         LI4 = PLOT_STATS.out.LI
 
@@ -120,11 +118,11 @@ workflow qc {
         // "Correct" PCs using Harmony or BBKNN
         if (params.harmony.run_process) {
             HARMONY(
-                NORMALISE_AND_PCA.out.outdir,
-                NORMALISE_AND_PCA.out.anndata,
-                NORMALISE_AND_PCA.out.metadata,
-                NORMALISE_AND_PCA.out.pcs,
-                NORMALISE_AND_PCA.out.param_details,
+                PCA.out.outdir,
+                PCA.out.anndata,
+                PCA.out.metadata,
+                PCA.out.pcs,
+                PCA.out.param_details,
                 n_pcs,
                 Channel.fromList( params.harmony.variables_and_thetas.value)
             )
@@ -187,11 +185,11 @@ workflow qc {
 
         if (params.bbknn.run_process) {
             BBKNN(
-                NORMALISE_AND_PCA.out.outdir,
-                NORMALISE_AND_PCA.out.anndata,
-                NORMALISE_AND_PCA.out.metadata,
-                NORMALISE_AND_PCA.out.pcs,
-                NORMALISE_AND_PCA.out.param_details,
+                PCA.out.outdir,
+                PCA.out.anndata,
+                PCA.out.metadata,
+                PCA.out.pcs,
+                PCA.out.param_details,
                 n_pcs,
                 params.bbknn.batch_variable.value
             )
@@ -256,8 +254,8 @@ workflow qc {
             lisi_input_second = lisi_input_first.mix(lisi_input3)
 
             LISI(
-                NORMALISE_AND_PCA.out.outdir,
-                NORMALISE_AND_PCA.out.metadata,
+                PCA.out.outdir,
+                PCA.out.metadata,
                 params.lisi.variables.value,
                 lisi_input_second.collect()
             )
diff --git a/workflows/yascp.nf b/workflows/yascp.nf
index 6e425d2f..f3009565 100755
--- a/workflows/yascp.nf
+++ b/workflows/yascp.nf
@@ -57,7 +57,7 @@ workflow YASCP {
                 
         if(!params.just_reports){
             // sometimes we just want to rerun report generation as a result of alterations, hence if we set params.just_reports =True pipeline will use the results directory and generate a new reports.
-            if (!params.skip_preprocessing.value){
+            if (!params.skip_preprocessing){
                 // The input table should contain the folowing columns - experiment_id	n_pooled	donor_vcf_ids	data_path_10x_format
                 // prepearing the inputs from a standard 10x dataset folders.
                 prepare_inputs(input_channel)
@@ -125,23 +125,23 @@ workflow YASCP {
             }else{
                 // This option skips all the deconvolution and and takes a preprocessed yascp h5ad file to run the downstream clustering and celltype annotation.
                 log.info '''----Skipping Preprocessing since we already have prepeared h5ad input file----'''
-                file__anndata_merged = Channel.from(params.skip_preprocessing.file__anndata_merged)
+                file__anndata_merged = Channel.from(params.file__anndata_merged)
 
 
                 if("${mode}"!='default'){
                     // Here we have rerun GT matching upstream - done for freeze1
                     assignments_all_pools = mode
                 }else{
-                    if (params.skip_preprocessing.file__anndata_merged !=''){
-                        assignments_all_pools = Channel.from(params.skip_preprocessing.gt_match_file)
+                    if (params.file__anndata_merged !=''){
+                        assignments_all_pools = Channel.from(params.gt_match_file)
                     }else{
                         assignments_all_pools = Channel.from("$projectDir/assets/fake_file.fq")
                     }
                 }
                 
-                if (params.skip_preprocessing.file__cells_filtered ==''){
+                if (params.file__cells_filtered ==''){
                     log.info '''--- No cells filtered input ----'''
-                    dummy_filtered_channel(file__anndata_merged,params.skip_preprocessing.id_in)
+                    dummy_filtered_channel(file__anndata_merged,params.id_in)
                     file__cells_filtered = dummy_filtered_channel.out.anndata_metadata
                 }else{
                     file__cells_filtered = Channel.from(params.skip_preprocessing.file__cells_filtered)
@@ -175,7 +175,14 @@ workflow YASCP {
             // ###################################
 
             if (!params.skip_qc){
-                qc(file__anndata_merged,file__cells_filtered,assignments_all_pools) //This runs the Clusterring and qc assessments of the datasets.
+
+                if(params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !=''){
+                    gt_outlier_input = assignments_all_pools
+                }else{
+                    gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq")
+                }
+
+                qc(file__anndata_merged,file__cells_filtered,gt_outlier_input) //This runs the Clusterring and qc assessments of the datasets.
                 process_finish_check_channel = qc.out.LI
                 file__anndata_merged = qc.out.file__anndata_merged
             }else{