From 1961a5ae82242666161806022e339d12bb618330 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Tue, 28 Nov 2023 15:48:24 +0000 Subject: [PATCH] harriets changes --- assets/deploy_scripts/bsub.sh | 2 +- assets/deploy_scripts/bsub__removeWork.sh | 2 +- assets/deploy_scripts/bsub_test.sh | 2 +- assets/deploy_scripts/bsub_test_celltypes.sh | 2 +- assets/deploy_scripts/bsub_test_recluster.sh | 29 + .../input_setups/recluster_profile.nf | 138 +++++ .../nohup_start_nextflow_lsf.sh | 2 +- .../nohup_start_nextflow_lsf__removeWork.sh | 2 +- .../nohup_start_nextflow_lsf_celltypes.sh | 2 +- .../nohup_start_nextflow_lsf_recluster.sh | 27 + .../nohup_start_nextflow_lsf_test.sh | 2 +- bin/0026-plot_filtered_cells.py | 17 +- bin/0028-plot_predicted_sex.py | 5 +- bin/0030-estimate_pca_elbow.py | 5 +- bin/0035-scanpy_normalize_pca.py | 143 +---- ...canpy_cluster_validate_resolution-keras.py | 3 +- bin/pca_anndata.py | 556 ++++++++++++++++++ conf/base.conf | 15 +- main.nf | 13 +- .../nf-core/modules/clustering/functions.nf | 2 +- modules/nf-core/modules/clustering/main.nf | 82 ++- .../modules/estimate_pca_elbow/main.nf | 3 - .../nf-core/modules/normalise_and_pca/main.nf | 95 +-- subworkflows/qc.nf | 84 ++- workflows/yascp.nf | 21 +- 25 files changed, 954 insertions(+), 300 deletions(-) create mode 100755 assets/deploy_scripts/bsub_test_recluster.sh create mode 100644 assets/deploy_scripts/input_setups/recluster_profile.nf create mode 100755 assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh create mode 100755 bin/pca_anndata.py diff --git a/assets/deploy_scripts/bsub.sh b/assets/deploy_scripts/bsub.sh index bcbe9189..0d9012b8 100755 --- a/assets/deploy_scripts/bsub.sh +++ b/assets/deploy_scripts/bsub.sh @@ -21,5 +21,5 @@ if ["$varname" = '']; fi sample="$RUN_ID" echo -e "\n Submitting yascp (https://github.com/wtsi-hgi/yascp) with input file $INPUT_FILE" -bsub -R'select[mem>8000] rusage[mem=8000]' -J $sample -n 1 -M 8000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf.sh $INPUT_FILE +bsub -R'select[mem>8000] rusage[mem=8000]' -J $sample -n 1 -M 8000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf.sh $INPUT_FILE echo "Submitted job can be killed with: bkill -J $sample" \ No newline at end of file diff --git a/assets/deploy_scripts/bsub__removeWork.sh b/assets/deploy_scripts/bsub__removeWork.sh index 1f2e5dfa..f1bffef1 100755 --- a/assets/deploy_scripts/bsub__removeWork.sh +++ b/assets/deploy_scripts/bsub__removeWork.sh @@ -5,5 +5,5 @@ INPUT_FILE=$1 export RUN_ID="${PWD##*/}" sample="$RUN_ID.yascp" echo "Cleaning the work directory (https://github.com/wtsi-hgi/yascp) with input file $INPUT_FILE by using '-entry WORK_DIR_REMOVAL --remove_work_dir' " -bsub -R'select[mem>4000] rusage[mem=4000]' -J $sample -n 1 -M 4000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh $INPUT_FILE +bsub -R'select[mem>4000] rusage[mem=4000]' -J $sample -n 1 -M 4000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh $INPUT_FILE echo "Submitted job can be killed with: bkill -J $sample" \ No newline at end of file diff --git a/assets/deploy_scripts/bsub_test.sh b/assets/deploy_scripts/bsub_test.sh index 8a163fff..52b474dd 100755 --- a/assets/deploy_scripts/bsub_test.sh +++ b/assets/deploy_scripts/bsub_test.sh @@ -25,5 +25,5 @@ fi sample="$RUN_ID.yascp" echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in test mode withsample OneK1k dataset" -bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_test -n 1 -M 4000 -o yascp_test.o -e yascp_test.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh +bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_test -n 1 -M 4000 -o yascp_test.o -e yascp_test.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh echo "Submitted job can be killed with: bkill -J yascp_test" \ No newline at end of file diff --git a/assets/deploy_scripts/bsub_test_celltypes.sh b/assets/deploy_scripts/bsub_test_celltypes.sh index 3c6dc200..7a12a9ac 100755 --- a/assets/deploy_scripts/bsub_test_celltypes.sh +++ b/assets/deploy_scripts/bsub_test_celltypes.sh @@ -25,5 +25,5 @@ fi sample="$RUN_ID.yascp" echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in JUST_CELLTYPES mode with input file $INPUT_FILE" -bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_celltypes -n 1 -M 4000 -o yascp_celltypes.o -e yascp_celltypes.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh $INPUT_FILE +bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_celltypes -n 1 -M 4000 -o yascp_celltypes.o -e yascp_celltypes.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh $INPUT_FILE echo "Submitted job can be killed with: bkill -J yascp_celltypes" \ No newline at end of file diff --git a/assets/deploy_scripts/bsub_test_recluster.sh b/assets/deploy_scripts/bsub_test_recluster.sh new file mode 100755 index 00000000..7c8d6b97 --- /dev/null +++ b/assets/deploy_scripts/bsub_test_recluster.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +CWD1="$PWD" +parentdir="$(dirname "$CWD1")" +INPUT_FILE=$1 +export RUN_ID="${PWD##*/}" + +# export SINGULARITY_CACHEDIR='/software/hgi/containers/yascp' + +export NXF_OPTS="-Xms5G -Xmx5G" +export SINGULARITY_TMPDIR=$PWD/work/tmp +export TEMP=$PWD/work/tmp +export TMP_DIR=$PWD/work/tmp + +echo press ENTER to NOT fetch containers, otherwise provide writable path: +read varname + +if ["$varname" = '']; + then + export NXF_SINGULARITY_CACHEDIR='/software/hgi/containers/yascp' + export SINGULARITY_DISABLE_CACHE=0 + else + echo Yascp Will fetch the containers and place them in $varname + export NXF_SINGULARITY_CACHEDIR=$varname +fi + +sample="$RUN_ID.yascp" +echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in JUST_RECLUSTER mode with input file $INPUT_FILE" +bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_cluster -n 1 -M 4000 -o yascp_cluster.o -e yascp_cluster.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh $INPUT_FILE +echo "Submitted job can be killed with: bkill -J yascp_cluster" \ No newline at end of file diff --git a/assets/deploy_scripts/input_setups/recluster_profile.nf b/assets/deploy_scripts/input_setups/recluster_profile.nf new file mode 100644 index 00000000..cd84a1b0 --- /dev/null +++ b/assets/deploy_scripts/input_setups/recluster_profile.nf @@ -0,0 +1,138 @@ +params { + + lisi{ + run_process=true + } + replace_genotype_ids=false + write_h5=true + cluster_validate_resolution_keras = true + // run_celltype_assignment = true + project_name = 'T_Cell_Bio_Response' + filter_outliers = false + extra_sample_metadata ="" + output_dir = outdir= "${launchDir}/recluster_resolutions" + cellex_cluster_markers=true + cluster_markers = false + normalise_andata = false + skip_handover = true + // output_dir = outdir= "${launchDir}/results" + // run_celltype_assignment=true + split_ad_per_bach=true //if not splitting the celltype assignment will be run on full tranche + // input_data_table = "$outdir/handover/Summary_plots/$RUN_ID/Fetch Pipeline/Input/input_table.tsv" + // cellbender_location="${output_dir}/nf-preprocessing/cellbender" //!!!!! if cellbender is run already then can skip this by selecting input = 'existing_cellbender' instead input = 'cellbender' + // existing_cellsnp="${output_dir}/cellsnp" + cellbender_location="/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/qc/results_11_09_2023/nf-preprocessing/cellbender" //!!!!! if cellbender is run already then can skip this by selecting input = 'existing_cellbender' instead input = 'cellbender' + existing_cellsnp="/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/qc/results/cellsnp" + + skip_preprocessing = true + // file__anndata_merged = '/lustre/scratch126/humgen/projects/sc-eqtl-ibd/analysis/harriet_analysis/230313_hb58_yascp_analysis/231114_h5ad_files_for_MCC/231120_TCs_only_regressed_counts_HVGs.h5ad' + + harmony{ + run_process= true + } + umap{ + run_process = true + colors_quantitative{ + description = 'Comma separated string of quantitative variables that will be used to color points.' + value = 'n_cells,total_counts,pct_counts_gene_group__mito_transcript,prob_doublet,pct_counts_gene_group__ribo_rna,Azimuth:predicted.celltype.l2.score,Azimuth:mapping.score,log10_ngenes_by_count' + } + colors_categorical{ + description = 'Comma separated string of categorical variables that will be used to color points.' + value = 'cell_passes_qc,cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2,experiment_id,Azimuth:predicted.celltype.l2,Celltypist:Immune_All_Low:predicted_labels,Celltypist:Immune_All_High:predicted_labels,donor_id' + } + } + + mads_categories ='pct_counts_gene_group__mito_transcript,pct_counts_gene_group__mito_protein,pct_counts_gene_group__ribo_protein,pct_counts_gene_group__ribo_rna,total_counts,n_genes_by_counts,log10_ngenes_by_count' + // hard_filters_file = "${projectDir}/../sample_qc.yml" + // hard_filters_drop = false //#This indicates whether we want to drop the cells that fail hard filters of just flag them + + cluster{ + description = """Parameters for clustering. All pairwise combinations of + method and resolution will be performed.""" + number_neighbors{ + description = """Number of neighbors. If <= 0, uses number of unique + experiment_id.""" + value = 15 + } + methods{ + description = 'Clustering method. Valid options [leiden|louvain].' + value = 'leiden' + } + resolutions{ + description = 'Clustering resolution.' + value = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] + } + + variables_boxplot{ + decription = 'Generate boxplots of these variables for each cluster.' + value ='n_cells,total_counts,pct_counts_gene_group__mito_transcript' + } + + known_markers{ + run_process = false + description = """Files with markers that will be used to generate + dotplots. Each marker file should be the full path and have the + following columns: cell_type, hgnc_symbol. The following columns + are optional: p_value_adj. Use "" for a single entry in the + file_id and file value to indicate no plots.""" + value = [ + [ file_id: 'SmillieCS_31348891', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon/SmillieCS-31348891/database.tsv' ], + [ file_id: 'ParikhK_30814735', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon/ParikhK-30814735/database.tsv' ], + [ file_id: 'JamesKR_32066951', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon-immune/JamesKR-32066951/database.tsv' ] + ] + } + + + + + } + bbknn{ + run_process = true + } + + celltype_assignment{ + run_celltype_assignment=false + run_azimuth=true + run_keras=false + run_celltypist=true + } + reduced_dims{ + vars_to_regress{ + value = '' + } + } + +} + +process { + + withName: plot_distributions{ + containerOptions = "--containall --cleanenv --workdir /tmp -B /tmp" + } + + withName: cellex_cluster_markers{ + maxForks=7 + memory = 300.GB + } + + withName: GATHER_DATA{ + maxForks=7 + memory = 100.GB + } + withName: LISI{ + maxForks=7 + memory = 300.GB + } + withName: cluster_validate_resolution_keras{ + memory = 300.GB + } + + withName: umap_calculate_and_plot{ + memory = 300.GB + } + + withName: sccaf_assess_clustering{ + memory = 300.GB + } + +} diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf.sh index f1dfcbc0..ee8066ab 100755 --- a/assets/deploy_scripts/nohup_start_nextflow_lsf.sh +++ b/assets/deploy_scripts/nohup_start_nextflow_lsf.sh @@ -17,7 +17,7 @@ parentdir="$(dirname "$CWD1")" export RUN_ID="${PWD##*/}" mkdir $PWD/work || echo 'exists' mkdir $PWD/work/tmp || echo 'exists' -echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & +echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & # get process PID sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR") diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh index f640bbf3..28db82dc 100755 --- a/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh +++ b/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh @@ -21,7 +21,7 @@ export RUN_ID="${PWD##*/}" # export TEMP=$PWD/tmp # export TMP_DIR=$PWD/tmp -echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 & +echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 & # get process PID sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR") diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh index 800475d7..295cf5c7 100755 --- a/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh +++ b/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh @@ -17,7 +17,7 @@ parentdir="$(dirname "$CWD1")" export RUN_ID="${PWD##*/}" mkdir $PWD/work || echo 'exists' mkdir $PWD/work/tmp || echo 'exists' -echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger -entry JUST_CELLTYPES -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & +echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -entry JUST_CELLTYPES -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & # get process PID sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR") diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh new file mode 100755 index 00000000..995e07e5 --- /dev/null +++ b/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +INPUT_FILE=$1 +dt=`date +"%Y_%m_%d_%T"` +cp nextflow.nohup.log ./nextflow.nohup_$dt.log2 || echo 'first time running' +# activate Nextflow conda env + +# clean up previous run files +rm -f *.log +rm -f nextflow.nohup.PID.txt + +# start Nextflow in background: +export NXF_OPTS="-Xms5G -Xmx5G" + +CWD1="$PWD" +parentdir="$(dirname "$CWD1")" +# export RUN_ID="${parentdir##*/}" +export RUN_ID="${PWD##*/}" +mkdir $PWD/work || echo 'exists' +mkdir $PWD/work/tmp || echo 'exists' +echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -entry JUST_RECLUSTER -c /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/input_setups/recluster_profile.nf -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & + +# get process PID +sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR") +echo $PID > nextflow.nohup.PID.txt +echo "Nextflow PID is $PID (saved in ./nextflow.nohup.PID.txt)" +echo kill with \"kill $PID\" +echo "check logs files nextflow.nohup.log and .nextflow.log" diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh index 6a8e1946..cc5fd45d 100755 --- a/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh +++ b/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh @@ -16,7 +16,7 @@ parentdir="$(dirname "$CWD1")" export RUN_ID="${PWD##*/}" mkdir $PWD/work || echo 'exists' mkdir $PWD/work/tmp || echo 'exists' -echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger,test --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & +echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger,test --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & # get process PID sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR") diff --git a/bin/0026-plot_filtered_cells.py b/bin/0026-plot_filtered_cells.py index 1752b769..7295070b 100755 --- a/bin/0026-plot_filtered_cells.py +++ b/bin/0026-plot_filtered_cells.py @@ -67,13 +67,16 @@ def main(): # Check if any difference between before and after filters. If not, # return early. df_after_filters = df[df.filter_type.isin(['after_filters'])] - filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[ - df_after_filters.experiment_id, - 'n_cells_left_in_adata' - ].values - if all(filt): - print("No difference detected before and after filters. No plots.") - return() + try: + filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[ + df_after_filters.experiment_id, + 'n_cells_left_in_adata' + ].values + if all(filt): + print("No difference detected before and after filters. No plots.") + return() + except: + return() # Set some plotting parameters plt_height = 16 # 1.5 * df.experiment_id.nunique() diff --git a/bin/0028-plot_predicted_sex.py b/bin/0028-plot_predicted_sex.py index 700e98d3..fad7fab6 100755 --- a/bin/0028-plot_predicted_sex.py +++ b/bin/0028-plot_predicted_sex.py @@ -60,7 +60,10 @@ def main(): # Load the AnnData file adata = sc.read_h5ad(filename=options.h5) - + try: + adata.X=adata.layers['counts'] + except: + _='counts may be already set' # If we have a flag for cells that pass QC then filter down to them if 'cell_passes_qc' in adata.obs: adata = adata[adata.obs['cell_passes_qc'], :] diff --git a/bin/0030-estimate_pca_elbow.py b/bin/0030-estimate_pca_elbow.py index 75c1e490..0a952160 100755 --- a/bin/0030-estimate_pca_elbow.py +++ b/bin/0030-estimate_pca_elbow.py @@ -78,7 +78,10 @@ def main(): # Read in the dataframe adata = sc.read_h5ad(filename=options.h5) - + try: + adata.X=adata.layers['counts'] + except: + _='counts may be already set' kneedle_dict = {} output_dict = {} diff --git a/bin/0035-scanpy_normalize_pca.py b/bin/0035-scanpy_normalize_pca.py index 99747ec5..93c6ba67 100755 --- a/bin/0035-scanpy_normalize_pca.py +++ b/bin/0035-scanpy_normalize_pca.py @@ -372,7 +372,6 @@ def scanpy_normalize_and_pca( sc.pp.filter_genes(adata, min_cells=5) # Only consider genes expressed in more than 0.5% of cells: # sc.pp.filter_genes(adata, min_cells=0.005*len(adata.obs.index)) - # Total-count normalize (library-size correct) the data matrix X to # counts per million, so that counts become comparable among cells. sc.pp.normalize_total( @@ -385,26 +384,8 @@ def scanpy_normalize_and_pca( # Logarithmize the data: X = log(X + 1) where log = natural logorithm. # Numpy has a nice function to undo this np.expm1(adata.X). sc.pp.log1p(adata) - # Delete automatically added uns - UPDATE: bad idea to delete as this slot - # is used in _highly_variable_genes_single_batch. - # del adata.uns['log1p'] - # Add record of this operation. - # adata.layers['log1p_cpm'] = adata.X.copy() - # adata.uns['log1p_cpm'] = {'transformation': 'ln(CPM+1)'} adata.layers['log1p_cp10k'] = adata.X.copy() adata.uns['log1p_cp10k'] = {'transformation': 'ln(CP10k+1)'} - - # Stash the unprocessed data in the raw slot. - # adata.raw.X.data is now ln(CPM+1). - # NOTE: - Layers are not preserved in adata.raw, though obs, var, uns are. - # - If genes are filtered (e.g., - # sc.pp.filter_genes(adata, min_cells=1)), the full dataset will - # remain in the raw slot. - # - We store in the raw slot because later for UMAP and marker gene - # analysis, we can easily tell scanpy to use the raw slot via the - # use_raw = True flag. Raw was specifically designed for this use - # case of ln(CPM+1), - # Can be deleted later: del adata.raw adata.raw = adata # adata_raw = adata.raw.to_adata() @@ -433,32 +414,7 @@ def scanpy_normalize_and_pca( batch_key=variable_feature_batch_key, inplace=True ) - if verbose: - print('{}: {} (all batches); {} ({})'.format( - 'Number of variable features detected', - adata.var['highly_variable_intersection'].sum(), - adata.var['highly_variable'].sum(), - 'after ranking the number of batches where a feature is variable' - )) - # If n_top_genes = None, then one needs to set 'highly_variable'. - # Here, highly_variable_intersection is only true for genes variable across - # all batch keys (i.e., 'highly_variable_nbatches' = n_batch_keys): - # adata.var.loc[ - # adata.var["highly_variable_intersection"], - # ["highly_variable_nbatches"] - # ] - # - # If n_top_genes = None, then one also needs needs to set highly_variable'. - # Fix bug in PCA when we have set batch_key. More below: - # https://github.com/theislab/scanpy/issues/1032 - # adata.var['highly_variable'] = adata.var['highly_variable_intersection'] - # - # Alternatively, if one specifies n_top_genes, then genes are ranked by - # 'highly_variable_nbatches' and highly_variable is set to the top n. - # adata.var.loc[ - # adata.var["highly_variable"], - # ["highly_variable_nbatches"] - # ] + if plot: # Plot highly variable genes. @@ -609,106 +565,15 @@ def scanpy_normalize_and_pca( copy=False ) + # Keep a record of the different gene scores if score_genes_df is not None: adata.uns['df_score_genes'] = score_genes_df_updated - # Calculate PCs. - - seed_value = 0 - # 0. Set `PYTHONHASHSEED` environment variable at a fixed value - os.environ['PYTHONHASHSEED'] = str(seed_value) - # 1. Set `python` built-in pseudo-random generator at a fixed value - random.seed(seed_value) - # 2. Set `numpy` pseudo-random generator at a fixed value - np.random.seed(seed_value) - - sc.tl.pca( - adata, - n_comps=min(200, adata.var['highly_variable'].sum()), - zero_center=True, # Set to true for standard PCA - svd_solver='arpack', # arpack reproducible when zero_center = True - use_highly_variable=True, - copy=False, - random_state=np.random.RandomState(0), - chunked=False - ) - # pca( - # adata, - # n_comps=min(200, adata.var['highly_variable'].sum()), - # svd_solver='arpack', # lobpcg not found in current sklearn - # use_highly_variable=True, - # copy=False - # ) - - # Save PCs to a seperate file for Harmony. - pca_df = pd.DataFrame( - adata.obsm['X_pca'], - index=adata.obs_names, - columns=[ - 'PC{}'.format(x) for x in range(1, adata.obsm['X_pca'].shape[1]+1) - ] - ) - pca_df.to_csv( - '{}-pcs.tsv.gz'.format(output_file), - sep='\t', - index=True, - index_label='cell_barcode', - na_rep='', - compression=compression_opts - ) - - # Save the metadata to a seperate file for Harmony. - adata.obs.to_csv( - '{}-metadata.tsv.gz'.format(output_file), - sep='\t', - index=True, - quoting=csv.QUOTE_NONNUMERIC, - index_label='cell_barcode', - na_rep='', - compression=compression_opts - ) - # Save the data. adata.write( - '{}-normalized_pca.h5ad'.format(output_file), + '{}-normalized.h5ad'.format(output_file), compression='gzip' - #compression_opts=anndata_compression_opts - ) - # adata_merged.write_csvs(output_file) - # adata_merged.write_loom(output_file+".loom")) - - # Plot the PC info. - if plot: - # Plot the vanilla PCs. - # sc.pl.pca( - # adata, - # color='experiment_id', - # components=['1,2', '3,4'] - # ) - _ = sc.pl.pca_variance_ratio( - adata, - n_pcs=adata.obsm['X_pca'].shape[1], - log=False, - show=False, - save='-{}.pdf'.format(output_file) - ) - _ = sc.pl.pca_variance_ratio( - adata, - n_pcs=adata.obsm['X_pca'].shape[1], - log=True, - show=False, - save='-{}-log.pdf'.format(output_file) - ) - - # Save the filtered count matrix for input to other software like scVI - adata.X = adata.layers['counts'] - del adata.layers['counts'] - del adata.raw - adata.write( - '{}-normalized_pca-counts.h5ad'.format(output_file), - compression='gzip' - #compression_opts=anndata_compression_opts ) return(output_file) @@ -852,7 +717,7 @@ def main(): drop_cell_passes_qc_from_clustering=options.drop_cell_passes_qc_from_clustering # Load the AnnData file adata = sc.read_h5ad(filename=options.h5) - + # adata_comp = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/test_recluster/work/6f/e30114c18a6dc6f620da63e187f348/f9d037b7109a2a7f96cb3ad63b97ff/outlier_filtered_adata.h5ad') # if this is the subclustering analysis, the count matrix should be used # by default, the analysis is "conventional" and thus will be skipped if options.layer != "none": diff --git a/bin/0057-scanpy_cluster_validate_resolution-keras.py b/bin/0057-scanpy_cluster_validate_resolution-keras.py index 0dafeafa..f5579e9a 100755 --- a/bin/0057-scanpy_cluster_validate_resolution-keras.py +++ b/bin/0057-scanpy_cluster_validate_resolution-keras.py @@ -519,7 +519,8 @@ def main(): # Virtual devices must be set before GPUs have been initialized print(e) else: - raise Exception('ERROR: no GPUs detected.') + _ = 'running without gpus' + # raise Exception('ERROR: no GPUs detected.') # Get additional data we are going to append to the output model info dict_add = {} diff --git a/bin/pca_anndata.py b/bin/pca_anndata.py new file mode 100755 index 00000000..591c9ad4 --- /dev/null +++ b/bin/pca_anndata.py @@ -0,0 +1,556 @@ +#!/usr/bin/env python + + +__date__ = '2020-03-13' +__version__ = '0.0.1' + +import argparse +from distutils.version import LooseVersion +import os +os.environ['NUMBA_CACHE_DIR']='/tmp' +os.environ['MPLCONFIGDIR']='/tmp' +import random +import numpy as np +import scipy as sp +# import sklearn.utils +import sklearn.decomposition +import pandas as pd +import scanpy as sc +import csv +import time +from datetime import timedelta + +# Set seed for reproducibility +seed_value = 0 +# 0. Set `PYTHONHASHSEED` environment variable at a fixed value +os.environ['PYTHONHASHSEED'] = str(seed_value) +# 1. Set `python` built-in pseudo-random generator at a fixed value +random.seed(seed_value) +# 2. Set `numpy` pseudo-random generator at a fixed value +np.random.seed(seed_value) + +# Set scanpy settings +# sc verbosity: errors (0), warnings (1), info (2), hints (3) +# sc.settings.verbosity = 3 +# sc.logging.print_versions() +# sc.settings.set_figure_params(dpi=80) + + +def pca( + data, + n_comps=None, + svd_solver='arpack', + use_highly_variable=None, + copy=False +): + """Compute PCA coordinates, loadings and variance decomposition. + + Derived from scanpy 1.5.1. + Principal component analysis [Pedregosa11]_.] + Uses the implementation of *scikit-learn* [Pedregosa11]_. + + Parameters + ---------- + data + The (annotated) data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. + n_comps + Number of principal components to compute. Defaults to 50, or 1 - + minimum dimension size of selected representation. + svd_solver + SVD solver to use: + `'arpack'` (the default) + for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`) + `'randomized'` + for the randomized algorithm due to Halko (2009). + `'auto'` + chooses automatically depending on the size of the problem. + `'lobpcg'` + An alternative SciPy solver. + .. versionchanged:: 1.4.5 + Default value changed from `'auto'` to `'arpack'`. + Efficient computation of the principal components of a sparse matrix + currently only works with the `'arpack`' or `'lobpcg'` solvers. + use_highly_variable + Whether to use highly variable genes only, stored in + `.var['highly_variable']`. + By default uses them if they have been determined beforehand. + copy + If an :class:`~anndata.AnnData` is passed, determines whether a copy + is returned. Is ignored otherwise. + Returns + ------- + adata : anndata.AnnData + …otherwise if `copy=True` it returns or else adds fields to `adata`: + `.obsm['X_pca']` + PCA representation of data. + `.varm['PCs']` + The principal components containing the loadings. + `.uns['pca']['variance_ratio']` + Ratio of explained variance. + `.uns['pca']['variance']` + Explained variance, equivalent to the eigenvalues of the + covariance matrix. + """ + adata = data.copy() if copy else data + + if use_highly_variable and 'highly_variable' not in adata.var.keys(): + raise ValueError( + 'Did not find adata.var[\'highly_variable\']. ' + 'Either your data already only consists of highly-variable genes ' + 'or consider running `pp.highly_variable_genes` first.' + ) + if use_highly_variable is None: + if 'highly_variable' in adata.var.keys(): + use_highly_variable = True + else: + use_highly_variable = False + + if use_highly_variable: + adata_comp = ( + adata[:, adata.var['highly_variable']] + ) + else: + adata_comp = adata + + if n_comps is None: + min_dim = min(adata_comp.n_vars, adata_comp.n_obs) + n_comps = min_dim - 1 + + # random_state = sklearn.utils.check_random_state(random_state) + X = adata_comp.X + + # If sparse, make dense. + # Another option: + # output = _pca_with_sparse( + # X, n_comps, solver=svd_solver, random_state=random_state + # ) + if sp.sparse.issparse(X): + X = X.toarray() + + # Sort out the solver + if svd_solver == 'auto': + svd_solver = 'arpack' + if svd_solver not in {'arpack', 'randomized'}: + raise ValueError( + 'svd_solver: {svd_solver} can not be used with sparse input.' + ) + + pca_ = sklearn.decomposition.PCA( + n_components=n_comps, + svd_solver=svd_solver, + random_state=0 + ) + X_pca = pca_.fit_transform(X) + + # Cast to whatever datatype. + # dtype = 'float32' + # dtype + # Numpy data type string to which to convert the result. + # if X_pca.dtype.descr != np.dtype(dtype).descr: + # X_pca = X_pca.astype(dtype) + + # Update the adata frame (if copy=False, then this is the same input adata + # that the user provided) + adata.obsm['X_pca'] = X_pca + adata.uns['pca'] = {} + adata.uns['pca']['params'] = { + 'zero_center': True, + 'use_highly_variable': use_highly_variable, + } + if use_highly_variable: + adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) + adata.varm['PCs'][adata.var['highly_variable']] = pca_.components_.T + else: + adata.varm['PCs'] = pca_.components_.T + adata.uns['pca']['variance'] = pca_.explained_variance_ + adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_ + + return adata if copy else None + + +def score_cells( + adata, + score_genes_df, + score_genes_df_column='ensembl_gene_id', + only_use_variable_genes=False +): + """Scores each cell. + + Parameters + ---------- + adata : AnnData + Input AnnData object. Assume adata.X is norm->log1p->scaled data. + score_genes_df : pd.DataFrame + Dataframe of marker genes. Needs to have score_genes_df_column and + score_id column. If one score_id == 'cell_cycle', then requires a + grouping_id column with 'G2/M' and 'S'. + score_genes_df_column : string + Column in score_genes_df to use for gene ids (e.g., hgnc_symbol, + ensembl_gene_id) + only_use_variable_genes : boolean + Only use variable genes to calculate scores. If True, score_id will + be changed to __hvg_only. Note this flage does not apply + to score_id == 'cell_cycle'. + + + Returns + ------- + adata : AnnData + AnnData object with scores calculated and stored in + adata.obs[]. + score_genes_df : pd.DataFrame + The score_genes_df with the following columns added: + gene_found_in_adata, gene_found_is_highly_variable. It is suggested + that this dataframe is added to the adata.uns slot. + """ + verbose = False # For debugging purposes. + + # Update the score_genes_df with details on the genes and if they were + # found in adata and if they are highly variable. + score_genes_df['gene_found_in_adata'] = np.in1d( + score_genes_df[score_genes_df_column], + adata.var.index + ) + score_genes_df['gene_found_is_highly_variable'] = np.in1d( + score_genes_df[score_genes_df_column], + adata.var.index[adata.var['highly_variable']] + ) + + # Set the gene pool parameter. + gene_pool = None # If None, all genes are randomly sampled for background + if only_use_variable_genes: + gene_pool = adata.var.index[adata.var['highly_variable']] + + # Loop over each score_id in score_genes_df, updating adata. + for score_id, df_group in score_genes_df.groupby('score_id'): + # Downsample to only those genes found in the data. + df_group = df_group.loc[ + df_group['gene_found_in_adata'], : + ] + if df_group.shape[0] == 0: + continue + + # If we are supposed to use only_use_variable_genes, then do so. + if only_use_variable_genes: + if score_id == 'cell_cycle': + continue + score_id = '{}__hvg_only'.format(score_id) + df_group = df_group.loc[ + df_group['gene_found_is_highly_variable'], : + ] + if df_group.shape[0] == 0: + continue + if verbose: + print('Scoring {}'.format(score_id)) + + # Set the number of control genes. + ctrl_size = 50 + if df_group.shape[0] > 50: + ctrl_size = df_group.shape[0] + if gene_pool is not None: + if ctrl_size > len(gene_pool): + raise Exception( + 'Error in gene scoring ctrl_size > len(gene_pool)' + ) + + # If the score_id is cell_cycle, then use the specific cell cycle + # scoring function. + if score_id == 'cell_cycle': + # NOTE: Setting ctrl_size` is not possible, as it's set as + # `min(len(s_genes), len(g2m_genes))`. + sc.tl.score_genes_cell_cycle( + adata, + s_genes=df_group.loc[ + df_group['grouping_id'] == 'S', score_genes_df_column + ], + g2m_genes=df_group.loc[ + df_group['grouping_id'] == 'G2/M', score_genes_df_column + ], + copy=False, + gene_pool=gene_pool, # Default is None (aka, use all) + n_bins=25, # Default is 25 + use_raw=False + ) + else: + sc.tl.score_genes( + adata, + df_group[score_genes_df_column], + ctrl_size=ctrl_size, # Default is 50 + gene_pool=gene_pool, # Default is None (aka, use all) + n_bins=25, # Default is 25 + score_name=score_id, + random_state=0, # Default is 0 + copy=False, + use_raw=False + ) + + return adata, score_genes_df + + +def pca_analysis( + adata, + output_file, + variable_feature_batch_key='experiment_id', + n_variable_features=2000, + exclude_hv_gene_df=[], + score_genes_df=None, + verbose=True, + plot=True, + anndata_compression_opts=4 +): + + # Calculate PCs. + seed_value = 0 + # 0. Set `PYTHONHASHSEED` environment variable at a fixed value + os.environ['PYTHONHASHSEED'] = str(seed_value) + # 1. Set `python` built-in pseudo-random generator at a fixed value + random.seed(seed_value) + # 2. Set `numpy` pseudo-random generator at a fixed value + np.random.seed(seed_value) + + sc.tl.pca( + adata, + n_comps=min(200, adata.var['highly_variable'].sum()), + zero_center=True, # Set to true for standard PCA + svd_solver='arpack', # arpack reproducible when zero_center = True + use_highly_variable=True, + copy=False, + random_state=np.random.RandomState(0), + chunked=False + ) + + # Save PCs to a seperate file for Harmony. + pca_df = pd.DataFrame( + adata.obsm['X_pca'], + index=adata.obs_names, + columns=[ + 'PC{}'.format(x) for x in range(1, adata.obsm['X_pca'].shape[1]+1) + ] + ) + + compression_opts = 'gzip' + if LooseVersion(pd.__version__) > '1.0.0': + compression_opts = dict( + method='gzip', + compresslevel=9 + ) + + pca_df.to_csv( + '{}-pcs.tsv.gz'.format(output_file), + sep='\t', + index=True, + index_label='cell_barcode', + na_rep='', + compression=compression_opts + ) + + # Save the metadata to a seperate file for Harmony. + adata.obs.to_csv( + '{}-metadata.tsv.gz'.format(output_file), + sep='\t', + index=True, + quoting=csv.QUOTE_NONNUMERIC, + index_label='cell_barcode', + na_rep='', + compression=compression_opts + ) + + # Save the data. + adata.write( + '{}-normalized_pca.h5ad'.format(output_file), + compression='gzip' + ) + # Plot the PC info. + if plot: + # Plot the vanilla PCs. + sc.pl.pca_variance_ratio( + adata, + n_pcs=adata.obsm['X_pca'].shape[1], + log=False, + show=False, + save='-{}.pdf'.format(output_file) + ) + + sc.pl.pca_variance_ratio( + adata, + n_pcs=adata.obsm['X_pca'].shape[1], + log=True, + show=False, + save='-{}-log.pdf'.format(output_file) + ) + + # Save the filtered count matrix for input to other software like scVI + adata.X = adata.layers['counts'] + del adata.layers['counts'] + del adata.raw + adata.write( + '{}-normalized_pca-counts.h5ad'.format(output_file), + compression='gzip' + #compression_opts=anndata_compression_opts + ) + + +def main(): + """Run CLI.""" + parser = argparse.ArgumentParser( + description=""" + Read anndata object. Normalize, calculate PCs. Save new anndata + object along with csv file of PCs. + """ + ) + + parser.add_argument( + '-v', '--version', + action='version', + version='%(prog)s {version}'.format(version=__version__) + ) + + parser.add_argument( + '-h5', '--h5_anndata', + action='store', + dest='h5', + required=True, + help='H5 AnnData file.' + ) + + parser.add_argument( + '-layer', '--overwrite_x_with_layer', + action='store', + dest='layer', + default='none', + help='Specify a layer of the AnnData file, which should be used for \ + the following normalization and downstream analysis. This should \ + go together with the analysis mode of the pipeline as \ + "conventional" or "subclustering". \ + (default: %(default)s)' + ) + + parser.add_argument( + '-bk', '--batch_key', + action='store', + dest='bk', + default='experiment_id', + help='Batch key for highly-variable feature (e.g., gene) detection.\ + If specified, highly-variable features are selected within each\ + batch separately and merged.\ + (default: %(default)s)' + ) + + parser.add_argument( + '-nvf', '--number_variable_features', + action='store', + dest='nvf', + default=2000, + type=int, + help='After calculating variable features within each batch set via\ + , rank features by number of batches where they are\ + variable and select the top .\ + (default: %(default)s)' + ) + + parser.add_argument( + '-vge', '--variable_genes_exclude', + action='store', + dest='vge', + default='', + help='Tab-delimited file with genes to exclude from the highly\ + variable gene list. Must contain ensembl_gene_id column.\ + (default: None - keep all variable genes)' + ) + + parser.add_argument( + '-vr', '--vars_to_regress', + action='store', + dest='vr', + default='', + help='Comma seperated list of metadata variables to regress prior to\ + calculating PCs. Example: gene_group__mito_transcript,n_count.\ + (default: "" and sc.pp.regress_out is not called)' + ) + + parser.add_argument( + '-sg', '--score_genes', + action='store', + dest='sg', + default='', + help='Tab-delimited file of genes for scores. Needs to have\ + ensembl_gene_id and score_id column. If one\ + score_id == "cell_cycle", then requires a grouping_id column with\ + "G2/M" and "S".' + ) + + parser.add_argument( + '-drop_cell_passes_qc_from_clustering', '--drop_cell_passes_qc_from_clusteringdrop_cell_passes_qc_from_clustering', + action='store', + dest='drop_cell_passes_qc_from_clustering', + default=False, + help='Whether we want to drop cells before clustering based on the cell_passes_qc filter established by outlier filter part of pipeline' + ) + + + parser.add_argument( + '-ncpu', '--number_cpu', + action='store', + dest='ncpu', + default=4, + type=int, + help='Number of CPUs to use.\ + (default: %(default)s)' + ) + + parser.add_argument( + '--anndata_compression_opts', + action='store', + dest='anndata_compression_opts', + default=4, + type=int, + help='Compression level in anndata. A larger value decreases disk \ + space requirements at the cost of compression time. \ + (default: %(default)s)' + ) + + parser.add_argument( + '-of', '--output_file', + action='store', + dest='of', + default='adata-normalize_pca', + help='Directory and basename of output files.\ + (default: %(default)s)' + ) + + options = parser.parse_args() + + # Scanpy settings + sc.settings.figdir = os.getcwd() # figure output directory to match base. + sc.settings.n_jobs = options.ncpu # number CPUs + # sc.settings.max_memory = 500 # in Gb + # sc.set_figure_params(dpi_save = 300) + drop_cell_passes_qc_from_clustering=options.drop_cell_passes_qc_from_clustering + # Load the AnnData file + adata = sc.read_h5ad(filename=options.h5) + try: + del adata.uns + except: + _='still there' + # adata_comp = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/test_recluster/work/6f/e30114c18a6dc6f620da63e187f348/f9d037b7109a2a7f96cb3ad63b97ff/outlier_filtered_adata.h5ad') + # adata_comp = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/test_recluster/work/91/676237d4521fd78b293a8c4e548394/adata-normalized_pca.h5ad') + + start_time = time.time() + + pca_analysis( + adata, + output_file=options.of, + variable_feature_batch_key=options.bk, + n_variable_features=options.nvf, + anndata_compression_opts=options.anndata_compression_opts + ) + execution_summary = "Analysis execution time [{}]:\t{}".format( + "pca.py", + str(timedelta(seconds=time.time()-start_time)) + ) + print(execution_summary) + + +if __name__ == '__main__': + main() diff --git a/conf/base.conf b/conf/base.conf index d31eed54..cc41cee9 100755 --- a/conf/base.conf +++ b/conf/base.conf @@ -36,15 +36,16 @@ params{ mem1= 12000 copy_mode = "rellink" split_bam = false + cluster_markers = true existing_cellsnp="${projectDir}/assets/existing_cellsnp" existing_vireo='' - skip_preprocessing{ - value=false - gt_match_file="" // #We prvide this if we want to exclude a particular samples matched to a ceirtain GT cohortc from the adaptive qc - gt_match_based_adaptive_qc_exclusion_pattern = '' // #We run the adaptive QC on these patterns independently regardless on assigned celltype. - file__anndata_merged = '' - file__cells_filtered = '' - } + normalise_andata = true + skip_preprocessing=false + gt_match_file="" // #We prvide this if we want to exclude a particular samples matched to a ceirtain GT cohortc from the adaptive qc + gt_match_based_adaptive_qc_exclusion_pattern = '' // #We run the adaptive QC on these patterns independently regardless on assigned celltype. + file__anndata_merged = '' + file__cells_filtered = '' + id_in='experiment_id' genotype_phenotype_mapping_file ='' extra_sample_metadata = '' use_phenotype_ids_for_gt_match = true //#if false this will keep the genotype ids, for this to be used have to set a genotype_phenotype_mapping_file to a path to csv where firs column contains genotype ids and second contains phenotype ids to replace these to. diff --git a/main.nf b/main.nf index c6dea7ed..138101bf 100755 --- a/main.nf +++ b/main.nf @@ -13,6 +13,9 @@ include { YASCP } from "$projectDir/workflows/yascp" include { RETRIEVE_RECOURSES;RETRIEVE_RECOURSES_TEST_DATASET } from "$projectDir/subworkflows/local/retrieve_recourses" include {RSYNC_RESULTS_REMOVE_WORK_DIR} from "$projectDir/modules/local/rsync_results_remove_work_dir/main" include {celltype} from "$projectDir/subworkflows/celltype" +include {qc} from "$projectDir/subworkflows/qc" +include {dummy_filtered_channel} from "$projectDir/modules/nf-core/modules/merge_samples/functions" + ////// WORKFLOW: Run main nf-core/yascp analysis pipeline // This is the default entry point, we have others to update ceirtain parts of the results. // Please go to ./workflows/yascp to see the main Yascp workflow. @@ -46,11 +49,19 @@ workflow { workflow JUST_CELLTYPES{ - file__anndata_merged = Channel.from(params.skip_preprocessing.file__anndata_merged) + file__anndata_merged = Channel.from(params.file__anndata_merged) celltype(file__anndata_merged) } +workflow JUST_RECLUSTER{ + file__anndata_merged = Channel.from(params.file__anndata_merged) + gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq") + dummy_filtered_channel(file__anndata_merged,params.id_in) + file__cells_filtered = dummy_filtered_channel.out.anndata_metadata + qc(file__anndata_merged,file__cells_filtered,gt_outlier_input) //This runs the Clusterring and qc assessments of the datasets. + +} ////// You do not need to concern about the workflows bellow as these are Cardinal Specific and used for development diff --git a/modules/nf-core/modules/clustering/functions.nf b/modules/nf-core/modules/clustering/functions.nf index 5f6efa32..b54847fb 100755 --- a/modules/nf-core/modules/clustering/functions.nf +++ b/modules/nf-core/modules/clustering/functions.nf @@ -341,7 +341,7 @@ process cluster_validate_resolution_keras { // ------------------------------------------------------------------------ //cache false // cache results from run //maxForks 2 // hard to control memory usage. limit to 3 concurrent - label 'gpu' // use GPU + label 'process_low' // use GPU scratch false // use tmp directory if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://yascp.cog.sanger.ac.uk/public/singularity_images/wtsihgi_nf_scrna_qc_6bb6af5-2021-12-23-3270149cf265.sif" diff --git a/modules/nf-core/modules/clustering/main.nf b/modules/nf-core/modules/clustering/main.nf index ea4e0b2a..2c850608 100755 --- a/modules/nf-core/modules/clustering/main.nf +++ b/modules/nf-core/modules/clustering/main.nf @@ -71,37 +71,34 @@ workflow CLUSTERING { // cluster_validate_resolution__sparsity, // cluster_validate_resolution__train_size_cells // ) - if (params.utilise_gpu){ - if (params.cluster_validate_resolution_keras){ + + if (params.cluster_validate_resolution_keras){ - - cluster_validate_resolution_keras( - cluster.out.outdir, - cluster.out.anndata, - cluster.out.metadata, - cluster.out.pcs, - cluster.out.reduced_dims, - cluster.out.clusters, - cluster_validate_resolution__sparsity, - cluster_validate_resolution__train_size_cells, - cluster.out.outdir__reduced_dims - ) + + cluster_validate_resolution_keras( + cluster.out.outdir, + cluster.out.anndata, + cluster.out.metadata, + cluster.out.pcs, + cluster.out.reduced_dims, + cluster.out.clusters, + cluster_validate_resolution__sparsity, + cluster_validate_resolution__train_size_cells, + cluster.out.outdir__reduced_dims + ) - plot_resolution_validate( - cluster_validate_resolution_keras.out.plot_input.groupTuple() - ) - } - + plot_resolution_validate( + cluster_validate_resolution_keras.out.plot_input.groupTuple() + ) } + + SCCAF(cluster.out.outdir, cluster.out.anndata, cluster.out.clusters, sccaf_minacc) - - - // // Generate UMAPs of the results. umap_calculate_and_plot( cluster.out.outdir, @@ -118,28 +115,29 @@ workflow CLUSTERING { ) dummy_output=umap_calculate_and_plot.out.dummy_output // // Find marker genes for clusters - cluster_markers( - cluster.out.outdir, - cluster.out.anndata, - cluster.out.metadata, - cluster.out.pcs, - cluster.out.reduced_dims, - cluster.out.clusters, - cluster_marker__methods - ) - - // // Find marker genes for clusters using CELLEX - cellex_cluster_markers( - cluster.out.outdir, - cluster.out.anndata - ) + if (params.cluster_markers){ + cluster_markers( + cluster.out.outdir, + cluster.out.anndata, + cluster.out.metadata, + cluster.out.pcs, + cluster.out.reduced_dims, + cluster.out.clusters, + cluster_marker__methods + ) - // Prep adata file for cellxgene website - prep_cellxgene( - cluster.out.outdir, - cluster.out.anndata - ) + // // Find marker genes for clusters using CELLEX + cellex_cluster_markers( + cluster.out.outdir, + cluster.out.anndata + ) + // Prep adata file for cellxgene website + prep_cellxgene( + cluster.out.outdir, + cluster.out.anndata + ) + } emit: dummy_output diff --git a/modules/nf-core/modules/estimate_pca_elbow/main.nf b/modules/nf-core/modules/estimate_pca_elbow/main.nf index 05d3d9ac..2251e3af 100755 --- a/modules/nf-core/modules/estimate_pca_elbow/main.nf +++ b/modules/nf-core/modules/estimate_pca_elbow/main.nf @@ -43,10 +43,7 @@ process ESTIMATE_PCA_ELBOW { script: outdir = "${outdir_prev}" - log.info("""outdir = ${outdir}""") - // from the file__anndata job. outfile = "${file__anndata}".minus(".h5ad") - .split("-").drop(1).join("-") outfile = "${outfile}-knee" """ rm -fr plots diff --git a/modules/nf-core/modules/normalise_and_pca/main.nf b/modules/nf-core/modules/normalise_and_pca/main.nf index b0f81031..696128c9 100755 --- a/modules/nf-core/modules/normalise_and_pca/main.nf +++ b/modules/nf-core/modules/normalise_and_pca/main.nf @@ -3,6 +3,59 @@ def random_hex(n) { Long.toUnsignedString(new Random().nextLong(), n).toUpperCase() } +process PCA { + + label 'process_medium' + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://yascp.cog.sanger.ac.uk/public/singularity_images/nf_qc_scrna_v1.img" + // /software/hgi/containers/nf_qc_scrna_v1.img + } else { + container "mercury/nf_qc_scrna:v1" + } + + publishDir path: "${outdir}", + saveAs: {filename -> filename.replaceAll("-", "")}, + mode: "${params.copy_mode}", + overwrite: "true" + + input: + path(file__anndata) + val(outdir) + val(layer) + + output: + val(outdir, emit: outdir) + val("${outdir}", emit: outdir3) + path("adata-normalized_pca.h5ad", emit: anndata) + path("adata-metadata.tsv.gz", emit: metadata) + path("adata-pcs.tsv.gz", emit: pcs) + path( + "adata-normalized_pca-counts.h5ad", + emit: anndata_filtered_counts + ) + val("${param_details}", emit: param_details) + path("plots/*.pdf") + path("plots/*.png") optional true + + script: + + """ + rm -fr plots + pca_anndata.py \ + --h5_anndata ${file__anndata} \ + --overwrite_x_with_layer ${layer} \ + --output_file adata \ + --number_cpu ${task.cpus} \ + --drop_cell_passes_qc_from_clustering ${params.drop_cell_passes_qc_from_clustering} + mkdir plots + + mv *pdf plots/ 2>/dev/null || true + mv *png plots/ 2>/dev/null || true + """ + +} + + process NORMALISE_AND_PCA { // Takes annData object, nomalizes across samples, calculates PCs. // NOTE: Once normalization is set, it would be faster to normalize per @@ -25,7 +78,7 @@ process NORMALISE_AND_PCA { overwrite: "true" input: - val(outdir_prev) + path(file__anndata) val(analysis_mode) val(layer) @@ -36,16 +89,8 @@ process NORMALISE_AND_PCA { output: val(outdir, emit: outdir) - val("${outdir}", emit: outdir3) - path("adata-normalized_pca.h5ad", emit: anndata) - path("adata-metadata.tsv.gz", emit: metadata) - path("adata-pcs.tsv.gz", emit: pcs) - - path( - "adata-normalized_pca-counts.h5ad", - emit: anndata_filtered_counts - ) + path("adata-normalized.h5ad", emit: anndata) val("${param_details}", emit: param_details) path("plots/*.pdf") path("plots/*.png") optional true @@ -66,19 +111,14 @@ process NORMALISE_AND_PCA { cmd__vars_to_regress = "--vars_to_regress ${vars_to_regress}" } - // todo - mo11 - these paths are confusing - - outdir = "${outdir_prev}/normalize=total_count.${param_details}" + outdir = "${params.outdir}/clustering/normalize=total_count.${param_details}" // Add details on the genes we are exlcuding from hgv list. file_vge = "${file__genes_exclude_hvg.getSimpleName()}" outdir = "${outdir}.hvg_exclude=${file_vge}" // Add details on the scores we are using. file_score = "${file__genes_score.getSimpleName()}" outdir = "${outdir}.scores=${file_score}" - - // this is where the subfolder 1 is determined - // Customize command for optional files. cmd__genes_exclude_hvg = "" if (file__genes_exclude_hvg.name != "no_file__genes_exclude_hvg") { @@ -89,7 +129,6 @@ process NORMALISE_AND_PCA { cmd__genes_score = "--score_genes ${file__genes_score}" } - """ rm -fr plots 0035-scanpy_normalize_pca.py \ @@ -106,26 +145,4 @@ process NORMALISE_AND_PCA { mv *pdf plots/ 2>/dev/null || true mv *png plots/ 2>/dev/null || true """ - // Old version with bash evaluation of optional commands - // - // echo "normalize_pca: ${process_info}" - // # If there are entries in the variable_genes_exclude file, add it to - // # the call. - // cmd__vg_exclude="--variable_genes_exclude ${file__genes_exclude_hvg}" - // val=\$(cat ${file__genes_exclude_hvg} | wc -l) - // if [ \$val -eq 0 ]; then cmd__vg_exclude=""; fi - // # If there are entries in the score_genes file, add it to the call. - // cmd__score_genes="--score_genes ${file__genes_score}" - // val=\$(cat ${file__genes_score} | wc -l) - // if [ \$val -eq 0 ]; then cmd__score_genes=""; fi - // 0035-scanpy_normalize_pca.py \ - // --h5_anndata ${file__anndata} \ - // --output_file adata \ - // --number_cpu ${task.cpus} \ - // ${cmd__vars_to_regress} \ - // \${cmd__vg_exclude} \ - // \${cmd__score_genes} - // mkdir plots - // mv *pdf plots/ 2>/dev/null || true - // mv *png plots/ 2>/dev/null || true } diff --git a/subworkflows/qc.nf b/subworkflows/qc.nf index 9ea2ff74..2d88024f 100755 --- a/subworkflows/qc.nf +++ b/subworkflows/qc.nf @@ -5,7 +5,7 @@ include {OUTLIER_FILTER} from "$projectDir/modules/nf-core/modules/outlier_filte include {PLOT_STATS} from "$projectDir/modules/nf-core/modules/plot_stats/main" include {ESTIMATE_PCA_ELBOW} from "$projectDir/modules/nf-core/modules/estimate_pca_elbow/main" include {SUBSET_PCS} from "$projectDir/modules/nf-core/modules/subset_pcs/main" -include {NORMALISE_AND_PCA} from "$projectDir/modules/nf-core/modules/normalise_and_pca/main" +include {NORMALISE_AND_PCA; PCA} from "$projectDir/modules/nf-core/modules/normalise_and_pca/main" include {HARMONY} from "$projectDir/modules/nf-core/modules/harmony/main" include {BBKNN} from "$projectDir/modules/nf-core/modules/bbknn/main" include {ADD_EXTRA_METADATA_TO_H5AD} from "$projectDir/modules/nf-core/modules/adata_manipulations/main" @@ -19,7 +19,7 @@ workflow qc { take: file__anndata_merged file__cells_filtered - assignments_all_pools + gt_outlier_input main: log.info "--- Running QC metrics --- " // if(params.extra_metadata!=''){ @@ -29,26 +29,14 @@ workflow qc { // }else{ // log.info '''--- No extra metadata to add to h5ad ---''' // } - file__anndata_merged.map{val1 -> tuple('full', val1)}.set{out1} CELL_HARD_FILTERS(file__anndata_merged,params.hard_filters_file,params.hard_filters_drop) if(params.hard_filters_file != "no_file__file_sample_qc"){ file__anndata_merged = CELL_HARD_FILTERS.out.anndata } - // Next we define an input channel to outlier filtering strategy in case if params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !='' - // i.e - if we want to exclude a particular cohort that has been matched by gt match from the adaptive qc we feed this in the outlier_filter() - if(params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !=''){ - gt_outlier_input = assignments_all_pools - }else{ - gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq") - } - - file__anndata_merged.subscribe { println "value1: $it" } - file__cells_filtered.subscribe { println "value2: $it" } - gt_outlier_input.subscribe { println "value3: $it" } //FILTERING OUTLIER CELLS - if (params.sample_qc.cell_filters.filter_outliers.run_process) { + if (params.filter_outliers) { log.info """---Running automatic outlier cell filtering.----""" OUTLIER_FILTER( params.outdir, @@ -67,18 +55,28 @@ workflow qc { } + if (params.normalise_andata){ + NORMALISE_AND_PCA( + file__anndata_merged, + params.mode, + params.layer, + params.genes_exclude_hvg, + params.genes_score, + params.reduced_dims.vars_to_regress.value) + andata = NORMALISE_AND_PCA.out.anndata + outdir = NORMALISE_AND_PCA.out.outdir + + }else{ + andata = file__anndata_merged + outdir = "${params.outdir}" + LI4 = Channel.of([1, 'dummy_lisi']) + } - NORMALISE_AND_PCA(params.outdir+'/clustering', - file__anndata_merged, - params.mode, - params.layer, - params.genes_exclude_hvg, - params.genes_score, - params.reduced_dims.vars_to_regress.value) + PCA(andata,params.outdir,params.layer) ESTIMATE_PCA_ELBOW( - NORMALISE_AND_PCA.out.outdir, - NORMALISE_AND_PCA.out.anndata, + PCA.out.outdir, + PCA.out.anndata, params.reduced_dims.n_dims.add_n_to_estimate ) @@ -91,21 +89,21 @@ workflow qc { } SUBSET_PCS( - NORMALISE_AND_PCA.out.outdir, - NORMALISE_AND_PCA.out.anndata, - NORMALISE_AND_PCA.out.metadata, - NORMALISE_AND_PCA.out.pcs, - NORMALISE_AND_PCA.out.param_details, + PCA.out.outdir, + PCA.out.anndata, + PCA.out.metadata, + PCA.out.pcs, + PCA.out.param_details, n_pcs ) - file__anndata_merged.subscribe { println "PLOT_STATS input: $it" } + PCA.out.outdir.subscribe { println "outdir input: $it" } PLOT_STATS(file__anndata_merged, file__cells_filtered, SUBSET_PCS.out.outdir, SUBSET_PCS.out.anndata, n_pcs) - file__anndata_merged = NORMALISE_AND_PCA.out.anndata + file__anndata_merged = PCA.out.anndata LI4 = PLOT_STATS.out.LI @@ -120,11 +118,11 @@ workflow qc { // "Correct" PCs using Harmony or BBKNN if (params.harmony.run_process) { HARMONY( - NORMALISE_AND_PCA.out.outdir, - NORMALISE_AND_PCA.out.anndata, - NORMALISE_AND_PCA.out.metadata, - NORMALISE_AND_PCA.out.pcs, - NORMALISE_AND_PCA.out.param_details, + PCA.out.outdir, + PCA.out.anndata, + PCA.out.metadata, + PCA.out.pcs, + PCA.out.param_details, n_pcs, Channel.fromList( params.harmony.variables_and_thetas.value) ) @@ -187,11 +185,11 @@ workflow qc { if (params.bbknn.run_process) { BBKNN( - NORMALISE_AND_PCA.out.outdir, - NORMALISE_AND_PCA.out.anndata, - NORMALISE_AND_PCA.out.metadata, - NORMALISE_AND_PCA.out.pcs, - NORMALISE_AND_PCA.out.param_details, + PCA.out.outdir, + PCA.out.anndata, + PCA.out.metadata, + PCA.out.pcs, + PCA.out.param_details, n_pcs, params.bbknn.batch_variable.value ) @@ -256,8 +254,8 @@ workflow qc { lisi_input_second = lisi_input_first.mix(lisi_input3) LISI( - NORMALISE_AND_PCA.out.outdir, - NORMALISE_AND_PCA.out.metadata, + PCA.out.outdir, + PCA.out.metadata, params.lisi.variables.value, lisi_input_second.collect() ) diff --git a/workflows/yascp.nf b/workflows/yascp.nf index 6e425d2f..f3009565 100755 --- a/workflows/yascp.nf +++ b/workflows/yascp.nf @@ -57,7 +57,7 @@ workflow YASCP { if(!params.just_reports){ // sometimes we just want to rerun report generation as a result of alterations, hence if we set params.just_reports =True pipeline will use the results directory and generate a new reports. - if (!params.skip_preprocessing.value){ + if (!params.skip_preprocessing){ // The input table should contain the folowing columns - experiment_id n_pooled donor_vcf_ids data_path_10x_format // prepearing the inputs from a standard 10x dataset folders. prepare_inputs(input_channel) @@ -125,23 +125,23 @@ workflow YASCP { }else{ // This option skips all the deconvolution and and takes a preprocessed yascp h5ad file to run the downstream clustering and celltype annotation. log.info '''----Skipping Preprocessing since we already have prepeared h5ad input file----''' - file__anndata_merged = Channel.from(params.skip_preprocessing.file__anndata_merged) + file__anndata_merged = Channel.from(params.file__anndata_merged) if("${mode}"!='default'){ // Here we have rerun GT matching upstream - done for freeze1 assignments_all_pools = mode }else{ - if (params.skip_preprocessing.file__anndata_merged !=''){ - assignments_all_pools = Channel.from(params.skip_preprocessing.gt_match_file) + if (params.file__anndata_merged !=''){ + assignments_all_pools = Channel.from(params.gt_match_file) }else{ assignments_all_pools = Channel.from("$projectDir/assets/fake_file.fq") } } - if (params.skip_preprocessing.file__cells_filtered ==''){ + if (params.file__cells_filtered ==''){ log.info '''--- No cells filtered input ----''' - dummy_filtered_channel(file__anndata_merged,params.skip_preprocessing.id_in) + dummy_filtered_channel(file__anndata_merged,params.id_in) file__cells_filtered = dummy_filtered_channel.out.anndata_metadata }else{ file__cells_filtered = Channel.from(params.skip_preprocessing.file__cells_filtered) @@ -175,7 +175,14 @@ workflow YASCP { // ################################### if (!params.skip_qc){ - qc(file__anndata_merged,file__cells_filtered,assignments_all_pools) //This runs the Clusterring and qc assessments of the datasets. + + if(params.skip_preprocessing.gt_match_based_adaptive_qc_exclusion_pattern !=''){ + gt_outlier_input = assignments_all_pools + }else{ + gt_outlier_input = Channel.from("$projectDir/assets/fake_file.fq") + } + + qc(file__anndata_merged,file__cells_filtered,gt_outlier_input) //This runs the Clusterring and qc assessments of the datasets. process_finish_check_channel = qc.out.LI file__anndata_merged = qc.out.file__anndata_merged }else{