diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c3393e7..4edb2f0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -9,7 +9,7 @@ variables: # Only common file inputs and option values need to be given here # (not things such as -profile) CI_FLAVOUR: "new" - NF_WORKFLOW_OPTS: "--fastq test_data/fastq \ + NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 16GB --fastq test_data/fastq \ --reference_genome test_data/grch38/grch38_chr19_22.fa.gz \ --targets test_data/targets.bed --full_report --threads 4" @@ -34,12 +34,12 @@ docker-run: - if: $MATRIX_NAME == "multi-sample" variables: NF_WORKFLOW_OPTS: - "--fastq test_data/fastq \ + "-executor.\\$$local.memory 16GB --fastq test_data/fastq \ --reference_genome test_data/grch38/grch38_chr19_22.fa.gz \ --targets test_data/targets.bed --full_report --threads 4" - if: $MATRIX_NAME == "one-sample" variables: NF_WORKFLOW_OPTS: - "--fastq 'test_data/fastq/sample_2/sample_2 ontarget.fastq.gz' \ + "-executor.\\$$local.memory 16GB --fastq 'test_data/fastq/sample_2/sample_2 ontarget.fastq.gz' \ --reference_genome test_data/grch38/grch38_chr19_22.fa.gz \ --targets test_data/targets.bed --full_report --threads 4" \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6583953..8134142 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,22 +1,14 @@ repos: - repo: local hooks: - - id: docs_schema - name: docs_schema - entry: parse_docs -p docs -e .md -s intro links -oj nextflow_schema.json - language: python - always_run: true - pass_filenames: false - additional_dependencies: - - epi2melabs - id: docs_readme name: docs_readme - entry: parse_docs -p docs -e .md -s header intro quickstart links -ot README.md + entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_inputs 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json language: python always_run: true pass_filenames: false additional_dependencies: - - epi2melabs + - epi2melabs>=0.0.50 - id: build_models name: build_models entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py diff --git a/README.md b/README.md index 5ab2de9..8f97f69 100644 --- a/README.md +++ b/README.md @@ -1,105 +1,213 @@ -# wf-cas9 - -wf-cas9 is a [nextflow](https://www.nextflow.io/) workflow -for the multiplexed analysis of Oxford Nanopore Cas9 enrichment sequencing. +# Cas9 enrichment workflow +Summarise the results of Cas9 enrichment sequencing. ## Introduction + + The ONT Cas9 sequencing kit allows the enrichment of genomic regions of interest by amplifying target regions from adapters ligated to Cas9 cleavage sites. The purpose of this workflow is to assess the effectiveness of such Cas9 enrichment, but it can be applied to other enrichment approaches. The workflow outputs -help assess the effectiveness -of the enrichement strategy and can be used to diagnose issues such as poorly performing probes. +help assess the effectiveness of the enrichement strategy and can be used to diagnose issues such as poorly performing probes. -Inputs to the workflow are: a reference genome file, FASTQ reads from enrichment sequencing, -and a BED file detailing the regions of interests (targets). -The main outputs are a report containing summary statistics and plots which give an overview of -the enrichment, and a BAM file with target-overlapping reads. +This workflow can be used for the following: -The main steps of the workflow are alignemnt of reads to the genome using -[minimap2](https://github.com/lh3/minimap2) and the analaysis -of read-target overlap with [bedtools](https://github.com/arq5x/bedtools2). ++ To obtain simple Cas9 enrichment sequencing summaries. ++ Statistics of the coverage of each target. ++ Plots of coverage across each target. ++ Identification of off-target hot-spots +## Compute requirements +Recommended requirements: ++ CPUs = 16 ++ Memory = 64 GB +Minimum requirements: ++ CPUs = 6 ++ Memory = 16 GB -## Quickstart +Approximate run time: Approximately 30 minutes to process a single sample of 100K reads wit the minimum requirements. -The workflow uses [nextflow](https://www.nextflow.io/) to manage compute and -software resources, as such nextflow will need to be installed before attempting -to run the workflow. +ARM processor support: True -The workflow can currently be run using either -[Docker](https://www.docker.com/products/docker-desktop) or -[singularity](https://docs.sylabs.io/guides/latest/user-guide/) to provide isolation of -the required software. Both methods are automated out-of-the-box provided -either docker or singularity is installed. -It is not required to clone or download the git repository in order to run the workflow. -For more information on running EPI2ME Labs workflows [visit out website](https://labs.epi2me.io/wfindex). -**Workflow options** -To obtain the workflow, having installed `nextflow`, users can run: +## Install and run -``` -nextflow run epi2me-labs/wf-cas9 --help -``` -to see the options for the workflow. + +These are instructions to install and run the workflow on command line. You can also access the workflow via the [EPI2ME application](https://labs.epi2me.io/downloads/). + +The workflow uses [Nextflow](https://www.nextflow.io/) to manage compute and software resources, therefore nextflow will need to be installed before attempting to run the workflow. + +The workflow can currently be run using either [Docker](https://www.docker.com/products/docker-desktop) or +[Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) to provide isolation of +the required software. Both methods are automated out-of-the-box provided +either docker or singularity is installed. This is controlled by the [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles) parameter as exemplified below. -The main inputs are: -* Folder of FASTQ reads. -* Genome reference file. -* Target BED file with 4 columns: - * chromosome - * start - * end - * target_name +It is not required to clone or download the git repository in order to run the workflow. +More information on running EPI2ME workflows can be found on our [website](https://labs.epi2me.io/wfindex). +The following command can be used to obtain the workflow. This will pull the repository in to the assets folder of nextflow and provide a list of all parameters available for the workflow as well as an example command: -To test on a small dataset with two targets and two chromosomes: -first download and unpack the demo data -```shell +``` +nextflow run epi2me-labs/wf-cas9 –help +``` +A demo dataset is provided for testing of the workflow. It can be downloaded using: +``` wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-cas9/wf-cas9-demo.tar.gz \ && tar -xvf wf-cas9-demo.tar.gz - -```shell +``` +The workflow can be run with the demo data using: +``` nextflow run epi2me-labs/wf-cas9 \ --fastq wf-cas9-demo/fastq/ \ --reference_genome wf-cas9-demo/grch38/grch38_chr19_22.fa.gz \ --targets wf-cas9-demo/targets.bed ``` +For further information about running a workflow on the cmd line see https://labs.epi2me.io/wfquickstart/ + + + +## Related protocols + + + +This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices. + +Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/). + + + +## Inputs + +### Input Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | +| reference_genome | string | FASTA reference file. | Full path to a FASTA reference genome file conating the target regions of interest. | | +| targets | string | A tab-delimited BED file of target regions. | Each row should contain the following fields: chromosome/contig name, start and end of target region, and a name to identify the target. An example row would look like this: `chr19 13204400 13211100 SCA6` | | +| analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False | + + +### Sample Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`. | | +| sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | + + +### Output Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| out_dir | string | Directory for output of all workflow results. | | output | +| full_report | boolean | Select this option to write a full report that contains plots giving a graphical representation of coverage at each target region. | In cases where there are many target to visualise, the report loading time can be slow and so it's is recommended to set `full_report` to false in such cases. | False | + + +### Miscellaneous Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| disable_ping | boolean | Enable to prevent sending a workflow ping. | | False | + + + + + + +## Outputs + +Outputs files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}. + +| Title | File path | Description | Per sample or aggregated | +|-------|-----------|-------------|--------------------------| +| workflow report | ./wf-cas9-report.html | Report for all samples | aggregated | +| Sample summary table | ./sample_summary.csv | Summary statistics for each sample. | aggregated | +| Target summary table | ./target_summary.csv | Summary statistics for each sample-target combination. | aggregated | +| Per file read stats | ./{{ alias }}/{{ alias }}_per-read-stats.tsv.gz | A TSV with per-file read statistics, including all samples. | per-sample | +| On-target BAM | .//{{ alias }}/{{ alias }}_on_target.bam | BAM file containing alignments that map to one of the given targets. | per-sample | +| On-target BED | .//{{ alias }}/{{ alias }}_on_target.bed | BED file containing summarising alignments that map to one of the given targets. | per-sample | +| On-target FASTQ | .//{{ alias }}/{{ alias }}_on_target.fastq | FASTQ file containing reads that map to one of the given targets. | per-sample | + + + + +## Pipeline overview + + +### 1. Concatenate input files and generate per read stats. + +The [fastcat/bamstats](https://github.com/epi2me-labs/fastcat) tool is used to concatenate multifile samples to be processed by the workflow. +### 2. Align read to the reference genome. +The reads are then aligned to the reference genome supplied by the user using [minimap2](https://github.com/lh3/minimap2). + +### 3. Generate target coverage data. +This stage of the workflow generates coverage data for each of targets that are used to make +the plots and tables in the report. + +First, the reference genome is split into consecutive windows of 100bp. The coverage statistics are +calcaulted across these windows. + +The user must supply a tab-delinted BED file (`targets`) detailing the genomic locations of the targets of interest. +Here is an example file describing two targets. + +``` +chr19 13204400 13211100 SCA6 +chr22 45791500 45799400 SCA10 +``` +The columns are: ++ chromosome ++ start position ++ end position ++ target name + +*Note*: the file does not contain column names. + +With the target locations defined, [bedtools](https://bedtools.readthedocs.io/en/latest/) is used to generate +information regarding the alignment coverage at each of the targets and also of +background reads (see section 4) per strand. + +### 4. Background identification +In a sequencing enrichment experiment, it can useful to know if there are any off-target genomic regions (regions that are not defined in the `targets` file) that are being preferentially encountered. This information can be used to inform primer design. + +We define off-target regions here as any region not within 1kb of a defined target. +Hot-spots are further defined as contiguous regions of off-target alignemnts containing at least 10 reads. + + + + + +## Troubleshooting + + ++ If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug. ++ See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/). + + + +## FAQ's + + + +If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-template/issues) page or start a discussion on the [community](https://community.nanoporetech.com/). -**Workflow outputs** -The primary outputs of the workflow include: -* A folder per sample containing: - * BAM file filtered to contain reads overlapping with targets (*_on_target.bam). - * BED file with alignment information for on-target reads (*on_target.bed). - * BED file containing windowed coverage for each target (*target_cov.bed). - * A simple text file providing a summary of sequencing reads (*.stats). -* sample_summary.csv - read and alignment summary for each sample. -* target_summary.csv - read and alignment summary for reads overlapping each target. -* A combined HTML report detailing the primary findings of the workflow across all samples including: - * Sequencing quality plots. - * Tables summarising targeted sequencing results. - * Plots of stranded coverage at each target. - * Histograms of on and off-target coverage for each sample. - * Off-target hotspot region tables. +## Related blog posts +See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts. -## Useful links -* [nextflow](https://www.nextflow.io/) -* [docker](https://www.docker.com/products/docker-desktop) -* [singularity](https://docs.sylabs.io/guides/3.5/user-guide/introduction.html) \ No newline at end of file diff --git a/bin/workflow_glue/build_tables.py b/bin/workflow_glue/build_tables.py index 1e696b3..0bdf50e 100755 --- a/bin/workflow_glue/build_tables.py +++ b/bin/workflow_glue/build_tables.py @@ -51,14 +51,14 @@ def main(args): frames = [] - df_read_to_taget = pd.read_csv( + df_read_to_target = pd.read_csv( args.read_to_target, sep='\t', names=['chr', 'start', 'end', 'read_id', 'target', 'sample_id'], index_col=False) read_stats_df = pd.read_csv(args.aln_summary, sep='\t', index_col=False) - df_read_to_taget = df_read_to_taget.merge( + df_read_to_target = df_read_to_target.merge( read_stats_df[['name', 'read_length']], left_on='read_id', right_on='name') @@ -69,19 +69,33 @@ def main(args): df = df.drop(columns=['sample_id']) if len(df) == 0: continue - df_read_to_taget = df_read_to_taget.astype({ + df_read_to_target = df_read_to_target.astype({ 'start': int, 'end': int, 'read_length': int }) - read_len = df_read_to_taget.groupby(['target'])[['read_length']].mean() - read_len.columns = ['mean_read_length'] + read_len = ( + df_read_to_target[['target', 'read_length']] + .groupby(['target']) + .agg(mean_read_length=('read_length', 'mean')) + ) + if len(read_len) > 0: df = df.merge(read_len, left_on='target', right_index=True) else: df['mean_read_length'] = 0 - kbases = df_read_to_taget.groupby(['target']).sum()[['read_length']] / 1000 + df_read_to_target['align_len'] = ( + df_read_to_target['end'] - df_read_to_target['start'] + ) + + # Kbases is the approximate number of bases mapping to a target. + # Deletions and insertions within the reads will mean the actual value may + # vary slightly + kbases = ( + df_read_to_target[['target', 'align_len']] + .groupby(['target']).sum() / 1000 + ) kbases.columns = ['kbases'] if len(kbases) > 0: df = df.merge(kbases, left_on='target', right_index=True) @@ -144,5 +158,5 @@ def main(args): sample_summary.to_csv('sample_summary.csv') - read_target_summary_table = read_target_summary(df_read_to_taget) + read_target_summary_table = read_target_summary(df_read_to_target) read_target_summary_table.to_csv('read_target_summary.tsv', sep='\t') diff --git a/bin/workflow_glue/check_sample_sheet.py b/bin/workflow_glue/check_sample_sheet.py index d77f1ab..fe4fc37 100755 --- a/bin/workflow_glue/check_sample_sheet.py +++ b/bin/workflow_glue/check_sample_sheet.py @@ -2,6 +2,7 @@ import codecs import csv import os +import re import sys from .util import get_named_logger, wf_parser # noqa: ABS101 @@ -79,6 +80,19 @@ def main(args): sys.stdout.write(f"Parsing error: {e}") sys.exit() + # check barcodes are correct format + for barcode in barcodes: + if not re.match(r'^barcode\d\d+$', barcode): + sys.stdout.write("values in 'barcode' column are incorrect format") + sys.exit() + + # check barcodes are all the same length + first_length = len(barcodes[0]) + for barcode in barcodes[1:]: + if len(barcode) != first_length: + sys.stdout.write("values in 'barcode' column are different lengths") + sys.exit() + # check barcode and alias values are unique if len(barcodes) > len(set(barcodes)): sys.stdout.write("values in 'barcode' column not unique") diff --git a/bin/workflow_glue/report.py b/bin/workflow_glue/report.py index a8b9b62..eba92da 100755 --- a/bin/workflow_glue/report.py +++ b/bin/workflow_glue/report.py @@ -186,7 +186,7 @@ def tiled_coverage_hist(report, tiled_coverage_tsv): Show on-target / off-target coverage at 100bp tiled genome bins. """ with report.add_section('Coverage distribution', 'Coverage distribution'): - p('''Theese plots show the on-target / off-target coverage distribution + p('''These plots show the on-target / off-target coverage distribution of genomic regions binned by 100bp. Off-target regions are defined as any region not within 1kb of a target. diff --git a/docs/01_brief_description.md b/docs/01_brief_description.md new file mode 100644 index 0000000..c7fd0e3 --- /dev/null +++ b/docs/01_brief_description.md @@ -0,0 +1 @@ +Summarise the results of Cas9 enrichment sequencing. \ No newline at end of file diff --git a/docs/02_introduction.md b/docs/02_introduction.md new file mode 100644 index 0000000..8e6dffd --- /dev/null +++ b/docs/02_introduction.md @@ -0,0 +1,13 @@ + +The ONT Cas9 sequencing kit allows the enrichment of genomic +regions of interest by amplifying target regions from adapters ligated to Cas9 cleavage sites. +The purpose of this workflow is to assess the effectiveness of such Cas9 enrichment, +but it can be applied to other enrichment approaches. The workflow outputs +help assess the effectiveness of the enrichement strategy and can be used to diagnose issues such as poorly performing probes. + +This workflow can be used for the following: + ++ To obtain simple Cas9 enrichment sequencing summaries. ++ Statistics of the coverage of each target. ++ Plots of coverage across each target. ++ Identification of off-target hot-spots \ No newline at end of file diff --git a/docs/03_compute_requirements.md b/docs/03_compute_requirements.md new file mode 100644 index 0000000..a9d559a --- /dev/null +++ b/docs/03_compute_requirements.md @@ -0,0 +1,13 @@ +Recommended requirements: + ++ CPUs = 16 ++ Memory = 64 GB + +Minimum requirements: + ++ CPUs = 6 ++ Memory = 16 GB + +Approximate run time: Approximately 30 minutes to process a single sample of 100K reads wit the minimum requirements. + +ARM processor support: True diff --git a/docs/04_install_and_run.md b/docs/04_install_and_run.md new file mode 100644 index 0000000..142407e --- /dev/null +++ b/docs/04_install_and_run.md @@ -0,0 +1,31 @@ + +These are instructions to install and run the workflow on command line. You can also access the workflow via the [EPI2ME application](https://labs.epi2me.io/downloads/). + +The workflow uses [Nextflow](https://www.nextflow.io/) to manage compute and software resources, therefore nextflow will need to be installed before attempting to run the workflow. + +The workflow can currently be run using either [Docker](https://www.docker.com/products/docker-desktop) or +[Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) to provide isolation of +the required software. Both methods are automated out-of-the-box provided +either docker or singularity is installed. This is controlled by the [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles) parameter as exemplified below. + +It is not required to clone or download the git repository in order to run the workflow. +More information on running EPI2ME workflows can be found on our [website](https://labs.epi2me.io/wfindex). + +The following command can be used to obtain the workflow. This will pull the repository in to the assets folder of nextflow and provide a list of all parameters available for the workflow as well as an example command: + +``` +nextflow run epi2me-labs/wf-cas9 –help +``` +A demo dataset is provided for testing of the workflow. It can be downloaded using: +``` +wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-cas9/wf-cas9-demo.tar.gz \ + && tar -xvf wf-cas9-demo.tar.gz +``` +The workflow can be run with the demo data using: +``` +nextflow run epi2me-labs/wf-cas9 \ + --fastq wf-cas9-demo/fastq/ \ + --reference_genome wf-cas9-demo/grch38/grch38_chr19_22.fa.gz \ + --targets wf-cas9-demo/targets.bed +``` +For further information about running a workflow on the cmd line see https://labs.epi2me.io/wfquickstart/ \ No newline at end of file diff --git a/docs/05_related_protocols.md b/docs/05_related_protocols.md new file mode 100644 index 0000000..55a9647 --- /dev/null +++ b/docs/05_related_protocols.md @@ -0,0 +1,5 @@ + + +This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices. + +Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/). \ No newline at end of file diff --git a/docs/06_inputs.md b/docs/06_inputs.md new file mode 100644 index 0000000..37a5c9b --- /dev/null +++ b/docs/06_inputs.md @@ -0,0 +1,33 @@ +### Input Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | +| reference_genome | string | FASTA reference file. | Full path to a FASTA reference genome file conating the target regions of interest. | | +| targets | string | A tab-delimited BED file of target regions. | Each row should contain the following fields: chromosome/contig name, start and end of target region, and a name to identify the target. An example row would look like this: `chr19 13204400 13211100 SCA6` | | +| analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False | + + +### Sample Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`. | | +| sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | + + +### Output Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| out_dir | string | Directory for output of all workflow results. | | output | +| full_report | boolean | Select this option to write a full report that contains plots giving a graphical representation of coverage at each target region. | In cases where there are many target to visualise, the report loading time can be slow and so it's is recommended to set `full_report` to false in such cases. | False | + + +### Miscellaneous Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| disable_ping | boolean | Enable to prevent sending a workflow ping. | | False | + + diff --git a/docs/07_outputs.md b/docs/07_outputs.md new file mode 100644 index 0000000..3720543 --- /dev/null +++ b/docs/07_outputs.md @@ -0,0 +1,11 @@ +Outputs files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}. + +| Title | File path | Description | Per sample or aggregated | +|-------|-----------|-------------|--------------------------| +| workflow report | ./wf-cas9-report.html | Report for all samples | aggregated | +| Sample summary table | ./sample_summary.csv | Summary statistics for each sample. | aggregated | +| Target summary table | ./target_summary.csv | Summary statistics for each sample-target combination. | aggregated | +| Per file read stats | ./{{ alias }}/{{ alias }}_per-read-stats.tsv.gz | A TSV with per-file read statistics, including all samples. | per-sample | +| On-target BAM | .//{{ alias }}/{{ alias }}_on_target.bam | BAM file containing alignments that map to one of the given targets. | per-sample | +| On-target BED | .//{{ alias }}/{{ alias }}_on_target.bed | BED file containing summarising alignments that map to one of the given targets. | per-sample | +| On-target FASTQ | .//{{ alias }}/{{ alias }}_on_target.fastq | FASTQ file containing reads that map to one of the given targets. | per-sample | diff --git a/docs/08_pipeline_overview.md b/docs/08_pipeline_overview.md new file mode 100644 index 0000000..d26cea9 --- /dev/null +++ b/docs/08_pipeline_overview.md @@ -0,0 +1,39 @@ + +### 1. Concatenate input files and generate per read stats. + +The [fastcat/bamstats](https://github.com/epi2me-labs/fastcat) tool is used to concatenate multifile samples to be processed by the workflow. +### 2. Align read to the reference genome. +The reads are then aligned to the reference genome supplied by the user using [minimap2](https://github.com/lh3/minimap2). + +### 3. Generate target coverage data. +This stage of the workflow generates coverage data for each of targets that are used to make +the plots and tables in the report. + +First, the reference genome is split into consecutive windows of 100bp. The coverage statistics are +calcaulted across these windows. + +The user must supply a tab-delinted BED file (`targets`) detailing the genomic locations of the targets of interest. +Here is an example file describing two targets. + +``` +chr19 13204400 13211100 SCA6 +chr22 45791500 45799400 SCA10 +``` +The columns are: ++ chromosome ++ start position ++ end position ++ target name + +*Note*: the file does not contain column names. + +With the target locations defined, [bedtools](https://bedtools.readthedocs.io/en/latest/) is used to generate +information regarding the alignment coverage at each of the targets and also of +background reads (see section 4) per strand. + +### 4. Background identification +In a sequencing enrichment experiment, it can useful to know if there are any off-target genomic regions (regions that are not defined in the `targets` file) that are being preferentially encountered. This information can be used to inform primer design. + +We define off-target regions here as any region not within 1kb of a defined target. +Hot-spots are further defined as contiguous regions of off-target alignemnts containing at least 10 reads. + diff --git a/docs/09_troubleshooting.md b/docs/09_troubleshooting.md new file mode 100644 index 0000000..e8ee712 --- /dev/null +++ b/docs/09_troubleshooting.md @@ -0,0 +1,3 @@ + ++ If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug. ++ See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/). \ No newline at end of file diff --git a/docs/10_FAQ.md b/docs/10_FAQ.md new file mode 100644 index 0000000..d1bf184 --- /dev/null +++ b/docs/10_FAQ.md @@ -0,0 +1,3 @@ + + +If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-template/issues) page or start a discussion on the [community](https://community.nanoporetech.com/). \ No newline at end of file diff --git a/docs/11_other.md b/docs/11_other.md new file mode 100644 index 0000000..fa8e298 --- /dev/null +++ b/docs/11_other.md @@ -0,0 +1,2 @@ + +See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts. \ No newline at end of file diff --git a/docs/header.md b/docs/header.md deleted file mode 100644 index 462153d..0000000 --- a/docs/header.md +++ /dev/null @@ -1,4 +0,0 @@ -# wf-cas9 - -wf-cas9 is a [nextflow](https://www.nextflow.io/) workflow -for the multiplexed analysis of Oxford Nanopore Cas9 enrichment sequencing. diff --git a/docs/intro.md b/docs/intro.md deleted file mode 100644 index 46ceed5..0000000 --- a/docs/intro.md +++ /dev/null @@ -1,20 +0,0 @@ -## Introduction -The ONT Cas9 sequencing kit allows the enrichment of genomic -regions of interest by amplifying target regions from adapters ligated to Cas9 cleavage sites. -The purpose of this workflow is to assess the effectiveness of such Cas9 enrichment, -but it can be applied to other enrichment approaches. The workflow outputs -help assess the effectiveness -of the enrichement strategy and can be used to diagnose issues such as poorly performing probes. - -Inputs to the workflow are: a reference genome file, FASTQ reads from enrichment sequencing, -and a BED file detailing the regions of interests (targets). -The main outputs are a report containing summary statistics and plots which give an overview of -the enrichment, and a BAM file with target-overlapping reads. - -The main steps of the workflow are alignemnt of reads to the genome using -[minimap2](https://github.com/lh3/minimap2) and the analaysis -of read-target overlap with [bedtools](https://github.com/arq5x/bedtools2). - - - - diff --git a/docs/links.md b/docs/links.md deleted file mode 100644 index cc5b8b1..0000000 --- a/docs/links.md +++ /dev/null @@ -1,5 +0,0 @@ -## Useful links - -* [nextflow](https://www.nextflow.io/) -* [docker](https://www.docker.com/products/docker-desktop) -* [singularity](https://docs.sylabs.io/guides/3.5/user-guide/introduction.html) \ No newline at end of file diff --git a/docs/quickstart.md b/docs/quickstart.md deleted file mode 100644 index 9ac4cfd..0000000 --- a/docs/quickstart.md +++ /dev/null @@ -1,64 +0,0 @@ -## Quickstart - -The workflow uses [nextflow](https://www.nextflow.io/) to manage compute and -software resources, as such nextflow will need to be installed before attempting -to run the workflow. - -The workflow can currently be run using either -[Docker](https://www.docker.com/products/docker-desktop) or -[singularity](https://docs.sylabs.io/guides/latest/user-guide/) to provide isolation of -the required software. Both methods are automated out-of-the-box provided -either docker or singularity is installed. - -It is not required to clone or download the git repository in order to run the workflow. -For more information on running EPI2ME Labs workflows [visit out website](https://labs.epi2me.io/wfindex). - -**Workflow options** - -To obtain the workflow, having installed `nextflow`, users can run: - -``` -nextflow run epi2me-labs/wf-cas9 --help -``` -to see the options for the workflow. - -The main inputs are: -* Folder of FASTQ reads. -* Genome reference file. -* Target BED file with 4 columns: - * chromosome - * start - * end - * target_name - - -To test on a small dataset with two targets and two chromosomes: -first download and unpack the demo data -```shell -wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-cas9/wf-cas9-demo.tar.gz \ - && tar -xvf wf-cas9-demo.tar.gz - -```shell -nextflow run epi2me-labs/wf-cas9 \ - --fastq wf-cas9-demo/fastq/ \ - --reference_genome wf-cas9-demo/grch38/grch38_chr19_22.fa.gz \ - --targets wf-cas9-demo/targets.bed -``` - -**Workflow outputs** - -The primary outputs of the workflow include: - -* A folder per sample containing: - * BAM file filtered to contain reads overlapping with targets (*_on_target.bam). - * BED file with alignment information for on-target reads (*on_target.bed). - * BED file containing windowed coverage for each target (*target_cov.bed). - * A simple text file providing a summary of sequencing reads (*.stats). -* sample_summary.csv - read and alignment summary for each sample. -* target_summary.csv - read and alignment summary for reads overlapping each target. -* A combined HTML report detailing the primary findings of the workflow across all samples including: - * Sequencing quality plots. - * Tables summarising targeted sequencing results. - * Plots of stranded coverage at each target. - * Histograms of on and off-target coverage for each sample. - * Off-target hotspot region tables. diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 3b29be1..81fdc2e 100644 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -141,7 +141,7 @@ class NfcoreSchema { for (specifiedParam in params.keySet()) { // nextflow params if (nf_params.contains(specifiedParam)) { - log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" + log.error "You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" has_error = true } // unexpected params @@ -180,7 +180,7 @@ class NfcoreSchema { schema.validate(params_json) } catch (ValidationException e) { println '' - log.error 'ERROR: Validation of pipeline parameters failed!' + log.error 'Validation of pipeline parameters failed!' JSONObject exceptionJSON = e.toJSON() HashSet observed_exceptions = [] printExceptions(exceptionJSON, params_json, log, enums, raw_schema, observed_exceptions) diff --git a/lib/ingress.nf b/lib/ingress.nf index 5839730..6ca2d11 100644 --- a/lib/ingress.nf +++ b/lib/ingress.nf @@ -149,13 +149,12 @@ def xam_ingress(Map arguments) def input = get_valid_inputs(margs, xam_extensions) + // check BAM headers to see if any samples are uBAM ch_result = input.dirs | map { meta, path -> [meta, get_target_files_in_dir(path, xam_extensions)] } | mix(input.files) - - ch_is_unaligned = ch_result | checkBamHeaders - | map { meta, is_unaligned_env, mixed_headers_env -> + | map { meta, paths, is_unaligned_env, mixed_headers_env -> // convert the env. variables from strings ('0' or '1') into bools boolean is_unaligned = is_unaligned_env as int as boolean boolean mixed_headers = mixed_headers_env as int as boolean @@ -163,14 +162,11 @@ def xam_ingress(Map arguments) if (mixed_headers) { error "Found mixed headers in (u)BAM files of sample '${meta.alias}'." } - [meta, is_unaligned] + // add `is_unaligned` to the metamap (note the use of `+` to create a copy of + // `meta` to avoid modifying every item in the channel; + // https://github.com/nextflow-io/nextflow/issues/2660) + [meta + [is_unaligned: is_unaligned], paths] } - - ch_result = ch_result | join(ch_is_unaligned) - // add `is_unaligned` to the metamap (note the use of `+` to create a copy of `meta` - // to avoid modifying every item in the channel; - // https://github.com/nextflow-io/nextflow/issues/2660) - | map { meta, paths, is_unaligned -> [meta + [is_unaligned: is_unaligned], paths] } | branch { meta, paths -> // set `paths` to `null` for uBAM samples if unallowed (they will be added to // the results channel in shape of `[meta, null]` at the end of the function @@ -240,11 +236,17 @@ process checkBamHeaders { label "ingress" label "wf_common" cpus 1 + memory "2 GB" input: tuple val(meta), path("input_dir/reads*.bam") output: // set the two env variables by `eval`-ing the output of the python script // checking the XAM headers - tuple val(meta), env(IS_UNALIGNED), env(MIXED_HEADERS) + tuple( + val(meta), + path("input_dir/reads*.bam", includeInputs: true), + env(IS_UNALIGNED), + env(MIXED_HEADERS), + ) script: """ workflow-glue check_bam_headers_in_dir input_dir > env.vars @@ -257,6 +259,7 @@ process mergeBams { label "ingress" label "wf_common" cpus 3 + memory "4 GB" input: tuple val(meta), path("input_bams/reads*.bam") output: tuple val(meta), path("reads.bam") shell: @@ -271,6 +274,7 @@ process catSortBams { label "ingress" label "wf_common" cpus 4 + memory "4 GB" input: tuple val(meta), path("input_bams/reads*.bam") output: tuple val(meta), path("reads.bam") script: @@ -285,6 +289,7 @@ process sortBam { label "ingress" label "wf_common" cpus 3 + memory "4 GB" input: tuple val(meta), path("reads.bam") output: tuple val(meta), path("reads.sorted.bam") script: @@ -298,6 +303,7 @@ process bamstats { label "ingress" label "wf_common" cpus 3 + memory "4 GB" input: tuple val(meta), path("reads.bam") output: @@ -414,6 +420,7 @@ process move_or_compress_fq_file { label "ingress" label "wf_common" cpus 1 + memory "2 GB" input: // don't stage `input` with a literal because we check the file extension tuple val(meta), path(input) @@ -439,6 +446,7 @@ process fastcat { label "ingress" label "wf_common" cpus 3 + memory "2 GB" input: tuple val(meta), path("input") val extra_args @@ -737,6 +745,7 @@ process validate_sample_sheet { cpus 1 label "ingress" label "wf_common" + memory "2 GB" input: path "sample_sheet.csv" val required_sample_types diff --git a/main.nf b/main.nf index c2bf21e..22a954f 100644 --- a/main.nf +++ b/main.nf @@ -9,6 +9,7 @@ include { fastq_ingress } from './lib/ingress' process getVersions { label "cas9" cpus 1 + memory "2 GB" output: path "versions.txt" script: @@ -22,7 +23,8 @@ process getVersions { process getParams { label "cas9" - cpus 1 + cpus 2 + memory "2 GB" output: path "params.json" script: @@ -35,7 +37,8 @@ process getParams { process make_tiles { label 'cas9' - cpus 1 + cpus 2 + memory "2 GB" input: path "chrom.sizes" path "targets.bed" @@ -55,6 +58,7 @@ process build_index{ */ label "cas9" cpus params.threads + memory "16 GB" input: path "reference" output: @@ -62,7 +66,7 @@ process build_index{ path "chrom.sizes", emit: chrom_sizes script: """ - minimap2 -t $task.cpus -x map-ont -d genome_index.mmi reference + minimap2 -t $task.cpus -I 16G -x map-ont -d genome_index.mmi reference samtools faidx reference cut -f 1,2 reference.fai >> chrom.sizes """ @@ -70,21 +74,22 @@ process build_index{ process align_reads { label "cas9" - cpus params.threads - memory params.minimap2_max_memory + cpus Math.min(params.threads, 20) + memory "16 GB" input: path "genome_index.mmi" - path "reference" + path reference_fasta tuple val(meta), path("reads.fastq") output: path "${meta.alias}_aln_stats.csv", emit: aln_stats tuple val(meta), path("${meta.alias}_fastq_pass.bed"), emit: bed tuple val(meta), path("${meta.alias}.bam"), emit: bam script: + def mm2_threads = Math.max(task.cpus - 4, 1) """ - minimap2 -t $task.cpus -ax map-ont "genome_index.mmi" "reads.fastq" | \ - samtools sort -O bam -@ $task.cpus - | tee "${meta.alias}.bam" | \ - bedtools bamtobed -i stdin | sort -k 1,1 -k2,2n --parallel $task.cpus > "${meta.alias}_fastq_pass.bed" + minimap2 -t ${mm2_threads} -K 20M -ax map-ont "${reference_fasta}" "reads.fastq" | \ + samtools sort -m 400M -O bam -@ 2 - | tee "${meta.alias}.bam" | \ + bedtools bamtobed -i stdin | sort -k 1,1 -k2,2n > "${meta.alias}_fastq_pass.bed" # Get a csv with columns: [read_id, alignment_accuracy] samtools index "${meta.alias}.bam" bamstats "${meta.alias}.bam" | \ @@ -107,7 +112,8 @@ process target_coverage { */ label "cas9" - cpus 1 + cpus 3 + memory "2 GB" input: path 'targets.tsv' path 'tiles.tsv' @@ -166,7 +172,8 @@ process target_summary { strand reads */ label "cas9" - cpus 1 + cpus 2 + memory "2 GB" input: path "targets.bed" path "tiles.bed" @@ -209,7 +216,8 @@ process target_summary { process coverage_summary { label "cas9" - cpus 1 + cpus 2 + memory "2 GB" input: path 'targets.bed' tuple val(meta), @@ -250,7 +258,8 @@ process coverage_summary { process background { label "cas9" - cpus 1 + cpus 2 + memory "4 GB" input: path 'targets.tsv' path 'tiles.tsv' @@ -297,7 +306,8 @@ process background { process get_on_target_reads { label "cas9" - cpus 1 + cpus 2 + memory "2 GB" input: tuple val(meta), path("input.fastq"), @@ -317,6 +327,7 @@ process get_on_target_reads { process get_on_target_bams { label "cas9" cpus 1 + memory "2 GB" input: tuple val(meta), path("on_target.bed"), @@ -336,7 +347,7 @@ process get_on_target_bams { process build_tables { label "cas9" - cpus 1 + cpus 2 input: path 'read_to_target.tsv' path 'aln_summary.tsv' @@ -356,7 +367,8 @@ process build_tables { process makeReport { label "cas9" - cpus 1 + cpus 2 + memory "4 GB" input: path "versions/*" path "params.json" @@ -390,6 +402,8 @@ process makeReport { process combine_stats { label "cas9" + cpus 2 + memory "2 GB" input: tuple val(meta), path('stats.tsv.gz') @@ -404,7 +418,8 @@ process combine_stats { process pack_files_into_sample_dirs { label "cas9" - cpus 1 + cpus 2 + memory "2 GB" input: tuple val(meta), path(sample_files) @@ -424,7 +439,8 @@ process pack_files_into_sample_dirs { process output { // publish inputs to output directory label "cas9" - cpus 1 + cpus 2 + memory "4 GB" publishDir "${params.out_dir}", mode: 'copy', pattern: "*" input: path fname @@ -542,7 +558,7 @@ workflow { ref_genome = file(params.reference_genome, type: "file") if (!ref_genome.exists()) { - error "--ref_genome: File doesn't exist, check path." + error "--reference_genome: File doesn't exist, check path." } targets = file(params.targets, type: "file") if (!targets.exists()) { diff --git a/nextflow.config b/nextflow.config index e540408..4b90d58 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,7 +18,6 @@ params { reference_genome = null targets = null threads = 8 - minimap2_max_memory = "4 GB" out_dir = "output" full_report = false sample = null @@ -42,7 +41,7 @@ params { "--targets 'wf-cas9-demo/targets.bed'" ] - container_sha = "shac399dc15598ec278919fc045a646fe325f2b608b" + container_sha = "shaa7b95018145dc9c753d6092309ac6be5166a491a" common_sha = "sha91452ece4f647f62b32dac3a614635a6f0d7f8b5" } } @@ -63,13 +62,6 @@ epi2melabs { icon = "faBullseye" } -executor { - $local { - cpus = 8 - memory = "16 GB" - } -} - env { PYTHONNOUSERSITE = 1 } @@ -122,10 +114,10 @@ profiles { queue = "${params.aws_queue}" memory = '8G' withLabel:cas9 { - container = "${params.aws_image_prefix}-wf-cas9:${params.wf.container_sha}-root" + container = "${params.aws_image_prefix}-wf-cas9:${params.wf.container_sha}" } withLabel:wf_common { - container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}-root" + container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}" } shell = ['/bin/bash', '-euo', 'pipefail'] } diff --git a/nextflow_schema.json b/nextflow_schema.json index d9addbc..192d9c7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,6 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", "title": "epi2me-labs/wf-cas9", + "workflow_title": "Cas9 enrichment workflow", "description": "Summarise the results of Cas9 enrichment sequencing.", "demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-cas9/wf-cas9-demo.tar.gz", "aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-cas9/wf-cas9-demo/aws.nextflow.config", @@ -96,13 +97,6 @@ "description": "Number of CPU threads to use per workflow task.", "help_text": "The total CPU resource used by the workflow is constrained by the executor configuration.", "default": 8 - }, - "minimap2_max_memory": { - "type": "string", - "title": "Minimap2 maximum memory", - "default": "4 GB", - "description": "Maximum allowed memory usage for the minimap2 stage,", - "help_text": "The total memory resource used by the workflow is constrained by the executor configuration" } } }, @@ -171,8 +165,16 @@ "type": "boolean" } }, - "docs": { - "intro": "## Introduction\nThe ONT Cas9 sequencing kit allows the enrichment of genomic\nregions of interest by amplifying target regions from adapters ligated to Cas9 cleavage sites.\nThe purpose of this workflow is to assess the effectiveness of such Cas9 enrichment, \nbut it can be applied to other enrichment approaches. The workflow outputs\nhelp assess the effectiveness \nof the enrichement strategy and can be used to diagnose issues such as poorly performing probes.\n\nInputs to the workflow are: a reference genome file, FASTQ reads from enrichment sequencing,\nand a BED file detailing the regions of interests (targets).\nThe main outputs are a report containing summary statistics and plots which give an overview of \nthe enrichment, and a BAM file with target-overlapping reads.\n\nThe main steps of the workflow are alignemnt of reads to the genome using \n[minimap2](https://github.com/lh3/minimap2) and the analaysis\nof read-target overlap with [bedtools](https://github.com/arq5x/bedtools2).\n\n\n\n\n", - "links": "## Useful links\n\n* [nextflow](https://www.nextflow.io/)\n* [docker](https://www.docker.com/products/docker-desktop)\n* [singularity](https://docs.sylabs.io/guides/3.5/user-guide/introduction.html)" + "resources": { + "recommended": { + "cpus": 16, + "memory": "64 GB" + }, + "minimum": { + "cpus": 6, + "memory": "16 GB" + }, + "run_time": "Approximately 30 minutes to process a single sample of 100K reads wit the minimum requirements.", + "arm_support": true } } \ No newline at end of file diff --git a/output_definition.json b/output_definition.json new file mode 100644 index 0000000..7bc5041 --- /dev/null +++ b/output_definition.json @@ -0,0 +1,60 @@ +{ + "files": { + "workflow-report": { + "filepath": "./wf-cas9-report.html", + "title": "workflow report", + "description": "Report for all samples", + "mime-type": "text/html", + "optional": false, + "type": "aggregated" + }, + "sample-summary": { + "filepath": "./sample_summary.csv", + "title": "Sample summary table", + "description": "Summary statistics for each sample.", + "mime-type": "text/tab-separated-values", + "optional": false, + "type": "aggregated" + }, + "target-summary": { + "filepath": "./target_summary.csv", + "title": "Target summary table", + "description": "Summary statistics for each sample-target combination.", + "mime-type": "text/tab-separated-values", + "optional": false, + "type": "aggregated" + }, + "read-stats-per-file": { + "filepath": "./{{ alias }}/{{ alias }}_per-read-stats.tsv.gz", + "title": "Per file read stats", + "description": "A TSV with per-file read statistics, including all samples.", + "mime-type": "application/gzip", + "optional": false, + "type": "per-sample" + }, + "on-target-bam": { + "filepath": ".//{{ alias }}/{{ alias }}_on_target.bam", + "title": "On-target BAM", + "description": "BAM file containing alignments that map to one of the given targets.", + "mime-type": "application/gzip", + "optional": false, + "type": "per-sample" + }, + "on-target-bed": { + "filepath": ".//{{ alias }}/{{ alias }}_on_target.bed", + "title": "On-target BED", + "description": "BED file containing summarising alignments that map to one of the given targets.", + "mime-type": "text/tab-separated-values", + "optional": false, + "type": "per-sample" + }, + "on-target-fastq": { + "filepath": ".//{{ alias }}/{{ alias }}_on_target.fastq", + "title": "On-target FASTQ", + "description": "FASTQ file containing reads that map to one of the given targets.", + "mime-type": "text/plain", + "optional": false, + "type": "per-sample" + } + } +} \ No newline at end of file