From 56e700d6aacdaaafc41ef6d1fe1656d457732e1b Mon Sep 17 00:00:00 2001 From: Fredrik Boulund Date: Mon, 30 Apr 2018 10:12:44 +0200 Subject: [PATCH] Use pathlib (#31) * Use pathlib in Snakefile * Add logdir config param. Get tired because Snakemake doesn't support Path objects as input or log files. * Use pathlib for all paths. Add version printout to Snakefile * Add info about pathlib use * Add details on branching structure to CONTRIBUTING.md * Bump docs version --- CONTRIBUTING.md | 48 +++++++++++++---- Snakefile | 27 +++++++--- config.yaml | 1 + docs/source/conf.py | 4 +- rules/antibiotic_resistance/megares.smk | 66 +++++++++++++----------- rules/mappers/bbmap.smk | 54 +++++++++---------- rules/mappers/bowtie2.smk | 56 ++++++++++---------- rules/preproc/bbcountunique.smk | 15 +++--- rules/preproc/read_quality.smk | 33 ++++++------ rules/preproc/remove_human.smk | 64 ++++++++++++----------- rules/sketch_compare/sketch_compare.smk | 29 +++++------ rules/taxonomic_profiling/centrifuge.smk | 25 ++++----- rules/taxonomic_profiling/kaiju.smk | 54 ++++++++++--------- rules/taxonomic_profiling/metaphlan2.smk | 62 ++++++++++++---------- 14 files changed, 299 insertions(+), 239 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 820e338..283d6dc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,18 +5,41 @@ We use the issue tracker in Github. Submit issues for things such as bug reports, feature requests, or general improvement discussion topics. ## Submitting changes -The typical procedure to develop new features or fix bugs in StaG-mwc looks -something like this: +The main branch of StaG-mwc should always be stable and reliable. All +development should be based on the develop branch. Please create new feature +branches from the develop branch. The develop branch is then merged into the +master branch when enough improvements have accrued. The typical procedure to +develop new features or fix bugs in StaG-mwc looks something like this: 1. Fork or clone the repository. -2. Create a branch with a descriptive name based on your intended changes using - dashes to separate words, e.g. `branch-to-add-megahit-assembly-step`. -3. Insert your code into the respective folders, i.e. scripts, rules and envs. - Define the entry point of the workflow in the Snakefile and the main - configuration in the config.yaml file. +2. Checkout the develop branch and create a new feature branch from there. + Use a descriptive name and use dashes to separate words: + ``` + git checkout develop + git checkout -b add-megahit-assembly-step + ``` +3. Write or modify code in the scripts, rules and envs folders. Define the + entry point of the workflow in the Snakefile and the main configuration in the + config.yaml file. +4. If a new feature has been added, document it in the Sphinx documentation. 4. Commit changes to your fork/clone. -5. Create a pull request (PR) with some motivation behind the work you have - done and possibly some explanations for tricky bits. +5. Create a pull request (PR) with some descriptions of the work you have + done and possibly some explanations for potentially tricky bits. +6. When the feature is considered complete, we bump the version number and + merge the PR back to the develop branch. + +### Releases +New releases are made whenever enough new features have accrued on the develop +branch. Before creating a new release, ensure the following things have been +taken care of: + +* All pending features that should be included in the upcoming release are + merged into the develop branch. +* Double check that documentation is up-to-date for implemented features. +* Check that the version number in the documentation matches the Snakefile. + +Then, merge the develop branch into master, squashing all commits, and tag +the new release. ## Code organization @@ -76,7 +99,12 @@ designed to allow some inclusion logic in the main Snakefile, so components can be turned on or off without too much trouble. Output should typically be in a subfolder inside the overall `outdir` folder. `outdir` is available as a string in all rule files, as it is defined in the main Snakefile based on the value -set in `config.yaml`. +set in `config.yaml`. + +Declare paths to input, output and log files using the pathlib Path objects +INPUTDIR, OUTDIR, and LOGDIR. Note that Snakemake is not yet fully pathlib +compatible so convert Path objects to strings inside `expand` statements and +log file declarations. Tools that require databases or other reference material to work can be confusing or annyoing to users of the workflow. To minimize the amount of diff --git a/Snakefile b/Snakefile index 0674ef1..a4ade3f 100644 --- a/Snakefile +++ b/Snakefile @@ -1,21 +1,32 @@ # vim: syntax=python expandtab # -# StaG -# mwc - Metagenomic Workflow Collaboration +# StaG Metagenomic Workflow Collaboration +# StaG-mwc # Copyright (c) 2018 Authors # -# Running snakemake -n in a clone of this repository should successfully -# execute a test dry-run of the workflow. +# Running snakemake --use-conda -n in a clone of this repository should +# successfully execute a test dry run of the workflow. +from pathlib import Path + from snakemake.exceptions import WorkflowError +from snakemake.utils import min_version +min_version("4.8.1") # TODO: Bump version when Snakemake is pathlib compatible -from sys import exit -import os.path +stag_version = "0.1.1-dev" +print("="*60) +print("StaG Metagenomic Workflow Collaboration".center(60)) +print("StaG-mwc".center(60)) +print(stag_version.center(60)) +print("="*60) configfile: "config.yaml" -outdir = config["outdir"] +INPUTDIR = Path(config["inputdir"]) +OUTDIR = Path(config["outdir"]) +LOGDIR = Path(config["logdir"]) +DBDIR = Path(config["dbdir"]) all_outputs = [] -SAMPLES = set(glob_wildcards(config["inputdir"]+"/"+config["input_fn_pattern"]).sample) +SAMPLES = set(glob_wildcards(INPUTDIR/config["input_fn_pattern"]).sample) ############################# # Pre-processing diff --git a/config.yaml b/config.yaml index fb7ec59..fd3a8a2 100644 --- a/config.yaml +++ b/config.yaml @@ -15,6 +15,7 @@ inputdir: "input" input_fn_pattern: "{sample}_R{readpair}.fastq.gz" outdir: "output_dir" +logdir: "output_dir/logs" dbdir: "databases" # Databases will be downloaded to this dir, if requested diff --git a/docs/source/conf.py b/docs/source/conf.py index 9798d9b..f187dad 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -56,9 +56,9 @@ # built documents. # # The short X.Y version. -version = '0.1.0' +version = '0.1.1' # The full version, including alpha/beta/rc tags. -release = '0.1.0-dev' +release = '0.1.1-dev' # reStructuredText prolog contains a string of reStructuredText that will be # included at the beginning of every source file that is read. diff --git a/rules/antibiotic_resistance/megares.smk b/rules/antibiotic_resistance/megares.smk index fcf781b..f00d267 100644 --- a/rules/antibiotic_resistance/megares.smk +++ b/rules/antibiotic_resistance/megares.smk @@ -1,19 +1,20 @@ # Generic rules for detection of antibiotic resistance genes using MEGARes +# TODO: Remove superfluous str conversions when Snakemake is pathlib compatible. +from pathlib import Path from snakemake.exceptions import WorkflowError -import os.path localrules: download_megares -if not os.path.isdir(os.path.join(config["megares"]["db_path"], "ref")): - err_message = "No MEGARes database found at: '{}'!\n".format(config["megares"]["db_path"]) +megares_db_path = Path(config["megares"]["db_path"]) +if not Path(megares_db_path/"ref").exists(): + err_message = "No MEGARes database found at: '{}'!\n".format(megares_db_path) err_message += "Specify the DB path in the megares section of config.yaml.\n" - err_message += "Run 'snakemake create_megares_index' to download and build a BBMap index in '{dbdir}/megares'\n".format(dbdir=config["dbdir"]) + err_message += "Run 'snakemake create_megares_index' to download and build a BBMap index in '{dbdir}'\n".format(dbdir=DBDIR/"megares") err_message += "If you do not want to map reads against MEGARes for antibiotic resistance gene detection, set antibiotic_resistance: False in config.yaml" raise WorkflowError(err_message) -megares_outputs = expand("{outdir}/megares/{sample}.{output_type}", - outdir=outdir, +megares_outputs = expand(str(OUTDIR/"megares/{sample}.{output_type}"), sample=SAMPLES, output_type=("sam.gz", "mapped_reads.fq.gz", "mhist.txt", "covstats.txt", "rpkm.txt")) all_outputs.extend(megares_outputs) @@ -21,46 +22,53 @@ all_outputs.extend(megares_outputs) rule download_megares: """Download MEGARes database.""" output: - config["dbdir"]+"/megares/megares_annotations_v1.01.csv", - config["dbdir"]+"/megares/megares_database_v1.01.fasta", - config["dbdir"]+"/megares/megares_to_external_header_mappings_v1.01.tsv", + DBDIR/"megares/megares_annotations_v1.01.csv", + DBDIR/"megares/megares_database_v1.01.fasta", + DBDIR/"megares/megares_to_external_header_mappings_v1.01.tsv", + log: + str(LOGDIR/"megares/megares.download.log") shadow: "shallow" params: - dbdir=config["dbdir"]+"/megares" + dbdir=DBDIR/"megares" shell: """ cd {params.dbdir} wget http://megares.meglab.org/download/megares_v1.01.zip \ + > {log} \ && \ unzip megares_v1.01.zip \ + >> {log} \ && \ mv megares_v1.01/* . \ && \ - rm -rfv megares_v1.01 megares_v1.01.zip + rm -rfv megares_v1.01 megares_v1.01.zip \ + >> {log} """ rule create_megares_index: """Create BBMap index for MEGARes.""" input: - fasta=config["dbdir"]+"/megares/megares_database_v1.01.fasta" + fasta=DBDIR/"megares/megares_database_v1.01.fasta" output: - config["dbdir"]+"/megares/ref/genome/1/chr1.chrom.gz", - config["dbdir"]+"/megares/ref/genome/1/info.txt", - config["dbdir"]+"/megares/ref/genome/1/scaffolds.txt.gz", - config["dbdir"]+"/megares/ref/genome/1/summary.txt", - config["dbdir"]+"/megares/ref/index/1/chr1_index_k13_c8_b1.block", - config["dbdir"]+"/megares/ref/index/1/chr1_index_k13_c8_b1.block2.gz", + DBDIR/"megares/ref/genome/1/chr1.chrom.gz", + DBDIR/"megares/ref/genome/1/info.txt", + DBDIR/"megares/ref/genome/1/scaffolds.txt.gz", + DBDIR/"megares/ref/genome/1/summary.txt", + DBDIR/"megares/ref/index/1/chr1_index_k13_c8_b1.block", + DBDIR/"megares/ref/index/1/chr1_index_k13_c8_b1.block2.gz", + log: + str(LOGDIR/"megares/megares.bbmap_index.log") shadow: "shallow" conda: "../../envs/stag-mwc.yaml" params: - dbdir=config["dbdir"]+"/megares" + dbdir=DBDIR/"megares" shell: """ - bbmap.sh ref={input} path={params.dbdir} + bbmap.sh ref={input} path={params.dbdir} > {log} """ @@ -68,17 +76,17 @@ megares_config = config["megares"] rule bbmap_to_megares: """BBMap to MEGARes.""" input: - read1=config["outdir"]+"/filtered_human/{sample}_R1.filtered_human.fq.gz", - read2=config["outdir"]+"/filtered_human/{sample}_R2.filtered_human.fq.gz", + read1=OUTDIR/"filtered_human/{sample}_R1.filtered_human.fq.gz", + read2=OUTDIR/"filtered_human/{sample}_R2.filtered_human.fq.gz", output: - sam=config["outdir"]+"/megares/{sample}.sam.gz", - mapped_reads=config["outdir"]+"/megares/{sample}.mapped_reads.fq.gz", - covstats=config["outdir"]+"/megares/{sample}.covstats.txt", - rpkm=config["outdir"]+"/megares/{sample}.rpkm.txt", - mhist=config["outdir"]+"/megares/{sample}.mhist.txt", + sam=OUTDIR/"megares/{sample}.sam.gz", + mapped_reads=OUTDIR/"megares/{sample}.mapped_reads.fq.gz", + covstats=OUTDIR/"megares/{sample}.covstats.txt", + rpkm=OUTDIR/"megares/{sample}.rpkm.txt", + mhist=OUTDIR/"megares/{sample}.mhist.txt", log: - stdout=config["outdir"]+"/logs/megares/{sample}.bbmap.stdout.log", - stderr=config["outdir"]+"/logs/megares/{sample}.bbmap.statsfile.txt" + stdout=str(LOGDIR/"megares/{sample}.bbmap.stdout.log"), + stderr=str(LOGDIR/"megares/{sample}.bbmap.statsfile.txt"), shadow: "shallow" conda: diff --git a/rules/mappers/bbmap.smk b/rules/mappers/bbmap.smk index ded95ca..c126197 100644 --- a/rules/mappers/bbmap.smk +++ b/rules/mappers/bbmap.smk @@ -1,43 +1,43 @@ # Rules for generic read mapping using BBMap +# TODO: Remove superfluous str conversions when Snakemake is pathlib compatible. +from pathlib import Path + from snakemake.exceptions import WorkflowError -import os.path localrules: bbmap_counts_table bbmap_featureCounts -if not os.path.isdir(os.path.join(config["bbmap"]["db_path"], "ref")): - err_message = "BBMap index not found at: '{}'\n".format(config["bbmap"]["db_path"]) +db_path = Path(config["bbmap"]["db_path"]) +if not Path(db_path/"ref").exists(): + err_message = "BBMap index not found at: '{}'\n".format(db_path) err_message += "Check path in config setting 'bbmap:db_path'.\n" err_message += "If you want to skip mapping with BBMap, set mappers:bbmap:False in config.yaml." raise WorkflowError(err_message) # Add final output files from this module to 'all_outputs' from the main # Snakefile scope. SAMPLES is also from the main Snakefile scope. -bbmap_alignments = expand("{outdir}/bbmap/{db_name}/{sample}.{output_type}", - outdir=config["outdir"], +bbmap_alignments = expand(str(OUTDIR/"bbmap/{db_name}/{sample}.{output_type}"), db_name=config["bbmap"]["db_name"], sample=SAMPLES, output_type=("sam.gz", "covstats.txt", "rpkm.txt")) -counts_table = expand("{outdir}/bbmap/{db_name}/all_samples.counts_table.tab", - outdir=config["outdir"], +counts_table = expand(str(OUTDIR/"bbmap/{db_name}/all_samples.counts_table.tab"), db_name=config["bbmap"]["db_name"], sample=SAMPLES) -featureCounts = expand("{outdir}/bbmap/{db_name}/all_samples.featureCounts{output_type}", - outdir=config["outdir"], +featureCounts = expand(str(OUTDIR/"bbmap/{db_name}/all_samples.featureCounts{output_type}"), db_name=config["bbmap"]["db_name"], sample=SAMPLES, output_type=["", ".summary", ".table.tsv"]) all_outputs.extend(bbmap_alignments) if config["bbmap"]["counts_table"]["annotations"]: - if not os.path.isfile(config["bbmap"]["counts_table"]["annotations"]): + if not Path(config["bbmap"]["counts_table"]["annotations"]).exists(): err_message = "BBMap counts table annotations not found at: '{}'\n".format(config["bbmap"]["counts_table"]["annotations"]) err_message += "Check path in config setting 'bbmap:counts_table:annotations'.\n" err_message += "If you want to skip read counts summary for BBMap, set bbmap:counts_table:annotations to '' in config.yaml." raise WorkflowError(err_message) all_outputs.extend(counts_table) if config["bbmap"]["featureCounts"]["annotations"]: - if not os.path.isfile(config["bbmap"]["featureCounts"]["annotations"]): + if not Path(config["bbmap"]["featureCounts"]["annotations"]).exists(): err_message = "BBMap featureCounts annotations not found at: '{}'\n".format(config["bbmap"]["featureCounts"]["annotations"]) err_message += "Check path in config setting 'bbmap:featureCounts:annotations'.\n" err_message += "If you want to skip mapping with BBMap, set mappers:bbmap:False in config.yaml." @@ -45,19 +45,19 @@ if config["bbmap"]["featureCounts"]["annotations"]: all_outputs.extend(featureCounts) bbmap_config = config["bbmap"] -bbmap_output_folder = config["outdir"]+"/bbmap/{db_name}/".format(db_name=bbmap_config["db_name"]) +bbmap_output_folder = OUTDIR/"bbmap/{db_name}".format(db_name=bbmap_config["db_name"]) rule bbmap: """BBMap""" input: - read1=config["outdir"]+"/filtered_human/{sample}_R1.filtered_human.fq.gz", - read2=config["outdir"]+"/filtered_human/{sample}_R2.filtered_human.fq.gz", + read1=OUTDIR/"filtered_human/{sample}_R1.filtered_human.fq.gz", + read2=OUTDIR/"filtered_human/{sample}_R2.filtered_human.fq.gz", output: - sam=bbmap_output_folder+"{sample}.sam.gz", - covstats=bbmap_output_folder+"{sample}.covstats.txt", - rpkm=bbmap_output_folder+"{sample}.rpkm.txt", + sam=bbmap_output_folder/"{sample}.sam.gz", + covstats=bbmap_output_folder/"{sample}.covstats.txt", + rpkm=bbmap_output_folder/"{sample}.rpkm.txt", log: - stdout=config["outdir"]+"/logs/bbmap/{sample}.bbmap.stdout.log", - stderr=config["outdir"]+"/logs/bbmap/{sample}.bbmap.statsfile.txt" + stdout=str(LOGDIR/"bbmap/{sample}.bbmap.stdout.log"), + stderr=str(LOGDIR/"bbmap/{sample}.bbmap.statsfile.txt"), shadow: "shallow" conda: @@ -87,13 +87,13 @@ rule bbmap: rule bbmap_counts_table: input: - rpkms=expand(config["outdir"]+"/bbmap/{dbname}/{sample}.rpkm.txt", + rpkms=expand(str(OUTDIR/"bbmap/{dbname}/{sample}.rpkm.txt"), dbname=bbmap_config["db_name"], sample=SAMPLES) output: - counts=config["outdir"]+"/bbmap/{dbname}/all_samples.counts_table.tab".format(dbname=bbmap_config["db_name"]), + counts=OUTDIR/"bbmap/{dbname}/all_samples.counts_table.tab".format(dbname=bbmap_config["db_name"]), log: - config["outdir"]+"/logs/bbmap/{dbname}/all_samples.counts_table.log".format(dbname=bbmap_config["db_name"]) + str(LOGDIR/"bbmap/{dbname}/all_samples.counts_table.log".format(dbname=bbmap_config["db_name"])) shadow: "shallow" conda: @@ -115,15 +115,15 @@ rule bbmap_counts_table: fc_config = bbmap_config["featureCounts"] rule bbmap_featureCounts: input: - bams=expand(config["outdir"]+"/bbmap/{dbname}/{sample}.sam.gz", + bams=expand(str(OUTDIR/"bbmap/{dbname}/{sample}.sam.gz"), dbname=bbmap_config["db_name"], sample=SAMPLES) output: - counts=config["outdir"]+"/bbmap/{dbname}/all_samples.featureCounts".format(dbname=bbmap_config["db_name"]), - counts_table=config["outdir"]+"/bbmap/{dbname}/all_samples.featureCounts.table.tsv".format(dbname=bbmap_config["db_name"]), - summary=config["outdir"]+"/bbmap/{dbname}/all_samples.featureCounts.summary".format(dbname=bbmap_config["db_name"]), + counts=OUTDIR/"bbmap/{dbname}/all_samples.featureCounts".format(dbname=bbmap_config["db_name"]), + counts_table=OUTDIR/"bbmap/{dbname}/all_samples.featureCounts.table.tsv".format(dbname=bbmap_config["db_name"]), + summary=OUTDIR/"bbmap/{dbname}/all_samples.featureCounts.summary".format(dbname=bbmap_config["db_name"]), log: - config["outdir"]+"/logs/bbmap/{dbname}/all_samples.featureCounts.log".format(dbname=bbmap_config["db_name"]) + str(LOGDIR/"bbmap/{dbname}/all_samples.featureCounts.log".format(dbname=bbmap_config["db_name"])) shadow: "shallow" conda: diff --git a/rules/mappers/bowtie2.smk b/rules/mappers/bowtie2.smk index b671e97..d7667e6 100644 --- a/rules/mappers/bowtie2.smk +++ b/rules/mappers/bowtie2.smk @@ -1,50 +1,48 @@ # Generic rules for alignment of reads to a reference database using Bowtie2 +# TODO: Remove superfluous str conversions when Snakemake is pathlib compatible. +from pathlib import Path + from snakemake.exceptions import WorkflowError -import os.path localrules: bowtie2_counts_table bowtie2_featureCounts bt2_db_extensions = (".1.bt2", ".1.bt2l") -if not any([os.path.isfile(config["bowtie2"]["db_prefix"]+ext) for ext in bt2_db_extensions]): +if not any([Path(config["bowtie2"]["db_prefix"]).with_suffix(ext) for ext in bt2_db_extensions]): err_message = "Bowtie2 index not found at: '{}'\n".format(config["bowtie2"]["db_prefix"]) err_message += "Check path in config setting 'bowtie2:db_prefix'.\n" err_message += "If you want to skip mapping with bowtie2, set mappers:bowtie2:False in config.yaml." raise WorkflowError(err_message) -bt2_db_name = os.path.basename(config["bowtie2"]["db_prefix"]) +bt2_db_name = Path(config["bowtie2"]["db_prefix"]).name # Add final output files from this module to 'all_outputs' from the main # Snakefile scope. SAMPLES is also from the main Snakefile scope. -bowtie2_alignments = expand("{outdir}/bowtie2/{db_name}/{sample}.bam", - outdir=config["outdir"], +bowtie2_alignments = expand(str(OUTDIR/"bowtie2/{db_name}/{sample}.bam"), sample=SAMPLES, db_name=bt2_db_name) -bowtie2_stats = expand("{outdir}/bowtie2/{db_name}/{sample}.{stats}.txt", - outdir=config["outdir"], +bowtie2_stats = expand(str(OUTDIR/"bowtie2/{db_name}/{sample}.{stats}.txt"), sample=SAMPLES, stats=["covstats", "rpkm"], db_name=bt2_db_name) -counts_table = expand("{outdir}/bowtie2/{db_name}/all_samples.counts_table.tab", - outdir=config["outdir"], +counts_table = expand(str(OUTDIR/"bowtie2/{db_name}/all_samples.counts_table.tab"), db_name=bt2_db_name, sample=SAMPLES) -featureCounts = expand("{outdir}/bowtie2/{db_name}/all_samples.featureCounts{output_type}", - outdir=config["outdir"], +featureCounts = expand(str(OUTDIR/"bowtie2/{db_name}/all_samples.featureCounts{output_type}"), db_name=bt2_db_name, sample=SAMPLES, output_type=["", ".summary", ".table.tsv"]) all_outputs.extend(bowtie2_alignments) all_outputs.extend(bowtie2_stats) if config["bowtie2"]["counts_table"]["annotations"]: - if not os.path.isfile(config["bowtie2"]["counts_table"]["annotations"]): + if not Path(config["bowtie2"]["counts_table"]["annotations"]).exists(): err_message = "Bowtie2 counts_table annotations not found at: '{}'\n".format(config["bowtie2"]["counts_table"]["annotations"]) err_message += "Check path in config setting 'bowtie2:counts_table:annotations'.\n" err_message += "If you want to skip the counts table summary for Bowtie2, set bowtie2:counts_table:annotations to '' in config.yaml." raise WorkflowError(err_message) all_outputs.extend(counts_table) if config["bowtie2"]["featureCounts"]["annotations"]: - if not os.path.isfile(config["bowtie2"]["featureCounts"]["annotations"]): + if not Path(config["bowtie2"]["featureCounts"]["annotations"]).exists(): err_message = "Bowtie2 featureCounts annotations not found at: '{}'\n".format(config["bowtie2"]["featureCounts"]["annotations"]) err_message += "Check path in config setting 'bowtie2:featureCounts:annotations'.\n" err_message += "If you want to skip mapping with Bowtie2, set mappers:bowtie2:False in config.yaml." @@ -54,12 +52,12 @@ if config["bowtie2"]["featureCounts"]["annotations"]: rule bowtie2: """Align reads using Bowtie2.""" input: - sample=[config["outdir"]+"/filtered_human/{sample}_R1.filtered_human.fq.gz", - config["outdir"]+"/filtered_human/{sample}_R2.filtered_human.fq.gz"] + sample=[OUTDIR/"filtered_human/{sample}_R1.filtered_human.fq.gz", + OUTDIR/"filtered_human/{sample}_R2.filtered_human.fq.gz"] output: - config["outdir"]+"/bowtie2/{db_name}/{{sample}}.bam".format(db_name=bt2_db_name) + OUTDIR/"bowtie2/{db_name}/{{sample}}.bam".format(db_name=bt2_db_name) log: - config["outdir"]+"/logs/bowtie2/{db_name}/{{sample}}.log".format(db_name=bt2_db_name) + str(LOGDIR/"bowtie2/{db_name}/{{sample}}.log".format(db_name=bt2_db_name)) params: index=config["bowtie2"]["db_prefix"], extra=config["bowtie2"]["extra"], @@ -72,12 +70,12 @@ rule bowtie2: rule bowtie2_mapping_stats: """Summarize bowtie2 mapping statistics.""" input: - bam=config["outdir"]+"/bowtie2/{dbname}/{{sample}}.bam".format(dbname=bt2_db_name) + bam=OUTDIR/"bowtie2/{dbname}/{{sample}}.bam".format(dbname=bt2_db_name) output: - covstats=config["outdir"]+"/bowtie2/{dbname}/{{sample}}.covstats.txt".format(dbname=bt2_db_name), - rpkm=config["outdir"]+"/bowtie2/{dbname}/{{sample}}.rpkm.txt".format(dbname=bt2_db_name) + covstats=OUTDIR/"bowtie2/{dbname}/{{sample}}.covstats.txt".format(dbname=bt2_db_name), + rpkm=OUTDIR/"bowtie2/{dbname}/{{sample}}.rpkm.txt".format(dbname=bt2_db_name) log: - config["outdir"]+"/logs/bowtie2/{dbname}/{{sample}}.pileup.log".format(dbname=bt2_db_name) + str(LOGDIR/"bowtie2/{dbname}/{{sample}}.pileup.log".format(dbname=bt2_db_name)) shadow: "shallow" conda: @@ -93,13 +91,13 @@ rule bowtie2_mapping_stats: rule bowtie2_counts_table: input: - rpkms=expand(config["outdir"]+"/bowtie2/{dbname}/{sample}.rpkm.txt", + rpkms=expand(str(OUTDIR/"bowtie2/{dbname}/{sample}.rpkm.txt"), dbname=bt2_db_name, sample=SAMPLES) output: - counts=config["outdir"]+"/bowtie2/{dbname}/all_samples.counts_table.tab".format(dbname=bt2_db_name), + counts=OUTDIR/"bowtie2/{dbname}/all_samples.counts_table.tab".format(dbname=bt2_db_name), log: - config["outdir"]+"/logs/bowtie2/{dbname}/all_samples.counts_table.log".format(dbname=bt2_db_name) + str(LOGDIR/"bowtie2/{dbname}/all_samples.counts_table.log".format(dbname=bt2_db_name)) shadow: "shallow" conda: @@ -121,15 +119,15 @@ rule bowtie2_counts_table: fc_config = config["bowtie2"]["featureCounts"] rule bowtie2_featureCounts: input: - bams=expand(config["outdir"]+"/bowtie2/{dbname}/{sample}.bam", + bams=expand(str(OUTDIR/"bowtie2/{dbname}/{sample}.bam"), dbname=bt2_db_name, sample=SAMPLES) output: - counts=config["outdir"]+"/bowtie2/{dbname}/all_samples.featureCounts".format(dbname=bt2_db_name), - counts_table=config["outdir"]+"/bowtie2/{dbname}/all_samples.featureCounts.table.tsv".format(dbname=bt2_db_name), - summary=config["outdir"]+"/bowtie2/{dbname}/all_samples.featureCounts.summary".format(dbname=bt2_db_name), + counts=OUTDIR/"bowtie2/{dbname}/all_samples.featureCounts".format(dbname=bt2_db_name), + counts_table=OUTDIR/"bowtie2/{dbname}/all_samples.featureCounts.table.tsv".format(dbname=bt2_db_name), + summary=OUTDIR/"bowtie2/{dbname}/all_samples.featureCounts.summary".format(dbname=bt2_db_name), log: - config["outdir"]+"/logs/bowtie2/{dbname}/all_samples.featureCounts.log".format(dbname=bt2_db_name) + str(LOGDIR/"bowtie2/{dbname}/all_samples.featureCounts.log".format(dbname=bt2_db_name)) shadow: "shallow" conda: diff --git a/rules/preproc/bbcountunique.smk b/rules/preproc/bbcountunique.smk index 5f3c8c6..3790623 100644 --- a/rules/preproc/bbcountunique.smk +++ b/rules/preproc/bbcountunique.smk @@ -1,11 +1,10 @@ # vim: syntax=python expandtab # Assess sequencing depth of sample using BBCountUnique from the BBMap suite. -import os.path +# TODO: Remove superfluous str conversions when Snakemake is pathlib compatible. # Add final output files from this module to 'all_outputs' from # the main Snakefile scope. SAMPLES is also from the main Snakefile scope. -bcu_output = expand("{outdir}/bbcountunique/{sample}.{output_type}", - outdir=config["outdir"], +bcu_output = expand(str(OUTDIR/"bbcountunique/{sample}.{output_type}"), sample=SAMPLES, output_type=["bbcountunique.txt", "bbcountunique.pdf"]) all_outputs.extend(bcu_output) @@ -13,13 +12,13 @@ all_outputs.extend(bcu_output) rule bbcountunique: """Assess sequencing depth using BBCountUnique.""" input: - os.path.join(config["inputdir"], config["input_fn_pattern"]).format(sample="{sample}", readpair="1") + INPUTDIR/config["input_fn_pattern"].format(sample="{sample}", readpair="1") output: - txt=config["outdir"]+"/bbcountunique/{sample}.bbcountunique.txt", - pdf=config["outdir"]+"/bbcountunique/{sample}.bbcountunique.pdf", + txt=OUTDIR/"bbcountunique/{sample}.bbcountunique.txt", + pdf=OUTDIR/"bbcountunique/{sample}.bbcountunique.pdf", log: - stdout=config["outdir"]+"/logs/bbcountunique/{sample}.bbcountunique.stdout.log", - stderr=config["outdir"]+"/logs/bbcountunique/{sample}.bbcountunique.stderr.log", + stdout=str(LOGDIR/"bbcountunique/{sample}.bbcountunique.stdout.log"), + stderr=str(LOGDIR/"bbcountunique/{sample}.bbcountunique.stderr.log"), shadow: "shallow" threads: diff --git a/rules/preproc/read_quality.smk b/rules/preproc/read_quality.smk index c306765..d23cfc2 100644 --- a/rules/preproc/read_quality.smk +++ b/rules/preproc/read_quality.smk @@ -1,27 +1,28 @@ # vim: syntax=python expandtab # MWC read pre-processing rules -import os.path +#TODO: Remove superfluous str conversions of paths in expand and log statements +# when Snakemake is pathlib compatible. # Add final output files from this module to 'all_outputs' from # the main Snakefile scope. SAMPLES is also from the main Snakefile scope. -fastqc_output = expand("{outdir}/fastqc/{sample}_R{readpair}_fastqc.{ext}", - outdir=config["outdir"], +fastqc_output = expand(str(OUTDIR/"fastqc/{sample}_R{readpair}_fastqc.{ext}"), sample=SAMPLES, - readpair=[1,2], + readpair=[1, 2], ext=["zip", "html"]) -trimmed_qa = expand("{outdir}/trimmed_qa/{sample}_R{readpair}.trimmed_qa.fq.gz", - outdir=config["outdir"], +trimmed_qa = expand(str(OUTDIR/"trimmed_qa/{sample}_R{readpair}.trimmed_qa.fq.gz"), sample=SAMPLES, - readpair=[1,2]) + readpair=[1, 2]) all_outputs.extend(fastqc_output) all_outputs.extend(trimmed_qa) rule fastqc: input: - os.path.join(config["inputdir"], config["input_fn_pattern"]) + INPUTDIR/config["input_fn_pattern"] output: - html=config["outdir"]+"/fastqc/{sample}_R{readpair}_fastqc.html", - zip=config["outdir"]+"/fastqc/{sample}_R{readpair}_fastqc.zip", + html=OUTDIR/"fastqc/{sample}_R{readpair}_fastqc.html", + zip=OUTDIR/"fastqc/{sample}_R{readpair}_fastqc.zip", + log: + str(LOGDIR/"fastqc/{sample}_R{readpair}_fastq.log") shadow: "shallow" wrapper: @@ -31,14 +32,14 @@ rule fastqc: bbduk_config = config["bbduk"] rule trim_adapters_quality: input: - read1=os.path.join(config["inputdir"], config["input_fn_pattern"]).format(sample="{sample}", readpair="1"), - read2=os.path.join(config["inputdir"], config["input_fn_pattern"]).format(sample="{sample}", readpair="2") + read1=INPUTDIR/config["input_fn_pattern"].format(sample="{sample}", readpair="1"), + read2=INPUTDIR/config["input_fn_pattern"].format(sample="{sample}", readpair="2") output: - read1=config["outdir"]+"/trimmed_qa/{sample}_R1.trimmed_qa.fq.gz", - read2=config["outdir"]+"/trimmed_qa/{sample}_R2.trimmed_qa.fq.gz", + read1=OUTDIR/"trimmed_qa/{sample}_R1.trimmed_qa.fq.gz", + read2=OUTDIR/"trimmed_qa/{sample}_R2.trimmed_qa.fq.gz", log: - stdout=config["outdir"]+"/logs/bbduk/{sample}.stdout.log", - stderr=config["outdir"]+"/logs/bbduk/{sample}.stderr.log", + stdout=str(LOGDIR/"bbduk/{sample}.stdout.log"), + stderr=str(LOGDIR/"bbduk/{sample}.stderr.log"), shadow: "shallow" conda: diff --git a/rules/preproc/remove_human.smk b/rules/preproc/remove_human.smk index ccf361d..a6ccac4 100644 --- a/rules/preproc/remove_human.smk +++ b/rules/preproc/remove_human.smk @@ -1,21 +1,23 @@ # vim: syntax=python expandtab # Rules to filter human sequences from metagenomic reads +# TODO: Remove superfluous str conversions when Snakemake is pathlib compatible. +from pathlib import Path + from snakemake.exceptions import WorkflowError -import os.path localrules: download_hg19 -if not os.path.isdir(os.path.join(config["remove_human"]["hg19_path"], "ref")): - err_message = "Cannot find hg19 database for human sequence removal at: '{}'!\n".format(config["remove_human"]["hg19_path"]) +hg19_path = Path(config["remove_human"]["hg19_path"]) +if not Path(hg19_path/"ref").exists(): + err_message = "Cannot find hg19 database for human sequence removal at: '{}'!\n".format(hg19_path) err_message += "Specify path to folder containing BBMap index of hg19 files in config.yaml.\n" - err_message += "Run 'snakemake index_hg19' to download and create a BBMap index in '{dbdir}/hg19'".format(dbdir=config["dbdir"]) + err_message += "Run 'snakemake index_hg19' to download and create a BBMap index in '{dbdir}'".format(dbdir=dbdir/"hg19") raise WorkflowError(err_message) # Add final output files from this module to 'all_outputs' from the main # Snakefile scope. SAMPLES is also from the main Snakefile scope. -filtered_human = expand("{outdir}/filtered_human/{sample}_R{readpair}.filtered_human.fq.gz", - outdir=config["outdir"], +filtered_human = expand(str(OUTDIR/"filtered_human/{sample}_R{readpair}.filtered_human.fq.gz"), sample=SAMPLES, readpair=[1,2]) all_outputs.extend(filtered_human) @@ -25,11 +27,11 @@ rule download_hg19: """Download masked hg19 from: https://drive.google.com/file/d/0B3llHR93L14wd0pSSnFULUlhcUk""" output: - config["dbdir"]+"/hg19/hg19_main_mask_ribo_animal_allplant_allfungus.fa", + OUTDIR/"hg19/hg19_main_mask_ribo_animal_allplant_allfungus.fa", conda: "../../envs/stag-mwc.yaml" params: - dbdir=config["dbdir"]+"/hg19/" + dbdir=DBDIR/"hg19" shell: """ scripts/download_from_gdrive.py \ @@ -43,26 +45,26 @@ rule download_hg19: rule index_hg19: """Create BBMap index of hg19 fasta file.""" input: - config["dbdir"]+"/hg19/hg19_main_mask_ribo_animal_allplant_allfungus.fa", + DBDIR/"hg19/hg19_main_mask_ribo_animal_allplant_allfungus.fa", output: - config["dbdir"]+"/hg19/ref/genome/1/chr1.chrom.gz", - config["dbdir"]+"/hg19/ref/genome/1/chr2.chrom.gz", - config["dbdir"]+"/hg19/ref/genome/1/chr3.chrom.gz", - config["dbdir"]+"/hg19/ref/genome/1/chr4.chrom.gz", - config["dbdir"]+"/hg19/ref/genome/1/chr5.chrom.gz", - config["dbdir"]+"/hg19/ref/genome/1/chr6.chrom.gz", - config["dbdir"]+"/hg19/ref/genome/1/chr7.chrom.gz", - config["dbdir"]+"/hg19/ref/genome/1/info.txt", - config["dbdir"]+"/hg19/ref/genome/1/scaffolds.txt.gz", - config["dbdir"]+"/hg19/ref/genome/1/summary.txt", - config["dbdir"]+"/hg19/ref/index/1/chr1-3_index_k13_c2_b1.block", - config["dbdir"]+"/hg19/ref/index/1/chr1-3_index_k13_c2_b1.block2.gz", - config["dbdir"]+"/hg19/ref/index/1/chr4-7_index_k13_c2_b1.block", - config["dbdir"]+"/hg19/ref/index/1/chr4-7_index_k13_c2_b1.block2.gz", + DBDIR/"hg19/ref/genome/1/chr1.chrom.gz", + DBDIR/"hg19/ref/genome/1/chr2.chrom.gz", + DBDIR/"hg19/ref/genome/1/chr3.chrom.gz", + DBDIR/"hg19/ref/genome/1/chr4.chrom.gz", + DBDIR/"hg19/ref/genome/1/chr5.chrom.gz", + DBDIR/"hg19/ref/genome/1/chr6.chrom.gz", + DBDIR/"hg19/ref/genome/1/chr7.chrom.gz", + DBDIR/"hg19/ref/genome/1/info.txt", + DBDIR/"hg19/ref/genome/1/scaffolds.txt.gz", + DBDIR/"hg19/ref/genome/1/summary.txt", + DBDIR/"hg19/ref/index/1/chr1-3_index_k13_c2_b1.block", + DBDIR/"hg19/ref/index/1/chr1-3_index_k13_c2_b1.block2.gz", + DBDIR/"hg19/ref/index/1/chr4-7_index_k13_c2_b1.block", + DBDIR/"hg19/ref/index/1/chr4-7_index_k13_c2_b1.block2.gz", conda: "../../envs/stag-mwc.yaml" params: - dbdir=config["dbdir"]+"/hg19/" + dbdir=DBDIR/"hg19" shell: """ bbmap.sh \ @@ -75,15 +77,15 @@ rh_config = config["remove_human"] rule remove_human: """Filter reads matching hg19. NB: requires about 16GB of memory.""" input: - read1=config["outdir"]+"/trimmed_qa/{sample}_R1.trimmed_qa.fq.gz", - read2=config["outdir"]+"/trimmed_qa/{sample}_R2.trimmed_qa.fq.gz", + read1=OUTDIR/"trimmed_qa/{sample}_R1.trimmed_qa.fq.gz", + read2=OUTDIR/"trimmed_qa/{sample}_R2.trimmed_qa.fq.gz", output: - read1=config["outdir"]+"/filtered_human/{sample}_R1.filtered_human.fq.gz", - read2=config["outdir"]+"/filtered_human/{sample}_R2.filtered_human.fq.gz", - human=config["outdir"]+"/filtered_human/{sample}_human.fq.gz", + read1=OUTDIR/"filtered_human/{sample}_R1.filtered_human.fq.gz", + read2=OUTDIR/"filtered_human/{sample}_R2.filtered_human.fq.gz", + human=OUTDIR/"filtered_human/{sample}_human.fq.gz", log: - statsfile=config["outdir"]+"/logs/remove_human/{sample}.statsfile.txt", - stderr=config["outdir"]+"/logs/remove_human/{sample}.stderr.log", + statsfile=str(OUTDIR/"logs/remove_human/{sample}.statsfile.txt"), + stderr=str(OUTDIR/"logs/remove_human/{sample}.stderr.log"), shadow: "shallow" conda: diff --git a/rules/sketch_compare/sketch_compare.smk b/rules/sketch_compare/sketch_compare.smk index fcfb0e4..22f6a9f 100644 --- a/rules/sketch_compare/sketch_compare.smk +++ b/rules/sketch_compare/sketch_compare.smk @@ -1,6 +1,6 @@ # vim: syntax=python expandtab # Compare all samples against all samples using MinHash sketches -import os.path +# TODO: Remove superfluous str conversions when Snakemake is pathlib compatible. localrules: compare_sketches @@ -9,20 +9,19 @@ localrules: # Add final output files from this module to 'all_outputs' from the # main Snakefile scope. -sample_similarity_plot = expand("{outdir}/sketch_compare/sample_similarity.pdf", - outdir=outdir) -all_outputs.extend(sample_similarity_plot) +sample_similarity_plot = str(OUTDIR/"sketch_compare/sample_similarity.pdf") +all_outputs.append(sample_similarity_plot) rule sketch: """Create MinHash sketches of samples using BBMap's sketch.sh. Uses only the first readpair of each sample.""" input: - os.path.join(config["inputdir"], config["input_fn_pattern"]).format(sample="{sample}", readpair="1") + INPUTDIR/config["input_fn_pattern"].format(sample="{sample}", readpair="1") output: - sketch=config["outdir"]+"/sketch_compare/{sample}.sketch.gz", + sketch=OUTDIR/"sketch_compare/{sample}.sketch.gz", log: - config["outdir"]+"/logs/sketch_compare/{sample}.sketch.log" + str(LOGDIR/"sketch_compare/{sample}.sketch.log") shadow: "shallow" conda: @@ -42,12 +41,12 @@ rule sketch: rule compare_sketches: """Compare all samples using BBMap's comparesketch.sh""" input: - samples=expand(config["outdir"]+"/sketch_compare/{sample}.sketch.gz", - sample=SAMPLES) + samples=expand(str(OUTDIR/"sketch_compare/{sample}.sketch.gz"), + sample=SAMPLES) output: - alltoall=config["outdir"]+"/sketch_compare/alltoall.txt", + alltoall=OUTDIR/"sketch_compare/alltoall.txt", log: - config["outdir"]+"/logs/sketch_compare/comparesketch.log" + str(LOGDIR/"sketch_compare/comparesketch.log") shadow: "shallow" conda: @@ -66,12 +65,12 @@ rule compare_sketches: rule plot_sample_similarity: """Plot sample sketch similarity matrix""" input: - config["outdir"]+"/sketch_compare/alltoall.txt" + OUTDIR/"sketch_compare/alltoall.txt" output: - config["outdir"]+"/sketch_compare/sample_similarity.pdf" + OUTDIR/"sketch_compare/sample_similarity.pdf" log: - stdout=config["outdir"]+"/logs/sketch_compare/sample_similarity_plot.stdout.log", - stderr=config["outdir"]+"/logs/sketch_compare/sample_similarity_plot.stderr.log", + stdout=str(LOGDIR/"sketch_compare/sample_similarity_plot.stdout.log"), + stderr=str(LOGDIR/"sketch_compare/sample_similarity_plot.stderr.log"), conda: "../../envs/stag-mwc.yaml" shell: diff --git a/rules/taxonomic_profiling/centrifuge.smk b/rules/taxonomic_profiling/centrifuge.smk index 8043938..0f3694b 100644 --- a/rules/taxonomic_profiling/centrifuge.smk +++ b/rules/taxonomic_profiling/centrifuge.smk @@ -1,34 +1,35 @@ # vim: syntax=python expandtab # Taxonomic classification of metagenomic reads using Centrifuge +# TODO: Remove superfluous str conversions when Snakemake is pathlib compatible. +from pathlib import Path + from snakemake.exceptions import WorkflowError -import os.path localrules: download_centrifuge_database centrifuge_db_ext = ".1.cf" -if not os.path.isfile(config["centrifuge"]["db_prefix"]+centrifuge_db_ext): +if not Path(config["centrifuge"]["db_prefix"]).with_suffix(centrifuge_db_ext).exists(): err_message = "No Centrifuge database found at: '{}'!\n".format(config["centrifuge"]["db_prefix"]) err_message += "Specify Centrifuge database prefix in the Centrifuge section of config.yaml.\n" - err_message += "Run 'snakemake download_centrifuge_database' to download a copy into '{dbdir}/centrifuge'\n".format(dbdir=config["dbdir"]) + err_message += "Run 'snakemake download_centrifuge_database' to download a copy into '{dbdir}'\n".format(dbdir=DBDIR/"centrifuge") err_message += "If you do not want to run Centrifuge for taxonomic profiling, set centrifuge: False in config.yaml" raise WorkflowError(err_message) # Add final output files from this module to 'all_outputs' from the main # Snakefile scope. SAMPLES is also from the main Snakefile scope. -centrifuge = expand("{outdir}/centrifuge/{sample}.{output_type}.tsv", - outdir=config["outdir"], +centrifuge = expand(str(OUTDIR/"centrifuge/{sample}.{output_type}.tsv"), sample=SAMPLES, output_type=("centrifuge", "centrifuge_report")) all_outputs.extend(centrifuge) rule download_centrifuge_database: output: - db=[config["dbdir"]+"/centrifuge/p+h+v.{n}.cf".format(n=num) for num in (1,2,3)] + db=[DBDIR/"centrifuge/p+h+v.{n}.cf".format(n=num) for num in (1,2,3)] shadow: "shallow" params: - dbdir=config["dbdir"]+"/centrifuge" + dbdir=DBDIR/"centrifuge" shell: """ cd {params.dbdir} @@ -43,13 +44,13 @@ rule download_centrifuge_database: cf_config = config["centrifuge"] rule centrifuge: input: - read1=config["outdir"]+"/filtered_human/{sample}_R1.filtered_human.fq.gz", - read2=config["outdir"]+"/filtered_human/{sample}_R2.filtered_human.fq.gz", + read1=OUTDIR/"filtered_human/{sample}_R1.filtered_human.fq.gz", + read2=OUTDIR/"filtered_human/{sample}_R2.filtered_human.fq.gz", output: - classifications=config["outdir"]+"/centrifuge/{sample}.centrifuge.tsv", - report=config["outdir"]+"/centrifuge/{sample}.centrifuge_report.tsv", + classifications=OUTDIR/"centrifuge/{sample}.centrifuge.tsv", + report=OUTDIR/"centrifuge/{sample}.centrifuge_report.tsv", log: - config["outdir"]+"/logs/centrifuge/{sample}.centrifuge.log" + str(LOGDIR/"centrifuge/{sample}.centrifuge.log") shadow: "shallow" conda: diff --git a/rules/taxonomic_profiling/kaiju.smk b/rules/taxonomic_profiling/kaiju.smk index 8be9339..8299b7a 100644 --- a/rules/taxonomic_profiling/kaiju.smk +++ b/rules/taxonomic_profiling/kaiju.smk @@ -1,40 +1,42 @@ # vim: syntax=python expandtab # Taxonomic classification of metagenomic reads using Kaiju +# TODO: Remove superfluous str conversions when Snakemake is pathlib compatible. +from pathlib import Path + from snakemake.exceptions import WorkflowError -import os.path localrules: download_kaiju_database create_kaiju_krona_plot kaiju_config = config["kaiju"] -if not all([os.path.isfile(kaiju_config["db"]), - os.path.isfile(kaiju_config["nodes"]), - os.path.isfile(kaiju_config["names"])]): +if not all([Path(kaiju_config["db"]).exists(), + Path(kaiju_config["nodes"]).exists(), + Path(kaiju_config["names"]).exists()]): err_message = "No Kaiju database files at: '{}', '{}', '{}'!\n".format(kaiju_config["db"], kaiju_config["nodes"], kaiju_config["names"]) err_message += "Specify relevant paths in the kaiju section of config.yaml.\n" - err_message += "Run 'snakemake download_kaiju_database' to download a copy into '{dbdir}/kaiju'\n".format(dbdir=config["dbdir"]) + err_message += "Run 'snakemake download_kaiju_database' to download a copy into '{dbdir}'\n".format(dbdir=DBDIR/"kaiju") err_message += "If you do not want to run Kaiju for taxonomic profiling, set 'kaiju: False' in config.yaml" raise WorkflowError(err_message) # Add Kaiju output files to 'all_outputs' from the main Snakefile scope. # SAMPLES is also from the main Snakefile scope. -kaiju = expand("{outdir}/kaiju/{sample}.kaiju", outdir=outdir, sample=SAMPLES) -kaiju_reports = expand("{outdir}/kaiju/{sample}.kaiju.summary.species", outdir=outdir, sample=SAMPLES) -kaiju_krona = expand("{outdir}/kaiju/all_samples.kaiju.krona.html", outdir=outdir) +kaiju = expand(str(OUTDIR/"kaiju/{sample}.kaiju"), sample=SAMPLES) +kaiju_reports = expand(str(OUTDIR/"kaiju/{sample}.kaiju.summary.species"), sample=SAMPLES) +kaiju_krona = str(OUTDIR/"kaiju/all_samples.kaiju.krona.html") all_outputs.extend(kaiju) all_outputs.extend(kaiju_reports) -all_outputs.extend(kaiju_krona) +all_outputs.append(kaiju_krona) rule download_kaiju_database: output: - db=config["dbdir"]+"/kaiju/kaiju_db.fmi", - names=config["dbdir"]+"/kaiju/names.dmp", - nodes=config["dbdir"]+"/kaiju/nodes.dmp" + db=DBDIR/"kaiju/kaiju_db.fmi", + names=DBDIR/"kaiju/names.dmp", + nodes=DBDIR/"kaiju/nodes.dmp" shadow: "shallow" params: - dbdir=config["dbdir"]+"/kaiju" + dbdir=DBDIR/"kaiju" shell: """ wget http://kaiju.binf.ku.dk/database/kaiju_index_pg.tgz \ @@ -46,10 +48,12 @@ rule download_kaiju_database: rule kaiju: input: - read1=config["outdir"]+"/filtered_human/{sample}_R1.filtered_human.fq.gz", - read2=config["outdir"]+"/filtered_human/{sample}_R2.filtered_human.fq.gz", + read1=OUTDIR/"filtered_human/{sample}_R1.filtered_human.fq.gz", + read2=OUTDIR/"filtered_human/{sample}_R2.filtered_human.fq.gz", output: - kaiju=config["outdir"]+"/kaiju/{sample}.kaiju", + kaiju=OUTDIR/"kaiju/{sample}.kaiju", + log: + str(LOGDIR/"kaiju/{sample}.kaiju.log") shadow: "shallow" threads: @@ -69,7 +73,7 @@ rule kaiju: -f {params.db} \ -i <(gunzip -c {input.read1}) \ -j <(gunzip -c {input.read2}) \ - -o {output.kaiju} + -o {output.kaiju} > {log} else kaiju \ -z {threads} \ @@ -77,19 +81,19 @@ rule kaiju: -f {params.db} \ -i {input.read1} \ -j {input.read2} \ - -o {output.kaiju} + -o {output.kaiju} > {log} fi """ rule kaiju_report: input: - kaiju=config["outdir"]+"/kaiju/{sample}.kaiju", + kaiju=OUTDIR/"kaiju/{sample}.kaiju", output: - krona=config["outdir"]+"/kaiju/{sample}.krona", - family=config["outdir"]+"/kaiju/{sample}.summary.family", - genus=config["outdir"]+"/kaiju/{sample}.kaiju.summary.genus", - species=config["outdir"]+"/kaiju/{sample}.kaiju.summary.species", + krona=OUTDIR/"kaiju/{sample}.krona", + family=OUTDIR/"kaiju/{sample}.summary.family", + genus=OUTDIR/"kaiju/{sample}.kaiju.summary.genus", + species=OUTDIR/"kaiju/{sample}.kaiju.summary.species", shadow: "shallow" params: @@ -130,9 +134,9 @@ rule kaiju_report: rule create_kaiju_krona_plot: input: - expand(config["outdir"]+"/kaiju/{sample}.krona", sample=SAMPLES) + expand(str(OUTDIR/"kaiju/{sample}.krona"), sample=SAMPLES) output: - krona_html=config["outdir"]+"/kaiju/all_samples.kaiju.krona.html", + krona_html=OUTDIR/"kaiju/all_samples.kaiju.krona.html", shadow: "shallow" conda: diff --git a/rules/taxonomic_profiling/metaphlan2.smk b/rules/taxonomic_profiling/metaphlan2.smk index 527f0e5..bc0aa1f 100644 --- a/rules/taxonomic_profiling/metaphlan2.smk +++ b/rules/taxonomic_profiling/metaphlan2.smk @@ -1,7 +1,9 @@ # vim: syntax=python expandtab # Taxonomic classification of metagenomic reads using MetaPhlAn2 +# TODO: Remove superfluous str conversions when Snakemake is pathlib compatible. +from pathlib import Path + from snakemake.exceptions import WorkflowError -import os.path localrules: download_metaphlan2_database @@ -9,22 +11,20 @@ localrules: mpa_config = config["metaphlan2"] bt2_db_ext = ".1.bt2" -if not any([os.path.isfile(mpa_config["mpa_pkl"]), - os.path.isfile(mpa_config["bt2_db_prefix"]+bt2_db_ext)]): +if not any([Path(mpa_config["mpa_pkl"]).exists(), + Path(mpa_config["bt2_db_prefix"]).with_suffix(bt2_db_ext).exists()]): err_message = "No MetaPhlAn2 pickle or database found at: '{}', '{}'!\n".format(mpa_config["mpa_pkl"], mpa_config["bt2_db_prefix"]) err_message += "Specify relevant paths in the metaphlan2 section of config.yaml.\n" - err_message += "Run 'snakemake build_metaphlan2_index' to download and build the default mpa_v20_m200 database in '{dbdir}/metaphlan2'\n".format(dbdir=config["dbdir"]) + err_message += "Run 'snakemake build_metaphlan2_index' to download and build the default mpa_v20_m200 database in '{dbdir}'\n".format(dbdir=DBDIR/"metaphlan2") err_message += "If you do not want to run MetaPhlAn2 for taxonomic profiling, set metaphlan2: False in config.yaml" raise WorkflowError(err_message) # Add MetaPhlAn2 output files to 'all_outputs' from the main Snakefile scope. # SAMPLES is also from the main Snakefile scope. -mpa_outputs = expand("{outdir}/metaphlan2/{sample}.{output_type}", - outdir=config["outdir"], +mpa_outputs = expand(str(OUTDIR/"metaphlan2/{sample}.{output_type}"), sample=SAMPLES, output_type=("bowtie2.bz2", "metaphlan2.txt", "metaphlan2.krona")) -mpa_combined = expand("{outdir}/metaphlan2/all_samples.metaphlan2.{ext}", - outdir=config["outdir"], +mpa_combined = expand(str(OUTDIR/"metaphlan2/all_samples.metaphlan2.{ext}"), ext=("txt", "pdf", "krona.html")) all_outputs.extend(mpa_outputs) all_outputs.extend(mpa_combined) @@ -32,8 +32,10 @@ all_outputs.extend(mpa_combined) rule download_metaphlan2_database: """Download MetaPhlAn2 db_v20_m200""" output: - config["dbdir"]+"/metaphlan2/mpa_v20_m200.fna", - config["dbdir"]+"/metaphlan2/mpa_v20_m200.pkl", + DBDIR/"metaphlan2/mpa_v20_m200.fna", + DBDIR/"metaphlan2/mpa_v20_m200.pkl", + log: + str(LOGDIR/"metaphlan2/mpa_v20_m200.download.log") shadow: "shallow" conda: @@ -44,22 +46,27 @@ rule download_metaphlan2_database: """ cd {params.dbdir} wget https://bitbucket.org/biobakery/metaphlan2/downloads/mpa_v20_m200.tar \ + > {log} \ && \ tar -xf mpa_v20_m200.tar \ + >> {log} \ && \ bunzip2 mpa_v20_m200.fna.bz2 \ + >> {log} \ && \ - rm -v mpa_v20_m200.tar + rm -v mpa_v20_m200.tar \ + >> {log} """ rule build_metaphlan2_index: """Build MetaPhlAn2 bowtie2 index.""" input: - config["dbdir"]+"/metaphlan2/mpa_v20_m200.fna" + DBDIR/"metaphlan2/mpa_v20_m200.fna" output: - [config["dbdir"]+"/metaphlan2/mpa_v20_m200.{n}.bt2".format(n=num) for num in (1,2,3,4)], - [config["dbdir"]+"/metaphlan2/mpa_v20_m200.rev.{n}.bt2".format(n=num) for num in (1,2)], + [DBDIR/"metaphlan2/mpa_v20_m200.{n}.bt2".format(n=num) for num in (1,2,3,4)], + [DBDIR/"metaphlan2/mpa_v20_m200.rev.{n}.bt2".format(n=num) for num in (1,2)], log: + str(LOGDIR/"metaphlan2/mpa_v20_m200.build.log") shadow: "shallow" conda: @@ -74,21 +81,22 @@ rule build_metaphlan2_index: bowtie2-build \ mpa_v20_m200.fna \ mpa_v20_m200 \ - --threads {threads} + --threads {threads} \ + > {log} """ rule metaphlan2: """Taxonomic profiling using MetaPhlAn2.""" input: - read1=config["outdir"]+"/filtered_human/{sample}_R1.filtered_human.fq.gz", - read2=config["outdir"]+"/filtered_human/{sample}_R2.filtered_human.fq.gz", + read1=OUTDIR/"filtered_human/{sample}_R1.filtered_human.fq.gz", + read2=OUTDIR/"filtered_human/{sample}_R2.filtered_human.fq.gz", output: - bt2_out=config["outdir"]+"/metaphlan2/{sample}.bowtie2.bz2", - mpa_out=config["outdir"]+"/metaphlan2/{sample}.metaphlan2.txt", - krona=config["outdir"]+"/metaphlan2/{sample}.metaphlan2.krona", + bt2_out=OUTDIR/"metaphlan2/{sample}.bowtie2.bz2", + mpa_out=OUTDIR/"metaphlan2/{sample}.metaphlan2.txt", + krona=OUTDIR/"metaphlan2/{sample}.metaphlan2.krona", log: - stdout=config["outdir"]+"/logs/metaphlan2/{sample}.metaphlan2.stdout.log", - stderr=config["outdir"]+"/logs/metaphlan2/{sample}.metaphlan2.stderr.log", + stdout=str(LOGDIR/"metaphlan2/{sample}.metaphlan2.stdout.log"), + stderr=str(LOGDIR/"metaphlan2/{sample}.metaphlan2.stderr.log"), shadow: "shallow" conda: @@ -123,10 +131,10 @@ rule metaphlan2: rule combine_metaphlan2_outputs: """Combine metaphlan2 outputs into a large table and plot heatmap.""" input: - expand(config["outdir"]+"/metaphlan2/{sample}.metaphlan2.txt", sample=SAMPLES) + expand(str(OUTDIR/"metaphlan2/{sample}.metaphlan2.txt"), sample=SAMPLES) output: - txt=config["outdir"]+"/metaphlan2/all_samples.metaphlan2.txt", - pdf=config["outdir"]+"/metaphlan2/all_samples.metaphlan2.pdf", + txt=OUTDIR/"metaphlan2/all_samples.metaphlan2.txt", + pdf=OUTDIR/"metaphlan2/all_samples.metaphlan2.pdf", shadow: "shallow" conda: @@ -162,9 +170,9 @@ rule combine_metaphlan2_outputs: rule create_metaphlan2_krona_plots: input: - expand(config["outdir"]+"/metaphlan2/{sample}.metaphlan2.krona", sample=SAMPLES) + expand(str(OUTDIR/"metaphlan2/{sample}.metaphlan2.krona"), sample=SAMPLES) output: - html=config["outdir"]+"/metaphlan2/all_samples.metaphlan2.krona.html", + html=OUTDIR/"metaphlan2/all_samples.metaphlan2.krona.html", shadow: "shallow" conda: