diff --git a/.github/workflows/dag.yaml b/.github/workflows/dag.yaml index 2b963de..dcdea43 100644 --- a/.github/workflows/dag.yaml +++ b/.github/workflows/dag.yaml @@ -36,6 +36,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt + pip install pulp==2.7.0 # must < 2.8 for integration with Snakemake - name: Build DAG run: snakemake --configfile tests/example_config.yaml --dag > dag.dot diff --git a/README.md b/README.md index ed8dd93..aac65a0 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ Sometimes, when snakemake unexpectedly exits (e.g., due to a server connection t 8. Align all reads against the host genome - HoMi will by default download the GRCh38 human reference genome, but you can provide an alternative genome (fna + gtf) if it's already downloaded - - BBmap is used to map the reads, and featureCounts is used to generate a read count table + - Either BBmap or HISAT2 is used to map the reads (HISAT2 by default), and featureCounts is used to generate a read count table ### DAG ![DAG](figures/dag.svg) diff --git a/conda_envs/hisat2.yaml b/conda_envs/hisat2.yaml new file mode 100644 index 0000000..63724d3 --- /dev/null +++ b/conda_envs/hisat2.yaml @@ -0,0 +1,7 @@ +name: hisat2 +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::hisat2 \ No newline at end of file diff --git a/figures/dag.svg b/figures/dag.svg index e60f7d9..24850aa 100644 --- a/figures/dag.svg +++ b/figures/dag.svg @@ -5,542 +5,542 @@ --> + viewBox="0.00 0.00 1220.49 620.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> snakemake_dag - + 0 - -all + +all 1 - -multiqc_pass1 + +multiqc_pass1 1->0 - - + + 2 - -fastQC_pass1 -read: R1 + +fastQC_pass1 +read: R1 2->1 - - + + 3 - -remove_adapters + +remove_adapters 3->2 - - + + 5 - -fastQC_pass1 -read: R2 + +fastQC_pass1 +read: R2 3->5 - - + + 8 - -trim_forward + +trim_forward 3->8 - - + + 10 - -trim_reverse + +trim_reverse 3->10 - - + + 4 - -symlink_fastqs -sample: mock_community + +symlink_fastqs +sample: mock_community 4->3 - - + + 5->1 - - + + 6 - -multiqc_pass2 + +multiqc_pass2 6->0 - - + + 7 - -fastQC_pass2 -read: R1 + +fastQC_pass2 +read: R1 7->6 - - + + 8->7 - - + + 18 - -host_filter + +host_filter 8->18 - - + + 27 - -bbmap_host + +map_host 8->27 - - + + 9 - -fastQC_pass2 -read: R2 + +fastQC_pass2 +read: R2 9->6 - - + + 10->9 - - + + 10->18 - - + + 10->27 - - + + 11 - -aggregate_humann_outs_nonhost + +aggregate_humann_outs_nonhost 11->0 - - + + 20 - -taxa_barplot + +taxa_barplot 11->20 - - + + 21 - -func_barplot_rxn + +func_barplot_rxn 11->21 - - + + 35 - -calc_gut_metabolic_modules + +calc_gut_metabolic_modules 11->35 - - + + 12 - -get_utility_mapping_db + +get_utility_mapping_db 12->11 - - + + 13 - -run_humann_nonhost + +run_humann_nonhost 13->11 - - + + 14 - -setup_metaphlan + +setup_metaphlan 14->13 - - + + 15 - -get_biobakery_chocophlan_db + +get_biobakery_chocophlan_db 15->13 - - + + 16 - -get_biobakery_uniref_db + +get_biobakery_uniref_db 16->13 - - + + 17 - -concat_nonhost_reads + +concat_nonhost_reads 17->13 - - + + 18->17 - - + + 23 - -nonpareil + +nonpareil 18->23 - - + + 34 - -run_kraken + +run_kraken 18->34 - - + + 19 - -download_hostile_db + +download_hostile_db 19->18 - - + + 20->0 - - + + 21->0 - - + + 22 - -nonpareil_curves + +nonpareil_curves 22->0 - - + + 23->22 - - + + 24 - -generate_feature_counts + +generate_feature_counts 24->0 - - + + 29 - -feature_counts_to_tpm + +feature_counts_to_tpm 24->29 - - + + 25 - -pull_host_genome + +pull_host_genome 25->24 - - + + 28 - -build_human_genome_index_bbmap + +build_host_genome_index 25->28 - - + + 26 - -validate_bams + +validate_bams 26->24 - - + + 27->24 - - + + 27->26 - - + + 28->27 - - + + 29->0 - - + + 30 - -aggregate_bracken + +aggregate_bracken 30->0 - - + + 31 - -run_bracken + +run_bracken 31->30 - - + + 32 - -get_kraken_db + +get_kraken_db 32->31 - - + + 33 - -build_bracken + +build_bracken 32->33 - - + + 32->34 - - + + 33->31 - - + + 34->31 - - + + 35->0 - - + + diff --git a/snakefile b/snakefile index 7892f0a..ddecc52 100644 --- a/snakefile +++ b/snakefile @@ -1,7 +1,7 @@ import pandas as pd from os.path import join as pj from os.path import split -from src.snake_utils import hostile_db_to_path, get_adapters_path, get_nonpareil_rmd_path, get_nonpareil_html_path, get_agg_script_path, get_mphlan_conv_script_path, get_taxa_barplot_rmd_path, get_sam2bam_path, get_func_barplot_rmd_path, get_partition, get_mem, get_runtime, get_threads, get_host_mapping_samples, get_slurm_extra, get_gmm_rmd_path, get_kraken_db_loc, get_tpm_converter_path +from src.snake_utils import hostile_db_to_path, get_adapters_path, get_nonpareil_rmd_path, get_nonpareil_html_path, get_agg_script_path, get_mphlan_conv_script_path, get_taxa_barplot_rmd_path, get_sam2bam_path, get_func_barplot_rmd_path, get_partition, get_mem, get_runtime, get_threads, get_host_mapping_samples, get_slurm_extra, get_gmm_rmd_path, get_kraken_db_loc, get_tpm_converter_path, get_host_map_method @@ -998,7 +998,7 @@ rule nonpareil_curves: # pull human genome rule pull_host_genome: """ - This rule downloads the human GRCh38 reference genome and annotation. + This rule downloads the host (default human GRCh38) reference genome and annotation. """ output: GENOME=config['host_ref_fna'], @@ -1028,65 +1028,79 @@ rule pull_host_genome: """ -# make bbmap index for human genome -rule build_human_genome_index_bbmap: +# make index for host genome mapping +rule build_host_genome_index: """ - This rule builds a Bbmap index for the host genome. + This rule builds an index for mapping the host genome. """ input: config['host_ref_fna'] output: directory(pj(f"{trim_trunc_path}.host", "ref/")) - conda: "conda_envs/bbmap.yaml" + conda: f"conda_envs/{get_host_map_method(config).lower()}.yaml" resources: - partition=get_partition("short", config, "build_human_genome_index_bbmap"), - mem_mb=get_mem(int(30*1000), config, "build_human_genome_index_bbmap"), # MB, or 30 GB - runtime=get_runtime(int(2*60), config, "build_human_genome_index_bbmap"), # min, or 2 hrs - slurm=get_slurm_extra(config, "build_human_genome_index_bbmap") - threads: get_threads(8, config, "build_human_genome_index_bbmap") + partition=get_partition("short", config, "build_host_genome_index"), + mem_mb=get_mem(int(32*1000), config, "build_host_genome_index"), # MB, or 30 GB + runtime=get_runtime(int(2*60), config, "build_host_genome_index"), # min, or 2 hrs + slurm=get_slurm_extra(config, "build_host_genome_index") + threads: get_threads(8, config, "build_host_genome_index") params: - ref_dir=f"{trim_trunc_path}.host" + ref_dir=f"{trim_trunc_path}.host", + method=get_host_map_method(config) shell: """ mkdir -p {params.ref_dir} - bbmap.sh ref={input} path={params.ref_dir} threads={threads} -Xmx{resources.mem_mb}m + if [ "{method}" == "BBMap" ]; then + bbmap.sh ref={input} path={params.ref_dir} threads={threads} -Xmx{resources.mem_mb}m + elif [ "{method}" == "HISAT2" ]; then + hisat2-build {input} {output} -p {threads} + fi """ -# Map to human genome -rule bbmap_host: +# Map to host genome +rule map_host: """ - This rule maps reads to the host genome using Bbmap and compresses SAM files to BAM files. + This rule maps reads to the host genome and compresses SAM files to BAM files. """ input: - REF=pj(f"{trim_trunc_path}.host", "ref/"), - FWD=pj(trim_trunc_path, - "{sample}.R1.fq"), - REV=pj(trim_trunc_path, - "{sample}.R2.fq") + REF=pj(f"{trim_trunc_path}.host", "ref/"), + FWD=pj(trim_trunc_path, + "{sample}.R1.fq"), + REV=pj(trim_trunc_path, + "{sample}.R2.fq") output: SAM=pj(f"{trim_trunc_path}.host", "{sample}.sam"), BAM=pj(f"{trim_trunc_path}.host", "{sample}.bam") - conda: "conda_envs/bbmap.yaml" + conda: f"conda_envs/{get_host_map_method(config).lower()}.yaml" resources: - partition=get_partition("short", config, "bbmap_host"), - mem_mb=get_mem(int(210*1000), config, "bbmap_host"), # MB, or 210 GB, can cut to ~100 for 16 threads - runtime=get_runtime(int(23.9*60), config, "bbmap_host"), # min, or almost 24 hrs - slurm=get_slurm_extra(config, "bbmap_host") - threads: get_threads(32, config, "bbmap_host") + partition=get_partition("short", config, "map_host"), + mem_mb=get_mem(int(210*1000), config, "map_host"), # MB, or 210 GB, can cut to ~100 for 16 threads + runtime=get_runtime(int(23.9*60), config, "map_host"), # min, or almost 24 hrs + slurm=get_slurm_extra(config, "map_host") + threads: get_threads(32, config, "map_host") params: out_dir=f"{trim_trunc_path}.host", - sam2bam_path=get_sam2bam_path() + sam2bam_path=get_sam2bam_path(), + method=get_host_map_method(config) shell: """ cd {params.out_dir} - bbmap.sh in=../{input.FWD} in2=../{input.REV} \ - out={wildcards.sample}.sam \ - trimreaddescriptions=t \ - threads={threads} \ - -Xmx{resources.mem_mb}m - + if [ "{method}" == "BBMap" ]; then + bbmap.sh in=../{input.FWD} in2=../{input.REV} \ + out={wildcards.sample}.sam \ + trimreaddescriptions=t \ + threads={threads} \ + -Xmx{resources.mem_mb}m + + elif [ "{method}" == "HISAT2" ]; then + hisat2 -1 ../{input.FWD} -2 ../{input.REV} \ + -S {wildcards.sample}.sam \ + -x ref/ \ + -p {threads} + fi + bash {params.sam2bam_path} {wildcards.sample}.sam """ diff --git a/src/snake_utils.py b/src/snake_utils.py index bfbebf8..2382b85 100644 --- a/src/snake_utils.py +++ b/src/snake_utils.py @@ -185,4 +185,13 @@ def get_tpm_converter_path(): if path.exists(converter): return converter else: - raise FileNotFoundError(f"TPM conversion script was not found at {converter}.") \ No newline at end of file + raise FileNotFoundError(f"TPM conversion script was not found at {converter}.") + +def get_host_map_method(config): + implemented_mappers = ["HISAT2", "BBMap"] + map_method = config.get("host_map_method") + if map_method is None: + return "HISAT2" + if map_method in implemented_mappers: + return map_method + raise NotImplementedError(f"Mapping host transcriptomes with {map_method} is not yet an implemented option. Please use an option from {implemented_mappers}") \ No newline at end of file diff --git a/tests/example_config.yaml b/tests/example_config.yaml index ef80ace..11a4ce6 100644 --- a/tests/example_config.yaml +++ b/tests/example_config.yaml @@ -61,12 +61,14 @@ kraken_db: data/kraken2_db ################################# -### Host read mapping (BBmap) ### +### Host read mapping ### # provide the path for the host reference genome # if these files do not exist yet, HoMi will by default download the GRCh38 human reference genome host_ref_fna: GRCh38/GRCh38_full_analysis_set.fna host_ref_gtf: GRCh38/GRCh38_full_analysis_set.refseq.gtf +# Specifying the host mapping method here. options include BBMap and HISAT2 +host_map_method: BBMap ################################ ### Resources ### @@ -78,10 +80,10 @@ host_ref_gtf: GRCh38/GRCh38_full_analysis_set.refseq.gtf # rule_name_mem_mb: 10000 # (10 GB) # rule_name_runtime: 600 # (10 hours) # rule_name_threads: 8 -bbmap_host_partition: long -bbmap_host_runtime: 2160 # 36 hours -bbmap_host_threads: 48 -bbmap_host_mem_mb: 250000 +map_host_partition: long +map_host_runtime: 2160 # 36 hours +map_host_threads: 48 +map_host_mem_mb: 250000 ################################ ### Partition ### @@ -105,6 +107,6 @@ default_long_partition_name: long # = = # examples: # default_slurm_extra: email=sample_email@colorado.edu -# bbmap_host_slurm_extra: qos=long email=sample_email@colorado.edu +# map_host_slurm_extra: qos=long email=sample_email@colorado.edu # All rules would receive the email param, # and the bbmap host rule would also receive the qos param diff --git a/tests/test_snake_utils.py b/tests/test_snake_utils.py index 2267639..c80721b 100644 --- a/tests/test_snake_utils.py +++ b/tests/test_snake_utils.py @@ -52,4 +52,19 @@ def test_get_kraken_db_default(): def test_get_kraken_db_nondefault(): config = {"kraken_db": "NEW_location"} - assert su.get_kraken_db_loc(default="data_location", config=config) == "NEW_location" \ No newline at end of file + assert su.get_kraken_db_loc(default="data_location", config=config) == "NEW_location" + + +def test_get_host_map_method_bbmap(): + config = {"host_map_method": "BBMap"} + assert su.get_host_map_method(config) == "BBMap" + + +def test_get_host_map_method_default(): + config = {"Nothing relevant": "is here"} + assert su.get_host_map_method(config) == "HISAT2" + +def test_get_host_map_method_not_implemented(): + config = {"host_map_method": "A method that isn't implemented"} + with pytest.raises(NotImplementedError): + out = su.get_host_map_method(config) \ No newline at end of file