diff --git a/.github/workflows/dag.yaml b/.github/workflows/dag.yaml
index 2b963de..dcdea43 100644
--- a/.github/workflows/dag.yaml
+++ b/.github/workflows/dag.yaml
@@ -36,6 +36,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
+ pip install pulp==2.7.0 # must < 2.8 for integration with Snakemake
- name: Build DAG
run: snakemake --configfile tests/example_config.yaml --dag > dag.dot
diff --git a/README.md b/README.md
index ed8dd93..aac65a0 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ Sometimes, when snakemake unexpectedly exits (e.g., due to a server connection t
8. Align all reads against the host genome
- HoMi will by default download the GRCh38 human reference genome, but you can provide an alternative genome (fna + gtf) if it's already downloaded
- - BBmap is used to map the reads, and featureCounts is used to generate a read count table
+ - Either BBmap or HISAT2 is used to map the reads (HISAT2 by default), and featureCounts is used to generate a read count table
### DAG
![DAG](figures/dag.svg)
diff --git a/conda_envs/hisat2.yaml b/conda_envs/hisat2.yaml
new file mode 100644
index 0000000..63724d3
--- /dev/null
+++ b/conda_envs/hisat2.yaml
@@ -0,0 +1,7 @@
+name: hisat2
+channels:
+ - conda-forge
+ - bioconda
+ - defaults
+dependencies:
+ - bioconda::hisat2
\ No newline at end of file
diff --git a/figures/dag.svg b/figures/dag.svg
index e60f7d9..24850aa 100644
--- a/figures/dag.svg
+++ b/figures/dag.svg
@@ -5,542 +5,542 @@
-->
diff --git a/snakefile b/snakefile
index 7892f0a..ddecc52 100644
--- a/snakefile
+++ b/snakefile
@@ -1,7 +1,7 @@
import pandas as pd
from os.path import join as pj
from os.path import split
-from src.snake_utils import hostile_db_to_path, get_adapters_path, get_nonpareil_rmd_path, get_nonpareil_html_path, get_agg_script_path, get_mphlan_conv_script_path, get_taxa_barplot_rmd_path, get_sam2bam_path, get_func_barplot_rmd_path, get_partition, get_mem, get_runtime, get_threads, get_host_mapping_samples, get_slurm_extra, get_gmm_rmd_path, get_kraken_db_loc, get_tpm_converter_path
+from src.snake_utils import hostile_db_to_path, get_adapters_path, get_nonpareil_rmd_path, get_nonpareil_html_path, get_agg_script_path, get_mphlan_conv_script_path, get_taxa_barplot_rmd_path, get_sam2bam_path, get_func_barplot_rmd_path, get_partition, get_mem, get_runtime, get_threads, get_host_mapping_samples, get_slurm_extra, get_gmm_rmd_path, get_kraken_db_loc, get_tpm_converter_path, get_host_map_method
@@ -998,7 +998,7 @@ rule nonpareil_curves:
# pull human genome
rule pull_host_genome:
"""
- This rule downloads the human GRCh38 reference genome and annotation.
+ This rule downloads the host (default human GRCh38) reference genome and annotation.
"""
output:
GENOME=config['host_ref_fna'],
@@ -1028,65 +1028,79 @@ rule pull_host_genome:
"""
-# make bbmap index for human genome
-rule build_human_genome_index_bbmap:
+# make index for host genome mapping
+rule build_host_genome_index:
"""
- This rule builds a Bbmap index for the host genome.
+ This rule builds an index for mapping the host genome.
"""
input:
config['host_ref_fna']
output:
directory(pj(f"{trim_trunc_path}.host", "ref/"))
- conda: "conda_envs/bbmap.yaml"
+ conda: f"conda_envs/{get_host_map_method(config).lower()}.yaml"
resources:
- partition=get_partition("short", config, "build_human_genome_index_bbmap"),
- mem_mb=get_mem(int(30*1000), config, "build_human_genome_index_bbmap"), # MB, or 30 GB
- runtime=get_runtime(int(2*60), config, "build_human_genome_index_bbmap"), # min, or 2 hrs
- slurm=get_slurm_extra(config, "build_human_genome_index_bbmap")
- threads: get_threads(8, config, "build_human_genome_index_bbmap")
+ partition=get_partition("short", config, "build_host_genome_index"),
+ mem_mb=get_mem(int(32*1000), config, "build_host_genome_index"), # MB, or 30 GB
+ runtime=get_runtime(int(2*60), config, "build_host_genome_index"), # min, or 2 hrs
+ slurm=get_slurm_extra(config, "build_host_genome_index")
+ threads: get_threads(8, config, "build_host_genome_index")
params:
- ref_dir=f"{trim_trunc_path}.host"
+ ref_dir=f"{trim_trunc_path}.host",
+ method=get_host_map_method(config)
shell:
"""
mkdir -p {params.ref_dir}
- bbmap.sh ref={input} path={params.ref_dir} threads={threads} -Xmx{resources.mem_mb}m
+ if [ "{method}" == "BBMap" ]; then
+ bbmap.sh ref={input} path={params.ref_dir} threads={threads} -Xmx{resources.mem_mb}m
+ elif [ "{method}" == "HISAT2" ]; then
+ hisat2-build {input} {output} -p {threads}
+ fi
"""
-# Map to human genome
-rule bbmap_host:
+# Map to host genome
+rule map_host:
"""
- This rule maps reads to the host genome using Bbmap and compresses SAM files to BAM files.
+ This rule maps reads to the host genome and compresses SAM files to BAM files.
"""
input:
- REF=pj(f"{trim_trunc_path}.host", "ref/"),
- FWD=pj(trim_trunc_path,
- "{sample}.R1.fq"),
- REV=pj(trim_trunc_path,
- "{sample}.R2.fq")
+ REF=pj(f"{trim_trunc_path}.host", "ref/"),
+ FWD=pj(trim_trunc_path,
+ "{sample}.R1.fq"),
+ REV=pj(trim_trunc_path,
+ "{sample}.R2.fq")
output:
SAM=pj(f"{trim_trunc_path}.host", "{sample}.sam"),
BAM=pj(f"{trim_trunc_path}.host", "{sample}.bam")
- conda: "conda_envs/bbmap.yaml"
+ conda: f"conda_envs/{get_host_map_method(config).lower()}.yaml"
resources:
- partition=get_partition("short", config, "bbmap_host"),
- mem_mb=get_mem(int(210*1000), config, "bbmap_host"), # MB, or 210 GB, can cut to ~100 for 16 threads
- runtime=get_runtime(int(23.9*60), config, "bbmap_host"), # min, or almost 24 hrs
- slurm=get_slurm_extra(config, "bbmap_host")
- threads: get_threads(32, config, "bbmap_host")
+ partition=get_partition("short", config, "map_host"),
+ mem_mb=get_mem(int(210*1000), config, "map_host"), # MB, or 210 GB, can cut to ~100 for 16 threads
+ runtime=get_runtime(int(23.9*60), config, "map_host"), # min, or almost 24 hrs
+ slurm=get_slurm_extra(config, "map_host")
+ threads: get_threads(32, config, "map_host")
params:
out_dir=f"{trim_trunc_path}.host",
- sam2bam_path=get_sam2bam_path()
+ sam2bam_path=get_sam2bam_path(),
+ method=get_host_map_method(config)
shell:
"""
cd {params.out_dir}
- bbmap.sh in=../{input.FWD} in2=../{input.REV} \
- out={wildcards.sample}.sam \
- trimreaddescriptions=t \
- threads={threads} \
- -Xmx{resources.mem_mb}m
-
+ if [ "{method}" == "BBMap" ]; then
+ bbmap.sh in=../{input.FWD} in2=../{input.REV} \
+ out={wildcards.sample}.sam \
+ trimreaddescriptions=t \
+ threads={threads} \
+ -Xmx{resources.mem_mb}m
+
+ elif [ "{method}" == "HISAT2" ]; then
+ hisat2 -1 ../{input.FWD} -2 ../{input.REV} \
+ -S {wildcards.sample}.sam \
+ -x ref/ \
+ -p {threads}
+ fi
+
bash {params.sam2bam_path} {wildcards.sample}.sam
"""
diff --git a/src/snake_utils.py b/src/snake_utils.py
index bfbebf8..2382b85 100644
--- a/src/snake_utils.py
+++ b/src/snake_utils.py
@@ -185,4 +185,13 @@ def get_tpm_converter_path():
if path.exists(converter):
return converter
else:
- raise FileNotFoundError(f"TPM conversion script was not found at {converter}.")
\ No newline at end of file
+ raise FileNotFoundError(f"TPM conversion script was not found at {converter}.")
+
+def get_host_map_method(config):
+ implemented_mappers = ["HISAT2", "BBMap"]
+ map_method = config.get("host_map_method")
+ if map_method is None:
+ return "HISAT2"
+ if map_method in implemented_mappers:
+ return map_method
+ raise NotImplementedError(f"Mapping host transcriptomes with {map_method} is not yet an implemented option. Please use an option from {implemented_mappers}")
\ No newline at end of file
diff --git a/tests/example_config.yaml b/tests/example_config.yaml
index ef80ace..11a4ce6 100644
--- a/tests/example_config.yaml
+++ b/tests/example_config.yaml
@@ -61,12 +61,14 @@ kraken_db: data/kraken2_db
#################################
-### Host read mapping (BBmap) ###
+### Host read mapping ###
# provide the path for the host reference genome
# if these files do not exist yet, HoMi will by default download the GRCh38 human reference genome
host_ref_fna: GRCh38/GRCh38_full_analysis_set.fna
host_ref_gtf: GRCh38/GRCh38_full_analysis_set.refseq.gtf
+# Specifying the host mapping method here. options include BBMap and HISAT2
+host_map_method: BBMap
################################
### Resources ###
@@ -78,10 +80,10 @@ host_ref_gtf: GRCh38/GRCh38_full_analysis_set.refseq.gtf
# rule_name_mem_mb: 10000 # (10 GB)
# rule_name_runtime: 600 # (10 hours)
# rule_name_threads: 8
-bbmap_host_partition: long
-bbmap_host_runtime: 2160 # 36 hours
-bbmap_host_threads: 48
-bbmap_host_mem_mb: 250000
+map_host_partition: long
+map_host_runtime: 2160 # 36 hours
+map_host_threads: 48
+map_host_mem_mb: 250000
################################
### Partition ###
@@ -105,6 +107,6 @@ default_long_partition_name: long
# = =
# examples:
# default_slurm_extra: email=sample_email@colorado.edu
-# bbmap_host_slurm_extra: qos=long email=sample_email@colorado.edu
+# map_host_slurm_extra: qos=long email=sample_email@colorado.edu
# All rules would receive the email param,
# and the bbmap host rule would also receive the qos param
diff --git a/tests/test_snake_utils.py b/tests/test_snake_utils.py
index 2267639..c80721b 100644
--- a/tests/test_snake_utils.py
+++ b/tests/test_snake_utils.py
@@ -52,4 +52,19 @@ def test_get_kraken_db_default():
def test_get_kraken_db_nondefault():
config = {"kraken_db": "NEW_location"}
- assert su.get_kraken_db_loc(default="data_location", config=config) == "NEW_location"
\ No newline at end of file
+ assert su.get_kraken_db_loc(default="data_location", config=config) == "NEW_location"
+
+
+def test_get_host_map_method_bbmap():
+ config = {"host_map_method": "BBMap"}
+ assert su.get_host_map_method(config) == "BBMap"
+
+
+def test_get_host_map_method_default():
+ config = {"Nothing relevant": "is here"}
+ assert su.get_host_map_method(config) == "HISAT2"
+
+def test_get_host_map_method_not_implemented():
+ config = {"host_map_method": "A method that isn't implemented"}
+ with pytest.raises(NotImplementedError):
+ out = su.get_host_map_method(config)
\ No newline at end of file