Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hisat2 functionality #54

Merged
merged 10 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/dag.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pulp==2.7.0 # must < 2.8 for integration with Snakemake

- name: Build DAG
run: snakemake --configfile tests/example_config.yaml --dag > dag.dot
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ Sometimes, when snakemake unexpectedly exits (e.g., due to a server connection t

8. Align all reads against the host genome
- HoMi will by default download the GRCh38 human reference genome, but you can provide an alternative genome (fna + gtf) if it's already downloaded
- BBmap is used to map the reads, and featureCounts is used to generate a read count table
- Either BBmap or HISAT2 is used to map the reads (HISAT2 by default), and featureCounts is used to generate a read count table

### DAG
![DAG](figures/dag.svg)
Expand Down
7 changes: 7 additions & 0 deletions conda_envs/hisat2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: hisat2
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- bioconda::hisat2
366 changes: 183 additions & 183 deletions figures/dag.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
82 changes: 48 additions & 34 deletions snakefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
from os.path import join as pj
from os.path import split
from src.snake_utils import hostile_db_to_path, get_adapters_path, get_nonpareil_rmd_path, get_nonpareil_html_path, get_agg_script_path, get_mphlan_conv_script_path, get_taxa_barplot_rmd_path, get_sam2bam_path, get_func_barplot_rmd_path, get_partition, get_mem, get_runtime, get_threads, get_host_mapping_samples, get_slurm_extra, get_gmm_rmd_path, get_kraken_db_loc, get_tpm_converter_path
from src.snake_utils import hostile_db_to_path, get_adapters_path, get_nonpareil_rmd_path, get_nonpareil_html_path, get_agg_script_path, get_mphlan_conv_script_path, get_taxa_barplot_rmd_path, get_sam2bam_path, get_func_barplot_rmd_path, get_partition, get_mem, get_runtime, get_threads, get_host_mapping_samples, get_slurm_extra, get_gmm_rmd_path, get_kraken_db_loc, get_tpm_converter_path, get_host_map_method



Expand Down Expand Up @@ -998,7 +998,7 @@ rule nonpareil_curves:
# pull human genome
rule pull_host_genome:
"""
This rule downloads the human GRCh38 reference genome and annotation.
This rule downloads the host (default human GRCh38) reference genome and annotation.
"""
output:
GENOME=config['host_ref_fna'],
Expand Down Expand Up @@ -1028,65 +1028,79 @@ rule pull_host_genome:
"""


# make bbmap index for human genome
rule build_human_genome_index_bbmap:
# make index for host genome mapping
rule build_host_genome_index:
"""
This rule builds a Bbmap index for the host genome.
This rule builds an index for mapping the host genome.
"""
input:
config['host_ref_fna']
output:
directory(pj(f"{trim_trunc_path}.host", "ref/"))
conda: "conda_envs/bbmap.yaml"
conda: f"conda_envs/{get_host_map_method(config).lower()}.yaml"
resources:
partition=get_partition("short", config, "build_human_genome_index_bbmap"),
mem_mb=get_mem(int(30*1000), config, "build_human_genome_index_bbmap"), # MB, or 30 GB
runtime=get_runtime(int(2*60), config, "build_human_genome_index_bbmap"), # min, or 2 hrs
slurm=get_slurm_extra(config, "build_human_genome_index_bbmap")
threads: get_threads(8, config, "build_human_genome_index_bbmap")
partition=get_partition("short", config, "build_host_genome_index"),
mem_mb=get_mem(int(32*1000), config, "build_host_genome_index"), # MB, or 30 GB
runtime=get_runtime(int(2*60), config, "build_host_genome_index"), # min, or 2 hrs
slurm=get_slurm_extra(config, "build_host_genome_index")
threads: get_threads(8, config, "build_host_genome_index")
params:
ref_dir=f"{trim_trunc_path}.host"
ref_dir=f"{trim_trunc_path}.host",
method=get_host_map_method(config)
shell:
"""
mkdir -p {params.ref_dir}
bbmap.sh ref={input} path={params.ref_dir} threads={threads} -Xmx{resources.mem_mb}m
if [ "{method}" == "BBMap" ]; then
bbmap.sh ref={input} path={params.ref_dir} threads={threads} -Xmx{resources.mem_mb}m
elif [ "{method}" == "HISAT2" ]; then
hisat2-build {input} {output} -p {threads}
fi
"""


# Map to human genome
rule bbmap_host:
# Map to host genome
rule map_host:
"""
This rule maps reads to the host genome using Bbmap and compresses SAM files to BAM files.
This rule maps reads to the host genome and compresses SAM files to BAM files.
"""
input:
REF=pj(f"{trim_trunc_path}.host", "ref/"),
FWD=pj(trim_trunc_path,
"{sample}.R1.fq"),
REV=pj(trim_trunc_path,
"{sample}.R2.fq")
REF=pj(f"{trim_trunc_path}.host", "ref/"),
FWD=pj(trim_trunc_path,
"{sample}.R1.fq"),
REV=pj(trim_trunc_path,
"{sample}.R2.fq")
output:
SAM=pj(f"{trim_trunc_path}.host", "{sample}.sam"),
BAM=pj(f"{trim_trunc_path}.host", "{sample}.bam")
conda: "conda_envs/bbmap.yaml"
conda: f"conda_envs/{get_host_map_method(config).lower()}.yaml"
resources:
partition=get_partition("short", config, "bbmap_host"),
mem_mb=get_mem(int(210*1000), config, "bbmap_host"), # MB, or 210 GB, can cut to ~100 for 16 threads
runtime=get_runtime(int(23.9*60), config, "bbmap_host"), # min, or almost 24 hrs
slurm=get_slurm_extra(config, "bbmap_host")
threads: get_threads(32, config, "bbmap_host")
partition=get_partition("short", config, "map_host"),
mem_mb=get_mem(int(210*1000), config, "map_host"), # MB, or 210 GB, can cut to ~100 for 16 threads
runtime=get_runtime(int(23.9*60), config, "map_host"), # min, or almost 24 hrs
slurm=get_slurm_extra(config, "map_host")
threads: get_threads(32, config, "map_host")
params:
out_dir=f"{trim_trunc_path}.host",
sam2bam_path=get_sam2bam_path()
sam2bam_path=get_sam2bam_path(),
method=get_host_map_method(config)
shell:
"""
cd {params.out_dir}

bbmap.sh in=../{input.FWD} in2=../{input.REV} \
out={wildcards.sample}.sam \
trimreaddescriptions=t \
threads={threads} \
-Xmx{resources.mem_mb}m

if [ "{method}" == "BBMap" ]; then
bbmap.sh in=../{input.FWD} in2=../{input.REV} \
out={wildcards.sample}.sam \
trimreaddescriptions=t \
threads={threads} \
-Xmx{resources.mem_mb}m

elif [ "{method}" == "HISAT2" ]; then
hisat2 -1 ../{input.FWD} -2 ../{input.REV} \
-S {wildcards.sample}.sam \
-x ref/ \
-p {threads}
fi

bash {params.sam2bam_path} {wildcards.sample}.sam
"""

Expand Down
11 changes: 10 additions & 1 deletion src/snake_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,4 +185,13 @@ def get_tpm_converter_path():
if path.exists(converter):
return converter
else:
raise FileNotFoundError(f"TPM conversion script was not found at {converter}.")
raise FileNotFoundError(f"TPM conversion script was not found at {converter}.")

def get_host_map_method(config):
implemented_mappers = ["HISAT2", "BBMap"]
map_method = config.get("host_map_method")
if map_method is None:
return "HISAT2"
if map_method in implemented_mappers:
return map_method
raise NotImplementedError(f"Mapping host transcriptomes with {map_method} is not yet an implemented option. Please use an option from {implemented_mappers}")
14 changes: 8 additions & 6 deletions tests/example_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,14 @@ kraken_db: data/kraken2_db


#################################
### Host read mapping (BBmap) ###
### Host read mapping ###
# provide the path for the host reference genome
# if these files do not exist yet, HoMi will by default download the GRCh38 human reference genome
host_ref_fna: GRCh38/GRCh38_full_analysis_set.fna
host_ref_gtf: GRCh38/GRCh38_full_analysis_set.refseq.gtf

# Specifying the host mapping method here. options include BBMap and HISAT2
host_map_method: BBMap

################################
### Resources ###
Expand All @@ -78,10 +80,10 @@ host_ref_gtf: GRCh38/GRCh38_full_analysis_set.refseq.gtf
# rule_name_mem_mb: 10000 # (10 GB)
# rule_name_runtime: 600 # (10 hours)
# rule_name_threads: 8
bbmap_host_partition: long
bbmap_host_runtime: 2160 # 36 hours
bbmap_host_threads: 48
bbmap_host_mem_mb: 250000
map_host_partition: long
map_host_runtime: 2160 # 36 hours
map_host_threads: 48
map_host_mem_mb: 250000

################################
### Partition ###
Expand All @@ -105,6 +107,6 @@ default_long_partition_name: long
# <slurm_long_name>=<parameter> <slurm_long_name_2>=<parameter_2>
# examples:
# default_slurm_extra: [email protected]
# bbmap_host_slurm_extra: qos=long [email protected]
# map_host_slurm_extra: qos=long [email protected]
# All rules would receive the email param,
# and the bbmap host rule would also receive the qos param
17 changes: 16 additions & 1 deletion tests/test_snake_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,19 @@ def test_get_kraken_db_default():

def test_get_kraken_db_nondefault():
config = {"kraken_db": "NEW_location"}
assert su.get_kraken_db_loc(default="data_location", config=config) == "NEW_location"
assert su.get_kraken_db_loc(default="data_location", config=config) == "NEW_location"


def test_get_host_map_method_bbmap():
config = {"host_map_method": "BBMap"}
assert su.get_host_map_method(config) == "BBMap"


def test_get_host_map_method_default():
config = {"Nothing relevant": "is here"}
assert su.get_host_map_method(config) == "HISAT2"

def test_get_host_map_method_not_implemented():
config = {"host_map_method": "A method that isn't implemented"}
with pytest.raises(NotImplementedError):
out = su.get_host_map_method(config)