Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding option for star aligner #113

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .test/config/config_paired_end.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@ results-jupyterbook:
activate: True

fastq:
auto: True # set auto if your files follow
auto: True # set auto if your files follow
paired: True


# Paths for reference files
reference:
genome:
Expand Down
5 changes: 4 additions & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,15 @@ welcome(version="v2.0.2")

include: "rules/qc.smk"
include: "rules/diffexp.smk"
include: "rules/star-freebayes.smk"
include: "rules/utilities.smk"
include: "rules/hisat2-freebayes.smk"
include: "rules/snpEff.smk"
include: "rules/variantAnalysis.smk"
include: "rules/jupyter-book.smk"

#if config['aligner'] == 'HISAT2':
# include: "rules/hisat2-freebayes.smk"

if config['pipeline'] == 'parabricks':
include: "rules/star-haplotypecaller.smk"

Expand Down
1 change: 1 addition & 0 deletions workflow/envs/variants.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ channels:
- conda-forge
dependencies:
- hisat2
- star
- samtools
- freebayes=1.3.8
- bcftools
Expand Down
40 changes: 22 additions & 18 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -17,52 +17,56 @@ def load_metadata(metadata_path):
return metadata



def getFASTQs(wildcards, rules=None):
"""
Get FASTQ files from unit sheet.
If there are more than one wildcard (aka, sample), only return one fastq file
If the rule is HISAT2align, then return the fastqs with -1 and -2 flags
If the rule is STARalign, return the fastqs without flags.
"""
metadata = load_metadata(config["metadata"])

if config['fastq']['paired'] == True:
fastq_cols = ['fq1', 'fq2']
else:
fastq_cols = ['fq1']
# Determine if the FASTQs are paired or single-end
fastq_cols = ['fq1', 'fq2'] if config['fastq']['paired'] else ['fq1']

if config["QualityControl"]["fastp-trim"]["activate"] == True:
if rules in ["KallistoQuant", "HISAT2align", "HISAT2align_input"]:
# Check for fastp trimming activation
if config["QualityControl"]["fastp-trim"]["activate"]:
if rules in ["KallistoQuant", "HISAT2align", "HISAT2align_input", "STARalign", "STARalign_input"]:
for i, col in enumerate(fastq_cols):
metadata = metadata.assign(**{col: f"resources/reads/trimmed/" + metadata["sampleID"] + f"_{i+1}.fastq.gz"})
metadata = metadata.set_index("sampleID")

u = metadata.loc[wildcards.sample, fastq_cols].dropna()
if rules == "HISAT2align":
return [f"-1 {u.fq1} -2 {u.fq2}"] if config['fastq']['paired'] == True else f"-U {u.fq1}"
return [f"-1 {u.fq1} -2 {u.fq2}"] if config['fastq']['paired'] else f"-U {u.fq1}"
elif rules in ["STARalign", "STARalign_input"]:
return [u.fq1, u.fq2] if config['fastq']['paired'] else [u.fq1]
else:
return [u.fq1, u.fq2] if config['fastq']['paired'] == True else [u.fq1]
return [u.fq1, u.fq2] if config['fastq']['paired'] else [u.fq1]

# Auto-generate FASTQ paths if enabled
if config["fastq"]["auto"]:
for i, col in enumerate(fastq_cols):
metadata = metadata.assign(**{col: f"resources/reads/" + metadata["sampleID"] + f"_{i+1}.fastq.gz"})
metadata = metadata.set_index("sampleID")
else:
assert (
"fq1" in metadata.columns
), f"The fq1 column in the metadata does not seem to exist. Please create one, or use the 'auto' option and name the fastq files as specified in the config/README.md"
assert "fq1" in metadata.columns, "The fq1 column in the metadata does not seem to exist. Please create one, or use the 'auto' option."
if config['fastq']['paired']:
assert (
"fq2" in metadata.columns
), f"The fq2 column in the metadata does not seem to exist. Please create one, or use the 'auto' option and name the fastq files as specified in the config/README.md"
assert "fq2" in metadata.columns, "The fq2 column in the metadata does not seem to exist. Please create one, or use the 'auto' option."

metadata = metadata.set_index("sampleID")

# Retrieve FASTQ paths based on the current rules
u = metadata.loc[wildcards.sample, fastq_cols].dropna()
if rules == "HISAT2align":
return [f"-1 {u.fq1} -2 {u.fq2}"] if config['fastq']['paired'] == True else f"-U {u.fq1}"
return [f"-1 {u.fq1} -2 {u.fq2}"] if config['fastq']['paired'] else f"-U {u.fq1}"
elif rules in ["STARalign", "STARalign_input"]:
return [u.fq1, u.fq2] if config['fastq']['paired'] else [u.fq1]
else:
return [u.fq1, u.fq2] if config['fastq']['paired'] == True else [u.fq1]
return [u.fq1, u.fq2] if config['fastq']['paired'] else [u.fq1]





def getBAM(wildcards):
Expand Down Expand Up @@ -283,4 +287,4 @@ def welcome(version):
print(f"Author: Sanjay Curtis Nagi")
print(f"Workflow Version: {version}")
print("Execution time: ", datetime.datetime.now())
print(f"Dataset: {config['dataset']}", "\n")
print(f"Dataset: {config['dataset']}", "\n")
35 changes: 17 additions & 18 deletions workflow/rules/hisat2-freebayes.smk
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@

rule HISAT2index:
"""
Make a HISAT2 index of the reference genome
"""
input:
fasta=config["reference"]["genome"].rstrip(".gz"),
output:
"resources/reference/ht2index/idx.1.ht2",
touch("resources/reference/ht2index/.complete"),
log:
"logs/HISAT2/HISAT2index.log",
conda:
"../envs/variants.yaml"
params:
prefix=lambda w, output: output[0].split(os.extsep)[0],
threads: 8
shell:
"hisat2-build -p {threads} {input.fasta} {params.prefix} 2> {log}"
"""
Make a HISAT2 index of the reference genome
"""
input:
fasta=config["reference"]["genome"].rstrip(".gz"),
output:
"resources/reference/ht2index/idx.1.ht2",
touch("resources/reference/ht2index/.complete"),
log:
"logs/HISAT2/HISAT2index.log",
conda:
"../envs/variants.yaml"
params:
prefix=lambda w, output: output[0].split(os.extsep)[0],
threads: 8
shell:
"hisat2-build -p {threads} {input.fasta} {params.prefix} 2> {log}"


rule HISAT2align:
Expand Down
14 changes: 5 additions & 9 deletions workflow/rules/qc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@ rule CheckInputs:
"../scripts/checkInputs.py"



rule fastp:
input:
sample = getFASTQs,
sample=getFASTQs,
output:
trimmed=["resources/reads/trimmed/{sample}_1.fastq.gz", "resources/reads/trimmed/{sample}_2.fastq.gz"] if config['fastq']['paired'] else ["resources/reads/trimmed/{sample}_1.fastq.gz"],
html="results/qc/{sample}.html",
Expand All @@ -37,30 +36,27 @@ rule fastp:
wrapper:
"v2.2.1/bio/fastp"



rule BamStats:
"""
QC alignment statistics
"""
input:
bam="results/alignments/{sample}.star.bam" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam",
idx="results/alignments/{sample}.star.bam.bai" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam.bai",
bam="results/alignments/{sample}.star.bam", # if config['aligner'] == 'STAR' else "results/alignments/{sample}.hisat2.bam",
idx="results/alignments/{sample}.star.bam.bai", # if config['aligner'] == 'STAR' else "results/alignments/{sample}.hisat2.bam.bai",
output:
stats="results/qc/alignments/{sample}.flagstat",
log:
"logs/BamStats/{sample}.log",
wrapper:
"v3.12.1/bio/samtools/flagstat"


rule Coverage:
"""
Calculate coverage with mosdepth
"""
input:
bam="results/alignments/{sample}.star.bam" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam",
idx="results/alignments/{sample}.star.bam.bai" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam.bai",
bam="results/alignments/{sample}.star.bam",# if config['aligner'] == 'STAR' else "results/alignments/{sample}.hisat2.bam",
idx="results/alignments/{sample}.star.bam.bai",# if config['aligner'] == 'STAR' else "results/alignments/{sample}.hisat2.bam.bai",
output:
"results/qc/coverage/{sample}.mosdepth.summary.txt",
log:
Expand Down
125 changes: 125 additions & 0 deletions workflow/rules/star-freebayes.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
rule STARindex:
"""
Make a STAR index of the reference genome.
"""
input:
gtfFile=config["reference"]["gff"].rstrip(".gz"),
fasta=config["reference"]["genome"].rstrip(".gz"),
output:
directory("resources/reference/starindex/"), # Flagging the output as a directory
touch("resources/reference/starindex/.complete"), # Completion tracking
log:
"logs/STAR/STARindex.log",
conda:
"../envs/variants.yaml"
params:
prefix="resources/reference/starindex/",
threads: 8
shell:
"""
STAR --runMode genomeGenerate --genomeDir {params.prefix} --genomeFastaFiles {input.fasta} --sjdbGTFfile {input.gtfFile} --sjdbGTFtagExonParentTranscript Parent --sjdbOverhang 100 --runThreadN {threads} --outFileNamePrefix {params.prefix} 2> {log}
"""

rule STARalign:
"""
Align reads to the genome with STAR, generate an unsorted BAM, and then sort with samtools.
"""
input:
reads=lambda wildcards: getFASTQs(wildcards=wildcards, rules="STARalign"),
idx="resources/reference/starindex/"
output:
"results/alignments/{sample}.star.bam"
log:
align="logs/STAR/{sample}_align.log", # Log for alignment
sort="logs/samtoolsSort/{sample}.log" # Log for sorting
conda:
"../envs/variants.yaml" # Conda environment
params:
readflags=lambda wildcards: " ".join(getFASTQs(wildcards=wildcards, rules="STARalign")),
extra="--outSAMtype BAM Unsorted", # STAR output type
rg=lambda wildcards: f"--outSAMattrRGline ID:{wildcards.sample} SM:{wildcards.sample} PL:ILLUMINA"
threads: 12
shell:
"""
# Generate unsorted BAM with STAR
STAR --genomeDir {input.idx} --readFilesIn {params.readflags} --readFilesCommand zcat {params.extra} {params.rg} --runThreadN {threads} --outFileNamePrefix results/alignments/{wildcards.sample}. 2> {log.align}

# Sort the BAM file with samtools, following output file convention
samtools sort -@{threads} -o {output} results/alignments/{wildcards.sample}.Aligned.out.bam 2>> {log.sort}

# Remove intermediate unsorted BAM to save space
rm results/alignments/{wildcards.sample}.Aligned.out.bam
"""

chunks = np.arange(1, config["VariantAnalysis"]["chunks"])

rule GenerateFreebayesParams:
input:
ref_idx=config["reference"]["genome"].rstrip(".gz"),
index=config["reference"]["genome"].rstrip(".gz") + ".fai",
bams=expand("results/alignments/{sample}.star.bam", sample=samples),
output:
bamlist="results/alignments/bam.list",
pops="results/alignments/populations.tsv",
regions=expand(
"results/variantAnalysis/regions/genome.{contig}.region.{i}.bed",
contig=config["contigs"],
i=chunks,
),
log:
"logs/GenerateFreebayesParams.log",
params:
aligner='STAR',
metadata=config["metadata"],
contigs=config["contigs"],
chunks=config["VariantAnalysis"]["chunks"],
conda:
"../envs/diffexp.yaml"
script:
"../scripts/starGenerateFreebayesParams.R"


rule VariantCallingFreebayes:
"""
Run freebayes on chunks of the genome, splitting the samples by population (strain)
"""
input:
bams=expand("results/alignments/{sample}.star.bam", sample=samples),
index=expand("results/alignments/{sample}.star.bam.bai", sample=samples),
ref=config["reference"]["genome"].rstrip(".gz"),
samples="results/alignments/bam.list",
pops="results/alignments/populations.tsv",
regions="results/variantAnalysis/regions/genome.{contig}.region.{i}.bed",
output:
temp("results/variantAnalysis/vcfs/freebayes/{contig}/variants.{i}.vcf"),
log:
"logs/VariantCallingFreebayes/{contig}.{i}.log",
params:
ploidy=config["VariantAnalysis"]["ploidy"],
conda:
"../envs/variants.yaml"
threads: 8
shell:
"""
freebayes -f {input.ref} -t {input.regions} --ploidy {params.ploidy} --populations {input.pops} --pooled-discrete --use-best-n-alleles 5 --min-alternate-fraction 0.05 -L {input.samples} > {output} 2> {log}
"""


rule ConcatVCFs:
"""
Concatenate VCFs together
"""
input:
calls=expand(
"results/variantAnalysis/vcfs/freebayes/{{contig}}/variants.{i}.vcf",
i=chunks,
),
output:
temp("results/variantAnalysis/vcfs/freebayes/variants.{contig}.vcf"),
log:
"logs/ConcatVCFs/{contig}.log",
conda:
"../envs/variants.yaml"
threads: 4
shell:
"bcftools concat {input.calls} | vcfuniq > {output} 2> {log}"
14 changes: 9 additions & 5 deletions workflow/rules/utilities.smk
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,17 @@ rule IndexBams:
Index bams with samtools
"""
input:
bam="results/alignments/{sample}.hisat2.bam",
bam="results/alignments/{sample}.star.bam", #if config['aligner'] == 'STAR' else "results/alignments/{sample}.hisat2.bam",
output:
idx="results/alignments/{sample}.hisat2.bam.bai",
idx="results/alignments/{sample}.star.bam.bai", #if config['aligner'] == 'STAR' else "results/alignments/{sample}.hisat2.bam",
log:
"logs/IndexBams/{sample}.log",
wrapper:
"v1.15.0/bio/samtools/index"
conda:
"../envs/variants.yaml"
shell:
"""
samtools index {input.bam} {output.idx} 2> {log}
"""


rule RestrictToSNPs:
Expand All @@ -66,4 +70,4 @@ rule RestrictToSNPs:
params:
extra="-v snps",
wrapper:
"v1.15.0/bio/bcftools/view"
"v1.15.0/bio/bcftools/view"
4 changes: 2 additions & 2 deletions workflow/rules/variantsOfInterest.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ rule mpileupVariantsOfInterest:
Get allele count tables of variants of choice (specified in config file ("IRmutations.tsv"))
"""
input:
bam="results/alignments/{sample}.star.bam" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam",
idx="results/alignments/{sample}.star.bam.bai" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam.bai",
bam="results/alignments/{sample}.star.bam", #if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam",
idx="results/alignments/{sample}.star.bam.bai", #if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam.bai",
output:
"results/variantAnalysis/variantsOfInterest/counts/{sample}_{mut}_allele_counts.tsv",
conda:
Expand Down
Loading