Skip to content

Commit

Permalink
Merge pull request #257 from metagenome-atlas/GTDB
Browse files Browse the repository at this point in the history
Finalize GTDB database with Tree
  • Loading branch information
SilasK authored Oct 30, 2019
2 parents 028dfd1 + ea3b0c3 commit 3afa564
Show file tree
Hide file tree
Showing 18 changed files with 275 additions and 214 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ Atlas is still under active development; therefore, you may want to install the

Create a conda environment with all primary dependencies. All further dependencies are installed on the fly.
```
conda create -n atlas -c conda-forge -c bioconda python=3.6 snakemake pandas bbmap=37.78 click=7 ruamel.yaml biopython
conda create -n atlasenv -c conda-forge -c bioconda python=3.6 snakemake pandas bbmap=37.78 click=7 ruamel.yaml biopython
```
Load the environment:
```
source activate atlas
source activate atlasenv
```
copy code from GitHub and install:
```
Expand Down
9 changes: 4 additions & 5 deletions atlas/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,6 @@ include: "rules/genomes.smk"
include: "rules/genecatalog.snakefile"
include: "rules/cat_taxonomy.smk"
include: "rules/gtdbtk.smk"
include: "rules/tree.smk"



Expand Down Expand Up @@ -237,14 +236,14 @@ def get_genome_annotations():
annotation_file_names={"cat_taxonomy":"genomes/taxonomy/taxonomy.tsv",
"ssu":"genomes/SSU/ssu_summary.tsv",
"checkm_taxonomy":"genomes/checkm/taxonomy.tsv",
"checkm_tree": "genomes/tree/tree.nwk",
"gtdb_tree":"genomes/taxonomy/gtdbtk.unrooted.tree",
"gtdb_taxonomy":"genomes/taxonomy/gtdbtk.bac120.summary.tsv"}
"checkm_tree": "genomes/tree/checkm.nwk",
"gtdb_tree":"genomes/tree/finished_gtdb_trees",
"gtdb_taxonomy":"genomes/taxonomy/gtdb/classify"}

annotations_requested= config['annotations']

try:
annotations_files=["genomes/annotations/genes"]+[annotation_file_names[an] for an in annotations_requested]
annotations_files=["genomes/annotations/genes/predicted"]+[annotation_file_names[an] for an in annotations_requested]

except Exception as e:

Expand Down
1 change: 0 additions & 1 deletion atlas/envs/tree.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,5 @@ channels:
- defaults
dependencies:
- python=3.6
- pandas=0.20.3
- ete3=3.1.1
- fasttree=2.1.8
1 change: 1 addition & 0 deletions atlas/rules/binning.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@ rule run_checkm_lineage_wf:
bins = "{sample}/binning/{binner}/bins" # actualy path to fastas
output:
"{sample}/binning/{binner}/checkm/completeness.tsv",
"{sample}/binning/{binner}/checkm/storage/tree/concatenated.fasta"
params:
output_dir = lambda wc, output: os.path.dirname(output[0])
conda:
Expand Down
2 changes: 1 addition & 1 deletion atlas/rules/download.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ rule download:
expand("{dir}/{filename}", dir=EGGNOG_DIR,
filename=["OG_fasta","eggnog.db","eggnog_proteins.dmnd","og2level.tsv"]),
CHECKMFILES,
CAT_flag_downloaded,
os.path.join(GTDBTK_DATA_PATH,'downloaded_success')



Expand Down
9 changes: 5 additions & 4 deletions atlas/rules/genecatalog.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import os


if config['genecatalog']['source']=='contigs':

#TODO: cat with python
localrules: concat_genes
rule concat_genes:
input:
Expand All @@ -21,13 +21,14 @@ else:
localrules: concat_genes
rule concat_genes:
input:
"genomes/annotations/genes"
faa= lambda wc: get_all_genes(wc,".faa"),
fna= lambda wc: get_all_genes(wc,".fna")
output:
faa= temp("Genecatalog/all_genes_unfiltered.faa"),
fna = temp("Genecatalog/all_genes_unfiltered.fna"),
shell:
" cat {input}/*.faa > {output.faa} ;"
" cat {input}/*.fna > {output.fna}"
" cat {input.faa} > {output.faa} ;"
" cat {input.fna} > {output.fna}"


localrules: filter_genes
Expand Down
77 changes: 47 additions & 30 deletions atlas/rules/genomes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,11 @@ rule merge_checkm:
sample= SAMPLES, binner= config['final_binner']),
taxonomy= expand("{sample}/binning/{binner}/checkm/taxonomy.tsv",
sample= SAMPLES, binner= config['final_binner']),

markers= expand("{sample}/binning/{binner}/checkm/storage/tree/concatenated.fasta",
sample= SAMPLES, binner= config['final_binner'])
output:
checkm="genomes/checkm/checkm_all_bins.tsv",
markers= "genomes/checkm/all_bins_markers.fasta"
run:

import pandas as pd
Expand All @@ -71,7 +73,9 @@ rule merge_checkm:
D= pd.concat(D,axis=0)
D.to_csv(output.checkm,sep='\t')


with open(output.markers,'wb') as fout:
for fasta in input.markers:
shutil.copyfileobj(open(fasta,'rb'),fout)



Expand Down Expand Up @@ -213,6 +217,7 @@ rule run_all_checkm_lineage_wf:
dir = genome_dir
output:
"genomes/checkm/completeness.tsv",
"genomes/checkm/storage/tree/concatenated.fasta"
params:
output_dir = lambda wc, output: os.path.dirname(output[0]),
conda:
Expand Down Expand Up @@ -434,43 +439,55 @@ rule combine_bined_coverages_MAGs:

Median_abund.to_csv(output.median_abund,sep='\t')

# rule predict_genes_genomes:
# input:
# dir=genome_dir
# output:
# directory("genomes/annotations/genes")
# conda:
# "%s/required_packages.yaml" % CONDAENV
# log:
# "logs/genomes/prodigal.log"
# shadow:
# "shallow"
# threads:
# config.get("threads", 1)
# script:
# "predict_genes_of_genomes.py"



rule predict_genes_genomes:
input:
dir=genome_dir
"genomes/genomes/{genome}.fasta"
output:
directory("genomes/annotations/genes")
fna = "genomes/annotations/genes/{genome}.fna",
faa = "genomes/annotations/genes/{genome}.faa",
gff = "genomes/annotations/genes/{genome}.gff"
conda:
"%s/required_packages.yaml" % CONDAENV
log:
"logs/genomes/prodigal.log"
shadow:
"shallow"
"logs/genomes/prodigal/{genome}.txt"
threads:
config.get("threads", 1)
script:
"predict_genes_of_genomes.py"


1
resources:
mem= config['simplejob_mem']
shell:
"""
prodigal -i {input} -o {output.gff} -d {output.fna} \
-a {output.faa} -p meta -f gff 2> >(tee {log})
"""

# rule predict_genes_genomes:
# input:
# "genomes/genomes/{genome}.fasta"
# output:
# fna = "genomes/annotations/genes/{genome}.fna",
# faa = "genomes/annotations/genes/{genome}.faa",
# gff = "genomes/annotations/genes/{genome}.gff"
# conda:
# "%s/required_packages.yaml" % CONDAENV
# log:
# "logs/genomes/prodigal/{genome}.txt"
# threads:
# 1
# shell:
# """
# prodigal -i {input} -o {output.gff} -d {output.fna} \
# -a {output.faa} -p meta -f gff 2> >(tee {log})
# """
def get_all_genes(wildcards,extension='.faa'):
return expand("genomes/annotations/genes/{genome}{extension}",
genome=get_genomes_(wildcards),extension=extension)

localrules: all_prodigal
rule all_prodigal:
input:
get_all_genes
output:
touch("genomes/annotations/genes/predicted")



Expand Down
92 changes: 64 additions & 28 deletions atlas/rules/gtdbtk.smk
Original file line number Diff line number Diff line change
@@ -1,66 +1,69 @@


gtdb_dir="genomes/taxonomy/gtdb"


rule identify:
input:
dir=genome_dir,
flag= rules.download_gtdb.output
output:
directory("genomes/taxonomy/identify")
directory(f"{gtdb_dir}/identify")
threads:
config['threads']
conda:
"../envs/gtdbtk.yaml"
log:
"logs/taxonomy/gtdbtk_identify.txt",
"genomes/taxonomy/gtdbtk.log"
"logs/taxonomy/gtdbtk/identify.txt",
f"{gtdb_dir}/gtdbtk.log"
params:
outdir= "genomes/taxonomy",
outdir= gtdb_dir,
extension="fasta",
shell:
"GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} ; "
"gtdbtk identify --genome_dir {input.dir} --out_dir {params.outdir} "
"--extension {params.extension} "
"--cpus {threads} &> {log[0]}"

rule align:
checkpoint align:
input:
"genomes/taxonomy/identify"
f"{gtdb_dir}/identify"
output:
"genomes/taxonomy/gtdbtk.bac120.user_msa.fasta"
directory(f"{gtdb_dir}/align")
threads:
config['threads']
conda:
"../envs/gtdbtk.yaml"
log:
"logs/taxonomy/gtdbtk_align.txt",
"genomes/taxonomy/gtdbtk.log"
"logs/taxonomy/gtdbtk/align.txt",
f"{gtdb_dir}/gtdbtk.log"
params:
outdir= "genomes/taxonomy"
outdir= gtdb_dir
shell:
"GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} ; "
"gtdbtk align --identify_dir {params.outdir} --out_dir {params.outdir} "
"--cpus {threads} &> {log[0]}"




rule classify:
input:
rules.align.output,
genome_dir=genome_dir,
output:
"genomes/taxonomy/gtdbtk.bac120.summary.tsv",
directory(f"{gtdb_dir}/classify"),
threads:
8 #pplacer needs much memory for not many threads
config['threads'] #pplacer needs much memory for not many threads
resources:
mem=config['large_mem']
conda:
"../envs/gtdbtk.yaml"
log:
"logs/taxonomy/gtdbtk_classify.txt",
"genomes/taxonomy/gtdbtk.log"
"logs/taxonomy/gtdbtk/classify.txt",
f"{gtdb_dir}/gtdbtk.log"
params:
outdir= "genomes/taxonomy",
outdir= gtdb_dir,
extension="fasta",
shell:
"GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} ; "
Expand All @@ -69,22 +72,55 @@ rule classify:
"--extension {params.extension} "
"--cpus {threads} &> {log[0]}"

msa_paths={'checkm':"genomes/checkm/storage/tree/concatenated.fasta",
'gtdbtk.bac120': f"{gtdb_dir}/align/gtdbtk.bac120.user_msa.fasta",
'gtdbtk.ar122': f"{gtdb_dir}/align/gtdbtk.ar122.user_msa.fasta"
}

rule infer:
rule fasttree:
input:
"genomes/taxonomy/gtdbtk.bac120.user_msa.fasta"
lambda wildcards: msa_paths[wildcards.msa]
output:
"genomes/taxonomy/gtdbtk.unrooted.tree"
temp("genomes/tree/{msa}.unrooted.nwk")
log:
"logs/genomes/tree/FastTree_{msa}.log"
threads:
config['threads']
max(config['threads'],3)
conda:
"../envs/gtdbtk.yaml"
log:
"logs/taxonomy/gtdbtk_infer.txt",
"genomes/taxonomy/gtdbtk.log"
params:
outdir="genomes/taxonomy"
"%s/tree.yaml" % CONDAENV
shell:
"GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} ; "
"gtdbtk infer --msa_file {input} --out_dir {params.outdir} "
"--cpus {threads} &> {log[0]}"
"export OMP_NUM_THREADS={threads}; "
"FastTree -log {log} {input} > {output} "


localrules: root_tree
rule root_tree:
input:
tree="genomes/tree/{msa}.unrooted.nwk",
wildcard_constraints:
msa="((?!unrooted).)*"
output:
tree="genomes/tree/{msa}.nwk",
conda:
"%s/tree.yaml" % CONDAENV
threads:
1
log:
"logs/genomes/tree/root_tree_{msa}.log"
script:
"../scripts/root_tree.py"


def all_gtdb_trees_input(wildcards):
dir= checkpoints.align.get().output[0]

domains = glob_wildcards(f"{dir}/gtdbtk.{{domain}}.user_msa.fasta").domain

return expand("genomes/tree/gtdbtk.{domain}.nwk",domain=domains)


rule all_gtdb_trees:
input:
all_gtdb_trees_input
output:
touch("genomes/tree/finished_gtdb_trees")
31 changes: 0 additions & 31 deletions atlas/rules/tree.smk

This file was deleted.

Loading

0 comments on commit 3afa564

Please sign in to comment.