diff --git a/README.md b/README.md index d9d44a38..82d9609f 100644 --- a/README.md +++ b/README.md @@ -36,11 +36,11 @@ Atlas is still under active development; therefore, you may want to install the Create a conda environment with all primary dependencies. All further dependencies are installed on the fly. ``` -conda create -n atlas -c conda-forge -c bioconda python=3.6 snakemake pandas bbmap=37.78 click=7 ruamel.yaml biopython +conda create -n atlasenv -c conda-forge -c bioconda python=3.6 snakemake pandas bbmap=37.78 click=7 ruamel.yaml biopython ``` Load the environment: ``` -source activate atlas +source activate atlasenv ``` copy code from GitHub and install: ``` diff --git a/atlas/Snakefile b/atlas/Snakefile index 993bd1a9..9e7531cc 100644 --- a/atlas/Snakefile +++ b/atlas/Snakefile @@ -204,7 +204,6 @@ include: "rules/genomes.smk" include: "rules/genecatalog.snakefile" include: "rules/cat_taxonomy.smk" include: "rules/gtdbtk.smk" -include: "rules/tree.smk" @@ -237,14 +236,14 @@ def get_genome_annotations(): annotation_file_names={"cat_taxonomy":"genomes/taxonomy/taxonomy.tsv", "ssu":"genomes/SSU/ssu_summary.tsv", "checkm_taxonomy":"genomes/checkm/taxonomy.tsv", - "checkm_tree": "genomes/tree/tree.nwk", - "gtdb_tree":"genomes/taxonomy/gtdbtk.unrooted.tree", - "gtdb_taxonomy":"genomes/taxonomy/gtdbtk.bac120.summary.tsv"} + "checkm_tree": "genomes/tree/checkm.nwk", + "gtdb_tree":"genomes/tree/finished_gtdb_trees", + "gtdb_taxonomy":"genomes/taxonomy/gtdb/classify"} annotations_requested= config['annotations'] try: - annotations_files=["genomes/annotations/genes"]+[annotation_file_names[an] for an in annotations_requested] + annotations_files=["genomes/annotations/genes/predicted"]+[annotation_file_names[an] for an in annotations_requested] except Exception as e: diff --git a/atlas/envs/tree.yaml b/atlas/envs/tree.yaml index 4feaa3b4..f87eedb9 100644 --- a/atlas/envs/tree.yaml +++ b/atlas/envs/tree.yaml @@ -4,6 +4,5 @@ channels: - defaults dependencies: - python=3.6 - - pandas=0.20.3 - ete3=3.1.1 - fasttree=2.1.8 diff --git a/atlas/rules/binning.snakefile b/atlas/rules/binning.snakefile index f1863d84..aa52718a 100644 --- a/atlas/rules/binning.snakefile +++ b/atlas/rules/binning.snakefile @@ -304,6 +304,7 @@ rule run_checkm_lineage_wf: bins = "{sample}/binning/{binner}/bins" # actualy path to fastas output: "{sample}/binning/{binner}/checkm/completeness.tsv", + "{sample}/binning/{binner}/checkm/storage/tree/concatenated.fasta" params: output_dir = lambda wc, output: os.path.dirname(output[0]) conda: diff --git a/atlas/rules/download.snakefile b/atlas/rules/download.snakefile index af199411..eab45897 100644 --- a/atlas/rules/download.snakefile +++ b/atlas/rules/download.snakefile @@ -89,7 +89,7 @@ rule download: expand("{dir}/{filename}", dir=EGGNOG_DIR, filename=["OG_fasta","eggnog.db","eggnog_proteins.dmnd","og2level.tsv"]), CHECKMFILES, - CAT_flag_downloaded, + os.path.join(GTDBTK_DATA_PATH,'downloaded_success') diff --git a/atlas/rules/genecatalog.snakefile b/atlas/rules/genecatalog.snakefile index da339084..cfe1d078 100644 --- a/atlas/rules/genecatalog.snakefile +++ b/atlas/rules/genecatalog.snakefile @@ -3,7 +3,7 @@ import os if config['genecatalog']['source']=='contigs': - +#TODO: cat with python localrules: concat_genes rule concat_genes: input: @@ -21,13 +21,14 @@ else: localrules: concat_genes rule concat_genes: input: - "genomes/annotations/genes" + faa= lambda wc: get_all_genes(wc,".faa"), + fna= lambda wc: get_all_genes(wc,".fna") output: faa= temp("Genecatalog/all_genes_unfiltered.faa"), fna = temp("Genecatalog/all_genes_unfiltered.fna"), shell: - " cat {input}/*.faa > {output.faa} ;" - " cat {input}/*.fna > {output.fna}" + " cat {input.faa} > {output.faa} ;" + " cat {input.fna} > {output.fna}" localrules: filter_genes diff --git a/atlas/rules/genomes.smk b/atlas/rules/genomes.smk index f42e2e3d..89895ae0 100644 --- a/atlas/rules/genomes.smk +++ b/atlas/rules/genomes.smk @@ -54,9 +54,11 @@ rule merge_checkm: sample= SAMPLES, binner= config['final_binner']), taxonomy= expand("{sample}/binning/{binner}/checkm/taxonomy.tsv", sample= SAMPLES, binner= config['final_binner']), - + markers= expand("{sample}/binning/{binner}/checkm/storage/tree/concatenated.fasta", + sample= SAMPLES, binner= config['final_binner']) output: checkm="genomes/checkm/checkm_all_bins.tsv", + markers= "genomes/checkm/all_bins_markers.fasta" run: import pandas as pd @@ -71,7 +73,9 @@ rule merge_checkm: D= pd.concat(D,axis=0) D.to_csv(output.checkm,sep='\t') - + with open(output.markers,'wb') as fout: + for fasta in input.markers: + shutil.copyfileobj(open(fasta,'rb'),fout) @@ -213,6 +217,7 @@ rule run_all_checkm_lineage_wf: dir = genome_dir output: "genomes/checkm/completeness.tsv", + "genomes/checkm/storage/tree/concatenated.fasta" params: output_dir = lambda wc, output: os.path.dirname(output[0]), conda: @@ -434,43 +439,55 @@ rule combine_bined_coverages_MAGs: Median_abund.to_csv(output.median_abund,sep='\t') +# rule predict_genes_genomes: +# input: +# dir=genome_dir +# output: +# directory("genomes/annotations/genes") +# conda: +# "%s/required_packages.yaml" % CONDAENV +# log: +# "logs/genomes/prodigal.log" +# shadow: +# "shallow" +# threads: +# config.get("threads", 1) +# script: +# "predict_genes_of_genomes.py" + + + rule predict_genes_genomes: input: - dir=genome_dir + "genomes/genomes/{genome}.fasta" output: - directory("genomes/annotations/genes") + fna = "genomes/annotations/genes/{genome}.fna", + faa = "genomes/annotations/genes/{genome}.faa", + gff = "genomes/annotations/genes/{genome}.gff" conda: "%s/required_packages.yaml" % CONDAENV log: - "logs/genomes/prodigal.log" - shadow: - "shallow" + "logs/genomes/prodigal/{genome}.txt" threads: - config.get("threads", 1) - script: - "predict_genes_of_genomes.py" - - + 1 + resources: + mem= config['simplejob_mem'] + shell: + """ + prodigal -i {input} -o {output.gff} -d {output.fna} \ + -a {output.faa} -p meta -f gff 2> >(tee {log}) + """ -# rule predict_genes_genomes: -# input: -# "genomes/genomes/{genome}.fasta" -# output: -# fna = "genomes/annotations/genes/{genome}.fna", -# faa = "genomes/annotations/genes/{genome}.faa", -# gff = "genomes/annotations/genes/{genome}.gff" -# conda: -# "%s/required_packages.yaml" % CONDAENV -# log: -# "logs/genomes/prodigal/{genome}.txt" -# threads: -# 1 -# shell: -# """ -# prodigal -i {input} -o {output.gff} -d {output.fna} \ -# -a {output.faa} -p meta -f gff 2> >(tee {log}) -# """ +def get_all_genes(wildcards,extension='.faa'): + return expand("genomes/annotations/genes/{genome}{extension}", + genome=get_genomes_(wildcards),extension=extension) +localrules: all_prodigal +rule all_prodigal: + input: + get_all_genes + output: + touch("genomes/annotations/genes/predicted") diff --git a/atlas/rules/gtdbtk.smk b/atlas/rules/gtdbtk.smk index cd34b9af..cd34ea75 100644 --- a/atlas/rules/gtdbtk.smk +++ b/atlas/rules/gtdbtk.smk @@ -1,5 +1,6 @@ +gtdb_dir="genomes/taxonomy/gtdb" rule identify: @@ -7,16 +8,16 @@ rule identify: dir=genome_dir, flag= rules.download_gtdb.output output: - directory("genomes/taxonomy/identify") + directory(f"{gtdb_dir}/identify") threads: config['threads'] conda: "../envs/gtdbtk.yaml" log: - "logs/taxonomy/gtdbtk_identify.txt", - "genomes/taxonomy/gtdbtk.log" + "logs/taxonomy/gtdbtk/identify.txt", + f"{gtdb_dir}/gtdbtk.log" params: - outdir= "genomes/taxonomy", + outdir= gtdb_dir, extension="fasta", shell: "GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} ; " @@ -24,43 +25,45 @@ rule identify: "--extension {params.extension} " "--cpus {threads} &> {log[0]}" -rule align: +checkpoint align: input: - "genomes/taxonomy/identify" + f"{gtdb_dir}/identify" output: - "genomes/taxonomy/gtdbtk.bac120.user_msa.fasta" + directory(f"{gtdb_dir}/align") threads: config['threads'] conda: "../envs/gtdbtk.yaml" log: - "logs/taxonomy/gtdbtk_align.txt", - "genomes/taxonomy/gtdbtk.log" + "logs/taxonomy/gtdbtk/align.txt", + f"{gtdb_dir}/gtdbtk.log" params: - outdir= "genomes/taxonomy" + outdir= gtdb_dir shell: "GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} ; " "gtdbtk align --identify_dir {params.outdir} --out_dir {params.outdir} " "--cpus {threads} &> {log[0]}" + + rule classify: input: rules.align.output, genome_dir=genome_dir, output: - "genomes/taxonomy/gtdbtk.bac120.summary.tsv", + directory(f"{gtdb_dir}/classify"), threads: - 8 #pplacer needs much memory for not many threads + config['threads'] #pplacer needs much memory for not many threads resources: mem=config['large_mem'] conda: "../envs/gtdbtk.yaml" log: - "logs/taxonomy/gtdbtk_classify.txt", - "genomes/taxonomy/gtdbtk.log" + "logs/taxonomy/gtdbtk/classify.txt", + f"{gtdb_dir}/gtdbtk.log" params: - outdir= "genomes/taxonomy", + outdir= gtdb_dir, extension="fasta", shell: "GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} ; " @@ -69,22 +72,55 @@ rule classify: "--extension {params.extension} " "--cpus {threads} &> {log[0]}" +msa_paths={'checkm':"genomes/checkm/storage/tree/concatenated.fasta", + 'gtdbtk.bac120': f"{gtdb_dir}/align/gtdbtk.bac120.user_msa.fasta", + 'gtdbtk.ar122': f"{gtdb_dir}/align/gtdbtk.ar122.user_msa.fasta" +} -rule infer: +rule fasttree: input: - "genomes/taxonomy/gtdbtk.bac120.user_msa.fasta" + lambda wildcards: msa_paths[wildcards.msa] output: - "genomes/taxonomy/gtdbtk.unrooted.tree" + temp("genomes/tree/{msa}.unrooted.nwk") + log: + "logs/genomes/tree/FastTree_{msa}.log" threads: - config['threads'] + max(config['threads'],3) conda: - "../envs/gtdbtk.yaml" - log: - "logs/taxonomy/gtdbtk_infer.txt", - "genomes/taxonomy/gtdbtk.log" - params: - outdir="genomes/taxonomy" + "%s/tree.yaml" % CONDAENV shell: - "GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} ; " - "gtdbtk infer --msa_file {input} --out_dir {params.outdir} " - "--cpus {threads} &> {log[0]}" + "export OMP_NUM_THREADS={threads}; " + "FastTree -log {log} {input} > {output} " + + +localrules: root_tree +rule root_tree: + input: + tree="genomes/tree/{msa}.unrooted.nwk", + wildcard_constraints: + msa="((?!unrooted).)*" + output: + tree="genomes/tree/{msa}.nwk", + conda: + "%s/tree.yaml" % CONDAENV + threads: + 1 + log: + "logs/genomes/tree/root_tree_{msa}.log" + script: + "../scripts/root_tree.py" + + +def all_gtdb_trees_input(wildcards): + dir= checkpoints.align.get().output[0] + + domains = glob_wildcards(f"{dir}/gtdbtk.{{domain}}.user_msa.fasta").domain + + return expand("genomes/tree/gtdbtk.{domain}.nwk",domain=domains) + + +rule all_gtdb_trees: + input: + all_gtdb_trees_input + output: + touch("genomes/tree/finished_gtdb_trees") diff --git a/atlas/rules/tree.smk b/atlas/rules/tree.smk deleted file mode 100644 index a43f2385..00000000 --- a/atlas/rules/tree.smk +++ /dev/null @@ -1,31 +0,0 @@ - - - -rule fasttree: - input: - "genomes/checkm/storage/tree/concatenated.fasta" - output: - "genomes/checkm/storage/tree/fasttree.nwk" - log: - "logs/genomes/tree/FastTree.log" - threads: - max(config['threads'],3) - conda: - "%s/tree.yaml" % CONDAENV - shell: - "export OMP_NUM_THREADS={threads}; " - "FastTree -log {log} {input} > {output} " - -localrules: root_tree -rule root_tree: - input: - tree="genomes/checkm/storage/tree/fasttree.nwk", - taxonomy="genomes/checkm/taxonomy.tsv" - output: - tree="genomes/tree/tree.nwk", - conda: - "%s/tree.yaml" % CONDAENV - threads: - 1 - script: - "../scripts/utils/tree.py" diff --git a/atlas/scripts/root_tree.py b/atlas/scripts/root_tree.py new file mode 100644 index 00000000..7bac694d --- /dev/null +++ b/atlas/scripts/root_tree.py @@ -0,0 +1,10 @@ +import sys +import ete3 + + +sys.stdout= open(snakemake.log[0],"w") + +T= ete3.Tree(snakemake.input.tree,quoted_node_names=True,format=1) +T.unroot() +T.set_outgroup(T.get_midpoint_outgroup()) +T.write(outfile=snakemake.output.tree) diff --git a/atlas/scripts/utils/parsers_checkm.py b/atlas/scripts/utils/parsers_checkm.py index 66072898..17e64cd1 100644 --- a/atlas/scripts/utils/parsers_checkm.py +++ b/atlas/scripts/utils/parsers_checkm.py @@ -1,7 +1,6 @@ - import pandas as pd -import os -import warnings + + def read_checkm_output(taxonomy_table, completness_table): @@ -23,22 +22,3 @@ def read_checkm_output(taxonomy_table, completness_table): ] df = pd.concat([c_df, t_df], axis=1) return df - - -def load_checkm_tax(checkm_taxonomy_file): - - checkmTax= pd.read_table(checkm_taxonomy_file,index_col=0) - - checkmTax = checkmTax['Taxonomy (contained)'] - - if checkmTax.isnull().any(): - warnings.warn("Some samples have no taxonomy asigned based on checkm. Samples:\n"+ \ - ', '.join(checkmTax.index[checkmTax.isnull()]) - ) - checkmTax= checkmTax.dropna().astype(str) - - checkmTax= pd.DataFrame(list( checkmTax.apply(lambda s: s.split(';'))), - index=checkmTax.index) - - checkmTax.columns=['kindom','phylum','class','order','family','genus','species'] - return checkmTax diff --git a/atlas/scripts/utils/taxonomy.py b/atlas/scripts/utils/taxonomy.py new file mode 100644 index 00000000..bae5fb1e --- /dev/null +++ b/atlas/scripts/utils/taxonomy.py @@ -0,0 +1,51 @@ +import pandas as pd +import numpy as np +import warnings + +TAXONMIC_LEVELS=['Domain','phylum','class','order','family','genus','species'] + +def tax2table(Taxonomy_Series,split_character=';',remove_prefix=False): + """ + Transforms (green_genes) taxonomy to a table + Expect the following input format: + d__Bacteria;p__Bacteroidota;c__Bacteroidia;f__ + + Replaces empty values and can remove prefix 'c__' + + """ + + if Taxonomy_Series.isnull().any(): + warnings.warn("Some samples have no taxonomy asigned. Samples:\n"+ \ + ', '.join(Taxonomy_Series.index[Taxonomy_Series.isnull()]) + ) + Taxonomy_Series= Taxonomy_Series.dropna().astype(str) + + Tax= pd.DataFrame(list( Taxonomy_Series.apply(lambda s: s.split(split_character))), + index=Taxonomy_Series.index) + + + Tax.columns= TAXONMIC_LEVELS[:len(Tax.collums)] + + if remove_prefix: + Tax=Tax.applymap(lambda s: s[3:]).replace('',np.nan) + else: + Tax[Tax.applymap(len)==3]=np.nan + + return Tax + +def load_checkm_tax(taxonomy_file,remove_prefix=False): + + D= pd.read_table(taxonomy_file,index_col=0) + + checkmTax = tax2table(D['Taxonomy (contained)'],remove_prefix=remove_prefix) + + return checkmTax + + +def load_gtdb_tax(taxonomy_file,remove_prefix=False): + + D= pd.read_table(taxonomy_file,index_col=0) + + checkmTax = tax2table(heckmTax['classification'],remove_prefix=remove_prefix) + + return checkmTax diff --git a/atlas/scripts/utils/tree.py b/atlas/scripts/utils/tree.py index 0dc83b6c..d552546a 100644 --- a/atlas/scripts/utils/tree.py +++ b/atlas/scripts/utils/tree.py @@ -5,47 +5,30 @@ import pandas as pd import warnings -#from . import parsers_checkm -def load_checkm_tax(checkm_taxonomy_file): - - checkmTax= pd.read_table(checkm_taxonomy_file,index_col=0) - - checkmTax = checkmTax['Taxonomy (contained)'] - - if checkmTax.isnull().any(): - warnings.warn("Some samples have no taxonomy asigned based on checkm. Samples:\n"+ \ - ', '.join(checkmTax.index[checkmTax.isnull()]) - ) - checkmTax= checkmTax.dropna().astype(str) - - checkmTax= pd.DataFrame(list( checkmTax.apply(lambda s: s.split(';'))), - index=checkmTax.index) - - checkmTax.columns=['kindom','phylum','class','order','family','genus','species'] - return checkmTax - def load_tree(netwik_file): return ete3.Tree(netwik_file,quoted_node_names=True,format=1) def root_tree_by_phyla(T,phyla): - """ Root the tree next to the least frequent phyla if possible + """ Root the tree next to the phylum that is as far apart as possible from the other phyla """ + phylum_LCA={} + for p in phyla.unique(): + phylum_LCA[p]=T.get_common_ancestor(*tuple(phyla.index[phyla==p].values)) - Freq_pyla= phyla.value_counts() - - for p in reversed(Freq_pyla.index): - LCA = T.get_common_ancestor(*tuple(phyla.index[phyla==p].values)) + Dist= pd.DataFrame() + for p1,lca1 in phylum_LCA.items(): + for p2,lca2 in phylum_LCA.items(): + Dist.loc[p1,p2]=T.get_distance(lca1,lca2) - if not T== LCA: - T.set_outgroup(LCA) - print(f"set {p} as outgroup for Tree rooting") - break + furthest_phylum= Dist.mean().idxmax() + outgroup=phylum_LCA[furthest_phylum] + if not outgroup== T: + T.set_outgroup(outgroup) - T.unroot() def layout_black_circles(node): # If node is a leaf @@ -64,14 +47,3 @@ def render_tree(T,out): ts.show_scale=False T.render(out,tree_style=ts,layout=layout_black_circles) - - -if __name__ == "__main__": - - - T= load_tree(snakemake.input.tree) - phyla= load_checkm_tax(snakemake.input.taxonomy).phylum - - root_tree_by_phyla(T,phyla) - - T.write(outfile=snakemake.output.tree) diff --git a/atlas/template_config.yaml b/atlas/template_config.yaml index e3a3a8cb..498721cc 100644 --- a/atlas/template_config.yaml +++ b/atlas/template_config.yaml @@ -9,7 +9,7 @@ #### #### ################################################################### # For more details about the config values see: -# https://metagenome-atlas.readthedocs.io +# https://metagenome-atlas.rtfd.io ################################################################### ######################## @@ -19,7 +19,7 @@ threads: 32 # Memory for most jobs especially from BBtools, which are memory demanding mem: 32 -# Memory and threads for jobs needing high amount of memory. e.g CAT taxonomy GTDB-tk +# Memory and threads for jobs needing high amount of memory. e.g GTDB-tk large_mem: 250 large_threads: 16 # can be a subset of threads or altered if rule run_spades or run_megahit are being defined differently in your cluster configuration @@ -179,19 +179,11 @@ genome_dereplication: annotations: - gtdb_tree - gtdb_taxonomy - - checkm_taxonomy -# - ssu -# - cat_taxonomy +# - checkm_taxonomy +# - checkm_tree + + -######################## -# taxonomy -####################### -# Diamond needs up to 100 GB of memory for building the taxonomy database -# If you prefer you can dowload an existing CAT database, see docs. -# number of top hits considered for taxonomic annotation -cat_range: 5 -# fraction of support needed for classification, <0.5 can give rise to duble classification. -cat_fraction: 0.3 ######################## diff --git a/docs/details/configuration.rst b/docs/details/configuration.rst index d5a7f085..411f0d1c 100644 --- a/docs/details/configuration.rst +++ b/docs/details/configuration.rst @@ -34,3 +34,4 @@ Detailed configuration ../advanced/qc ../advanced/assembly + longreads diff --git a/docs/details/longreads.rst b/docs/details/longreads.rst new file mode 100644 index 00000000..36feae5d --- /dev/null +++ b/docs/details/longreads.rst @@ -0,0 +1,14 @@ + +.. _longreads: + +Use long reads with spades +========================== + +Limitation: Hybrid assembly of long and short reads is supported with spades and metaSpades. +However metaSpades needs a paired-end short-read library. + +The path of the (preprocessed) long reads should be added manually to the +the sample table under a new column heading 'longreads'. + +In addition the type of the long reads should be defined in the config file: +``` longread_type``` one of ["pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"] diff --git a/docs/details/pipeline.rst b/docs/details/pipeline.rst index 4af24987..43dfc5ab 100644 --- a/docs/details/pipeline.rst +++ b/docs/details/pipeline.rst @@ -52,27 +52,23 @@ Besides the `reports/assembly_report.html`_ this rule outputs the following file - ``{sample}/{sample}_contigs.fasta`` - ``{sample}/sequence_alignment/{sample}.bam`` - - ``{sample}/assembly/contig_stats/postfilter_coverage_stats.txt`` - - ``{sample}/assembly/contig_stats/prefilter_contig_stats.txt`` - ``{sample}/assembly/contig_stats/final_contig_stats.txt`` .. _reports/assembly_report.html: ../_static/assembly_report.html -Genomes ---------------- -:: - - atlas run genomes - #or - atlas run all Binning -``````` +--------------- +:: + + atlas run binning + #or + atlas run all When you use different binners (e.g. metabat, maxbin) and a binner-reconciliator (e.g. DAS Tool), then Atlas will produce for each binner and sample: @@ -83,29 +79,52 @@ which shows the attribution of contigs to bins. For the final_binner it produces - ``reports/bin_report_{binner}.html`` -See an `example <../_static/bin_report.html>`_ +See an `example <../_static/bin_report.html>`_ as a summary of the quality of all bins. -As a summary of the quality of all bins. These bins are then De-replicated using DeRep. -The Metagenome assembled genomes are then renamed, but we keep mapping files. +Genomes +--------------- +:: - - ``genomes/Dereplication`` - - ``genomes/clustering/contig2genome.tsv`` - - ``genomes/clustering/allbins2genome.tsv`` + atlas run genomes + #or + atlas run all +As the binning can predict several times the same genome it is recommended to de-replicate these genomes. +For now we use DeRep to filter and de-replicate the genomes. +The Metagenome assembled genomes are then renamed, but we keep mapping files. + - ``genomes/Dereplication`` + - ``genomes/clustering/contig2genome.tsv`` + - ``genomes/clustering/allbins2genome.tsv`` -The main output files -`````````````````````` +The fasta sequence of the dereplicated and renamed genomes can be found in ``genomes/genomes`` +and their quality estimation are in ``genomes/checkm/completeness.tsv``. +The quantification of the genomes can be found in: - - ``genomes/genomes`` - - ``genomes/annotations/genes`` - - ``genomes/checkm/completeness.tsv`` - - ``genomes/taxonomy/taxonomy_names.tsv`` - ``genomes/counts/median_coverage_genomes.tsv`` - ``genomes/counts/raw_counts_genomes.tsv`` +See in `Atlas example `_ how to analyze these abundances. + +The predicted genes and translated protein sequences are in ``genomes/annotations/genes``. + +Taxonomic adnnotation +````````````````````` +:: + + annotations: + - gtdb_tree + - gtdb_taxonomy + - checkm_tree + - checkm_taxonomy +Different annotations can be turned on and off in the config file under the heading ``annotations``: +A taxonomy for the dereplicated genomes is proposed GTDB. +The results can be found in ``genomes/taxonomy``. +The genomes are placed in a phylogenetic tree separately for bacteria and archaea (if there are any) using the GTDB markers. +In addition a tree for bacteria and archaea can be generated based on the checkm markers. +All trees are properly rooted using the midpoint. The files can be found in ``genomes/tree`` Gene Catalog @@ -117,7 +136,7 @@ Gene Catalog # or atlas run genecatalog -The gene catalog takes all genes predicted from the genomes and clusters them +The gene catalog takes either genes predicted from the genomes or all genes predicted on the contigs and clusters them according to the configuration. This rule produces the following output file for the whole dataset. diff --git a/docs/usage/getting_started.rst b/docs/usage/getting_started.rst index 399f5fc1..14af04f6 100644 --- a/docs/usage/getting_started.rst +++ b/docs/usage/getting_started.rst @@ -1,7 +1,7 @@ Install ======== -1a. Create conda environment +A. Create conda environment ---------------------------- You need to install anaconda or miniconda. @@ -15,13 +15,13 @@ Then install metagenome-atlas:: conda install -y -c bioconda -c conda-forge metagenome-atlas -1b. Install the development version from GitHub +B. Install the development version from GitHub ----------------------------------------------- Atlas is still under active development, therefore you may want to install the up to date atlas from GitHub. Create an conda environment with all primary dependencies. All further dependencies are installed on the fly:: - conda create -n atlasenv -c bioconda -c conda-forge python>=3.6 snakemake pandas bbmap=37.78 click=7 ruamel.yaml biopython + conda create -n atlas -c conda-forge -c bioconda python=3.6 snakemake pandas bbmap=37.78 click=7 ruamel.yaml biopython Load the environment:: @@ -41,14 +41,14 @@ Now you should be able to run atlas:: -2. Download all databases first -------------------------------- - -May be you want to make sure that all databases are downloaded correctly. Simply run:: - - atlas download --db-dir path/to/databases - -To reassure you, most of the databases are md5 checked. The downloads use approximately 30 GB of disk space. +.. 2. Download all databases first +.. ------------------------------- +.. +.. May be you want to make sure that all databases are downloaded correctly. Simply run:: +.. +.. atlas download --db-dir path/to/databases +.. +.. To reassure you, most of the databases are md5 checked. The downloads use approximately 30 GB of disk space. .. 3. Test installation .. -------------------- @@ -79,7 +79,7 @@ atlas init [default: /Users/silas/Documents/Debug_atlas /databases] -w, --working-dir PATH location to run atlas - --assembler [megahit|spades] assembler [default: megahit] + --assembler [megahit|spades] assembler [default: spades] --data-type [metagenome|metatranscriptome] sample data type [default: metagenome] --threads INTEGER number of threads to use per multi-threaded @@ -113,7 +113,7 @@ atlas run :: - Usage: atlas run [OPTIONS] [[qc|assembly|genomes|genecatalog|None|all]] + Usage: atlas run [OPTIONS] [[qc|assembly|binning|genomes|genecatalog|None|all]] [SNAKEMAKE_ARGS]... Runs the ATLAS pipline