diff --git a/atlas/__init__.py b/atlas/__init__.py index 33be32d6..e46471ab 100644 --- a/atlas/__init__.py +++ b/atlas/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.0.5" +__version__ = "2.0.6" from .scripts import utils diff --git a/atlas/report/assembly_report.py b/atlas/report/assembly_report.py index 3b8c8a06..9e1bab01 100644 --- a/atlas/report/assembly_report.py +++ b/atlas/report/assembly_report.py @@ -162,20 +162,35 @@ def main(samples, contig_stats, gene_tables, mapping_logs, report_out, combined_ report(report_str, report_out, Table_1=combined_stats, stylesheet=os.path.join(atlas_dir,'report', "report.css")) + + if __name__ == "__main__": - p = argparse.ArgumentParser() - p.add_argument("--samples", nargs="+") - p.add_argument("--contig-stats", nargs="+") - p.add_argument("--gene-tables", nargs="+") - p.add_argument("--mapping-logs", nargs="+") - p.add_argument("--report-out") - p.add_argument("--combined-stats") - args = p.parse_args() - main( - args.samples, - args.contig_stats, - args.gene_tables, - args.mapping_logs, - args.report_out, - args.combined_stats, - ) + + try: + main( + samples=snakemake.params.samples, + contig_stats=snakemake.input.contig_stats, + gene_tables=snakemake.input.gene_tables, + mapping_logs=snakemake.input.mapping_logs, + report_out=snakemake.output.report, + combined_stats=snakemake.output.combined_contig_stats + ) + + except NameError: + + p = argparse.ArgumentParser() + p.add_argument("--samples", nargs="+") + p.add_argument("--contig-stats", nargs="+") + p.add_argument("--gene-tables", nargs="+") + p.add_argument("--mapping-logs", nargs="+") + p.add_argument("--report-out") + p.add_argument("--combined-stats") + args = p.parse_args() + main( + args.samples, + args.contig_stats, + args.gene_tables, + args.mapping_logs, + args.report_out, + args.combined_stats, + ) diff --git a/atlas/report/bin_report.py b/atlas/report/bin_report.py index 02ca2160..20ba2c47 100644 --- a/atlas/report/bin_report.py +++ b/atlas/report/bin_report.py @@ -12,52 +12,51 @@ ) -def parse_checkm_output(sample_data, out_tsv): - df = pd.DataFrame() - for sample in sample_data.keys(): - c_df = pd.read_table(sample_data[sample]["completeness"], index_col=0)[ - ["Completeness", "Contamination", "Strain heterogeneity"] - ] - t_df = pd.read_table(sample_data[sample]["taxonomy"], index_col=0)[ - [ - "# unique markers (of 43)", - "# multi-copy", - "Insertion branch UID", - "Taxonomy (contained)", - "Taxonomy (sister lineage)", - "GC", - "Genome size (Mbp)", - "Gene count", - "Coding density", - ] +def read_checkm_output(taxonomy_table, completness_table): + + c_df = pd.read_csv(completness_table, index_col=0,sep='\t')[ + ["Completeness", "Contamination", "Strain heterogeneity"] + ] + t_df = pd.read_csv(taxonomy_table, index_col=0,sep='\t')[ + [ + "# unique markers (of 43)", + "# multi-copy", + "Insertion branch UID", + "Taxonomy (contained)", + "Taxonomy (sister lineage)", + "GC", + "Genome size (Mbp)", + "Gene count", + "Coding density", ] - df = df.append(pd.concat([c_df, t_df], axis=1)) - df.to_csv(out_tsv, sep="\t") - df["Sample"] = df.index.map( lambda s: str(s).split(".")[0]) + ] + df = pd.concat([c_df, t_df], axis=1) return df + def main(samples, completeness_files, taxonomy_files, report_out, bin_table): sample_data = {} div = {} - for sample in samples: - sample_data[sample] = {} - for completeness_file in completeness_files: - # underscore version was for simplified local testing - # if "%s_" % sample in completeness_file: - if "%s/" % sample in completeness_file: - sample_data[sample]["completeness"] = completeness_file - for taxonomy_file in taxonomy_files: - # if "%s_" % sample in taxonomy_file: - if "%s/" % sample in taxonomy_file: - sample_data[sample]["taxonomy"] = taxonomy_file - df = parse_checkm_output(sample_data, bin_table) + + df= pd.DataFrame() + + for i,sample in enumerate(samples): + sample_data= read_checkm_output(taxonomy_table=taxonomy_files[i], + completness_table=completeness_files[i]) + sample_data['Sample']= sample + + df= df.append(sample_data) + + + df.to_csv(bin_table,sep='\t') + div["bin_scatter"] = offline.plot( { "data": [ { - "x": df[df["Sample"] == sample]["Completeness"], - "y": df[df["Sample"] == sample]["Contamination"], + "x": df.loc[df["Sample"] == sample,"Completeness"], + "y": df.loc[df["Sample"] == sample,"Contamination"], "name": sample, "mode": "markers", "text": df.index[df["Sample"] == sample], @@ -156,13 +155,24 @@ def main(samples, completeness_files, taxonomy_files, report_out, bin_table): if __name__ == "__main__": - p = argparse.ArgumentParser() - p.add_argument("--samples", nargs="+") - p.add_argument("--completeness", nargs="+") - p.add_argument("--taxonomy", nargs="+") - p.add_argument("--report-out") - p.add_argument("--bin-table") - args = p.parse_args() - main( - args.samples, args.completeness, args.taxonomy, args.report_out, args.bin_table - ) + + try: + main( + samples=snakemake.params.samples, + taxonomy_files=snakemake.input.taxonomy_files, + completeness_files=snakemake.input.completeness_files, + report_out=snakemake.output.report, + bin_table=snakemake.output.bin_table + ) + + except NameError: + p = argparse.ArgumentParser() + p.add_argument("--samples", nargs="+") + p.add_argument("--completeness", nargs="+") + p.add_argument("--taxonomy", nargs="+") + p.add_argument("--report-out") + p.add_argument("--bin-table") + args = p.parse_args() + main( + args.samples, args.completeness, args.taxonomy, args.report_out, args.bin_table + ) diff --git a/atlas/rules/assemble.snakefile b/atlas/rules/assemble.snakefile index bb37c35c..98de51f9 100644 --- a/atlas/rules/assemble.snakefile +++ b/atlas/rules/assemble.snakefile @@ -712,23 +712,15 @@ rule build_assembly_report: input: contig_stats = expand("{sample}/assembly/contig_stats/final_contig_stats.txt", sample=SAMPLES), gene_tables = expand("{sample}/annotation/predicted_genes/{sample}.tsv", sample=SAMPLES), - mapping_log_files = expand("{sample}/logs/assembly/calculate_coverage/align_reads_from_{sample}_to_filtered_contigs.log", sample=SAMPLES), + mapping_logs = expand("{sample}/logs/assembly/calculate_coverage/align_reads_from_{sample}_to_filtered_contigs.log", sample=SAMPLES), # mapping logs will be incomplete unless we wait on alignment to finish bams = expand("{sample}/sequence_alignment/{sample}.bam", sample=SAMPLES) output: report = "reports/assembly_report.html", combined_contig_stats = 'stats/combined_contig_stats.tsv' params: - samples = " ".join(SAMPLES) + samples = SAMPLES conda: "%s/report.yaml" % CONDAENV - shell: - """ - python %s/report/assembly_report.py \ - --samples {params.samples} \ - --contig-stats {input.contig_stats} \ - --gene-tables {input.gene_tables} \ - --mapping-logs {input.mapping_log_files} \ - --report-out {output.report} \ - --combined-stats {output.combined_contig_stats} - """ % os.path.dirname(os.path.abspath(workflow.snakefile)) + script: + "../report/assembly_report.py" diff --git a/atlas/rules/binning.snakefile b/atlas/rules/binning.snakefile index bac5509d..467ee56c 100644 --- a/atlas/rules/binning.snakefile +++ b/atlas/rules/binning.snakefile @@ -469,19 +469,11 @@ rule build_bin_report: report = "reports/bin_report_{binner}.html", bin_table = "reports/genomic_bins_{binner}.tsv" params: - samples = " ".join(SAMPLES), - script_dir = os.path.dirname(os.path.abspath(workflow.snakefile)) + samples = SAMPLES, conda: "%s/report.yaml" % CONDAENV - shell: - """ - python {params.script_dir}/report/bin_report.py \ - --samples {params.samples} \ - --completeness {input.completeness_files} \ - --taxonomy {input.taxonomy_files} \ - --report-out {output.report} \ - --bin-table {output.bin_table} - """ + script: + "../report/bin_report.py" localrules: get_unique_bin_ids rule get_unique_bin_ids: diff --git a/atlas/rules/download.snakefile b/atlas/rules/download.snakefile index e957ef9f..15aabea2 100644 --- a/atlas/rules/download.snakefile +++ b/atlas/rules/download.snakefile @@ -139,7 +139,7 @@ rule unpack_checkm_data: input: os.path.join(DBDIR, CHECKM_ARCHIVE) output: - CHECKMFILES + protected(CHECKMFILES) params: path = CHECKMDIR shell: @@ -172,7 +172,7 @@ rule download_cat_db: params: db_folder=CAT_DIR resources: - mem= config.get('diamond_mem',10) + mem= config.get('diamond_mem',100) threads: config.get('diamond_threads',10) conda: @@ -180,6 +180,16 @@ rule download_cat_db: shell: " CAT prepare -d {params.db_folder} -t {params.db_folder} --existing --nproc {threads}" +# output: +# "{dir}/{date}.{extension}", +# extension=["nr.fastaid2LCAtaxid", +# "nr.dmnd", +# "nr.taxids_with_multiple_offspring", +# ,"prot.accession2taxid.gz"] +# "names.dmp", +# "nodes.dmp" +# temp("{dir}/{date}.nr.gz") + onsuccess: print("All databases have downloaded and validated successfully")