Skip to content

Commit

Permalink
v2.0.6 (#180)
Browse files Browse the repository at this point in the history
- Fixes De-Replication of Bins
- Fixes Bin-Report
- Uses scripts directive for all reports
  • Loading branch information
SilasK authored Mar 5, 2019
1 parent 85598a6 commit 38fea10
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 87 deletions.
2 changes: 1 addition & 1 deletion atlas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "2.0.5"
__version__ = "2.0.6"

from .scripts import utils

Expand Down
47 changes: 31 additions & 16 deletions atlas/report/assembly_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,20 +162,35 @@ def main(samples, contig_stats, gene_tables, mapping_logs, report_out, combined_
report(report_str, report_out, Table_1=combined_stats, stylesheet=os.path.join(atlas_dir,'report', "report.css"))




if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("--samples", nargs="+")
p.add_argument("--contig-stats", nargs="+")
p.add_argument("--gene-tables", nargs="+")
p.add_argument("--mapping-logs", nargs="+")
p.add_argument("--report-out")
p.add_argument("--combined-stats")
args = p.parse_args()
main(
args.samples,
args.contig_stats,
args.gene_tables,
args.mapping_logs,
args.report_out,
args.combined_stats,
)

try:
main(
samples=snakemake.params.samples,
contig_stats=snakemake.input.contig_stats,
gene_tables=snakemake.input.gene_tables,
mapping_logs=snakemake.input.mapping_logs,
report_out=snakemake.output.report,
combined_stats=snakemake.output.combined_contig_stats
)

except NameError:

p = argparse.ArgumentParser()
p.add_argument("--samples", nargs="+")
p.add_argument("--contig-stats", nargs="+")
p.add_argument("--gene-tables", nargs="+")
p.add_argument("--mapping-logs", nargs="+")
p.add_argument("--report-out")
p.add_argument("--combined-stats")
args = p.parse_args()
main(
args.samples,
args.contig_stats,
args.gene_tables,
args.mapping_logs,
args.report_out,
args.combined_stats,
)
100 changes: 55 additions & 45 deletions atlas/report/bin_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,52 +12,51 @@
)


def parse_checkm_output(sample_data, out_tsv):
df = pd.DataFrame()
for sample in sample_data.keys():
c_df = pd.read_table(sample_data[sample]["completeness"], index_col=0)[
["Completeness", "Contamination", "Strain heterogeneity"]
]
t_df = pd.read_table(sample_data[sample]["taxonomy"], index_col=0)[
[
"# unique markers (of 43)",
"# multi-copy",
"Insertion branch UID",
"Taxonomy (contained)",
"Taxonomy (sister lineage)",
"GC",
"Genome size (Mbp)",
"Gene count",
"Coding density",
]
def read_checkm_output(taxonomy_table, completness_table):

c_df = pd.read_csv(completness_table, index_col=0,sep='\t')[
["Completeness", "Contamination", "Strain heterogeneity"]
]
t_df = pd.read_csv(taxonomy_table, index_col=0,sep='\t')[
[
"# unique markers (of 43)",
"# multi-copy",
"Insertion branch UID",
"Taxonomy (contained)",
"Taxonomy (sister lineage)",
"GC",
"Genome size (Mbp)",
"Gene count",
"Coding density",
]
df = df.append(pd.concat([c_df, t_df], axis=1))
df.to_csv(out_tsv, sep="\t")
df["Sample"] = df.index.map( lambda s: str(s).split(".")[0])
]
df = pd.concat([c_df, t_df], axis=1)
return df



def main(samples, completeness_files, taxonomy_files, report_out, bin_table):
sample_data = {}
div = {}
for sample in samples:
sample_data[sample] = {}
for completeness_file in completeness_files:
# underscore version was for simplified local testing
# if "%s_" % sample in completeness_file:
if "%s/" % sample in completeness_file:
sample_data[sample]["completeness"] = completeness_file
for taxonomy_file in taxonomy_files:
# if "%s_" % sample in taxonomy_file:
if "%s/" % sample in taxonomy_file:
sample_data[sample]["taxonomy"] = taxonomy_file
df = parse_checkm_output(sample_data, bin_table)

df= pd.DataFrame()

for i,sample in enumerate(samples):
sample_data= read_checkm_output(taxonomy_table=taxonomy_files[i],
completness_table=completeness_files[i])
sample_data['Sample']= sample

df= df.append(sample_data)


df.to_csv(bin_table,sep='\t')

div["bin_scatter"] = offline.plot(
{
"data": [
{
"x": df[df["Sample"] == sample]["Completeness"],
"y": df[df["Sample"] == sample]["Contamination"],
"x": df.loc[df["Sample"] == sample,"Completeness"],
"y": df.loc[df["Sample"] == sample,"Contamination"],
"name": sample,
"mode": "markers",
"text": df.index[df["Sample"] == sample],
Expand Down Expand Up @@ -156,13 +155,24 @@ def main(samples, completeness_files, taxonomy_files, report_out, bin_table):


if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("--samples", nargs="+")
p.add_argument("--completeness", nargs="+")
p.add_argument("--taxonomy", nargs="+")
p.add_argument("--report-out")
p.add_argument("--bin-table")
args = p.parse_args()
main(
args.samples, args.completeness, args.taxonomy, args.report_out, args.bin_table
)

try:
main(
samples=snakemake.params.samples,
taxonomy_files=snakemake.input.taxonomy_files,
completeness_files=snakemake.input.completeness_files,
report_out=snakemake.output.report,
bin_table=snakemake.output.bin_table
)

except NameError:
p = argparse.ArgumentParser()
p.add_argument("--samples", nargs="+")
p.add_argument("--completeness", nargs="+")
p.add_argument("--taxonomy", nargs="+")
p.add_argument("--report-out")
p.add_argument("--bin-table")
args = p.parse_args()
main(
args.samples, args.completeness, args.taxonomy, args.report_out, args.bin_table
)
16 changes: 4 additions & 12 deletions atlas/rules/assemble.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -712,23 +712,15 @@ rule build_assembly_report:
input:
contig_stats = expand("{sample}/assembly/contig_stats/final_contig_stats.txt", sample=SAMPLES),
gene_tables = expand("{sample}/annotation/predicted_genes/{sample}.tsv", sample=SAMPLES),
mapping_log_files = expand("{sample}/logs/assembly/calculate_coverage/align_reads_from_{sample}_to_filtered_contigs.log", sample=SAMPLES),
mapping_logs = expand("{sample}/logs/assembly/calculate_coverage/align_reads_from_{sample}_to_filtered_contigs.log", sample=SAMPLES),
# mapping logs will be incomplete unless we wait on alignment to finish
bams = expand("{sample}/sequence_alignment/{sample}.bam", sample=SAMPLES)
output:
report = "reports/assembly_report.html",
combined_contig_stats = 'stats/combined_contig_stats.tsv'
params:
samples = " ".join(SAMPLES)
samples = SAMPLES
conda:
"%s/report.yaml" % CONDAENV
shell:
"""
python %s/report/assembly_report.py \
--samples {params.samples} \
--contig-stats {input.contig_stats} \
--gene-tables {input.gene_tables} \
--mapping-logs {input.mapping_log_files} \
--report-out {output.report} \
--combined-stats {output.combined_contig_stats}
""" % os.path.dirname(os.path.abspath(workflow.snakefile))
script:
"../report/assembly_report.py"
14 changes: 3 additions & 11 deletions atlas/rules/binning.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -469,19 +469,11 @@ rule build_bin_report:
report = "reports/bin_report_{binner}.html",
bin_table = "reports/genomic_bins_{binner}.tsv"
params:
samples = " ".join(SAMPLES),
script_dir = os.path.dirname(os.path.abspath(workflow.snakefile))
samples = SAMPLES,
conda:
"%s/report.yaml" % CONDAENV
shell:
"""
python {params.script_dir}/report/bin_report.py \
--samples {params.samples} \
--completeness {input.completeness_files} \
--taxonomy {input.taxonomy_files} \
--report-out {output.report} \
--bin-table {output.bin_table}
"""
script:
"../report/bin_report.py"

localrules: get_unique_bin_ids
rule get_unique_bin_ids:
Expand Down
14 changes: 12 additions & 2 deletions atlas/rules/download.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ rule unpack_checkm_data:
input:
os.path.join(DBDIR, CHECKM_ARCHIVE)
output:
CHECKMFILES
protected(CHECKMFILES)
params:
path = CHECKMDIR
shell:
Expand Down Expand Up @@ -172,14 +172,24 @@ rule download_cat_db:
params:
db_folder=CAT_DIR
resources:
mem= config.get('diamond_mem',10)
mem= config.get('diamond_mem',100)
threads:
config.get('diamond_threads',10)
conda:
"%s/cat.yaml" % CONDAENV
shell:
" CAT prepare -d {params.db_folder} -t {params.db_folder} --existing --nproc {threads}"

# output:
# "{dir}/{date}.{extension}",
# extension=["nr.fastaid2LCAtaxid",
# "nr.dmnd",
# "nr.taxids_with_multiple_offspring",
# ,"prot.accession2taxid.gz"]
# "names.dmp",
# "nodes.dmp"
# temp("{dir}/{date}.nr.gz")


onsuccess:
print("All databases have downloaded and validated successfully")
Expand Down

0 comments on commit 38fea10

Please sign in to comment.