diff --git a/.circleci/config.yml b/.circleci/config.yml index 9c77e65e..0e3ded5c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -59,6 +59,12 @@ jobs: command: | source activate ./atlasenv test/dryrun.sh + - run: + name: Test_init + command: | + ls -l test + source activate ./atlasenv + ./test/test_init_many_samples.sh - persist_to_workspace: root: /root/project/ @@ -264,8 +270,8 @@ jobs: path: wd/logs destination: binning_logs - store_artifacts: - path: wd/Streptococcus/logs - destination: binning_logs/sample + path: wd/sample1/logs + destination: sample_logs # # diff --git a/.gitignore b/.gitignore index be0261d0..2446bd0f 100644 --- a/.gitignore +++ b/.gitignore @@ -35,15 +35,9 @@ databases *.manifest *.spec databases -test/* .test/* test/* -!test/dryrun.sh -!test/test_ci.sh -!test/test_assembly.sh -!test/test_binning.sh -!test/test_sra.sh -!test/test_external_genomes.sh +!test/*.sh # Installer logs pip-log.txt pip-delete-this-directory.txt diff --git a/atlas/atlas.py b/atlas/atlas.py index 033f2e38..6edcfee2 100644 --- a/atlas/atlas.py +++ b/atlas/atlas.py @@ -8,8 +8,8 @@ from snakemake.io import load_configfile -from .make_config import make_config, validate_config -from .init.atlas_init import run_init, run_init_sra +from .make_config import validate_config +from .init.atlas_init import run_init#, run_init_sra from .__init__ import __version__ @@ -66,7 +66,7 @@ def cli(obj): cli.add_command(run_init) -cli.add_command(run_init_sra) +#cli.add_command(run_init_sra) def get_snakefile(file="workflow/Snakefile"): diff --git a/atlas/init/atlas_init.py b/atlas/init/atlas_init.py index 88c76677..76068002 100644 --- a/atlas/init/atlas_init.py +++ b/atlas/init/atlas_init.py @@ -1,20 +1,16 @@ import os, sys from ..color_logger import logger -import multiprocessing -import tempfile -from snakemake import utils -from snakemake.io import load_configfile import pandas as pd import numpy as np -from collections import defaultdict import click from pathlib import Path - from ..make_config import make_config, validate_config from .create_sample_table import get_samples_from_fastq, simplify_sample_names from ..sample_table import ( validate_sample_table, - load_sample_table, + validate_bingroup_size_cobinning, + validate_bingroup_size_metabat, + BinGroupSizeError, ADDITIONAL_SAMPLEFILE_HEADERS, ) @@ -64,7 +60,14 @@ def prepare_sample_table_for_atlas( for h in Headers: sample_table[h] = np.nan - sample_table["BinGroup"] = sample_table.index + sample_table["BinGroup"] = "All" + + if sample_table.shape[0] >=50: + logger.warning( + "You have more than 50 samples in your sample table. " + "You should consider to split your samples into multiple BinGroups" + "For this modify the 'BinGroup' column in your sample table" + ) validate_sample_table(sample_table) @@ -146,6 +149,38 @@ def run_init( os.makedirs(working_dir, exist_ok=True) os.makedirs(db_dir, exist_ok=True) + sample_table = get_samples_from_fastq(path_to_fastq) + + prepare_sample_table_for_atlas( + sample_table, + reads_are_QC=skip_qc, + outfile=os.path.join(working_dir, "samples.tsv"), + ) + + # Set default binner depending on number of samples + n_samples = sample_table.shape[0] + if n_samples <= 7: + logger.info("You don't have many samples in your dataset. " + "I set 'metabat' as binner" + ) + binner = "metabat" + + try: + validate_bingroup_size_metabat(sample_table,logger) + except BinGroupSizeError: + pass + + + else: + binner = "vamb" + try: + validate_bingroup_size_cobinning(sample_table,logger) + + except BinGroupSizeError: + pass + + + make_config( db_dir, threads, @@ -153,14 +188,9 @@ def run_init( data_type, interleaved_fastq, os.path.join(working_dir, "config.yaml"), + binner= binner ) - sample_table = get_samples_from_fastq(path_to_fastq) - prepare_sample_table_for_atlas( - sample_table, - reads_are_QC=skip_qc, - outfile=os.path.join(working_dir, "samples.tsv"), - ) ########### Public init download data from SRA ############## diff --git a/atlas/init/parse_sra.py b/atlas/init/parse_sra.py index 2472c58b..953c03b6 100644 --- a/atlas/init/parse_sra.py +++ b/atlas/init/parse_sra.py @@ -96,7 +96,7 @@ def filter_runinfo(RunTable, ignore_paired=False): RunTable = RunTable.query("LibraryLayout == 'SINGLE'") else: - logger.warn(f"I drop {N_library_layout['SINGLE']} single end libraries") + logger.warning(f"I drop {N_library_layout['SINGLE']} single end libraries") RunTable = RunTable.query("LibraryLayout == 'PAIRED'") @@ -105,7 +105,7 @@ def filter_runinfo(RunTable, ignore_paired=False): if not RunTable.Platform.isin(["ILLUMINA"]).all(): Platforms = ", ".join(RunTable.Platform.unique()) - logger.warn( + logger.warning( f"Your samples are sequenced on the folowing platform: {Platforms}\n" "I don't know how well Atlas handles non-illumina reads.\n" "If you have long-reads, specify them via a the longreads, column in the sample table." @@ -160,7 +160,7 @@ def validate_merging_runinfo(path): else: problematic_samples_list = " ".join(problematic_samples) - logger.warn( + logger.warning( "You attemt to merge runs from the same sample. " f"But for {len(problematic_samples)} samples the runs have different {key}: {problematic_samples_list}\n" f"You can modify the table {path} and rerun the command.\n" diff --git a/atlas/make_config.py b/atlas/make_config.py index ac7d4e38..dd8365c1 100644 --- a/atlas/make_config.py +++ b/atlas/make_config.py @@ -155,7 +155,6 @@ def make_default_config(): config["cobining_min_contig_length"] = 2000 config["cobining_min_bin_size"] = 200 * 1000 - config["semibin_options"] = " --max-node 1 --max-edges 200 " config["cobinning_separator"] = ":" config["annotations"] = ["gtdb_taxonomy", "checkm_taxonomy", "gtdb_tree"] @@ -173,6 +172,7 @@ def make_config( data_type="metagenome", interleaved_fastq=False, config="config.yaml", + binner="vamb", ): """ Reads template config file with comments from ../workflow/config/template_config.yaml @@ -221,6 +221,8 @@ def make_config( # conf["refseq_tree"] = os.path.join(database_dir, "refseq.tree") # conf["diamond_db"] = os.path.join(database_dir, "refseq.dmnd") + conf["final_binner"] = binner + if os.path.exists(config): logger.warning( f"Config file {config} already exists, I didn't dare to overwrite it. continue..." @@ -234,6 +236,8 @@ def make_config( ) + + def validate_config(config, workflow): conf = load_configfile(config) diff --git a/atlas/sample_table.py b/atlas/sample_table.py index dabe3b0c..cdd4a7ea 100644 --- a/atlas/sample_table.py +++ b/atlas/sample_table.py @@ -25,21 +25,126 @@ def validate_sample_table(sampleTable): exit(1) if sampleTable.index.str.match("^\d").any(): - logger.warning( + logger.error( f"Sample names shouldn't start with a digit. This can lead to incompatibilities.\n {list(sampleTable.index)}" ) + exit(1) if sampleTable.index.str.contains("_").any(): - logger.warning( + logger.error( f"Sample names shouldn't contain underscores. This can lead to incompatibilities. \n {list(sampleTable.index)}" ) + exit(1) + if sampleTable.index.str.count("-").max() > 1: - logger.warning( - f"Sample names shouldn't have more than one hypon '-'. This can lead to incompatibilities.\n {list(sampleTable.index)}" + logger.error( + f"Sample names shouldn't have more than one hypo '-'. This can lead to incompatibilities.\n {list(sampleTable.index)}" + ) + exit(1) + + ### Validate BinGroup + + if sampleTable.BinGroup.isnull().any(): + logger.warning(f"Found empty values in the sample table column 'BinGroup'") + + if sampleTable.BinGroup.str.contains("_").any(): + logger.error( + f"BinGroup names shouldn't contain underscores. This can lead to incompatibilities. \n {list(sampleTable.BinGroup)}" ) + exit(1) + + if sampleTable.BinGroup.str.contains("-").any(): + logger.error( + f"BinGroup names shouldn't contain hypos '-'. This can lead to incompatibilities.\n {list(sampleTable.BinGroup)}" + ) + exit(1) def load_sample_table(sample_table="samples.tsv"): sampleTable = pd.read_csv(sample_table, index_col=0, sep="\t") validate_sample_table(sampleTable) return sampleTable + + +class BinGroupSizeError(Exception): + """ + Exception with Bingroupsize + """ + + def __init__(self, message): + super(BinGroupSizeError, self).__init__(message) + + +def validate_bingroup_size_cobinning(sampleTable, logger): + """ + Validate that the bingroups are not too large, nor too small for co-binning. + + e.g. vamb and SemiBin + """ + + bin_group_sizes = sampleTable.BinGroup.value_counts() + + if bin_group_sizes.max() > 180: + logger.warning( + f"Found a bin group with more than 180 samples. This might lead to memory issues. \n {bin_group_sizes}" + ) + + if bin_group_sizes.min() < 10: + logger.error( + "If you want to use co-binning, you should have at least 5-10 samples per bin group. \n" + ) + BinGroupSizeError("BinGroup too small") + + +def validate_bingroup_size_metabat(sampleTable, logger): + bin_group_sizes = sampleTable.BinGroup.value_counts() + + max_bin_group_size = bin_group_sizes.max() + + warn_message = ( + "Co-binning with metabat uses cross-mapping which scales quadratically." + f"You have a bingroup with {max_bin_group_size} samples, which already leads to {max_bin_group_size*max_bin_group_size} cross-mappings." + ) + + if max_bin_group_size > 50: + logger.error( + warn_message + + "This is too much for metabat. Please use vamb, or SemiBin or split your samples into smaller groups." + ) + BinGroupSizeError("BinGroup too large") + + if max_bin_group_size > 10: + logger.warning( + warn_message + + "This might be too much for metabat. Consider using vamb, or SemiBin or split your samples into smaller groups." + ) + + elif max_bin_group_size == 1: + logger.warning( + "You have only one sample per bingroup. This doesn't use the co-abundance information." + ) + + +def validate_bingroup_size(sampleTable, config, logger): + if config["final_binner"] == "DASTool": + binners = config["binner"] + + logger.info(f"DASTool uses the folowing binners: {binners}") + + if ("vamb" in binners) or ("SemiBin" in binners): + validate_bingroup_size_cobinning(sampleTable, logger) + + if "metabat" in binners: + validate_bingroup_size_metabat(sampleTable, logger) + + elif config["final_binner"] == "metabat": + validate_bingroup_size_metabat(sampleTable, logger) + + elif config["final_binner"] in ["vamb", "SemiBin"]: + validate_bingroup_size_cobinning(sampleTable, logger) + + elif config["final_binner"] == "maxbin": + logger.warning("maxbin Doesn't use coabundance for binning.") + + else: + Exception(f"Unknown final binner: {config['final_binner']}") diff --git a/config/default_config.yaml b/config/default_config.yaml index 9fa32f5c..e9c565e8 100644 --- a/config/default_config.yaml +++ b/config/default_config.yaml @@ -22,9 +22,18 @@ genome_aligner: "minimap" bin_quality_asesser: checkm2 #[ checkm2, busco, cehckm] - +semibin_options: "" semibin_train_extra: "" filter_chimieric_bins: true gunc_database: "progenomes" # progenomes or gtdb + + +binner: # If DASTool is used as final_binner, use predictions of this binners + - metabat + - maxbin +# - vamb + + +cobinning_readmapping_id: 0.95 #when mapping different reads to contigs from different samples use less stringent alignment threshold \ No newline at end of file diff --git a/config/template_config.yaml b/config/template_config.yaml index 2cfeb4d2..4532d543 100644 --- a/config/template_config.yaml +++ b/config/template_config.yaml @@ -156,13 +156,10 @@ minimum_map_quality: 0 # Binning ######################## -final_binner: DASTool # [SemiBin, DASTool, vamb, maxbin] +final_binner: vamb # [SemiBin, vamb, metabat, DASTool] -binner: # If DASTool is used as final_binner, use predictions of this binners - - metabat - - maxbin -# - vamb +semibin_options: "" metabat: sensitivity: sensitive diff --git a/docs/usage/configuration.rst b/docs/usage/configuration.rst index e54b069c..836a3456 100644 --- a/docs/usage/configuration.rst +++ b/docs/usage/configuration.rst @@ -1,10 +1,12 @@ -.. _configuration: +.. +_configuration: Configure Atlas *************** -.. _contaminants: +.. +_contaminants: Remove reads from Host ====================== @@ -16,6 +18,70 @@ We recommend you to use genomes where repetitive sequences are masked. See here for more details `human genome `_. +Co-abundance Binning +==================== + +.. _cobinning: + +While binning each sample individually is faster, using co-abundance for binning is recommended. +Quantifying the coverage of contigs across multiple samples provides valuable insights about contig co-variation. + +There are two primary strategies for co-abundance binning: + +1. **Cross mapping:** Map the reads from multiple samples to each sample's contigs. +2. **Co-binning:** Concatenate contigs from multiple samples and map all the reads to these combined contigs. + +`final_binner: metabat2` is used for cross-mapping, while `vamb` or `SemiBin` is used for co-binning. + +The samples to be binned together are specified using the `BinGroup` in the `sample.tsv` file. +The size of the BinGroup should be selected based on the binner and the co-binning strategy in use. + +Cross mapping complexity scales quadratically with the size of the BinGroup since each sample's reads are mapped to each other. +This might yield better results for complex metagenomes, although no definitive benchmark is known. +On the other hand, co-binning is more efficient, as it maps a sample's reads only once to a potentially large assembly. + +Default Behavior +---------------- + +Starting with version 2.18, Atlas places every sample in a single BinGroup and defaults to `vamb` as the binner unless there are very few samples. +For fewer than 8 samples, `metabat` is the default binner. + +.. note:: + This represents a departure from previous versions, where each sample had its own BinGroup. + Running `vamb` in those versions would consider all samples, regardless of their BinGroup. + This change might cause errors if using a `sample.tsv` file from an older Atlas version. + Typically, you can resolve this by assigning a unique BinGroup to each sample. + +The mapping threshold has been adjusted to 95% identity (single sample binning is 97%) to allow reads from different strains — +but not other species — to map to contigs from a different sample. + +If you're co-binning more than 150-200 samples or cross-mapping more than 50 samples, Atlas will issue a warning regarding excessive samples in a BinGroup. +Although VAMB's official publication suggests it can handle up to 1000 samples, this demands substantial resources. + +Therefore, splitting your samples into multiple BinGroups is recommended. +Ideally, related samples, or those where the same species are anticipated, should belong to the same BinGroup. + +Single-sample Binning +--------------------- + +To employ single-sample binning, simply assign each sample to its own BinGroup and select `metabat` or `DASTool` as the `final_binner`. + +Although it's not recommended, it's feasible to use `DASTool` and feed it inputs from `metabat` and other co-abundance-based binners. + +Add the following lines to your `config.yaml`: + + +.. code-block:: yaml + + final_binner: DASTool + + binner: + - metabat + - maxbin + - vamb + + + .. _longreads: Long reads @@ -35,7 +101,8 @@ Example config file =================== -.. include:: ../../workflow/../config/template_config.yaml +.. +include:: ../../workflow/../config/template_config.yaml :code: @@ -44,7 +111,8 @@ Example config file Detailed configuration ====================== -.. toctree:: +.. +toctree:: :maxdepth: 1 ../advanced/qc diff --git a/docs/usage/getting_started.rst b/docs/usage/getting_started.rst index 75721d9c..8df5ea8d 100644 --- a/docs/usage/getting_started.rst +++ b/docs/usage/getting_started.rst @@ -122,10 +122,9 @@ Have a look at them with a normal text editor and check if the samples names are See the :download:`example sample table <../reports/samples.tsv>` The ``BinGroup`` parameter is used during the genomic binning. -In short: all samples in which you expect the same strain to -be found should belong to the same group, -e.g. all metagenome samples from mice in the same cage or location. - +In short: If you have between 5 and 150 samples the default (puting everithing in one group) is fine. +If you have less than 5 samples, put every sample in an individual BinGroup and use `metabat` as final binner. +If you have more samples see the :ref:`cobinning` section for more details. .. note:: If you want to use :ref:`long reads ` for a hybrid assembly, you can also specify them in the sample table. diff --git a/test/dryrun.sh b/test/dryrun.sh index 39980b45..f6c8d733 100755 --- a/test/dryrun.sh +++ b/test/dryrun.sh @@ -16,7 +16,7 @@ reads_dir='test/reads/empty' echo "touch reads dir" mkdir -p $reads_dir -for sample in Sample1 Sample2 ; +for sample in Sample1 Sample2 Sample3 Sample4 Sample5 Sample6; do for fraction in R1 R2; do @@ -43,7 +43,7 @@ echo "Dryrun strains" atlas run genomes strains -w $WD --max-mem $MaxMem --jobs $NThreads --dryrun $@ -for binner in SemiBin vamb DASTool ; do +for binner in metabat SemiBin vamb DASTool ; do echo " Dryrun Binner $binner diff --git a/test/test_ci.sh b/test/test_ci.sh old mode 100644 new mode 100755 diff --git a/test/test_init_many_samples.sh b/test/test_init_many_samples.sh new file mode 100755 index 00000000..ff0faac2 --- /dev/null +++ b/test/test_init_many_samples.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set -euo pipefail + + +NThreads=2 +MaxMem=3 + +atlas --version +atlas run --help + + +databaseDir="test/databases" + + + + +create_reads_dir() { + + local reads_dir="$1" + local N=$2 + +echo "touch reads dir" + +rm -rf $reads_dir +mkdir -p $reads_dir + +for (( i=1; i<=$N; i++ )); do + sample="Sample$i" + + for fraction in R1 R2; + do + touch $reads_dir/${sample}_${fraction}.fastq.gz + done +done +} + + + + + +for N in 5 10 50 300 ; +do + +echo "test init with $N samples" + +WD="test/test_init/$N" +reads_dir="test/test_init/reads_${N}_samples/" + +rm -rf $WD $reads_dir + + + +create_reads_dir $reads_dir $N + +atlas init --db-dir $databaseDir -w $WD $reads_dir + +done diff --git a/workflow/Snakefile b/workflow/Snakefile index f7fb2d46..3e520ff6 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -33,12 +33,10 @@ min_version("6.1") container: "docker://continuumio/miniconda3:4.4.10" - - - wildcard_constraints: binner="[A-Za-z]+", + include: "rules/sample_table.smk" include: "rules/download.smk" # contains hard coded variables include: "rules/qc.smk" @@ -173,7 +171,7 @@ rule binning: rule assembly_one_sample: input: - "{sample}/{sample}_contigs.fasta", + get_assembly, "{sample}/sequence_alignment/{sample}.bam", "{sample}/assembly/contig_stats/postfilter_coverage_stats.txt", "{sample}/assembly/contig_stats/final_contig_stats.txt", @@ -228,11 +226,9 @@ for r in workflow.rules: if "mem" in r.resources: r.resources["mem_mb"] = r.resources["mem"] * 1000 else: + # default r.resources["mem_mb"] = config["mem"] * 1000 - # snakemake has a new name for that - if not "mem_mib" in r.resources: - r.resources["mem_mib"] = r.resources["mem_mb"] # add time if ot present. Simple jobs use simple time diff --git a/workflow/rules/assemble.smk b/workflow/rules/assemble.smk index ce96e2fb..c3d123bc 100644 --- a/workflow/rules/assemble.smk +++ b/workflow/rules/assemble.smk @@ -491,7 +491,6 @@ rule rename_contigs: " minscaf={params.minlength} &> {log} " - if config["filter_contigs"]: ruleorder: align_reads_to_prefilter_contigs > align_reads_to_final_contigs @@ -604,11 +603,9 @@ rule finalize_contigs: os.symlink(os.path.relpath(input[0], os.path.dirname(output[0])), output[0]) - - rule calculate_contigs_stats: input: - "{sample}/{sample}_contigs.fasta", + get_assembly, output: "{sample}/assembly/contig_stats/final_contig_stats.txt", conda: @@ -623,8 +620,6 @@ rule calculate_contigs_stats: "stats.sh in={input} format=3 out={output} &> {log}" - - # generalized rule so that reads from any "sample" can be aligned to contigs from "sample_contigs" rule align_reads_to_final_contigs: input: @@ -648,7 +643,7 @@ rule align_reads_to_final_contigs: rule pileup_contigs_sample: input: - fasta="{sample}/{sample}_contigs.fasta", + fasta=get_assembly, bam="{sample}/sequence_alignment/{sample}.bam", output: covhist="{sample}/assembly/contig_stats/postfilter_coverage_histogram.txt", @@ -702,7 +697,7 @@ rule create_bam_index: rule predict_genes: input: - "{sample}/{sample}_contigs.fasta", + get_assembly, output: fna="{sample}/annotation/predicted_genes/{sample}.fna", faa="{sample}/annotation/predicted_genes/{sample}.faa", diff --git a/workflow/rules/binning.smk b/workflow/rules/binning.smk index d6fd550f..a1082726 100644 --- a/workflow/rules/binning.smk +++ b/workflow/rules/binning.smk @@ -1,14 +1,12 @@ from glob import glob -BINNING_CONTIGS = "{sample}/{sample}_contigs.fasta" - include: "bin_quality.smk" rule pileup_for_binning: input: - fasta=BINNING_CONTIGS, + fasta=get_assembly, bam="{sample}/sequence_alignment/{sample_reads}.bam", output: covstats="{sample}/binning/coverage/{sample_reads}_coverage_stats.txt", @@ -79,7 +77,7 @@ rule combine_coverages: rule run_concoct: input: coverage="{sample}/binning/coverage/combined_coverage.tsv", - fasta=BINNING_CONTIGS, + fasta=get_assembly, output: "{{sample}}/binning/concoct/intermediate_files/clustering_gt{}.csv".format( config["concoct"]["min_contig_length"] @@ -128,7 +126,7 @@ rule convert_concoct_csv_to_tsv: ## METABAT rule get_metabat_depth_file: input: - bam=lambda wc: expand( + bams=lambda wc: expand( "{sample}/sequence_alignment/{sample_reads}.bam", sample_reads=get_alls_samples_of_group(wc), sample=wc.sample, @@ -138,15 +136,17 @@ rule get_metabat_depth_file: log: "{sample}/binning/metabat/metabat.log", conda: - "%s/metabat.yaml" % CONDAENV - threads: config["threads"] + "../envs/metabat.yaml" + threads: config["threads"] # multithreaded trough OMP_NUM_THREADS resources: - mem=config["mem"], + mem_mb=config["mem"]*1000, + params: + minid = lambda wc, input: config["cobinning_readmapping_id"] *100 if len(input.bams)>1 else 97 shell: - """ - jgi_summarize_bam_contig_depths --outputDepth {output} {input.bam} \ - &> {log} - """ + "jgi_summarize_bam_contig_depths " + " --percentIdentity {params.minid} " + " --outputDepth {output} " + " {input.bams} &> {log} " def get_metabat_sensitivity(): @@ -159,7 +159,7 @@ def get_metabat_sensitivity(): rule metabat: input: depth_file=rules.get_metabat_depth_file.output, - contigs=BINNING_CONTIGS, + contigs=get_assembly, output: "{sample}/binning/metabat/cluster_attribution.tmp", params: @@ -190,7 +190,7 @@ rule metabat: rule maxbin: input: - fasta=BINNING_CONTIGS, + fasta=get_assembly, abund="{sample}/binning/coverage/{sample}_coverage.txt", output: directory("{sample}/binning/maxbin/intermediate_files"), @@ -307,7 +307,7 @@ rule get_maxbin_cluster_attribution: rule get_bins: input: cluster_attribution="{sample}/binning/{binner}/cluster_attribution.tsv", - contigs=BINNING_CONTIGS, + contigs=get_assembly, output: directory("{sample}/binning/{binner}/bins"), conda: @@ -315,7 +315,7 @@ rule get_bins: log: "{sample}/logs/binning/get_bins_{binner}.log", script: - "get_fasta_of_bins.py" + "../scripts/get_fasta_of_bins.py" localrules: @@ -337,7 +337,7 @@ rule run_das_tool: "{{sample}}/binning/DASTool/{binner}.scaffolds2bin", binner=config["binner"], ), - contigs=BINNING_CONTIGS, + contigs=get_assembly, proteins="{sample}/annotation/predicted_genes/{sample}.faa", output: "{sample}/binning/DASTool/{sample}_DASTool_summary.tsv", diff --git a/workflow/rules/cobinning.smk b/workflow/rules/cobinning.smk index 4df0fd55..d68d7d79 100644 --- a/workflow/rules/cobinning.smk +++ b/workflow/rules/cobinning.smk @@ -8,9 +8,9 @@ localrules: rule filter_contigs: input: - "{sample}/{sample}_contigs.fasta", + get_assembly, output: - temp("Cobinning/filtered_contigs/{sample}.fasta"), + "Intermediate/cobinning/filtered_contigs/{sample}.fasta.gz", params: min_length=config["cobining_min_contig_length"], log: @@ -29,33 +29,59 @@ rule filter_contigs: " -Xmx{resources.java_mem}G 2> {log} " -localrules: - combine_contigs, +def get_samples_of_bingroup(wildcards): + + samples_of_group= sampleTable.query(f'BinGroup=="{wildcards.bingroup}"').index.tolist() + + return samples_of_group + + +def get_filtered_contigs_of_bingroup(wildcards): + + samples_of_group = get_samples_of_bingroup(wildcards) + + if len(samples_of_group) < 5: + raise ValueError( + f"Bin group {wildcards.bingroup} has {len(samples_of_group)} less than 5 samples." + "For cobinning we reccomend at least 5 samples per bin group." + "Adapt the sample.tsv to set BinGroup of size [5- 1000]" + ) + + return expand(rules.filter_contigs.output[0], sample=samples_of_group) + + +def get_bams_of_bingroup(wildcards): + + samples_of_group = get_samples_of_bingroup(wildcards) + + return expand( + "Intermediate/cobinning/{bingroup}/bams/{sample}.sorted.bam", + sample=samples_of_group, + bingroup=wildcards.bingroup, + ) rule combine_contigs: input: - #Trigers rerun if contigs change - flag=expand("{sample}/{sample}_contigs.fasta", sample=SAMPLES), - fasta=ancient(expand(rules.filter_contigs.output[0], sample=SAMPLES)), + fasta= get_filtered_contigs_of_bingroup, output: - "Cobinning/combined_contigs.fasta.gz", + "Intermediate/cobinning/{bingroup}/combined_contigs.fasta.gz", log: - "logs/cobinning/combine_contigs.log", + "logs/cobinning/{bingroup}/combine_contigs.log", params: seperator=config["cobinning_separator"], - samples=SAMPLES, + samples=get_samples_of_bingroup, threads: 1 run: import gzip as gz - with gz.open(output[0], "wt") as fout: + with gz.open(output[0], "wb") as fout: for sample, input_fasta in zip(params.samples, input.fasta): - with open(input_fasta) as fin: + with gz.open(input_fasta, "rb") as fin: for line in fin: # if line is a header add sample name - if line[0] == ">": - line = f">{sample}{params.seperator}" + line[1:] + if line[0] == ord('>'): + line = f">{sample}{params.seperator}".encode() + line[1:] # write each line to the combined file fout.write(line) @@ -64,16 +90,16 @@ rule minimap_index: input: contigs=rules.combine_contigs.output, output: - mmi=temp("Cobinning/combined_contigs.mmi"), + mmi=temp("Intermediate/cobinning/{bingroup}/combined_contigs.mmi"), params: index_size="12G", resources: mem=config["mem"], # limited num of fatnodes (>200g) - threads: 3 + threads: config["simplejob_threads"], log: - "logs/cobinning/vamb/index.log", + "logs/cobinning/{bingroup}/minimap_index.log", benchmark: - "logs/benchmarks/cobinning/mminimap_index.tsv" + "logs/benchmarks/cobinning/{bingroup}/mminimap_index.tsv" conda: "../envs/minimap.yaml" shell: @@ -84,13 +110,13 @@ rule samtools_dict: input: contigs=rules.combine_contigs.output, output: - dict="Cobinning/combined_contigs.dict", + dict="Intermediate/cobinning/{bingroup}/combined_contigs.dict", resources: mem=config["simplejob_mem"], - ttime=config["runtime"]["simplejob"], + time=config["runtime"]["simplejob"], threads: 1 log: - "logs/cobinning/samtools_dict.log", + "logs/cobinning/{bingroup}/samtools_dict.log", conda: "../envs/minimap.yaml" shell: @@ -100,39 +126,40 @@ rule samtools_dict: rule minimap: input: fq=get_quality_controlled_reads, - mmi="Cobinning/combined_contigs.mmi", - dict="Cobinning/combined_contigs.dict", + mmi="Intermediate/cobinning/{bingroup}/combined_contigs.mmi", + dict="Intermediate/cobinning/{bingroup}/combined_contigs.dict", output: - bam=temp("Cobinning/mapping/{sample}.unsorted.bam"), + bam=temp("Intermediate/cobinning/{bingroup}/bams/{sample}.unsorted.bam"), threads: config["threads"] resources: mem=config["mem"], log: - "logs/cobinning/mapping/{sample}.minimap.log", + "logs/cobinning/{bingroup}/mapping/minimap/{sample}.log", benchmark: - "logs/benchmarks/cobinning/mminimap/{sample}.tsv" + "logs/benchmarks/cobinning/{bingroup}/mminimap/{sample}.tsv" conda: "../envs/minimap.yaml" shell: """minimap2 -t {threads} -ax sr {input.mmi} {input.fq} | grep -v "^@" | cat {input.dict} - | samtools view -F 3584 -b - > {output.bam} 2>{log}""" + # samtools filters out secondary alignments ruleorder: sort_bam > minimap rule sort_bam: input: - "Cobinning/mapping/{sample}.unsorted.bam", + "Intermediate/cobinning/{bingroup}/bams/{sample}.unsorted.bam", output: - "Cobinning/mapping/{sample}.sorted.bam", + "Intermediate/cobinning/{bingroup}/bams/{sample}.sorted.bam", params: - prefix="Cobinning/mapping/tmp.{sample}", + prefix="Intermediate/cobinning/{bingroup}/bams/tmp.{sample}", threads: 2 resources: - mem=config["simplejob_mem"], - time=int(config["runtime"]["simplejob"]), + mem_mb=config["simplejob_mem"] *1000, + time_min=int(config["runtime"]["simplejob"]*60), log: - "logs/cobinning/mapping/{sample}.sortbam.log", + "logs/cobinning/{bingroup}/mapping/sortbam/{sample}.log", conda: "../envs/minimap.yaml" shell: @@ -141,20 +168,26 @@ rule sort_bam: rule summarize_bam_contig_depths: input: - bam=expand(rules.sort_bam.output, sample=SAMPLES), + bams=get_bams_of_bingroup, output: - "Cobinning/vamb/coverage.jgi.tsv", + "Intermediate/cobinning/{bingroup}/coverage.jgi.tsv", log: - "logs/cobinning/vamb/combine_coverage.log", + "logs/cobinning/{bingroup}/combine_coverage.log", conda: "../envs/metabat.yaml" - threads: config["threads"] + threads: config["threads"] # multithreaded trough OMP_NUM_THREADS + benchmark: + "logs/benchmarks/cobinning/{bingroup}/summarize_bam_contig_depths.tsv" resources: - mem=config["mem"], + mem_mb=config["mem"]*1000, + time_min = config["runtime"]["long"]*60 + params: + minid = config["cobinning_readmapping_id"] *100 shell: "jgi_summarize_bam_contig_depths " + " --percentIdentity {params.minid} " " --outputDepth {output} " - " {input.bam} &> {log} " + " {input.bams} &> {log} " localrules: @@ -163,11 +196,13 @@ localrules: rule convert_jgi2vamb_coverage: input: - "Cobinning/vamb/coverage.jgi.tsv", + rules.summarize_bam_contig_depths.output, output: - "Cobinning/vamb/coverage.tsv", + "Intermediate/cobinning/{bingroup}/coverage.tsv", log: - "logs/cobinning/vamb/convert_jgi2vamb_coverage.log", + "logs/cobinning/{bingroup}/convert_jgi2vamb_coverage.log", + benchmark: + "logs/benchmarks/cobinning/{bingroup}/convert_jgi2vamb_coverage.tsv" threads: 1 script: "../scripts/convert_jgi2vamb_coverage.py" @@ -175,20 +210,20 @@ rule convert_jgi2vamb_coverage: rule run_vamb: input: - coverage="Cobinning/vamb/coverage.tsv", + coverage="Intermediate/cobinning/{bingroup}/coverage.tsv", fasta=rules.combine_contigs.output, output: - directory("Cobinning/vamb/clustering"), + directory("Intermediate/cobinning/{bingroup}/vamb_output"), conda: "../envs/vamb.yaml" threads: config["threads"] resources: - mem=config["mem"], - time=config["runtime"]["long"], + mem_mb=config["mem"]*1000, + time_min=config["runtime"]["long"]*60, log: - "logs/cobinning/vamb/run_vamb.log", + "logs/cobinning/run_vamb/{bingroup}.log", benchmark: - "logs/benchmarks/vamb/run_vamb.tsv" + "logs/benchmarks/run_vamb/{bingroup}.tsv" params: mincontig=config["cobining_min_contig_length"], # min contig length for binning minfasta=config["cobining_min_bin_size"], # min bin size for output @@ -212,17 +247,18 @@ localrules: rule parse_vamb_output: input: - rules.run_vamb.output, + expand(rules.run_vamb.output, bingroup=sampleTable.BinGroup.unique()), output: - renamed_clusters="Cobinning/vamb/clusters.tsv.gz", + renamed_clusters="Binning/vamb/vamb_clusters.tsv.gz", cluster_atributions=expand(vamb_cluster_attribution_path, sample=SAMPLES), log: "logs/cobinning/vamb_parse_output.log", params: separator=config["cobinning_separator"], fasta_extension=".fna", - output_path=lambda wc: vamb_cluster_attribution_path, + output_path=lambda wc: vamb_cluster_attribution_path, # path with {sample} to replace samples=SAMPLES, + bingroups = sampleTable.BinGroup.unique() conda: "../envs/fasta.yaml" script: @@ -231,8 +267,7 @@ rule parse_vamb_output: rule vamb: input: - "Cobinning/vamb/clustering", - "Cobinning/vamb/clusters.tsv.gz", + "Binning/vamb/vamb_clusters.tsv.gz", expand("{sample}/binning/vamb/cluster_attribution.tsv", sample=SAMPLES), diff --git a/workflow/rules/get_fasta_of_bins.py b/workflow/rules/get_fasta_of_bins.py deleted file mode 100755 index 49bbde9f..00000000 --- a/workflow/rules/get_fasta_of_bins.py +++ /dev/null @@ -1,60 +0,0 @@ -import argparse -import os, sys -import shutil -import warnings - -import pandas as pd -from Bio import SeqIO - - -def get_fasta_of_bins(cluster_attribution, contigs, out_folder): - """ - Creates individual fasta files for each bin using the contigs fasta and the cluster attribution. - - input: - - cluster attribution file: tab seperated file of "contig_fasta_header bin" - - contigs: fasta file of contigs - - out_prefix: output_prefix for bin fastas {out_folder}/{binid}.fasta - """ - # create outdir - if os.path.exists(out_folder): - shutil.rmtree(out_folder) - os.makedirs(out_folder) - - CA = pd.read_csv(cluster_attribution, header=None, index_col=1, sep="\t") - - assert CA.shape[1] == 1, "File should have only two columns " + cluster_attribution - CA = CA.iloc[:, 0] - CA.index = CA.index.astype("str") - # exclude cluster 0 which is unclustered at least for metabat - CA = CA.loc[CA != "0"] - - contigs = SeqIO.to_dict(SeqIO.parse(contigs, "fasta")) - - for id in CA.index.unique(): - bin_contig_names = CA.loc[id] - out_file = os.path.join(out_folder, "{id}.fasta".format(id=id)) - if type(bin_contig_names) == str: - warnings.warn("single contig bin Bin: " + out_file) - bin_contig_names = [bin_contig_names] - bin_contigs = [contigs[c] for c in bin_contig_names] - SeqIO.write(bin_contigs, out_file, "fasta") - - -if __name__ == "__main__": - if "snakemake" not in globals(): - p = argparse.ArgumentParser() - p.add_argument("--cluster-attribution") - p.add_argument("--contigs") - p.add_argument("--out-folder") - args = vars(p.parse_args()) - get_fasta_of_bins(**args) - else: - with open(snakemake.log[0], "w") as log: - sys.stderr = sys.stdout = log - - get_fasta_of_bins( - snakemake.input.cluster_attribution, - snakemake.input.contigs, - snakemake.output[0], - ) diff --git a/workflow/rules/sample_table.smk b/workflow/rules/sample_table.smk index e3ec6f36..ff0e05a3 100644 --- a/workflow/rules/sample_table.smk +++ b/workflow/rules/sample_table.smk @@ -1,8 +1,8 @@ - - -from atlas.sample_table import load_sample_table +from atlas.sample_table import load_sample_table, validate_bingroup_size sampleTable = load_sample_table() +validate_bingroup_size(sampleTable, config, logger) + def io_params_for_tadpole(io, key="in"): """This function generates the input flag needed for bbwrap/tadpole for all cases @@ -182,4 +182,17 @@ def get_quality_controlled_reads(wildcards, include_se=False): def get_assembly(wildcards): - return "{sample}/assembly/{sample}_contigs.fasta".format(sample=wildcards.sample ) + """ + Returns Assembly file for a given sample. + + """ + + Header= "Assembly" + try: + return get_files_from_sampleTable(wildcards.sample, Header) + + except FileNotInSampleTableException: + # return files as named by atlas pipeline + + return "{sample}/{sample}_contigs.fasta".format(sample=wildcards.sample ) + diff --git a/workflow/rules/semibin.smk b/workflow/rules/semibin.smk index 59106929..761033c2 100644 --- a/workflow/rules/semibin.smk +++ b/workflow/rules/semibin.smk @@ -2,13 +2,14 @@ rule semibin_generate_data_multi: input: fasta=rules.combine_contigs.output, - bams=expand(rules.sort_bam.output, sample=SAMPLES), + bams=get_bams_of_bingroup, output: - expand( - "Cobinning/SemiBin/samples/{sample}/{files}", - sample=SAMPLES, - files=["data.csv", "data_split.csv"], - ), + directory("Intermediate/cobinning/{bingroup}/semibin/data_multi"), + # expand( + # "Cobinning/SemiBin/samples/{sample}/{files}", + # sample=SAMPLES, + # files=["data.csv", "data_split.csv"], + # ), conda: "../envs/semibin.yaml" threads: config["threads"] @@ -16,17 +17,17 @@ rule semibin_generate_data_multi: mem=config["mem"], time=config["runtime"]["default"], log: - "logs/semibin/generate_data_multi.log", + "logs/semibin/{bingroup}/generate_data_multi.log", benchmark: - "logs/benchmarks/semibin/generate_data_multi.tsv" + "logs/benchmarks/semibin/{bingroup}/generate_data_multi.tsv" params: - output_dir="Cobinning/SemiBin", + # output_dir="Cobinning/SemiBin", separator=config["cobinning_separator"], shell: "SemiBin generate_sequence_features_multi" " --input-fasta {input.fasta} " " --input-bam {input.bams} " - " --output {params.output_dir} " + " --output {output} " " --threads {threads} " " --separator {params.separator} " " 2> {log}" @@ -34,13 +35,12 @@ rule semibin_generate_data_multi: rule semibin_train: input: - "{sample}/{sample}_contigs.fasta", - fasta=rules.filter_contigs.output, - bams=expand(rules.sort_bam.output, sample=SAMPLES), - data="Cobinning/SemiBin/samples/{sample}/data.csv", - data_split="Cobinning/SemiBin/samples/{sample}/data_split.csv", + flag=get_assembly, + fasta_sample=rules.filter_contigs.output[0], + bams=get_bams_of_bingroup, + data_folder=rules.semibin_generate_data_multi.output[0], output: - "Cobinning/SemiBin/{sample}/model.h5", + "Intermediate/cobinning/{bingroup}/semibin/models/{sample}/model.h5", conda: "../envs/semibin.yaml" threads: config["threads"] @@ -48,31 +48,65 @@ rule semibin_train: mem=config["mem"], time=config["runtime"]["default"], log: - "logs/semibin/train/{sample}.log", + "logs/semibin/{bingroup}/train/{sample}.log", benchmark: - "logs/benchmarks/semibin/train/{sample}.tsv" + "logs/benchmarks/semibin/{bingroup}/train/{sample}.tsv" params: output_dir=lambda wc, output: os.path.dirname(output[0]), + data=lambda wc, input: Path(input.data_folder) + / "samples" + / wc.sample + / "data.csv", + data_split=lambda wc, input: Path(input.data_folder) + / "samples" + / wc.sample + / "data_split.csv", extra=config["semibin_train_extra"], shell: "SemiBin train_self " " --output {params.output_dir} " " --threads {threads} " - " --data {input.data} " - " --data-split {input.data_split} " + " --data {params.data} " + " --data-split {params.data_split} " " {params.extra} " " 2> {log}" +def semibin_input(wildcards): + + bingroup_of_sample = sampleTable.loc[wildcards.sample, "BinGroup"] + samples_of_bingroup = sampleTable.query( + f'BinGroup=="{bingroup_of_sample}"' + ).index.tolist() + + assert len(samples_of_bingroup) > 1 + + mapping = dict( + fasta=rules.filter_contigs.output[0].format(**wildcards), + bams=expand( + "Intermediate/cobinning/{bingroup}/bams/{sample}.sorted.bam", + sample=samples_of_bingroup, + bingroup=bingroup_of_sample, + ), + data_folder=rules.semibin_generate_data_multi.output[0].format( + bingroup=bingroup_of_sample, **wildcards + ), + model=rules.semibin_train.output[0].format( + bingroup=bingroup_of_sample, **wildcards + ), + ) + + return mapping + + rule run_semibin: input: - "{sample}/{sample}_contigs.fasta", - fasta=rules.filter_contigs.output, - bams=expand(rules.sort_bam.output, sample=SAMPLES), - data="Cobinning/SemiBin/samples/{sample}/data.csv", - model=rules.semibin_train.output[0], + unpack(semibin_input), output: - directory("Cobinning/SemiBin/{sample}/output_recluster_bins/"), + # contains no info to bingroup + directory( + "Intermediate/cobinning/semibin_output/{sample}/output_recluster_bins/" + ), conda: "../envs/semibin.yaml" threads: config["threads"] @@ -84,7 +118,11 @@ rule run_semibin: benchmark: "logs/benchmarks/semibin/bin/{sample}.tsv" params: - output_dir="Cobinning/SemiBin/{sample}/", + output_dir=lambda wc, output: os.path.dirname(output[0]), + data=lambda wc, input: Path(input.data_folder) + / "samples" + / wc.sample + / "data.csv", min_bin_kbs=int(config["cobining_min_bin_size"] / 1000), extra=config["semibin_options"], shell: @@ -92,7 +130,7 @@ rule run_semibin: " --input-fasta {input.fasta} " " --output {params.output_dir} " " --threads {threads} " - " --data {input.data} " + " --data {params.data} " " --model {input.model} " " --minfasta-kbs {params.min_bin_kbs}" " {params.extra} " @@ -103,6 +141,9 @@ localrules: parse_semibin_output, +ruleorder: parse_semibin_output > get_unique_cluster_attribution + + rule parse_semibin_output: input: rules.run_semibin.output[0], @@ -120,5 +161,4 @@ rule parse_semibin_output: rule semibin: input: - expand("Cobinning/SemiBin/{sample}/output_recluster_bins/", sample=SAMPLES), expand("{sample}/binning/SemiBin/cluster_attribution.tsv", sample=SAMPLES), diff --git a/workflow/scripts/combine_busco.py b/workflow/scripts/combine_busco.py index ed91e96b..eea83ad2 100644 --- a/workflow/scripts/combine_busco.py +++ b/workflow/scripts/combine_busco.py @@ -52,7 +52,7 @@ def main(samples, completeness_files, bin_table): failed_genomes = df.index[df.Dataset.str.lower().str.contains("run failed")] if len(failed_genomes) > 0: - logging.warn( + logging.warning( "Following genomes didn't pass BUSCO. I ignore them, because " "I think theas means they are too bad to be quantified:\n" f"{failed_genomes}" diff --git a/workflow/scripts/get_fasta_of_bins.py b/workflow/scripts/get_fasta_of_bins.py new file mode 100644 index 00000000..ff8c65d7 --- /dev/null +++ b/workflow/scripts/get_fasta_of_bins.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python + + +import sys, os +import logging, traceback + +logging.basicConfig( + filename=snakemake.log[0], + level=logging.DEBUG, + format="%(asctime)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + + +def handle_exception(exc_type, exc_value, exc_traceback): + if issubclass(exc_type, KeyboardInterrupt): + sys.__excepthook__(exc_type, exc_value, exc_traceback) + return + + logging.error( + "".join( + [ + "Uncaught exception: ", + *traceback.format_exception(exc_type, exc_value, exc_traceback), + ] + ) + ) + + +# Install exception handler +sys.excepthook = handle_exception + + +# start of script +import argparse +import os, sys +import shutil +import warnings + +import pandas as pd +from Bio import SeqIO + + +def get_fasta_of_bins(cluster_attribution, contigs_file, out_folder): + """ + Creates individual fasta files for each bin using the contigs fasta and the cluster attribution. + + input: + - cluster attribution file: tab seperated file of "contig_fasta_header bin" + - contigs: fasta file of contigs + - out_prefix: output_prefix for bin fastas {out_folder}/{binid}.fasta + """ + # create outdir + if os.path.exists(out_folder): + shutil.rmtree(out_folder) + os.makedirs(out_folder) + + CA = pd.read_csv(cluster_attribution, header=None, sep="\t", dtype=str) + + assert CA.shape[1] == 2, "File should have only two columns " + cluster_attribution + + CA.columns = ["Contig", "Bin"] + + # # assert that Contig is unique + # assert CA.Contig.is_unique, ( + # f"First column of file {cluster_attribution} should be contigs, hence unique" + # f"I got\n{CA.head()}" + # ) + + logging.info(f"index fasta file {contigs_file} for fast access") + contig_fasta_dict = SeqIO.index(str(contigs_file), "fasta") + + assert len(contig_fasta_dict) > 0, "No contigs in your fasta" + + unique_bins = CA.Bin.unique() + + assert len(unique_bins) >= 1, "No bins found" + + for binid in unique_bins: + bin_contig_names = CA.loc[CA.Bin == binid, "Contig"].tolist() + out_file = os.path.join(out_folder, f"{binid}.fasta") + + assert ( + len(bin_contig_names) >= 1 + ), f"No contigs found for bin {binid} in {cluster_attribution}" + + if len(bin_contig_names) == 1: + warnings.warn(f"single contig bin Bin : {binid} {bin_contig_names}") + + logging.debug(f"Found {len(bin_contig_names)} contigs {bin_contig_names}") + + fasta_contigs = [contig_fasta_dict[c] for c in bin_contig_names] + SeqIO.write(fasta_contigs, out_file, "fasta") + + +if __name__ == "__main__": + if "snakemake" not in globals(): + p = argparse.ArgumentParser() + p.add_argument("--cluster-attribution") + p.add_argument("--contigs") + p.add_argument("--out-folder") + args = vars(p.parse_args()) + get_fasta_of_bins(**args) + else: + get_fasta_of_bins( + snakemake.input.cluster_attribution, + snakemake.input.contigs, + snakemake.output[0], + ) diff --git a/workflow/scripts/parse_vamb.py b/workflow/scripts/parse_vamb.py index 1bbe6225..2386e9c3 100644 --- a/workflow/scripts/parse_vamb.py +++ b/workflow/scripts/parse_vamb.py @@ -3,7 +3,7 @@ logging.basicConfig( filename=snakemake.log[0], - level=logging.DEBUG, + level=logging.INFO, format="%(asctime)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) @@ -29,61 +29,100 @@ def handle_exception(exc_type, exc_value, exc_traceback): import pandas as pd +from pathlib import Path from utils.utils import gen_names_for_range from utils.fasta import parse_fasta_headers -bin_dir = os.path.join(snakemake.input[0], "bins") + fasta_extension = snakemake.params.fasta_extension separator = snakemake.params.separator +# path with {sample} to replace cluster_output_path = snakemake.params.output_path -vamb_cluster_file = os.path.join(snakemake.input[0], "clusters.tsv") + +# cluster.tsv.gz file for all samples of all bingroups output_culsters = snakemake.output.renamed_clusters -big_bins = [] +all_clusters = [] -for file in os.listdir(bin_dir): - bin_name, extension = os.path.splitext(file) - logging.debug(f"Found file {bin_name} with extension {extension}") +for i in range(len(list(snakemake.input))): + vamb_folder = Path(snakemake.input[i]) - if extension == fasta_extension: - big_bins.append(bin_name) + bingroup = snakemake.params.bingroups[i] + logging.info(f"Parse vamb output for bingroup {bingroup}") -logging.info( - f"Found {len(big_bins)} bins created by Vamb (above size limit)\n" - f"E.g. {big_bins[:5]}" -) + # path to the bins folder + bin_dir = vamb_folder / "bins" + vamb_cluster_file = vamb_folder / "clusters.tsv" + + # Get a list of binds that are big enough. Not all bins in the vamb_cluster_file, pass the size filter + big_bins = [] + + for file in os.listdir(bin_dir): + bin_name, extension = os.path.splitext(file) + logging.debug(f"Found file {bin_name} with extension {extension}") -logging.info(f"Load vamb cluster file {vamb_cluster_file}") -clusters_contigs = pd.read_table(vamb_cluster_file, header=None) + if extension == fasta_extension: + big_bins.append(bin_name) -clusters_contigs.columns = ["OriginalName", "Contig"] + logging.info( + f"Found {len(big_bins)} bins created by Vamb (above size limit)\n" + f"E.g. {big_bins[:5]}" + ) + + logging.info(f"Load vamb cluster file {vamb_cluster_file}") + clusters_contigs = pd.read_table(vamb_cluster_file, header=None) + clusters_contigs.columns = ["OriginalName", "Contig"] + + # split contigs by separator. This is mainly done for compatibility with SemiBin + clusters = clusters_contigs.Contig.str.rsplit(separator, n=1, expand=True) + clusters.columns = ["Sample", "Contig"] + + # get number of BinID given by vamb, prefix with bingroup + clusters["BinId"] = ( + bingroup + + clusters_contigs.OriginalName.str.rsplit(separator, n=1, expand=True)[1] + ) + # Add information if the bin is large enough + clusters["OriginalName"] = clusters_contigs.OriginalName + clusters["Large_enough"] = clusters.OriginalName.isin(big_bins) -clusters = clusters_contigs.Contig.str.rsplit(separator, n=1, expand=True) -clusters.columns = ["Sample", "Contig"] + # Add information about the bingroup + clusters["BinGroup"] = bingroup -clusters["BinID"] = clusters_contigs.OriginalName.str.rsplit( - separator, n=1, expand=True -)[1] -clusters["OriginalName"] = clusters_contigs.OriginalName + all_clusters.append(clusters) -clusters["Large_enough"] = clusters.OriginalName.isin(big_bins) + del clusters_contigs + + +logging.info(f"Concatenate clusters of all bingroups") +clusters = pd.concat(all_clusters, axis=0) + + +n_bins = ( + clusters.query("Large_enough").groupby(["BinGroup", "Sample"])["BinId"].nunique() +) +logging.info( + f"Number of bins per sample and bingroup passing the size filter:\n{n_bins}" +) + + +clusters["SampleBin"] = clusters.Sample + "_vamb_" + clusters.BinId +clusters.loc[~clusters.Large_enough, "SampleBin"] = "" -del clusters_contigs logging.info(f"Write reformated table to {output_culsters}") clusters.to_csv(output_culsters, sep="\t", index=False) +# filter for following clusters = clusters.query("Large_enough") -clusters["SampleBin"] = clusters.Sample + "_vamb_" + clusters.BinID - logging.info(f"Write cluster_attribution for samples") for sample, cl in clusters.groupby("Sample"): sample_output_path = cluster_output_path.format(sample=sample)