diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..3087ba4 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,15 @@ +include *.md +include LICENSE + +include bin/graphbin + +recursive-include graphbin/ * +recursive-include graphbin/bidirectionalmap * +recursive-include graphbin/labelpropagation * + +global-exclude graphbin/ *.pyc +global-exclude graphbin/__pycache__ *.pyc +global-exclude graphbin/bidirectionalmap *.pyc +global-exclude graphbin/bidirectionalmap/__pycache__ *.pyc +global-exclude graphbin/labelpropagation *.pyc +global-exclude graphbin/labelpropagation/__pycache__ *.pyc diff --git a/bin/graphbin b/bin/graphbin new file mode 100755 index 0000000..9614b4f --- /dev/null +++ b/bin/graphbin @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 + +"""graphbin: Refined binning of metagenomic contigs using assembly graphs.""" + +import argparse +import os +import sys + +from graphbin import ( + graphbin_Options, + graphbin_Canu, + graphbin_Flye, + graphbin_MEGAHIT, + graphbin_Miniasm, + graphbin_SGA, + graphbin_SPAdes, +) + +__author__ = "Vijini Mallawaarachchi, Anuradha Wickramarachchi, and Yu Lin" +__copyright__ = "Copyright 2020, GraphBin Project" +__credits__ = "Benjamin Kaehler and Gavin Huttley" +__license__ = "GPL" +__version__ = "1.1" +__maintainer__ = "Vijini Mallawaarachchi" +__email__ = "vijini.mallawaarachchi@anu.edu.au" +__status__ = "Stable Release" + + +def run(args): + RUNNER = { + "canu": graphbin_Canu.run, + "flye": graphbin_Flye.run, + "megahit": graphbin_MEGAHIT.run, + "miniasm": graphbin_Miniasm.run, + "sga": graphbin_SGA.run, + "spades": graphbin_SPAdes.run, + } + RUNNER[args.assembler](args) + + +def main(): + parser = graphbin_Options.PARSER + parser.add_argument( + "--assembler", + type=str, + help="name of the assembler used (SPAdes, SGA or MEGAHIT). GraphBin supports Flye, Canu and Miniasm long-read assemblies as well.", + ) + parser.add_argument( + "--paths", + default=None, + required=False, + help="path to the contigs.paths file, only SPAdes need", + ) + + args = parser.parse_args() + + if args.version: + print("GraphBin version %s" % __version__) + sys.exit(0) + + # Validation of inputs + # --------------------------------------------------- + # Check assembler type + if not ( + args.assembler.lower() == "spades" + or args.assembler.lower() == "sga" + or args.assembler.lower() == "megahit" + or args.assembler.lower() == "flye" + or args.assembler.lower() == "canu" + or args.assembler.lower() == "miniasm" + ): + print( + "\nPlease make sure to provide the correct assembler type (SPAdes, SGA or MEGAHIT). GraphBin supports Flye, Canu and Miniasm long-read assemblies as well." + ) + print("Exiting GraphBin...\nBye...!\n") + sys.exit(1) + + # Check assembly graph file + if not os.path.isfile(args.graph): + print("\nFailed to open the assembly graph file.") + print("Exiting GraphBin...\nBye...!\n") + sys.exit(1) + + # Check if paths files is provided when the assembler type is SPAdes + if args.assembler.lower() == "spades" and args.paths is None: + print("\nPlease make sure to provide the path to the contigs.paths file.") + print("Exiting GraphBin...\nBye...!\n") + sys.exit(1) + + # Check contigs.paths file for SPAdes + if args.assembler.lower() == "spades" and not os.path.isfile(args.paths): + print("\nFailed to open the contigs.paths file.") + print("Exiting GraphBin...\nBye...!\n") + sys.exit(1) + + # Check the file with the initial binning output + if not os.path.isfile(args.binned): + print("\nFailed to open the file with the initial binning output.") + print("Exiting GraphBin...\nBye...!\n") + sys.exit(1) + + # Handle for missing trailing forwardslash in output folder path + if args.output[-1:] != "/": + args.output = args.output + "/" + + # Create output folder if it does not exist + os.makedirs(args.output, exist_ok=True) + + # Validate prefix + if args.prefix != "": + if not args.prefix.endswith("_"): + args.prefix = args.prefix + "_" + + # Validate max_iteration + if args.max_iteration <= 0: + print("\nPlease enter a valid number for max_iteration") + print("Exiting GraphBin...\nBye...!\n") + sys.exit(1) + + # Validate diff_threshold + if args.diff_threshold < 0: + print("\nPlease enter a valid number for diff_threshold") + print("Exiting GraphBin...\nBye...!\n") + sys.exit(1) + + # Run GraphBin + # --------------------------------------------------- + run(args) + + +if __name__ == "__main__": + main() diff --git a/graphbin.py b/graphbin.py deleted file mode 100644 index 63c51af..0000000 --- a/graphbin.py +++ /dev/null @@ -1,206 +0,0 @@ -#!/usr/bin/env python3 - -"""graphbin.py: Refined binning of metagenomic contigs using assembly graphs.""" - -import argparse -import os -import sys -import subprocess - -__author__ = "Vijini Mallawaarachchi, Anuradha Wickramarachchi, and Yu Lin" -__copyright__ = "Copyright 2020, GraphBin Project" -__credits__ = "Benjamin Kaehler and Gavin Huttley" -__license__ = "GPL" -__version__ = "1.1" -__maintainer__ = "Vijini Mallawaarachchi" -__email__ = "vijini.mallawaarachchi@anu.edu.au" -__status__ = "Stable Release" - -parser = argparse.ArgumentParser(description="""GraphBin Help. GraphBin is a metagenomic contig binning tool -that makes use of the contig connectivity information from the assembly graph to bin contigs. It utilizes the -binning result of an existing binning tool and a label propagation algorithm to correct mis-binned contigs -and predict the labels of contigs which are discarded due to short length.""") - -parser.add_argument("--assembler", - required=True, - type=str, - help="name of the assembler used (SPAdes, SGA or MEGAHIT). GraphBin supports Flye, Canu and Miniasm long-read assemblies as well.") - -parser.add_argument("--graph", - required=True, - type=str, - help="path to the assembly graph file") - -parser.add_argument("--paths", - required=False, - type=str, - help="path to the contigs.paths file") - -parser.add_argument("--binned", - required=True, - type=str, - help="path to the .csv file with the initial binning output from an existing tool") - -parser.add_argument("--output", - required=True, - type=str, - help="path to the output folder") - -parser.add_argument("--prefix", - required=False, - type=str, - default='', - help="prefix for the output file") - -parser.add_argument("--max_iteration", - required=False, - type=int, - default=100, - help="maximum number of iterations for label propagation algorithm. [default: 100]") - -parser.add_argument("--diff_threshold", - required=False, - type=float, - default=0.1, - help="difference threshold for label propagation algorithm. [default: 0.1]") - -args = vars(parser.parse_args()) - - -assembler = args["assembler"] -assembly_graph_file = args["graph"] -contig_paths = args["paths"] -contig_bins_file = args["binned"] -output_path = args["output"] -prefix = args["prefix"] -max_iteration = args["max_iteration"] -diff_threshold = args["diff_threshold"] - - -# Validation of inputs -#--------------------------------------------------- - -# Check assembler type -if not (assembler.lower() == "spades" or assembler.lower() == "sga" or assembler.lower() == "megahit" or assembler.lower() == "flye" or assembler.lower() == "canu" or assembler.lower() == "miniasm"): - print("\nPlease make sure to provide the correct assembler type (SPAdes, SGA or MEGAHIT). GraphBin supports Flye, Canu and Miniasm long-read assemblies as well.") - print("Exiting GraphBin...\nBye...!\n") - sys.exit(1) - -# Check assembly graph file -if not os.path.isfile(assembly_graph_file): - print("\nFailed to open the assembly graph file.") - print("Exiting GraphBin...\nBye...!\n") - sys.exit(1) - -# Check if paths files is provided when the assembler type is SPAdes -if assembler.lower() == "spades" and contig_paths is None: - print("\nPlease make sure to provide the path to the contigs.paths file.") - print("Exiting GraphBin...\nBye...!\n") - sys.exit(1) - -# Check contigs.paths file for SPAdes -if assembler.lower() == "spades" and not os.path.isfile(contig_paths): - print("\nFailed to open the contigs.paths file.") - print("Exiting GraphBin...\nBye...!\n") - sys.exit(1) - -# Check the file with the initial binning output -if not os.path.isfile(contig_bins_file): - print("\nFailed to open the file with the initial binning output.") - print("Exiting GraphBin...\nBye...!\n") - sys.exit(1) - -# Handle for missing trailing forwardslash in output folder path -if output_path[-1:] != "/": - output_path = output_path + "/" - -# Create output folder if it does not exist -if not os.path.isdir(output_path): - subprocess.run("mkdir -p "+output_path, shell=True) - -# Validate prefix -if args["prefix"] != '': - if args["prefix"].endswith("_"): - prefix = args["prefix"] - else: - prefix = args["prefix"]+"_" -else: - prefix = '' - -# Validate max_iteration -if args["max_iteration"] <= 0: - print("\nPlease enter a valid number for max_iteration") - print("Exiting GraphBin...\nBye...!\n") - sys.exit(1) - -# Validate diff_threshold -if args["diff_threshold"] < 0: - print("\nPlease enter a valid number for diff_threshold") - print("Exiting GraphBin...\nBye...!\n") - sys.exit(1) - - -# Run GraphBin -#--------------------------------------------------- -if assembler.lower() == "spades": - cmdGraphBin = """python "{0}src/graphbin_SPAdes.py" --graph "{1}" --paths "{2}" --binned "{3}" --output "{4}" --prefix "{5}" --max_iteration "{6}" --diff_threshold "{7}" """.format( - os.path.dirname(__file__), - assembly_graph_file, - contig_paths, - contig_bins_file, - output_path, - prefix, - max_iteration, - diff_threshold) - -elif assembler.lower() == "sga": - cmdGraphBin = """python "{0}src/graphbin_SGA.py" --graph "{1}" --binned "{2}" --output "{3}" --prefix "{4}" --max_iteration "{5}" --diff_threshold "{6}" """.format( - os.path.dirname(__file__), - assembly_graph_file, - contig_bins_file, - output_path, - prefix, - max_iteration, - diff_threshold) - -elif assembler.lower() == "megahit": - cmdGraphBin = """python "{0}src/graphbin_MEGAHIT.py" --graph "{1}" --binned "{2}" --output "{3}" --prefix "{4}" --max_iteration "{5}" --diff_threshold "{6}" """.format( - os.path.dirname(__file__), - assembly_graph_file, - contig_bins_file, - output_path, - prefix, - max_iteration, - diff_threshold) - -elif assembler.lower() == "flye": - cmdGraphBin = """python "{0}src/graphbin_Flye.py" --graph "{1}" --binned "{2}" --output "{3}" --prefix "{4}" --max_iteration "{5}" --diff_threshold "{6}" """.format( - os.path.dirname(__file__), - assembly_graph_file, - contig_bins_file, - output_path, - prefix, - max_iteration, - diff_threshold) - -elif assembler.lower() == "canu": - cmdGraphBin = """python "{0}src/graphbin_Canu.py" --graph "{1}" --binned "{2}" --output "{3}" --prefix "{4}" --max_iteration "{5}" --diff_threshold "{6}" """.format( - os.path.dirname(__file__), - assembly_graph_file, - contig_bins_file, - output_path, - prefix, - max_iteration, - diff_threshold) - -elif assembler.lower() == "miniasm": - cmdGraphBin = """python "{0}src/graphbin_Miniasm.py" --graph "{1}" --binned "{2}" --output "{3}" --prefix "{4}" --max_iteration "{5}" --diff_threshold "{6}" """.format( - os.path.dirname(__file__), - assembly_graph_file, - contig_bins_file, - output_path, - prefix, - max_iteration, - diff_threshold) - -os.system(cmdGraphBin) diff --git a/src/bidirectionalmap/bidirectionalmap.py b/graphbin/bidirectionalmap/bidirectionalmap.py similarity index 100% rename from src/bidirectionalmap/bidirectionalmap.py rename to graphbin/bidirectionalmap/bidirectionalmap.py diff --git a/graphbin/graphbin_Canu.py b/graphbin/graphbin_Canu.py new file mode 100755 index 0000000..c19bd78 --- /dev/null +++ b/graphbin/graphbin_Canu.py @@ -0,0 +1,525 @@ +#!/usr/bin/env python3 + +"""graphbin_Canu.py: Refined binning of metagenomic contigs using Canu assembly graphs. + +GraphBin is a metagenomic contig binning tool that makes use of the contig +connectivity information from the assembly graph to bin contigs. It utilizes +the binning result of an existing binning tool and a label propagation algorithm +to correct mis-binned contigs and predict the labels of contigs which are +discarded due to short length. + +graphbin_Canu.py makes use of the assembly graphs produced by Canu long read assembler. +""" + +import sys +import os +import subprocess +import csv +import operator +import time +import argparse +import re +import logging + +from igraph import * +from graphbin.labelpropagation.labelprop import LabelProp +from graphbin.bidirectionalmap.bidirectionalmap import BidirectionalMap +from graphbin.graphbin_Func import getClosestLabelledVertices +from graphbin.graphbin_Options import PARSER + +# Sample command +# ------------------------------------------------------------------- +# python graphbin_Canu.py --graph /path/to/graph_file.asqg +# --binned /path/to/binning_result.csv +# --output /path/to/output_folder +# ------------------------------------------------------------------- + + +def run(args): + # Setup logger + #----------------------- + + logger = logging.getLogger('GraphBin 1.1') + logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + consoleHeader = logging.StreamHandler() + consoleHeader.setFormatter(formatter) + logger.addHandler(consoleHeader) + + start_time = time.time() + + + + assembly_graph_file = args.graph + contig_bins_file = args.binned + output_path = args.output + prefix = args.prefix + max_iteration = args.max_iteration + diff_threshold = args.diff_threshold + + + # Setup output path for log file + #--------------------------------------------------- + + fileHandler = logging.FileHandler(output_path+"/"+prefix+"graphbin.log") + fileHandler.setLevel(logging.INFO) + fileHandler.setFormatter(formatter) + logger.addHandler(fileHandler) + + logger.info("Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs.") + logger.info("This version of GraphBin makes use of the assembly graph produced by Canu which is a long reads assembler based on the OLC approach.") + + logger.info("Assembly graph file: "+assembly_graph_file) + logger.info("Existing binning output file: "+contig_bins_file) + logger.info("Final binning output file: "+output_path) + logger.info("Maximum number of iterations: "+str(max_iteration)) + logger.info("Difference threshold: "+str(diff_threshold)) + + logger.info("GraphBin started") + + + # Get the number of bins from the initial binning result + #-------------------------------------------------------- + + n_bins = 0 + + try: + all_bins_list = [] + + with open(contig_bins_file) as csvfile: + readCSV = csv.reader(csvfile, delimiter=',') + for row in readCSV: + all_bins_list.append(row[1]) + + bins_list = list(set(all_bins_list)) + bins_list.sort() + + n_bins = len(bins_list) + logger.info("Number of bins available in the binning result: "+str(n_bins)) + except: + logger.error("Please make sure that the correct path to the binning result file is provided and it is having the correct format.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + + logger.info("Constructing the assembly graph") + + # Get the links from the .gfa file + #----------------------------------- + + my_map = BidirectionalMap() + + node_count = 0 + + nodes = [] + + links = [] + + try: + # Get contig connections from .gfa file + with open(assembly_graph_file) as file: + line = file.readline() + + while line != "": + + # Count the number of contigs + if "S" in line: + strings = line.split("\t") + my_node = strings[1] + my_map[node_count] = my_node + nodes.append(my_node) + node_count += 1 + + # Identify lines with link information + elif "L" in line: + + link = [] + strings = line.split("\t") + + if strings[1] != strings[3]: + start = strings[1] + end = strings[3] + link.append(start) + link.append(end) + links.append(link) + + line = file.readline() + + except: + logger.error("Please make sure that the correct path to the assembly graph file is provided.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + contigs_map = my_map + contigs_map_rev = my_map.inverse + + logger.info("Total number of contigs available: "+str(node_count)) + + + ## Construct the assembly graph + #------------------------------- + + try: + + # Create the graph + assembly_graph = Graph() + + # Create list of edges + edge_list = [] + + # Add vertices + assembly_graph.add_vertices(node_count) + + # Name vertices + for i in range(len(assembly_graph.vs)): + assembly_graph.vs[i]["id"]= i + assembly_graph.vs[i]["label"]= str(contigs_map[i]) + + # Iterate links + for link in links: + # Remove self loops + if link[0] != link[1]: + # Add edge to list of edges + edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) + + # Add edges to the graph + assembly_graph.add_edges(edge_list) + assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + + except: + logger.error("Please make sure that the correct path to the assembly graph file is provided.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info("Total number of edges in the assembly graph: "+str(len(edge_list))) + + + # Get initial binning result + #---------------------------- + + logger.info("Obtaining the initial binning result") + + bins = [[] for x in range(n_bins)] + + try: + with open(contig_bins_file) as contig_bins: + readCSV = csv.reader(contig_bins, delimiter=',') + for row in readCSV: + contig_num = contigs_map_rev[row[0]] + + bin_num = int(row[1])-1 + bins[bin_num].append(contig_num) + + for i in range(n_bins): + bins[i].sort() + + except: + logger.error("Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + + # Remove labels of ambiguous vertices + #------------------------------------- + + logger.info("Determining ambiguous vertices") + + remove_labels = [] + + neighbours_have_same_label_list = [] + + for b in range(n_bins): + + for i in bins[b]: + + my_bin = b + + dist = {} + + # Get set of closest labelled vertices with distance = 1 + closest_neighbours = assembly_graph.neighbors(i, mode=ALL) + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + neighbours_binned = False + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + neighbours_binned = True + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + remove_labels.append(i) + elif neighbours_binned: + neighbours_have_same_label_list.append(i) + + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + # Further remove labels of ambiguous vertices + binned_contigs = [] + + for n in range(n_bins): + binned_contigs = sorted(binned_contigs+bins[n]) + + for b in range(n_bins): + + for i in bins[b]: + + if i not in neighbours_have_same_label_list: + + my_bin = b + + # Get set of closest labelled vertices + closest_neighbours = getClosestLabelledVertices(assembly_graph, i, binned_contigs) + + if len(closest_neighbours) > 0: + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label and i not in remove_labels: + remove_labels.append(i) + + remove_labels.sort() + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + logger.info("Obtaining refined binning result") + + + # Get vertices which are not isolated and not in components without any labels + #----------------------------------------------------------------------------- + + logger.info("Deteremining vertices which are not isolated and not in components without any labels") + + non_isolated = [] + + for i in range(node_count): + + if i not in non_isolated and i in binned_contigs: + + component = [] + component.append(i) + length = len(component) + neighbours = assembly_graph.neighbors(i, mode=ALL) + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + component = list(set(component)) + + while length!= len(component): + + length = len(component) + + for j in component: + + neighbours = assembly_graph.neighbors(j, mode=ALL) + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + labelled = False + for j in component: + if j in binned_contigs: + labelled = True + break + + if labelled: + for j in component: + if j not in non_isolated: + non_isolated.append(j) + + logger.info("Number of non-isolated contigs: "+str(len(non_isolated))) + + + # Run label propagation + #----------------------- + + data = [] + + for contig in range(node_count): + + # Consider vertices that are not isolated + + if contig in non_isolated: + line = [] + line.append(contig) + + assigned = False + + for i in range(n_bins): + if contig in bins[i]: + line.append(i+1) + assigned = True + + if not assigned: + line.append(0) + + neighbours = assembly_graph.neighbors(contig, mode=ALL) + + neighs = [] + + for neighbour in neighbours: + n = [] + n.append(neighbour) + n.append(1.0) + neighs.append(n) + + line.append(neighs) + + data.append(line) + + lp = LabelProp() + + lp.load_data_from_mem(data) + + logger.info("Starting label propagation with eps="+str(diff_threshold)+" and max_iteration="+str(max_iteration)) + + ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) + ans.sort() + + logger.info("Obtaining Label Propagation result") + + for l in ans: + for i in range(n_bins): + if l[1]==i+1 and l[0] not in bins[i]: + bins[i].append(l[0]) + + + # Remove labels of ambiguous vertices + #------------------------------------- + + logger.info("Determining ambiguous vertices") + + remove_labels = [] + + for b in range(n_bins): + + for i in bins[b]: + + my_bin = b + + closest_neighbours = assembly_graph.neighbors(i, mode=ALL) + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + remove_labels.append(i) + + remove_labels.sort() + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + elapsed_time = time.time() - start_time + + logger.info("Obtaining the Final Refined Binning result") + + for i in range(n_bins): + bins[i].sort() + + # Print elapsed time for the process + logger.info("Elapsed time: "+str(elapsed_time)+" seconds") + + + # Write result to output file + #----------------------------- + + logger.info("Writing the Final Binning result to file") + + output_bins = [] + + for i in range(node_count): + for k in range(n_bins): + if i in bins[k]: + line = [] + line.append(str(contigs_map[i])) + line.append(k+1) + output_bins.append(line) + + output_file = output_path + prefix + 'graphbin_output.csv' + + with open(output_file, mode='w') as out_file: + output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in output_bins: + output_writer.writerow(row) + + logger.info("Final binning results can be found at "+output_file) + + + unbinned_contigs = [] + + for i in range(node_count): + if i in remove_labels or i not in non_isolated: + line = [] + line.append(str(contigs_map[i])) + unbinned_contigs.append(line) + + if len(unbinned_contigs)!=0: + unbinned_file = output_path + prefix + 'graphbin_unbinned.csv' + + with open(unbinned_file, mode='w') as out_file: + output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in unbinned_contigs: + output_writer.writerow(row) + + logger.info("Unbinned contigs can be found at "+unbinned_file) + + + # Exit program + #-------------- + + logger.info("Thank you for using GraphBin! Bye...!") + + logger.removeHandler(fileHandler) + logger.removeHandler(consoleHeader) + + +def main(): + # Setup argument parser + #----------------------- + ap = PARSER + args = ap.parse_args() + run(args) + + +if __name__ == "__main__": + main() diff --git a/graphbin/graphbin_Flye.py b/graphbin/graphbin_Flye.py new file mode 100755 index 0000000..72898be --- /dev/null +++ b/graphbin/graphbin_Flye.py @@ -0,0 +1,524 @@ +#!/usr/bin/env python3 + +"""graphbin_Flye.py: Refined binning of metagenomic contigs using Flye assembly graphs. + +GraphBin is a metagenomic contig binning tool that makes use of the contig +connectivity information from the assembly graph to bin contigs. It utilizes +the binning result of an existing binning tool and a label propagation algorithm +to correct mis-binned contigs and predict the labels of contigs which are +discarded due to short length. + +graphbin_Flye.py makes use of the assembly graphs produced by Flye long read assembler. +""" + +import sys +import os +import subprocess +import csv +import operator +import time +import argparse +import re +import logging + +from igraph import * +from graphbin.labelpropagation.labelprop import LabelProp +from graphbin.bidirectionalmap.bidirectionalmap import BidirectionalMap +from graphbin.graphbin_Func import getClosestLabelledVertices +from graphbin.graphbin_Options import PARSER + + +# Sample command +# ------------------------------------------------------------------- +# python graphbin_Flye.py --graph /path/to/graph_file.asqg +# --binned /path/to/binning_result.csv +# --output /path/to/output_folder +# ------------------------------------------------------------------- + + +def run(args): + # Setup logger + #----------------------- + logger = logging.getLogger('GraphBin 1.1') + logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + consoleHeader = logging.StreamHandler() + consoleHeader.setFormatter(formatter) + logger.addHandler(consoleHeader) + + start_time = time.time() + + + assembly_graph_file = args.graph + contig_bins_file = args.binned + output_path = args.output + prefix = args.prefix + max_iteration = args.max_iteration + diff_threshold = args.diff_threshold + + + # Setup output path for log file + #--------------------------------------------------- + + fileHandler = logging.FileHandler(output_path+"/"+prefix+"graphbin.log") + fileHandler.setLevel(logging.INFO) + fileHandler.setFormatter(formatter) + logger.addHandler(fileHandler) + + logger.info("Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs.") + logger.info("This version of GraphBin makes use of the assembly graph produced by Flye which is a long reads assembler based on the de Bruijn graph approach.") + + logger.info("Assembly graph file: "+assembly_graph_file) + logger.info("Existing binning output file: "+contig_bins_file) + logger.info("Final binning output file: "+output_path) + logger.info("Maximum number of iterations: "+str(max_iteration)) + logger.info("Difference threshold: "+str(diff_threshold)) + + logger.info("GraphBin started") + + + # Get the number of bins from the initial binning result + #-------------------------------------------------------- + + n_bins = 0 + + try: + all_bins_list = [] + + with open(contig_bins_file) as csvfile: + readCSV = csv.reader(csvfile, delimiter=',') + for row in readCSV: + all_bins_list.append(row[1]) + + bins_list = list(set(all_bins_list)) + bins_list.sort() + + n_bins = len(bins_list) + logger.info("Number of bins available in the binning result: "+str(n_bins)) + except: + logger.error("Please make sure that the correct path to the binning result file is provided and it is having the correct format.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + + logger.info("Constructing the assembly graph") + + # Get the links from the .gfa file + #----------------------------------- + + my_map = BidirectionalMap() + + node_count = 0 + + nodes = [] + + links = [] + + try: + # Get contig connections from .gfa file + with open(assembly_graph_file) as file: + line = file.readline() + + while line != "": + + # Count the number of contigs + if "S" in line: + strings = line.split("\t") + my_node = strings[1] + my_map[node_count] = my_node + nodes.append(my_node) + node_count += 1 + + # Identify lines with link information + elif "L" in line: + + link = [] + strings = line.split("\t") + + if strings[1] != strings[3]: + start = strings[1] + end = strings[3] + link.append(start) + link.append(end) + links.append(link) + + line = file.readline() + + except: + logger.error("Please make sure that the correct path to the assembly graph file is provided.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + contigs_map = my_map + contigs_map_rev = my_map.inverse + + logger.info("Total number of contigs available: "+str(node_count)) + + + ## Construct the assembly graph + #------------------------------- + + try: + + # Create the graph + assembly_graph = Graph() + + # Create list of edges + edge_list = [] + + # Add vertices + assembly_graph.add_vertices(node_count) + + # Name vertices + for i in range(len(assembly_graph.vs)): + assembly_graph.vs[i]["id"]= i + assembly_graph.vs[i]["label"]= str(contigs_map[i]) + + # Iterate links + for link in links: + # Remove self loops + if link[0] != link[1]: + # Add edge to list of edges + edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) + + # Add edges to the graph + assembly_graph.add_edges(edge_list) + assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + + except: + logger.error("Please make sure that the correct path to the assembly graph file is provided.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info("Total number of edges in the assembly graph: "+str(len(edge_list))) + + + # Get initial binning result + #---------------------------- + + logger.info("Obtaining the initial binning result") + + bins = [[] for x in range(n_bins)] + + try: + with open(contig_bins_file) as contig_bins: + readCSV = csv.reader(contig_bins, delimiter=',') + for row in readCSV: + contig_num = contigs_map_rev[row[0]] + + bin_num = int(row[1])-1 + bins[bin_num].append(contig_num) + + for i in range(n_bins): + bins[i].sort() + + except: + logger.error("Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + + # Remove labels of ambiguous vertices + #------------------------------------- + + logger.info("Determining ambiguous vertices") + + remove_labels = [] + + neighbours_have_same_label_list = [] + + for b in range(n_bins): + + for i in bins[b]: + + my_bin = b + + dist = {} + + # Get set of closest labelled vertices with distance = 1 + closest_neighbours = assembly_graph.neighbors(i, mode=ALL) + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + neighbours_binned = False + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + neighbours_binned = True + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + remove_labels.append(i) + elif neighbours_binned: + neighbours_have_same_label_list.append(i) + + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + # Further remove labels of ambiguous vertices + binned_contigs = [] + + for n in range(n_bins): + binned_contigs = sorted(binned_contigs+bins[n]) + + for b in range(n_bins): + + for i in bins[b]: + + if i not in neighbours_have_same_label_list: + + my_bin = b + + # Get set of closest labelled vertices + closest_neighbours = getClosestLabelledVertices(assembly_graph, i, binned_contigs) + + if len(closest_neighbours) > 0: + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label and i not in remove_labels: + remove_labels.append(i) + + remove_labels.sort() + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + logger.info("Obtaining refined binning result") + + + # Get vertices which are not isolated and not in components without any labels + #----------------------------------------------------------------------------- + + logger.info("Deteremining vertices which are not isolated and not in components without any labels") + + non_isolated = [] + + for i in range(node_count): + + if i not in non_isolated and i in binned_contigs: + + component = [] + component.append(i) + length = len(component) + neighbours = assembly_graph.neighbors(i, mode=ALL) + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + component = list(set(component)) + + while length!= len(component): + + length = len(component) + + for j in component: + + neighbours = assembly_graph.neighbors(j, mode=ALL) + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + labelled = False + for j in component: + if j in binned_contigs: + labelled = True + break + + if labelled: + for j in component: + if j not in non_isolated: + non_isolated.append(j) + + logger.info("Number of non-isolated contigs: "+str(len(non_isolated))) + + + # Run label propagation + #----------------------- + + data = [] + + for contig in range(node_count): + + # Consider vertices that are not isolated + + if contig in non_isolated: + line = [] + line.append(contig) + + assigned = False + + for i in range(n_bins): + if contig in bins[i]: + line.append(i+1) + assigned = True + + if not assigned: + line.append(0) + + neighbours = assembly_graph.neighbors(contig, mode=ALL) + + neighs = [] + + for neighbour in neighbours: + n = [] + n.append(neighbour) + n.append(1.0) + neighs.append(n) + + line.append(neighs) + + data.append(line) + + lp = LabelProp() + + lp.load_data_from_mem(data) + + logger.info("Starting label propagation with eps="+str(diff_threshold)+" and max_iteration="+str(max_iteration)) + + ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) + ans.sort() + + logger.info("Obtaining Label Propagation result") + + for l in ans: + for i in range(n_bins): + if l[1]==i+1 and l[0] not in bins[i]: + bins[i].append(l[0]) + + + # Remove labels of ambiguous vertices + #------------------------------------- + + logger.info("Determining ambiguous vertices") + + remove_labels = [] + + for b in range(n_bins): + + for i in bins[b]: + + my_bin = b + + closest_neighbours = assembly_graph.neighbors(i, mode=ALL) + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + remove_labels.append(i) + + remove_labels.sort() + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + elapsed_time = time.time() - start_time + + logger.info("Obtaining the Final Refined Binning result") + + for i in range(n_bins): + bins[i].sort() + + # Print elapsed time for the process + logger.info("Elapsed time: "+str(elapsed_time)+" seconds") + + + # Write result to output file + #----------------------------- + + logger.info("Writing the Final Binning result to file") + + output_bins = [] + + for i in range(node_count): + for k in range(n_bins): + if i in bins[k]: + line = [] + line.append(str(contigs_map[i])) + line.append(k+1) + output_bins.append(line) + + output_file = output_path + prefix + 'graphbin_output.csv' + + with open(output_file, mode='w') as out_file: + output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in output_bins: + output_writer.writerow(row) + + logger.info("Final binning results can be found at "+output_file) + + + unbinned_contigs = [] + + for i in range(node_count): + if i in remove_labels or i not in non_isolated: + line = [] + line.append(str(contigs_map[i])) + unbinned_contigs.append(line) + + if len(unbinned_contigs)!=0: + unbinned_file = output_path + prefix + 'graphbin_unbinned.csv' + + with open(unbinned_file, mode='w') as out_file: + output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in unbinned_contigs: + output_writer.writerow(row) + + logger.info("Unbinned contigs can be found at "+unbinned_file) + + + # Exit program + #-------------- + + logger.info("Thank you for using GraphBin! Bye...!") + + logger.removeHandler(fileHandler) + logger.removeHandler(consoleHeader) + + +def main(): + # Setup argument parser + #----------------------- + ap = PARSER + args = ap.parse_args() + run(args) + + +if __name__ == "__main__": + main() diff --git a/graphbin/graphbin_Func.py b/graphbin/graphbin_Func.py new file mode 100644 index 0000000..a963e3f --- /dev/null +++ b/graphbin/graphbin_Func.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + + +def getClosestLabelledVertices(graph, node, binned_contigs): + # Remove labels of ambiguous vertices + #------------------------------------- + + queu_l = [graph.neighbors(node, mode='ALL')] + visited_l = [node] + labelled = [] + + while len(queu_l) > 0: + active_level = queu_l.pop(0) + is_finish = False + visited_l += active_level + + for n in active_level: + if n in binned_contigs: + is_finish = True + labelled.append(n) + if is_finish: + return labelled + else: + temp = [] + for n in active_level: + temp += graph.neighbors(n, mode='ALL') + temp = list(set(temp)) + temp2 = [] + + for n in temp: + if n not in visited_l: + temp2.append(n) + if len(temp2) > 0: + queu_l.append(temp2) + return labelled diff --git a/graphbin/graphbin_MEGAHIT.py b/graphbin/graphbin_MEGAHIT.py new file mode 100755 index 0000000..c63670f --- /dev/null +++ b/graphbin/graphbin_MEGAHIT.py @@ -0,0 +1,527 @@ +#!/usr/bin/env python3 + +"""graphbin_MEGAHIT.py: Refined binning of metagenomic contigs using MEGAHIT assembly graphs. + +GraphBin is a metagenomic contig binning tool that makes use of the contig +connectivity information from the assembly graph to bin contigs. It utilizes +the binning result of an existing binning tool and a label propagation algorithm +to correct mis-binned contigs and predict the labels of contigs which are +discarded due to short length. + +graphbin_MEGAHIT.py makes use of the assembly graphs produced by MEGAHIT assembler. +""" + +import sys +import os +import subprocess +import csv +import operator +import time +import argparse +import re +import logging + +from igraph import * +from graphbin.labelpropagation.labelprop import LabelProp +from graphbin.bidirectionalmap.bidirectionalmap import BidirectionalMap +from graphbin.graphbin_Func import getClosestLabelledVertices +from graphbin.graphbin_Options import PARSER + +# Sample command +# ------------------------------------------------------------------- +# python graphbin_MEGAHIT.py --graph /path/to/graph_file.gfa +# --binned /path/to/binning_result.csv +# --output /path/to/output_folder +# ------------------------------------------------------------------- + + +def run(args): + # Setup logger + #----------------------- + + logger = logging.getLogger('GraphBin 1.1') + logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + consoleHeader = logging.StreamHandler() + consoleHeader.setFormatter(formatter) + logger.addHandler(consoleHeader) + + start_time = time.time() + + + assembly_graph_file = args.graph + contig_bins_file = args.binned + output_path = args.output + prefix = args.prefix + max_iteration = args.max_iteration + diff_threshold = args.diff_threshold + + + # Setup output path for log file + #--------------------------------------------------- + + fileHandler = logging.FileHandler(output_path+"/"+prefix+"graphbin.log") + fileHandler.setLevel(logging.INFO) + fileHandler.setFormatter(formatter) + logger.addHandler(fileHandler) + + logger.info("Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs.") + logger.info("This version of GraphBin makes use of the assembly graph produced by MEGAHIT which is based on the de Bruijn graph approach.") + + logger.info("Assembly graph file: "+assembly_graph_file) + logger.info("Existing binning output file: "+contig_bins_file) + logger.info("Final binning output file: "+output_path) + logger.info("Maximum number of iterations: "+str(max_iteration)) + logger.info("Difference threshold: "+str(diff_threshold)) + + logger.info("GraphBin started") + + + # Get the number of bins from the initial binning result + #-------------------------------------------------------- + + n_bins = 0 + + try: + all_bins_list = [] + + with open(contig_bins_file) as csvfile: + readCSV = csv.reader(csvfile, delimiter=',') + for row in readCSV: + all_bins_list.append(row[1]) + + bins_list = list(set(all_bins_list)) + bins_list.sort() + + n_bins = len(bins_list) + logger.info("Number of bins available in the initial binning result: "+str(n_bins)) + except: + logger.error("Please make sure that the correct path to the initial binning result file is provided and it is having the correct format.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + + logger.info("Constructing the assembly graph") + + + ## Construct the assembly graph + #------------------------------- + + node_count = 0 + + links = [] + + my_map = BidirectionalMap() + + try: + + # Get links from .gfa file + with open(assembly_graph_file) as file: + + line = file.readline() + + while line != "": + + # Identify lines with link information + if line.startswith("L"): + link = [] + + strings = line.split("\t") + + start_1 = 'NODE_' + end_1 = '_length' + + link1 = int(re.search('%s(.*)%s' % (start_1, end_1), strings[1]).group(1)) + + start_2 = 'NODE_' + end_2 = '_length' + + link2 = int(re.search('%s(.*)%s' % (start_2, end_2), strings[3]).group(1)) + + link.append(link1) + link.append(link2) + links.append(link) + + elif line.startswith("S"): + strings = line.split() + + start = 'NODE_' + end = '_length' + + contig_num = int(re.search('%s(.*)%s' % (start, end), strings[1]).group(1)) + + my_map[node_count] = int(contig_num) + + node_count += 1 + + line = file.readline() + + + logger.info("Total number of contigs available: "+str(node_count)) + + contigs_map = my_map + contigs_map_rev = my_map.inverse + + # Create graph + assembly_graph = Graph() + + # Add vertices + assembly_graph.add_vertices(node_count) + + # Create list of edges + edge_list = [] + + for i in range(node_count): + assembly_graph.vs[i]["id"]= i + assembly_graph.vs[i]["label"]= str(contigs_map[i]) + + # Iterate links + for link in links: + # Remove self loops + if link[0] != link[1]: + # Add edge to list of edges + edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) + + # Add edges to the graph + assembly_graph.add_edges(edge_list) + assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + + except: + logger.error("Please make sure that the correct path to the assembly graph file is provided.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info("Total number of edges in the assembly graph: "+str(len(edge_list))) + + + # Get initial binning result + #---------------------------- + + logger.info("Obtaining the initial binning result") + + bins = [[] for x in range(n_bins)] + + try: + with open(contig_bins_file) as contig_bins: + readCSV = csv.reader(contig_bins, delimiter=',') + for row in readCSV: + start = 'NODE_' + end = '' + contig_num = contigs_map_rev[int(re.search('%s(.*)%s' % (start, end), row[0]).group(1))] + + bin_num = int(row[1])-1 + bins[bin_num].append(contig_num) + + for i in range(n_bins): + bins[i].sort() + + except: + logger.error("Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + + # Remove labels of ambiguous vertices + #------------------------------------- + + logger.info("Determining ambiguous vertices") + + remove_labels = [] + + neighbours_have_same_label_list = [] + + for b in range(n_bins): + + for i in bins[b]: + + my_bin = b + + dist = {} + + # Get set of closest labelled vertices with distance = 1 + closest_neighbours = assembly_graph.neighbors(i, mode=ALL) + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + neighbours_binned = False + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + neighbours_binned = True + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + remove_labels.append(i) + elif neighbours_binned: + neighbours_have_same_label_list.append(i) + + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + # Further remove labels of ambiguous vertices + binned_contigs = [] + + for n in range(n_bins): + binned_contigs = sorted(binned_contigs+bins[n]) + + for b in range(n_bins): + + for i in bins[b]: + + if i not in neighbours_have_same_label_list: + + my_bin = b + + # Get set of closest labelled vertices + closest_neighbours = getClosestLabelledVertices(assembly_graph, i, binned_contigs) + + if len(closest_neighbours) > 0: + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label and i not in remove_labels: + remove_labels.append(i) + + remove_labels.sort() + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + logger.info("Obtaining refined binning result") + + + # Get vertices which are not isolated and not in components without any labels + #----------------------------------------------------------------------------- + + logger.info("Deteremining vertices which are not isolated and not in components without any labels") + + non_isolated = [] + + for i in range(node_count): + + if i not in non_isolated and i in binned_contigs: + + component = [] + component.append(i) + length = len(component) + neighbours = assembly_graph.neighbors(i, mode=ALL) + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + component = list(set(component)) + + while length!= len(component): + + length = len(component) + + for j in component: + + neighbours = assembly_graph.neighbors(j, mode=ALL) + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + labelled = False + for j in component: + if j in binned_contigs: + labelled = True + break + + if labelled: + for j in component: + if j not in non_isolated: + non_isolated.append(j) + + logger.info("Number of non-isolated contigs: "+str(len(non_isolated))) + + + # Run label propagation + #----------------------- + + data = [] + + for contig in range(node_count): + + # Consider vertices that are not isolated + + if contig in non_isolated: + line = [] + line.append(contig) + + assigned = False + + for i in range(n_bins): + if contig in bins[i]: + line.append(i+1) + assigned = True + + if not assigned: + line.append(0) + + neighbours = assembly_graph.neighbors(contig, mode=ALL) + + neighs = [] + + for neighbour in neighbours: + n = [] + n.append(neighbour) + n.append(1.0) + neighs.append(n) + + line.append(neighs) + + data.append(line) + + lp = LabelProp() + + lp.load_data_from_mem(data) + + logger.info("Starting label propagation with eps="+str(diff_threshold)+" and max_iteration="+str(max_iteration)) + + ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) + ans.sort() + + logger.info("Obtaining Label Propagation result") + + for l in ans: + for i in range(n_bins): + if l[1]==i+1 and l[0] not in bins[i]: + bins[i].append(l[0]) + + + # Remove labels of ambiguous vertices + #------------------------------------- + + logger.info("Determining ambiguous vertices") + + remove_labels = [] + + for b in range(n_bins): + + for i in bins[b]: + + my_bin = b + + closest_neighbours = assembly_graph.neighbors(i, mode=ALL) + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + remove_labels.append(i) + + remove_labels.sort() + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + elapsed_time = time.time() - start_time + + logger.info("Obtaining the Final Refined Binning result") + + for i in range(n_bins): + bins[i].sort() + + # Print elapsed time for the process + logger.info("Elapsed time: "+str(elapsed_time)+" seconds") + + + # Write result to output file + #----------------------------- + + logger.info("Writing the Final Binning result to file") + + output_bins = [] + + for i in range(node_count): + for k in range(n_bins): + if i in bins[k]: + line = [] + line.append("NODE_"+str(contigs_map[i])) + line.append(k+1) + output_bins.append(line) + + output_file = output_path + prefix + 'graphbin_output.csv' + + with open(output_file, mode='w') as out_file: + output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in output_bins: + output_writer.writerow(row) + + logger.info("Final binning results can be found at "+output_file) + + + unbinned_contigs = [] + + for i in range(node_count): + if i in remove_labels or i not in non_isolated: + line = [] + line.append("NODE_"+str(contigs_map[i])) + unbinned_contigs.append(line) + + if len(unbinned_contigs)!=0: + unbinned_file = output_path + prefix + 'graphbin_unbinned.csv' + + with open(unbinned_file, mode='w') as out_file: + output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in unbinned_contigs: + output_writer.writerow(row) + + logger.info("Unbinned contigs can be found at "+unbinned_file) + + + # Exit program + #-------------- + + logger.info("Thank you for using GraphBin! Bye...!") + + logger.removeHandler(fileHandler) + logger.removeHandler(consoleHeader) + + +def main(): + # Setup argument parser + #----------------------- + ap = PARSER + args = ap.parse_args() + run(args) + + +if __name__ == '__main__': + main() diff --git a/graphbin/graphbin_Miniasm.py b/graphbin/graphbin_Miniasm.py new file mode 100755 index 0000000..412f321 --- /dev/null +++ b/graphbin/graphbin_Miniasm.py @@ -0,0 +1,525 @@ +#!/usr/bin/env python3 + +"""graphbin_Miniasm.py: Refined binning of metagenomic contigs using Miniasm assembly graphs. + +GraphBin is a metagenomic contig binning tool that makes use of the contig +connectivity information from the assembly graph to bin contigs. It utilizes +the binning result of an existing binning tool and a label propagation algorithm +to correct mis-binned contigs and predict the labels of contigs which are +discarded due to short length. + +graphbin_Miniasm.py makes use of the assembly graphs produced by Miniasm long read assembler. +""" + +import sys +import os +import subprocess +import csv +import operator +import time +import argparse +import re +import logging + +from igraph import * +from graphbin.labelpropagation.labelprop import LabelProp +from graphbin.bidirectionalmap.bidirectionalmap import BidirectionalMap +from graphbin.graphbin_Func import getClosestLabelledVertices +from graphbin.graphbin_Options import PARSER + + +# Sample command +# ------------------------------------------------------------------- +# python graphbin_Miniasm.py --graph /path/to/graph_file.asqg +# --binned /path/to/binning_result.csv +# --output /path/to/output_folder +# ------------------------------------------------------------------- + + +def run(args): + # Setup logger + #----------------------- + + logger = logging.getLogger('GraphBin 1.1') + logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + consoleHeader = logging.StreamHandler() + consoleHeader.setFormatter(formatter) + logger.addHandler(consoleHeader) + + start_time = time.time() + + assembly_graph_file = args.graph + contig_bins_file = args.binned + output_path = args.output + prefix = args.prefix + max_iteration = args.max_iteration + diff_threshold = args.diff_threshold + + + # Setup output path for log file + #--------------------------------------------------- + + fileHandler = logging.FileHandler(output_path+"/"+prefix+"graphbin.log") + fileHandler.setLevel(logging.INFO) + fileHandler.setFormatter(formatter) + logger.addHandler(fileHandler) + + logger.info("Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs.") + logger.info("This version of GraphBin makes use of the assembly graph produced by Miniasm.") + + logger.info("Assembly graph file: "+assembly_graph_file) + logger.info("Existing binning output file: "+contig_bins_file) + logger.info("Final binning output file: "+output_path) + logger.info("Maximum number of iterations: "+str(max_iteration)) + logger.info("Difference threshold: "+str(diff_threshold)) + + logger.info("GraphBin started") + + + # Get the number of bins from the initial binning result + #-------------------------------------------------------- + + n_bins = 0 + + try: + all_bins_list = [] + + with open(contig_bins_file) as csvfile: + readCSV = csv.reader(csvfile, delimiter=',') + for row in readCSV: + all_bins_list.append(row[1]) + + bins_list = list(set(all_bins_list)) + bins_list.sort() + + n_bins = len(bins_list) + logger.info("Number of bins available in the binning result: "+str(n_bins)) + except: + logger.error("Please make sure that the correct path to the binning result file is provided and it is having the correct format.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + + logger.info("Constructing the assembly graph") + + # Get the links from the .gfa file + #----------------------------------- + + my_map = BidirectionalMap() + + node_count = 0 + + nodes = [] + + links = [] + + try: + # Get contig connections from .gfa file + with open(assembly_graph_file) as file: + line = file.readline() + + while line != "": + + # Count the number of contigs + if "S" in line: + strings = line.split("\t") + my_node = strings[1] + my_map[node_count] = my_node + nodes.append(my_node) + node_count += 1 + + # Identify lines with link information + elif "L" in line: + + link = [] + strings = line.split("\t") + + if strings[1] != strings[3]: + start = strings[1] + end = strings[3] + link.append(start) + link.append(end) + links.append(link) + + line = file.readline() + + except: + logger.error("Please make sure that the correct path to the assembly graph file is provided.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + contigs_map = my_map + contigs_map_rev = my_map.inverse + + logger.info("Total number of contigs available: "+str(node_count)) + + + ## Construct the assembly graph + #------------------------------- + + try: + + # Create the graph + assembly_graph = Graph() + + # Create list of edges + edge_list = [] + + # Add vertices + assembly_graph.add_vertices(node_count) + + # Name vertices + for i in range(len(assembly_graph.vs)): + assembly_graph.vs[i]["id"]= i + assembly_graph.vs[i]["label"]= str(contigs_map[i]) + + # Iterate links + for link in links: + # Remove self loops + if link[0] != link[1]: + # Add edge to list of edges + edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) + + # Add edges to the graph + assembly_graph.add_edges(edge_list) + assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + + except: + logger.error("Please make sure that the correct path to the assembly graph file is provided.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info("Total number of edges in the assembly graph: "+str(len(edge_list))) + + + # Get initial binning result + #---------------------------- + + logger.info("Obtaining the initial binning result") + + bins = [[] for x in range(n_bins)] + + try: + with open(contig_bins_file) as contig_bins: + readCSV = csv.reader(contig_bins, delimiter=',') + for row in readCSV: + contig_num = contigs_map_rev[row[0]] + + bin_num = int(row[1])-1 + bins[bin_num].append(contig_num) + + for i in range(n_bins): + bins[i].sort() + + except: + logger.error("Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + + # Remove labels of ambiguous vertices + #------------------------------------- + + + logger.info("Determining ambiguous vertices") + + remove_labels = [] + + neighbours_have_same_label_list = [] + + for b in range(n_bins): + + for i in bins[b]: + + my_bin = b + + dist = {} + + # Get set of closest labelled vertices with distance = 1 + closest_neighbours = assembly_graph.neighbors(i, mode=ALL) + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + neighbours_binned = False + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + neighbours_binned = True + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + remove_labels.append(i) + elif neighbours_binned: + neighbours_have_same_label_list.append(i) + + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + # Further remove labels of ambiguous vertices + binned_contigs = [] + + for n in range(n_bins): + binned_contigs = sorted(binned_contigs+bins[n]) + + for b in range(n_bins): + + for i in bins[b]: + + if i not in neighbours_have_same_label_list: + + my_bin = b + + # Get set of closest labelled vertices + closest_neighbours = getClosestLabelledVertices(assembly_graph, i, binned_contigs) + + if len(closest_neighbours) > 0: + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label and i not in remove_labels: + remove_labels.append(i) + + remove_labels.sort() + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + logger.info("Obtaining refined binning result") + + + # Get vertices which are not isolated and not in components without any labels + #----------------------------------------------------------------------------- + + logger.info("Deteremining vertices which are not isolated and not in components without any labels") + + non_isolated = [] + + for i in range(node_count): + + if i not in non_isolated and i in binned_contigs: + + component = [] + component.append(i) + length = len(component) + neighbours = assembly_graph.neighbors(i, mode=ALL) + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + component = list(set(component)) + + while length!= len(component): + + length = len(component) + + for j in component: + + neighbours = assembly_graph.neighbors(j, mode=ALL) + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + labelled = False + for j in component: + if j in binned_contigs: + labelled = True + break + + if labelled: + for j in component: + if j not in non_isolated: + non_isolated.append(j) + + logger.info("Number of non-isolated contigs: "+str(len(non_isolated))) + + + # Run label propagation + #----------------------- + + data = [] + + for contig in range(node_count): + + # Consider vertices that are not isolated + + if contig in non_isolated: + line = [] + line.append(contig) + + assigned = False + + for i in range(n_bins): + if contig in bins[i]: + line.append(i+1) + assigned = True + + if not assigned: + line.append(0) + + neighbours = assembly_graph.neighbors(contig, mode=ALL) + + neighs = [] + + for neighbour in neighbours: + n = [] + n.append(neighbour) + n.append(1.0) + neighs.append(n) + + line.append(neighs) + + data.append(line) + + lp = LabelProp() + + lp.load_data_from_mem(data) + + logger.info("Starting label propagation with eps="+str(diff_threshold)+" and max_iteration="+str(max_iteration)) + + ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) + ans.sort() + + logger.info("Obtaining Label Propagation result") + + for l in ans: + for i in range(n_bins): + if l[1]==i+1 and l[0] not in bins[i]: + bins[i].append(l[0]) + + + # Remove labels of ambiguous vertices + #------------------------------------- + + logger.info("Determining ambiguous vertices") + + remove_labels = [] + + for b in range(n_bins): + + for i in bins[b]: + + my_bin = b + + closest_neighbours = assembly_graph.neighbors(i, mode=ALL) + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + remove_labels.append(i) + + remove_labels.sort() + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + elapsed_time = time.time() - start_time + + logger.info("Obtaining the Final Refined Binning result") + + for i in range(n_bins): + bins[i].sort() + + # Print elapsed time for the process + logger.info("Elapsed time: "+str(elapsed_time)+" seconds") + + + # Write result to output file + #----------------------------- + + logger.info("Writing the Final Binning result to file") + + output_bins = [] + + for i in range(node_count): + for k in range(n_bins): + if i in bins[k]: + line = [] + line.append(str(contigs_map[i])) + line.append(k+1) + output_bins.append(line) + + output_file = output_path + prefix + 'graphbin_output.csv' + + with open(output_file, mode='w') as out_file: + output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in output_bins: + output_writer.writerow(row) + + logger.info("Final binning results can be found at "+output_file) + + + unbinned_contigs = [] + + for i in range(node_count): + if i in remove_labels or i not in non_isolated: + line = [] + line.append(str(contigs_map[i])) + unbinned_contigs.append(line) + + if len(unbinned_contigs)!=0: + unbinned_file = output_path + prefix + 'graphbin_unbinned.csv' + + with open(unbinned_file, mode='w') as out_file: + output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in unbinned_contigs: + output_writer.writerow(row) + + logger.info("Unbinned contigs can be found at "+unbinned_file) + + + # Exit program + #-------------- + + logger.info("Thank you for using GraphBin! Bye...!") + + logger.removeHandler(fileHandler) + logger.removeHandler(consoleHeader) + + +def main(): + # Setup argument parser + #----------------------- + ap = PARSER + args = ap.parse_args() + run(args) + + +if __name__ == "__main__": + main() diff --git a/graphbin/graphbin_Options.py b/graphbin/graphbin_Options.py new file mode 100644 index 0000000..0295fd8 --- /dev/null +++ b/graphbin/graphbin_Options.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +import argparse + +PARSER = argparse.ArgumentParser( + description="""GraphBin Help. GraphBin is a metagenomic contig binning tool + that makes use of the contig connectivity information from the assembly graph to bin contigs. It utilizes the + binning result of an existing binning tool and a label propagation algorithm to correct mis-binned contigs + and predict the labels of contigs which are discarded due to short length.""", + prog="graphbin", +) + +PARSER.add_argument("--version", default=False, action="store_true") + +PARSER.add_argument("--graph", type=str, help="path to the assembly graph file") + +PARSER.add_argument( + "--binned", + type=str, + help="path to the .csv file with the initial binning output from an existing tool", +) + +PARSER.add_argument("--output", type=str, help="path to the output folder") + +PARSER.add_argument("--prefix", type=str, default="", help="prefix for the output file") + +PARSER.add_argument( + "--max_iteration", + type=int, + default=100, + help="maximum number of iterations for label propagation algorithm. [default: 100]", +) + +PARSER.add_argument( + "--diff_threshold", + type=float, + default=0.1, + help="difference threshold for label propagation algorithm. [default: 0.1]", +) diff --git a/graphbin/graphbin_SGA.py b/graphbin/graphbin_SGA.py new file mode 100755 index 0000000..f60bfac --- /dev/null +++ b/graphbin/graphbin_SGA.py @@ -0,0 +1,516 @@ +#!/usr/bin/env python3 + +"""graphbin_SGA.py: Refined binning of metagenomic contigs using SGA assembly graphs. + +GraphBin is a metagenomic contig binning tool that makes use of the contig +connectivity information from the assembly graph to bin contigs. It utilizes +the binning result of an existing binning tool and a label propagation algorithm +to correct mis-binned contigs and predict the labels of contigs which are +discarded due to short length. + +graphbin_SGA.py makes use of the assembly graphs produced by SGA (String Graph Assembler). +""" + +import sys +import os +import subprocess +import csv +import operator +import time +import argparse +import re +import logging + +from igraph import * +from graphbin.labelpropagation.labelprop import LabelProp +from graphbin.bidirectionalmap.bidirectionalmap import BidirectionalMap +from graphbin.graphbin_Func import getClosestLabelledVertices +from graphbin.graphbin_Options import PARSER + + +# Sample command +# ------------------------------------------------------------------- +# python graphbin_SGA.py --graph /path/to/graph_file.asqg +# --binned /path/to/binning_result.csv +# --output /path/to/output_folder +# ------------------------------------------------------------------- + + +def run(args): + # Setup logger + #----------------------- + + logger = logging.getLogger('GraphBin 1.1') + logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + consoleHeader = logging.StreamHandler() + consoleHeader.setFormatter(formatter) + logger.addHandler(consoleHeader) + + start_time = time.time() + + assembly_graph_file = args.graph + contig_bins_file = args.binned + output_path = args.output + prefix = args.prefix + max_iteration = args.max_iteration + diff_threshold = args.diff_threshold + + # Setup output path for log file + #--------------------------------------------------- + + fileHandler = logging.FileHandler(output_path+"/"+prefix+"graphbin.log") + fileHandler.setLevel(logging.INFO) + fileHandler.setFormatter(formatter) + logger.addHandler(fileHandler) + + logger.info("Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs.") + logger.info("This version of GraphBin makes use of the assembly graph produced by SGA which is based on the OLC (more recent string graph) approach.") + + logger.info("Assembly graph file: "+assembly_graph_file) + logger.info("Existing binning output file: "+contig_bins_file) + logger.info("Final binning output file: "+output_path) + logger.info("Maximum number of iterations: "+str(max_iteration)) + logger.info("Difference threshold: "+str(diff_threshold)) + + logger.info("GraphBin started") + + + # Get the number of bins from the initial binning result + #-------------------------------------------------------- + + n_bins = 0 + + try: + all_bins_list = [] + + with open(contig_bins_file) as csvfile: + readCSV = csv.reader(csvfile, delimiter=',') + for row in readCSV: + all_bins_list.append(row[1]) + + bins_list = list(set(all_bins_list)) + bins_list.sort() + + n_bins = len(bins_list) + logger.info("Number of bins available in the binning result: "+str(n_bins)) + except: + logger.error("Please make sure that the correct path to the binning result file is provided and it is having the correct format.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + + logger.info("Constructing the assembly graph") + + # Get the links from the .asqg file + #----------------------------------- + + links = [] + + my_map = BidirectionalMap() + + node_count = 0 + + try: + # Get contig connections from .asqg file + with open(assembly_graph_file) as file: + line = file.readline() + + while line != "": + + # Count the number of contigs + if "VT" in line: + start = 'contig-' + end = '' + contig_num = int(re.search('%s(.*)%s' % (start, end), str(line.split()[1])).group(1)) + my_map[node_count] = contig_num + node_count += 1 + + # Identify lines with link information + elif "ED" in line: + link = [] + strings = line.split("\t")[1].split() + link.append(int(strings[0][7:])) + link.append(int(strings[1][7:])) + links.append(link) + line = file.readline() + + except: + logger.error("Please make sure that the correct path to the assembly graph file is provided.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + contigs_map = my_map + contigs_map_rev = my_map.inverse + + logger.info("Total number of contigs available: "+str(node_count)) + + + ## Construct the assembly graph + #------------------------------- + + try: + + # Create the graph + assembly_graph = Graph() + + # Create list of edges + edge_list = [] + + # Add vertices + assembly_graph.add_vertices(node_count) + + # Name vertices + for i in range(len(assembly_graph.vs)): + assembly_graph.vs[i]["id"]= i + assembly_graph.vs[i]["label"]= str(contigs_map[i]) + + # Iterate links + for link in links: + # Remove self loops + if link[0] != link[1]: + # Add edge to list of edges + edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) + + # Add edges to the graph + assembly_graph.add_edges(edge_list) + assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + + except: + logger.error("Please make sure that the correct path to the assembly graph file is provided.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info("Total number of edges in the assembly graph: "+str(len(edge_list))) + + + # Get initial binning result + #---------------------------- + + logger.info("Obtaining the initial binning result") + + bins = [[] for x in range(n_bins)] + + try: + with open(contig_bins_file) as contig_bins: + readCSV = csv.reader(contig_bins, delimiter=',') + for row in readCSV: + start = 'contig-' + end = '' + contig_num = contigs_map_rev[int(re.search('%s(.*)%s' % (start, end), row[0]).group(1))] + + bin_num = int(row[1])-1 + bins[bin_num].append(contig_num) + + for i in range(n_bins): + bins[i].sort() + + except: + logger.error("Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + # Remove labels of ambiguous vertices + #------------------------------------- + + logger.info("Determining ambiguous vertices") + + remove_labels = [] + + neighbours_have_same_label_list = [] + + for b in range(n_bins): + + for i in bins[b]: + + my_bin = b + + dist = {} + + # Get set of closest labelled vertices with distance = 1 + closest_neighbours = assembly_graph.neighbors(i, mode=ALL) + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + neighbours_binned = False + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + neighbours_binned = True + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + remove_labels.append(i) + elif neighbours_binned: + neighbours_have_same_label_list.append(i) + + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + # Further remove labels of ambiguous vertices + binned_contigs = [] + + for n in range(n_bins): + binned_contigs = sorted(binned_contigs+bins[n]) + + for b in range(n_bins): + + for i in bins[b]: + + if i not in neighbours_have_same_label_list: + + my_bin = b + + # Get set of closest labelled vertices + closest_neighbours = getClosestLabelledVertices(assembly_graph, i, binned_contigs) + + if len(closest_neighbours) > 0: + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label and i not in remove_labels: + remove_labels.append(i) + + remove_labels.sort() + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + logger.info("Obtaining refined binning result") + + + # Get vertices which are not isolated and not in components without any labels + #----------------------------------------------------------------------------- + + logger.info("Deteremining vertices which are not isolated and not in components without any labels") + + non_isolated = [] + + for i in range(node_count): + + if i not in non_isolated and i in binned_contigs: + + component = [] + component.append(i) + length = len(component) + neighbours = assembly_graph.neighbors(i, mode=ALL) + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + component = list(set(component)) + + while length!= len(component): + + length = len(component) + + for j in component: + + neighbours = assembly_graph.neighbors(j, mode=ALL) + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + labelled = False + for j in component: + if j in binned_contigs: + labelled = True + break + + if labelled: + for j in component: + if j not in non_isolated: + non_isolated.append(j) + + logger.info("Number of non-isolated contigs: "+str(len(non_isolated))) + + + # Run label propagation + #----------------------- + + data = [] + + for contig in range(node_count): + + # Consider vertices that are not isolated + + if contig in non_isolated: + line = [] + line.append(contig) + + assigned = False + + for i in range(n_bins): + if contig in bins[i]: + line.append(i+1) + assigned = True + + if not assigned: + line.append(0) + + neighbours = assembly_graph.neighbors(contig, mode=ALL) + + neighs = [] + + for neighbour in neighbours: + n = [] + n.append(neighbour) + n.append(1.0) + neighs.append(n) + + line.append(neighs) + + data.append(line) + + lp = LabelProp() + + lp.load_data_from_mem(data) + + logger.info("Starting label propagation with eps="+str(diff_threshold)+" and max_iteration="+str(max_iteration)) + + ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) + ans.sort() + + logger.info("Obtaining Label Propagation result") + + for l in ans: + for i in range(n_bins): + if l[1]==i+1 and l[0] not in bins[i]: + bins[i].append(l[0]) + + + # Remove labels of ambiguous vertices + #------------------------------------- + + logger.info("Determining ambiguous vertices") + + remove_labels = [] + + for b in range(n_bins): + + for i in bins[b]: + + my_bin = b + + closest_neighbours = assembly_graph.neighbors(i, mode=ALL) + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + remove_labels.append(i) + + remove_labels.sort() + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + elapsed_time = time.time() - start_time + + logger.info("Obtaining the Final Refined Binning result") + + for i in range(n_bins): + bins[i].sort() + + # Print elapsed time for the process + logger.info("Elapsed time: "+str(elapsed_time)+" seconds") + + + # Write result to output file + #----------------------------- + + logger.info("Writing the Final Binning result to file") + + output_bins = [] + + for i in range(node_count): + for k in range(n_bins): + if i in bins[k]: + line = [] + line.append("contig-"+str(contigs_map[i])) + line.append(k+1) + output_bins.append(line) + + output_file = output_path + prefix + 'graphbin_output.csv' + + with open(output_file, mode='w') as out_file: + output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in output_bins: + output_writer.writerow(row) + + logger.info("Final binning results can be found at "+output_file) + + + unbinned_contigs = [] + + for i in range(node_count): + if i in remove_labels or i not in non_isolated: + line = [] + line.append("contig-"+str(contigs_map[i])) + unbinned_contigs.append(line) + + if len(unbinned_contigs)!=0: + unbinned_file = output_path + prefix + 'graphbin_unbinned.csv' + + with open(unbinned_file, mode='w') as out_file: + output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in unbinned_contigs: + output_writer.writerow(row) + + logger.info("Unbinned contigs can be found at "+unbinned_file) + + + # Exit program + #-------------- + + logger.info("Thank you for using GraphBin! Bye...!") + + logger.removeHandler(fileHandler) + logger.removeHandler(consoleHeader) + + +def main(): + # Setup argument parser + #----------------------- + ap = PARSER + args = ap.parser_args() + run(args) + + +if __name__ == "__main__": + main() diff --git a/graphbin/graphbin_SPAdes.py b/graphbin/graphbin_SPAdes.py new file mode 100755 index 0000000..0c1d7e3 --- /dev/null +++ b/graphbin/graphbin_SPAdes.py @@ -0,0 +1,583 @@ +#!/usr/bin/env python3 + +"""graphbin_SPAdes.py: Refined binning of metagenomic contigs using SPAdes assembly graphs. + +GraphBin is a metagenomic contig binning tool that makes use of the contig +connectivity information from the assembly graph to bin contigs. It utilizes +the binning result of an existing binning tool and a label propagation algorithm +to correct mis-binned contigs and predict the labels of contigs which are +discarded due to short length. + +graphbin_SPAdes.py makes use of the assembly graphs produced by SPAdes. +""" + +import sys +import os +import subprocess +import csv +import operator +import time +import argparse +import re +import logging + +from igraph import * +from collections import defaultdict + +from graphbin.labelpropagation.labelprop import LabelProp +from graphbin.bidirectionalmap.bidirectionalmap import BidirectionalMap +from graphbin.graphbin_Func import getClosestLabelledVertices +from graphbin.graphbin_Options import PARSER + +# Sample command +# ------------------------------------------------------------------- +# python graphbin_SPAdes.py --graph /path/to/graph_file.gfa +# --paths /path/to/paths_file.paths +# --binned /path/to/binning_result.csv +# --output /path/to/output_folder +# ------------------------------------------------------------------- + + +def run(args): + # Setup logger + #----------------------- + logger = logging.getLogger('GraphBin 1.1') + logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + consoleHeader = logging.StreamHandler() + consoleHeader.setFormatter(formatter) + logger.addHandler(consoleHeader) + + start_time = time.time() + + assembly_graph_file = args.graph + contig_paths = args.paths + contig_bins_file = args.binned + output_path = args.output + prefix = args.prefix + max_iteration = args.max_iteration + diff_threshold = args.diff_threshold + + # Setup output path for log file + #--------------------------------------------------- + + fileHandler = logging.FileHandler(output_path+"/"+prefix+"graphbin.log") + fileHandler.setLevel(logging.INFO) + fileHandler.setFormatter(formatter) + logger.addHandler(fileHandler) + + logger.info("Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs.") + logger.info("This version of GraphBin makes use of the assembly graph produced by SPAdes which is based on the de Bruijn graph approach.") + + logger.info("Input arguments:") + logger.info("Assembly graph file: "+assembly_graph_file) + logger.info("Contig paths file: "+contig_paths) + logger.info("Existing binning output file: "+contig_bins_file) + logger.info("Final binning output file: "+output_path) + logger.info("Maximum number of iterations: "+str(max_iteration)) + logger.info("Difference threshold: "+str(diff_threshold)) + + logger.info("GraphBin started") + + + # Get the number of bins from the initial binning result + #--------------------------------------------------- + + n_bins = 0 + + try: + all_bins_list = [] + + with open(contig_bins_file) as csvfile: + readCSV = csv.reader(csvfile, delimiter=',') + for row in readCSV: + all_bins_list.append(row[1]) + + bins_list = list(set(all_bins_list)) + bins_list.sort() + + n_bins = len(bins_list) + logger.info("Number of bins available in the initial binning result: "+str(n_bins)) + except: + logger.error("Please make sure that the correct path to the initial binning result file is provided and it is having the correct format.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + + logger.info("Constructing the assembly graph") + + # Get contig paths from contigs.paths + #------------------------------------- + + paths = {} + segment_contigs = {} + node_count = 0 + + my_map = BidirectionalMap() + + current_contig_num = "" + + try: + with open(contig_paths) as file: + name = file.readline() + path = file.readline() + + while name != "" and path != "": + + while ";" in path: + path = path[:-2]+","+file.readline() + + start = 'NODE_' + end = '_length_' + contig_num = str(int(re.search('%s(.*)%s' % (start, end), name).group(1))) + + segments = path.rstrip().split(",") + + if current_contig_num != contig_num: + my_map[node_count] = int(contig_num) + current_contig_num = contig_num + node_count += 1 + + if contig_num not in paths: + paths[contig_num] = [segments[0], segments[-1]] + + for segment in segments: + if segment not in segment_contigs: + segment_contigs[segment] = set([contig_num]) + else: + segment_contigs[segment].add(contig_num) + + name = file.readline() + path = file.readline() + + except: + logger.error("Please make sure that the correct path to the contig paths file is provided.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + contigs_map = my_map + contigs_map_rev = my_map.inverse + + logger.info("Total number of contigs available: "+str(node_count)) + + links = [] + links_map = defaultdict(set) + + + ## Construct the assembly graph + #------------------------------- + + try: + # Get links from assembly_graph_with_scaffolds.gfa + with open(assembly_graph_file) as file: + line = file.readline() + + while line != "": + + # Identify lines with link information + if "L" in line: + strings = line.split("\t") + f1, f2 = strings[1]+strings[2], strings[3]+strings[4] + links_map[f1].add(f2) + links_map[f2].add(f1) + links.append(strings[1]+strings[2]+" "+strings[3]+strings[4]) + line = file.readline() + + + # Create graph + assembly_graph = Graph() + + # Add vertices + assembly_graph.add_vertices(node_count) + + # Create list of edges + edge_list = [] + + # Name vertices + for i in range(node_count): + assembly_graph.vs[i]["id"]= i + assembly_graph.vs[i]["label"]= str(contigs_map[i]) + + for i in range(len(paths)): + segments = paths[str(contigs_map[i])] + + start = segments[0] + start_rev = "" + + if start.endswith("+"): + start_rev = start[:-1]+"-" + else: + start_rev = start[:-1]+"+" + + end = segments[1] + end_rev = "" + + if end.endswith("+"): + end_rev = end[:-1]+"-" + else: + end_rev = end[:-1]+"+" + + new_links = [] + + if start in links_map: + new_links.extend(list(links_map[start])) + if start_rev in links_map: + new_links.extend(list(links_map[start_rev])) + if end in links_map: + new_links.extend(list(links_map[end])) + if end_rev in links_map: + new_links.extend(list(links_map[end_rev])) + + for new_link in new_links: + if new_link in segment_contigs: + for contig in segment_contigs[new_link]: + if i!=contigs_map_rev[int(contig)]: + # Add edge to list of edges + edge_list.append((i,contigs_map_rev[int(contig)])) + + # Add edges to the graph + assembly_graph.add_edges(edge_list) + assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + + except: + logger.error("Please make sure that the correct path to the assembly graph file is provided.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info("Total number of edges in the assembly graph: "+str(len(edge_list))) + + + # Get initial binning result + #---------------------------- + + logger.info("Obtaining the initial binning result") + + bins = [[] for x in range(n_bins)] + + try: + with open(contig_bins_file) as contig_bins: + readCSV = csv.reader(contig_bins, delimiter=',') + for row in readCSV: + start = 'NODE_' + end = '' + contig_num = contigs_map_rev[int(re.search('%s(.*)%s' % (start, end), row[0]).group(1))] + + bin_num = int(row[1])-1 + bins[bin_num].append(contig_num) + + for i in range(n_bins): + bins[i].sort() + + except: + logger.error("Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format.") + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + + # Remove labels of ambiguous vertices + #------------------------------------- + + logger.info("Determining ambiguous vertices") + + remove_labels = [] + + neighbours_have_same_label_list = [] + + for b in range(n_bins): + + for i in bins[b]: + + my_bin = b + + dist = {} + + # Get set of closest labelled vertices with distance = 1 + closest_neighbours = assembly_graph.neighbors(i, mode=ALL) + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + neighbours_binned = False + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + neighbours_binned = True + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + remove_labels.append(i) + elif neighbours_binned: + neighbours_have_same_label_list.append(i) + + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + # Further remove labels of ambiguous vertices + binned_contigs = [] + + for n in range(n_bins): + binned_contigs = sorted(binned_contigs+bins[n]) + + for b in range(n_bins): + + for i in bins[b]: + + if i not in neighbours_have_same_label_list: + + my_bin = b + + # Get set of closest labelled vertices + closest_neighbours = getClosestLabelledVertices(assembly_graph, i, binned_contigs) + + if len(closest_neighbours) > 0: + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label and i not in remove_labels: + remove_labels.append(i) + + remove_labels.sort() + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + logger.info("Obtaining the refined binning result") + + + # Get vertices which are not isolated and not in components without any labels + #----------------------------------------------------------------------------- + + logger.info("Deteremining vertices which are not isolated and not in components without any labels") + + non_isolated = [] + + for i in range(node_count): + + if i not in non_isolated and i in binned_contigs: + + component = [] + component.append(i) + length = len(component) + neighbours = assembly_graph.neighbors(i, mode=ALL) + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + component = list(set(component)) + + while length!= len(component): + + length = len(component) + + for j in component: + + neighbours = assembly_graph.neighbors(j, mode=ALL) + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + labelled = False + for j in component: + if j in binned_contigs: + labelled = True + break + + if labelled: + for j in component: + if j not in non_isolated: + non_isolated.append(j) + + logger.info("Number of non-isolated contigs: "+str(len(non_isolated))) + + + # Run label propagation + #----------------------- + + data = [] + + for contig in range(node_count): + + # Consider vertices that are not isolated + + if contig in non_isolated: + line = [] + line.append(contig) + + assigned = False + + for i in range(n_bins): + if contig in bins[i]: + line.append(i+1) + assigned = True + + if not assigned: + line.append(0) + + neighbours = assembly_graph.neighbors(contig, mode=ALL) + + neighs = [] + + for neighbour in neighbours: + n = [] + n.append(neighbour) + n.append(1.0) + neighs.append(n) + + line.append(neighs) + + data.append(line) + + lp = LabelProp() + + lp.load_data_from_mem(data) + + logger.info("Starting label propagation with eps="+str(diff_threshold)+" and max_iteration="+str(max_iteration)) + + ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) + ans.sort() + + logger.info("Obtaining Label Propagation result") + + for l in ans: + for i in range(n_bins): + if l[1]==i+1 and l[0] not in bins[i]: + bins[i].append(l[0]) + + + # Remove labels of ambiguous vertices + #------------------------------------- + + logger.info("Determining ambiguous vertices") + + remove_labels = [] + + for b in range(n_bins): + + for i in bins[b]: + + my_bin = b + + closest_neighbours = assembly_graph.neighbors(i, mode=ALL) + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + remove_labels.append(i) + + remove_labels.sort() + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + elapsed_time = time.time() - start_time + + logger.info("Obtaining the Final Refined Binning result") + + for i in range(n_bins): + bins[i].sort() + + # Print elapsed time for the process + logger.info("Elapsed time: "+str(elapsed_time)+" seconds") + + + # Write result to output file + #----------------------------- + + logger.info("Writing the Final Binning result to file") + + output_bins = [] + + for i in range(node_count): + for k in range(n_bins): + if i in bins[k]: + line = [] + line.append("NODE_"+str(contigs_map[i])) + line.append(k+1) + output_bins.append(line) + + output_file = output_path + prefix + 'graphbin_output.csv' + + with open(output_file, mode='w') as out_file: + output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in output_bins: + output_writer.writerow(row) + + logger.info("Final binning results can be found at "+output_file) + + + unbinned_contigs = [] + + for i in range(node_count): + if i in remove_labels or i not in non_isolated: + line = [] + line.append("NODE_"+str(contigs_map[i])) + unbinned_contigs.append(line) + + if len(unbinned_contigs)!=0: + unbinned_file = output_path + prefix + 'graphbin_unbinned.csv' + + with open(unbinned_file, mode='w') as out_file: + output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in unbinned_contigs: + output_writer.writerow(row) + + logger.info("Unbinned contigs can be found at "+unbinned_file) + + + # Exit program + #-------------- + + logger.info("Thank you for using GraphBin! Bye...!") + + logger.removeHandler(fileHandler) + logger.removeHandler(consoleHeader) + + +def main(): + # Setup argument parser + #--------------------------------------------------- + ap = PARSER + ap.add_argument( + "--paths", type=str, help="path to the contigs.paths file" + ) + args = ap.parse_args() + run(args) + + +if __name__ == "__main__": + main() diff --git a/src/labelpropagation/labelprop.py b/graphbin/labelpropagation/labelprop.py similarity index 100% rename from src/labelpropagation/labelprop.py rename to graphbin/labelpropagation/labelprop.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6919089 --- /dev/null +++ b/setup.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +import os +from setuptools import setup + +with open("README.md") as f: + long_description = f.read() + +packages = ["graphbin"] +package_data = { + "graphbin": [ + "graphbin/*.py", + "graphbin/bidirectionalmap/*.py", + "graphbin/labelpropagation/*.py", + ] +} +data_files = [(".", ["LICENSE", "README.md"])] + +setup( + name='graphbin', + version="1.1", + author="Vijini Mallawaarachchi, Anuradha Wickramarachchi, and Yu Lin", + author_email="vijini.mallawaarachchi@anu.edu.au", + packages=packages, + package_data=package_data, + data_files=data_files, + include_package_data=True, + scripts=['bin/graphbin'], + url='https://github.com/Vini2/graphbin', + license='GPLv2', + long_description_content_type="text/markdown", + long_description=long_description, + description='Refined binning of metagenomic contigs using assembly graphs.', + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)', + 'Natural Language :: English', + 'Programming Language :: Python :: 3.6', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + ], + install_requires=[ + "python-igraph", + "setuptools"], + zip_safe=False) diff --git a/src/graphbin_Canu.py b/src/graphbin_Canu.py deleted file mode 100644 index 5778a46..0000000 --- a/src/graphbin_Canu.py +++ /dev/null @@ -1,554 +0,0 @@ -#!/usr/bin/python3 - -"""graphbin_Canu.py: Refined binning of metagenomic contigs using Canu assembly graphs. - -GraphBin is a metagenomic contig binning tool that makes use of the contig -connectivity information from the assembly graph to bin contigs. It utilizes -the binning result of an existing binning tool and a label propagation algorithm -to correct mis-binned contigs and predict the labels of contigs which are -discarded due to short length. - -graphbin_Canu.py makes use of the assembly graphs produced by Canu long read assembler. -""" - -import sys -import os -import subprocess -import csv -import operator -import time -import argparse -import re -import logging - -from igraph import * -from labelpropagation.labelprop import LabelProp -from bidirectionalmap.bidirectionalmap import BidirectionalMap - -# Sample command -# ------------------------------------------------------------------- -# python graphbin_Canu.py --graph /path/to/graph_file.asqg -# --binned /path/to/binning_result.csv -# --output /path/to/output_folder -# ------------------------------------------------------------------- - - -# Setup logger -#----------------------- - -logger = logging.getLogger('GraphBin 1.1') -logger.setLevel(logging.INFO) -formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') -consoleHeader = logging.StreamHandler() -consoleHeader.setFormatter(formatter) -logger.addHandler(consoleHeader) - -start_time = time.time() - - -# Setup argument parser -#----------------------- - -ap = argparse.ArgumentParser() - -ap.add_argument("--graph", required=True, help="path to the assembly graph file") -ap.add_argument("--binned", required=True, help="path to the .csv file with the initial binning output from an existing tool") -ap.add_argument("--output", required=True, help="path to the output folder") -ap.add_argument("--prefix", required=False, help="prefix for the output file") -ap.add_argument("--max_iteration", required=False, type=int, help="maximum number of iterations for label propagation algorithm. [default: 100]") -ap.add_argument("--diff_threshold", required=False, type=float, help="difference threshold for label propagation algorithm. [default: 0.1]") - -args = vars(ap.parse_args()) - -assembly_graph_file = args["graph"] -contig_bins_file = args["binned"] -output_path = args["output"] -prefix = args["prefix"] -max_iteration = args["max_iteration"] -diff_threshold = args["diff_threshold"] - - -# Setup output path for log file -#--------------------------------------------------- - -fileHandler = logging.FileHandler(output_path+"/"+prefix+"graphbin.log") -fileHandler.setLevel(logging.INFO) -fileHandler.setFormatter(formatter) -logger.addHandler(fileHandler) - -logger.info("Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs.") -logger.info("This version of GraphBin makes use of the assembly graph produced by Canu which is a long reads assembler based on the OLC approach.") - -logger.info("Assembly graph file: "+assembly_graph_file) -logger.info("Existing binning output file: "+contig_bins_file) -logger.info("Final binning output file: "+output_path) -logger.info("Maximum number of iterations: "+str(max_iteration)) -logger.info("Difference threshold: "+str(diff_threshold)) - -logger.info("GraphBin started") - - -# Get the number of bins from the initial binning result -#-------------------------------------------------------- - -n_bins = 0 - -try: - all_bins_list = [] - - with open(contig_bins_file) as csvfile: - readCSV = csv.reader(csvfile, delimiter=',') - for row in readCSV: - all_bins_list.append(row[1]) - - bins_list = list(set(all_bins_list)) - bins_list.sort() - - n_bins = len(bins_list) - logger.info("Number of bins available in the binning result: "+str(n_bins)) -except: - logger.error("Please make sure that the correct path to the binning result file is provided and it is having the correct format.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - -logger.info("Constructing the assembly graph") - -# Get the links from the .gfa file -#----------------------------------- - -my_map = BidirectionalMap() - -node_count = 0 - -nodes = [] - -links = [] - -try: - # Get contig connections from .gfa file - with open(assembly_graph_file) as file: - line = file.readline() - - while line != "": - - # Count the number of contigs - if "S" in line: - strings = line.split("\t") - my_node = strings[1] - my_map[node_count] = my_node - nodes.append(my_node) - node_count += 1 - - # Identify lines with link information - elif "L" in line: - - link = [] - strings = line.split("\t") - - if strings[1] != strings[3]: - start = strings[1] - end = strings[3] - link.append(start) - link.append(end) - links.append(link) - - line = file.readline() - -except: - logger.error("Please make sure that the correct path to the assembly graph file is provided.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - -contigs_map = my_map -contigs_map_rev = my_map.inverse - -logger.info("Total number of contigs available: "+str(node_count)) - - -## Construct the assembly graph -#------------------------------- - -try: - - # Create the graph - assembly_graph = Graph() - - # Create list of edges - edge_list = [] - - # Add vertices - assembly_graph.add_vertices(node_count) - - # Name vertices - for i in range(len(assembly_graph.vs)): - assembly_graph.vs[i]["id"]= i - assembly_graph.vs[i]["label"]= str(contigs_map[i]) - - # Iterate links - for link in links: - # Remove self loops - if link[0] != link[1]: - # Add edge to list of edges - edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) - - # Add edges to the graph - assembly_graph.add_edges(edge_list) - assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) - -except: - logger.error("Please make sure that the correct path to the assembly graph file is provided.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - -logger.info("Total number of edges in the assembly graph: "+str(len(edge_list))) - - -# Get initial binning result -#---------------------------- - -logger.info("Obtaining the initial binning result") - -bins = [[] for x in range(n_bins)] - -try: - with open(contig_bins_file) as contig_bins: - readCSV = csv.reader(contig_bins, delimiter=',') - for row in readCSV: - contig_num = contigs_map_rev[row[0]] - - bin_num = int(row[1])-1 - bins[bin_num].append(contig_num) - - for i in range(n_bins): - bins[i].sort() - -except: - logger.error("Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - -# Remove labels of ambiguous vertices -#------------------------------------- - -def getClosestLabelledVertices(graph, node, binned_contigs): - queu_l = [graph.neighbors(node, mode='ALL')] - visited_l = [node] - labelled = [] - - while len(queu_l) > 0: - active_level = queu_l.pop(0) - is_finish = False - visited_l += active_level - - for n in active_level: - if n in binned_contigs: - is_finish = True - labelled.append(n) - if is_finish: - return labelled - else: - temp = [] - for n in active_level: - temp += graph.neighbors(n, mode='ALL') - temp = list(set(temp)) - temp2 = [] - - for n in temp: - if n not in visited_l: - temp2.append(n) - if len(temp2) > 0: - queu_l.append(temp2) - return labelled - - -logger.info("Determining ambiguous vertices") - -remove_labels = [] - -neighbours_have_same_label_list = [] - -for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - dist = {} - - # Get set of closest labelled vertices with distance = 1 - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - neighbours_binned = False - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - neighbours_binned = True - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - remove_labels.append(i) - elif neighbours_binned: - neighbours_have_same_label_list.append(i) - -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -# Further remove labels of ambiguous vertices -binned_contigs = [] - -for n in range(n_bins): - binned_contigs = sorted(binned_contigs+bins[n]) - -for b in range(n_bins): - - for i in bins[b]: - - if i not in neighbours_have_same_label_list: - - my_bin = b - - # Get set of closest labelled vertices - closest_neighbours = getClosestLabelledVertices(assembly_graph, i, binned_contigs) - - if len(closest_neighbours) > 0: - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label and i not in remove_labels: - remove_labels.append(i) - -remove_labels.sort() - -logger.info("Removing labels of ambiguous vertices") - -# Remove labels of ambiguous vertices -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -logger.info("Obtaining refined binning result") - - -# Get vertices which are not isolated and not in components without any labels -#----------------------------------------------------------------------------- - -logger.info("Deteremining vertices which are not isolated and not in components without any labels") - -non_isolated = [] - -for i in range(node_count): - - if i not in non_isolated and i in binned_contigs: - - component = [] - component.append(i) - length = len(component) - neighbours = assembly_graph.neighbors(i, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - component = list(set(component)) - - while length!= len(component): - - length = len(component) - - for j in component: - - neighbours = assembly_graph.neighbors(j, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - labelled = False - for j in component: - if j in binned_contigs: - labelled = True - break - - if labelled: - for j in component: - if j not in non_isolated: - non_isolated.append(j) - -logger.info("Number of non-isolated contigs: "+str(len(non_isolated))) - - -# Run label propagation -#----------------------- - -data = [] - -for contig in range(node_count): - - # Consider vertices that are not isolated - - if contig in non_isolated: - line = [] - line.append(contig) - - assigned = False - - for i in range(n_bins): - if contig in bins[i]: - line.append(i+1) - assigned = True - - if not assigned: - line.append(0) - - neighbours = assembly_graph.neighbors(contig, mode=ALL) - - neighs = [] - - for neighbour in neighbours: - n = [] - n.append(neighbour) - n.append(1.0) - neighs.append(n) - - line.append(neighs) - - data.append(line) - -lp = LabelProp() - -lp.load_data_from_mem(data) - -logger.info("Starting label propagation with eps="+str(diff_threshold)+" and max_iteration="+str(max_iteration)) - -ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) -ans.sort() - -logger.info("Obtaining Label Propagation result") - -for l in ans: - for i in range(n_bins): - if l[1]==i+1 and l[0] not in bins[i]: - bins[i].append(l[0]) - - -# Remove labels of ambiguous vertices -#------------------------------------- - -logger.info("Determining ambiguous vertices") - -remove_labels = [] - -for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - remove_labels.append(i) - -remove_labels.sort() - -logger.info("Removing labels of ambiguous vertices") - -# Remove labels of ambiguous vertices -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -elapsed_time = time.time() - start_time - -logger.info("Obtaining the Final Refined Binning result") - -for i in range(n_bins): - bins[i].sort() - -# Print elapsed time for the process -logger.info("Elapsed time: "+str(elapsed_time)+" seconds") - - -# Write result to output file -#----------------------------- - -logger.info("Writing the Final Binning result to file") - -output_bins = [] - -for i in range(node_count): - for k in range(n_bins): - if i in bins[k]: - line = [] - line.append(str(contigs_map[i])) - line.append(k+1) - output_bins.append(line) - -output_file = output_path + prefix + 'graphbin_output.csv' - -with open(output_file, mode='w') as out_file: - output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - - for row in output_bins: - output_writer.writerow(row) - -logger.info("Final binning results can be found at "+output_file) - - -unbinned_contigs = [] - -for i in range(node_count): - if i in remove_labels or i not in non_isolated: - line = [] - line.append(str(contigs_map[i])) - unbinned_contigs.append(line) - -if len(unbinned_contigs)!=0: - unbinned_file = output_path + prefix + 'graphbin_unbinned.csv' - - with open(unbinned_file, mode='w') as out_file: - output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - - for row in unbinned_contigs: - output_writer.writerow(row) - - logger.info("Unbinned contigs can be found at "+unbinned_file) - - -# Exit program -#-------------- - -logger.info("Thank you for using GraphBin! Bye...!") - -logger.removeHandler(fileHandler) -logger.removeHandler(consoleHeader) \ No newline at end of file diff --git a/src/graphbin_Flye.py b/src/graphbin_Flye.py deleted file mode 100644 index 22156ad..0000000 --- a/src/graphbin_Flye.py +++ /dev/null @@ -1,554 +0,0 @@ -#!/usr/bin/python3 - -"""graphbin_Flye.py: Refined binning of metagenomic contigs using Flye assembly graphs. - -GraphBin is a metagenomic contig binning tool that makes use of the contig -connectivity information from the assembly graph to bin contigs. It utilizes -the binning result of an existing binning tool and a label propagation algorithm -to correct mis-binned contigs and predict the labels of contigs which are -discarded due to short length. - -graphbin_Flye.py makes use of the assembly graphs produced by Flye long read assembler. -""" - -import sys -import os -import subprocess -import csv -import operator -import time -import argparse -import re -import logging - -from igraph import * -from labelpropagation.labelprop import LabelProp -from bidirectionalmap.bidirectionalmap import BidirectionalMap - -# Sample command -# ------------------------------------------------------------------- -# python graphbin_Flye.py --graph /path/to/graph_file.asqg -# --binned /path/to/binning_result.csv -# --output /path/to/output_folder -# ------------------------------------------------------------------- - - -# Setup logger -#----------------------- - -logger = logging.getLogger('GraphBin 1.1') -logger.setLevel(logging.INFO) -formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') -consoleHeader = logging.StreamHandler() -consoleHeader.setFormatter(formatter) -logger.addHandler(consoleHeader) - -start_time = time.time() - - -# Setup argument parser -#----------------------- - -ap = argparse.ArgumentParser() - -ap.add_argument("--graph", required=True, help="path to the assembly graph file") -ap.add_argument("--binned", required=True, help="path to the .csv file with the initial binning output from an existing tool") -ap.add_argument("--output", required=True, help="path to the output folder") -ap.add_argument("--prefix", required=False, help="prefix for the output file") -ap.add_argument("--max_iteration", required=False, type=int, help="maximum number of iterations for label propagation algorithm. [default: 100]") -ap.add_argument("--diff_threshold", required=False, type=float, help="difference threshold for label propagation algorithm. [default: 0.1]") - -args = vars(ap.parse_args()) - -assembly_graph_file = args["graph"] -contig_bins_file = args["binned"] -output_path = args["output"] -prefix = args["prefix"] -max_iteration = args["max_iteration"] -diff_threshold = args["diff_threshold"] - - -# Setup output path for log file -#--------------------------------------------------- - -fileHandler = logging.FileHandler(output_path+"/"+prefix+"graphbin.log") -fileHandler.setLevel(logging.INFO) -fileHandler.setFormatter(formatter) -logger.addHandler(fileHandler) - -logger.info("Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs.") -logger.info("This version of GraphBin makes use of the assembly graph produced by Flye which is a long reads assembler based on the de Bruijn graph approach.") - -logger.info("Assembly graph file: "+assembly_graph_file) -logger.info("Existing binning output file: "+contig_bins_file) -logger.info("Final binning output file: "+output_path) -logger.info("Maximum number of iterations: "+str(max_iteration)) -logger.info("Difference threshold: "+str(diff_threshold)) - -logger.info("GraphBin started") - - -# Get the number of bins from the initial binning result -#-------------------------------------------------------- - -n_bins = 0 - -try: - all_bins_list = [] - - with open(contig_bins_file) as csvfile: - readCSV = csv.reader(csvfile, delimiter=',') - for row in readCSV: - all_bins_list.append(row[1]) - - bins_list = list(set(all_bins_list)) - bins_list.sort() - - n_bins = len(bins_list) - logger.info("Number of bins available in the binning result: "+str(n_bins)) -except: - logger.error("Please make sure that the correct path to the binning result file is provided and it is having the correct format.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - -logger.info("Constructing the assembly graph") - -# Get the links from the .gfa file -#----------------------------------- - -my_map = BidirectionalMap() - -node_count = 0 - -nodes = [] - -links = [] - -try: - # Get contig connections from .gfa file - with open(assembly_graph_file) as file: - line = file.readline() - - while line != "": - - # Count the number of contigs - if "S" in line: - strings = line.split("\t") - my_node = strings[1] - my_map[node_count] = my_node - nodes.append(my_node) - node_count += 1 - - # Identify lines with link information - elif "L" in line: - - link = [] - strings = line.split("\t") - - if strings[1] != strings[3]: - start = strings[1] - end = strings[3] - link.append(start) - link.append(end) - links.append(link) - - line = file.readline() - -except: - logger.error("Please make sure that the correct path to the assembly graph file is provided.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - -contigs_map = my_map -contigs_map_rev = my_map.inverse - -logger.info("Total number of contigs available: "+str(node_count)) - - -## Construct the assembly graph -#------------------------------- - -try: - - # Create the graph - assembly_graph = Graph() - - # Create list of edges - edge_list = [] - - # Add vertices - assembly_graph.add_vertices(node_count) - - # Name vertices - for i in range(len(assembly_graph.vs)): - assembly_graph.vs[i]["id"]= i - assembly_graph.vs[i]["label"]= str(contigs_map[i]) - - # Iterate links - for link in links: - # Remove self loops - if link[0] != link[1]: - # Add edge to list of edges - edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) - - # Add edges to the graph - assembly_graph.add_edges(edge_list) - assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) - -except: - logger.error("Please make sure that the correct path to the assembly graph file is provided.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - -logger.info("Total number of edges in the assembly graph: "+str(len(edge_list))) - - -# Get initial binning result -#---------------------------- - -logger.info("Obtaining the initial binning result") - -bins = [[] for x in range(n_bins)] - -try: - with open(contig_bins_file) as contig_bins: - readCSV = csv.reader(contig_bins, delimiter=',') - for row in readCSV: - contig_num = contigs_map_rev[row[0]] - - bin_num = int(row[1])-1 - bins[bin_num].append(contig_num) - - for i in range(n_bins): - bins[i].sort() - -except: - logger.error("Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - -# Remove labels of ambiguous vertices -#------------------------------------- - -def getClosestLabelledVertices(graph, node, binned_contigs): - queu_l = [graph.neighbors(node, mode='ALL')] - visited_l = [node] - labelled = [] - - while len(queu_l) > 0: - active_level = queu_l.pop(0) - is_finish = False - visited_l += active_level - - for n in active_level: - if n in binned_contigs: - is_finish = True - labelled.append(n) - if is_finish: - return labelled - else: - temp = [] - for n in active_level: - temp += graph.neighbors(n, mode='ALL') - temp = list(set(temp)) - temp2 = [] - - for n in temp: - if n not in visited_l: - temp2.append(n) - if len(temp2) > 0: - queu_l.append(temp2) - return labelled - - -logger.info("Determining ambiguous vertices") - -remove_labels = [] - -neighbours_have_same_label_list = [] - -for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - dist = {} - - # Get set of closest labelled vertices with distance = 1 - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - neighbours_binned = False - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - neighbours_binned = True - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - remove_labels.append(i) - elif neighbours_binned: - neighbours_have_same_label_list.append(i) - -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -# Further remove labels of ambiguous vertices -binned_contigs = [] - -for n in range(n_bins): - binned_contigs = sorted(binned_contigs+bins[n]) - -for b in range(n_bins): - - for i in bins[b]: - - if i not in neighbours_have_same_label_list: - - my_bin = b - - # Get set of closest labelled vertices - closest_neighbours = getClosestLabelledVertices(assembly_graph, i, binned_contigs) - - if len(closest_neighbours) > 0: - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label and i not in remove_labels: - remove_labels.append(i) - -remove_labels.sort() - -logger.info("Removing labels of ambiguous vertices") - -# Remove labels of ambiguous vertices -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -logger.info("Obtaining refined binning result") - - -# Get vertices which are not isolated and not in components without any labels -#----------------------------------------------------------------------------- - -logger.info("Deteremining vertices which are not isolated and not in components without any labels") - -non_isolated = [] - -for i in range(node_count): - - if i not in non_isolated and i in binned_contigs: - - component = [] - component.append(i) - length = len(component) - neighbours = assembly_graph.neighbors(i, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - component = list(set(component)) - - while length!= len(component): - - length = len(component) - - for j in component: - - neighbours = assembly_graph.neighbors(j, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - labelled = False - for j in component: - if j in binned_contigs: - labelled = True - break - - if labelled: - for j in component: - if j not in non_isolated: - non_isolated.append(j) - -logger.info("Number of non-isolated contigs: "+str(len(non_isolated))) - - -# Run label propagation -#----------------------- - -data = [] - -for contig in range(node_count): - - # Consider vertices that are not isolated - - if contig in non_isolated: - line = [] - line.append(contig) - - assigned = False - - for i in range(n_bins): - if contig in bins[i]: - line.append(i+1) - assigned = True - - if not assigned: - line.append(0) - - neighbours = assembly_graph.neighbors(contig, mode=ALL) - - neighs = [] - - for neighbour in neighbours: - n = [] - n.append(neighbour) - n.append(1.0) - neighs.append(n) - - line.append(neighs) - - data.append(line) - -lp = LabelProp() - -lp.load_data_from_mem(data) - -logger.info("Starting label propagation with eps="+str(diff_threshold)+" and max_iteration="+str(max_iteration)) - -ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) -ans.sort() - -logger.info("Obtaining Label Propagation result") - -for l in ans: - for i in range(n_bins): - if l[1]==i+1 and l[0] not in bins[i]: - bins[i].append(l[0]) - - -# Remove labels of ambiguous vertices -#------------------------------------- - -logger.info("Determining ambiguous vertices") - -remove_labels = [] - -for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - remove_labels.append(i) - -remove_labels.sort() - -logger.info("Removing labels of ambiguous vertices") - -# Remove labels of ambiguous vertices -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -elapsed_time = time.time() - start_time - -logger.info("Obtaining the Final Refined Binning result") - -for i in range(n_bins): - bins[i].sort() - -# Print elapsed time for the process -logger.info("Elapsed time: "+str(elapsed_time)+" seconds") - - -# Write result to output file -#----------------------------- - -logger.info("Writing the Final Binning result to file") - -output_bins = [] - -for i in range(node_count): - for k in range(n_bins): - if i in bins[k]: - line = [] - line.append(str(contigs_map[i])) - line.append(k+1) - output_bins.append(line) - -output_file = output_path + prefix + 'graphbin_output.csv' - -with open(output_file, mode='w') as out_file: - output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - - for row in output_bins: - output_writer.writerow(row) - -logger.info("Final binning results can be found at "+output_file) - - -unbinned_contigs = [] - -for i in range(node_count): - if i in remove_labels or i not in non_isolated: - line = [] - line.append(str(contigs_map[i])) - unbinned_contigs.append(line) - -if len(unbinned_contigs)!=0: - unbinned_file = output_path + prefix + 'graphbin_unbinned.csv' - - with open(unbinned_file, mode='w') as out_file: - output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - - for row in unbinned_contigs: - output_writer.writerow(row) - - logger.info("Unbinned contigs can be found at "+unbinned_file) - - -# Exit program -#-------------- - -logger.info("Thank you for using GraphBin! Bye...!") - -logger.removeHandler(fileHandler) -logger.removeHandler(consoleHeader) \ No newline at end of file diff --git a/src/graphbin_MEGAHIT.py b/src/graphbin_MEGAHIT.py deleted file mode 100644 index 550dd9b..0000000 --- a/src/graphbin_MEGAHIT.py +++ /dev/null @@ -1,556 +0,0 @@ -#!/usr/bin/python3 - -"""graphbin_MEGAHIT.py: Refined binning of metagenomic contigs using MEGAHIT assembly graphs. - -GraphBin is a metagenomic contig binning tool that makes use of the contig -connectivity information from the assembly graph to bin contigs. It utilizes -the binning result of an existing binning tool and a label propagation algorithm -to correct mis-binned contigs and predict the labels of contigs which are -discarded due to short length. - -graphbin_MEGAHIT.py makes use of the assembly graphs produced by MEGAHIT assembler. -""" - -import sys -import os -import subprocess -import csv -import operator -import time -import argparse -import re -import logging - -from igraph import * -from labelpropagation.labelprop import LabelProp -from bidirectionalmap.bidirectionalmap import BidirectionalMap - -# Sample command -# ------------------------------------------------------------------- -# python graphbin_MEGAHIT.py --graph /path/to/graph_file.gfa -# --binned /path/to/binning_result.csv -# --output /path/to/output_folder -# ------------------------------------------------------------------- - - -# Setup logger -#----------------------- - -logger = logging.getLogger('GraphBin 1.1') -logger.setLevel(logging.INFO) -formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') -consoleHeader = logging.StreamHandler() -consoleHeader.setFormatter(formatter) -logger.addHandler(consoleHeader) - -start_time = time.time() - -# Setup argument parser -#----------------------- - -ap = argparse.ArgumentParser() - -ap.add_argument("--graph", required=True, help="path to the assembly graph file") -ap.add_argument("--binned", required=True, help="path to the .csv file with the initial binning output from an existing tool") -ap.add_argument("--output", required=False, help="path to the output folder") -ap.add_argument("--prefix", required=False, help="prefix for the output file") -ap.add_argument("--max_iteration", required=False, type=int, help="maximum number of iterations for label propagation algorithm. [default: 100]") -ap.add_argument("--diff_threshold", required=False, type=float, help="difference threshold for label propagation algorithm. [default: 0.1]") - -args = vars(ap.parse_args()) - -assembly_graph_file = args["graph"] -contig_bins_file = args["binned"] -output_path = args["output"] -prefix = args["prefix"] -max_iteration = args["max_iteration"] -diff_threshold = args["diff_threshold"] - - -# Setup output path for log file -#--------------------------------------------------- - -fileHandler = logging.FileHandler(output_path+"/"+prefix+"graphbin.log") -fileHandler.setLevel(logging.INFO) -fileHandler.setFormatter(formatter) -logger.addHandler(fileHandler) - -logger.info("Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs.") -logger.info("This version of GraphBin makes use of the assembly graph produced by MEGAHIT which is based on the de Bruijn graph approach.") - -logger.info("Assembly graph file: "+assembly_graph_file) -logger.info("Existing binning output file: "+contig_bins_file) -logger.info("Final binning output file: "+output_path) -logger.info("Maximum number of iterations: "+str(max_iteration)) -logger.info("Difference threshold: "+str(diff_threshold)) - -logger.info("GraphBin started") - - -# Get the number of bins from the initial binning result -#-------------------------------------------------------- - -n_bins = 0 - -try: - all_bins_list = [] - - with open(contig_bins_file) as csvfile: - readCSV = csv.reader(csvfile, delimiter=',') - for row in readCSV: - all_bins_list.append(row[1]) - - bins_list = list(set(all_bins_list)) - bins_list.sort() - - n_bins = len(bins_list) - logger.info("Number of bins available in the initial binning result: "+str(n_bins)) -except: - logger.error("Please make sure that the correct path to the initial binning result file is provided and it is having the correct format.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - -logger.info("Constructing the assembly graph") - - -## Construct the assembly graph -#------------------------------- - -node_count = 0 - -links = [] - -my_map = BidirectionalMap() - -try: - - # Get links from .gfa file - with open(assembly_graph_file) as file: - - line = file.readline() - - while line != "": - - # Identify lines with link information - if line.startswith("L"): - link = [] - - strings = line.split("\t") - - start_1 = 'NODE_' - end_1 = '_length' - - link1 = int(re.search('%s(.*)%s' % (start_1, end_1), strings[1]).group(1)) - - start_2 = 'NODE_' - end_2 = '_length' - - link2 = int(re.search('%s(.*)%s' % (start_2, end_2), strings[3]).group(1)) - - link.append(link1) - link.append(link2) - links.append(link) - - elif line.startswith("S"): - strings = line.split() - - start = 'NODE_' - end = '_length' - - contig_num = int(re.search('%s(.*)%s' % (start, end), strings[1]).group(1)) - - my_map[node_count] = int(contig_num) - - node_count += 1 - - line = file.readline() - - - logger.info("Total number of contigs available: "+str(node_count)) - - contigs_map = my_map - contigs_map_rev = my_map.inverse - - # Create graph - assembly_graph = Graph() - - # Add vertices - assembly_graph.add_vertices(node_count) - - # Create list of edges - edge_list = [] - - for i in range(node_count): - assembly_graph.vs[i]["id"]= i - assembly_graph.vs[i]["label"]= str(contigs_map[i]) - - # Iterate links - for link in links: - # Remove self loops - if link[0] != link[1]: - # Add edge to list of edges - edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) - - # Add edges to the graph - assembly_graph.add_edges(edge_list) - assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) - -except: - logger.error("Please make sure that the correct path to the assembly graph file is provided.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - -logger.info("Total number of edges in the assembly graph: "+str(len(edge_list))) - - -# Get initial binning result -#---------------------------- - -logger.info("Obtaining the initial binning result") - -bins = [[] for x in range(n_bins)] - -try: - with open(contig_bins_file) as contig_bins: - readCSV = csv.reader(contig_bins, delimiter=',') - for row in readCSV: - start = 'NODE_' - end = '' - contig_num = contigs_map_rev[int(re.search('%s(.*)%s' % (start, end), row[0]).group(1))] - - bin_num = int(row[1])-1 - bins[bin_num].append(contig_num) - - for i in range(n_bins): - bins[i].sort() - -except: - logger.error("Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - -# Remove labels of ambiguous vertices -#------------------------------------- - -def getClosestLabelledVertices(graph, node, binned_contigs): - queu_l = [graph.neighbors(node, mode='ALL')] - visited_l = [node] - labelled = [] - - while len(queu_l) > 0: - active_level = queu_l.pop(0) - is_finish = False - visited_l += active_level - - for n in active_level: - if n in binned_contigs: - is_finish = True - labelled.append(n) - if is_finish: - return labelled - else: - temp = [] - for n in active_level: - temp += graph.neighbors(n, mode='ALL') - temp = list(set(temp)) - temp2 = [] - - for n in temp: - if n not in visited_l: - temp2.append(n) - if len(temp2) > 0: - queu_l.append(temp2) - return labelled - - -logger.info("Determining ambiguous vertices") - -remove_labels = [] - -neighbours_have_same_label_list = [] - -for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - dist = {} - - # Get set of closest labelled vertices with distance = 1 - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - neighbours_binned = False - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - neighbours_binned = True - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - remove_labels.append(i) - elif neighbours_binned: - neighbours_have_same_label_list.append(i) - -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -# Further remove labels of ambiguous vertices -binned_contigs = [] - -for n in range(n_bins): - binned_contigs = sorted(binned_contigs+bins[n]) - -for b in range(n_bins): - - for i in bins[b]: - - if i not in neighbours_have_same_label_list: - - my_bin = b - - # Get set of closest labelled vertices - closest_neighbours = getClosestLabelledVertices(assembly_graph, i, binned_contigs) - - if len(closest_neighbours) > 0: - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label and i not in remove_labels: - remove_labels.append(i) - -remove_labels.sort() - -logger.info("Removing labels of ambiguous vertices") - -# Remove labels of ambiguous vertices -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -logger.info("Obtaining refined binning result") - - -# Get vertices which are not isolated and not in components without any labels -#----------------------------------------------------------------------------- - -logger.info("Deteremining vertices which are not isolated and not in components without any labels") - -non_isolated = [] - -for i in range(node_count): - - if i not in non_isolated and i in binned_contigs: - - component = [] - component.append(i) - length = len(component) - neighbours = assembly_graph.neighbors(i, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - component = list(set(component)) - - while length!= len(component): - - length = len(component) - - for j in component: - - neighbours = assembly_graph.neighbors(j, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - labelled = False - for j in component: - if j in binned_contigs: - labelled = True - break - - if labelled: - for j in component: - if j not in non_isolated: - non_isolated.append(j) - -logger.info("Number of non-isolated contigs: "+str(len(non_isolated))) - - -# Run label propagation -#----------------------- - -data = [] - -for contig in range(node_count): - - # Consider vertices that are not isolated - - if contig in non_isolated: - line = [] - line.append(contig) - - assigned = False - - for i in range(n_bins): - if contig in bins[i]: - line.append(i+1) - assigned = True - - if not assigned: - line.append(0) - - neighbours = assembly_graph.neighbors(contig, mode=ALL) - - neighs = [] - - for neighbour in neighbours: - n = [] - n.append(neighbour) - n.append(1.0) - neighs.append(n) - - line.append(neighs) - - data.append(line) - -lp = LabelProp() - -lp.load_data_from_mem(data) - -logger.info("Starting label propagation with eps="+str(diff_threshold)+" and max_iteration="+str(max_iteration)) - -ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) -ans.sort() - -logger.info("Obtaining Label Propagation result") - -for l in ans: - for i in range(n_bins): - if l[1]==i+1 and l[0] not in bins[i]: - bins[i].append(l[0]) - - -# Remove labels of ambiguous vertices -#------------------------------------- - -logger.info("Determining ambiguous vertices") - -remove_labels = [] - -for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - remove_labels.append(i) - -remove_labels.sort() - -logger.info("Removing labels of ambiguous vertices") - -# Remove labels of ambiguous vertices -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -elapsed_time = time.time() - start_time - -logger.info("Obtaining the Final Refined Binning result") - -for i in range(n_bins): - bins[i].sort() - -# Print elapsed time for the process -logger.info("Elapsed time: "+str(elapsed_time)+" seconds") - - -# Write result to output file -#----------------------------- - -logger.info("Writing the Final Binning result to file") - -output_bins = [] - -for i in range(node_count): - for k in range(n_bins): - if i in bins[k]: - line = [] - line.append("NODE_"+str(contigs_map[i])) - line.append(k+1) - output_bins.append(line) - -output_file = output_path + prefix + 'graphbin_output.csv' - -with open(output_file, mode='w') as out_file: - output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - - for row in output_bins: - output_writer.writerow(row) - -logger.info("Final binning results can be found at "+output_file) - - -unbinned_contigs = [] - -for i in range(node_count): - if i in remove_labels or i not in non_isolated: - line = [] - line.append("NODE_"+str(contigs_map[i])) - unbinned_contigs.append(line) - -if len(unbinned_contigs)!=0: - unbinned_file = output_path + prefix + 'graphbin_unbinned.csv' - - with open(unbinned_file, mode='w') as out_file: - output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - - for row in unbinned_contigs: - output_writer.writerow(row) - - logger.info("Unbinned contigs can be found at "+unbinned_file) - - -# Exit program -#-------------- - -logger.info("Thank you for using GraphBin! Bye...!") - -logger.removeHandler(fileHandler) -logger.removeHandler(consoleHeader) \ No newline at end of file diff --git a/src/graphbin_Miniasm.py b/src/graphbin_Miniasm.py deleted file mode 100644 index eb82489..0000000 --- a/src/graphbin_Miniasm.py +++ /dev/null @@ -1,554 +0,0 @@ -#!/usr/bin/python3 - -"""graphbin_Miniasm.py: Refined binning of metagenomic contigs using Miniasm assembly graphs. - -GraphBin is a metagenomic contig binning tool that makes use of the contig -connectivity information from the assembly graph to bin contigs. It utilizes -the binning result of an existing binning tool and a label propagation algorithm -to correct mis-binned contigs and predict the labels of contigs which are -discarded due to short length. - -graphbin_Miniasm.py makes use of the assembly graphs produced by Miniasm long read assembler. -""" - -import sys -import os -import subprocess -import csv -import operator -import time -import argparse -import re -import logging - -from igraph import * -from labelpropagation.labelprop import LabelProp -from bidirectionalmap.bidirectionalmap import BidirectionalMap - -# Sample command -# ------------------------------------------------------------------- -# python graphbin_Miniasm.py --graph /path/to/graph_file.asqg -# --binned /path/to/binning_result.csv -# --output /path/to/output_folder -# ------------------------------------------------------------------- - - -# Setup logger -#----------------------- - -logger = logging.getLogger('GraphBin 1.1') -logger.setLevel(logging.INFO) -formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') -consoleHeader = logging.StreamHandler() -consoleHeader.setFormatter(formatter) -logger.addHandler(consoleHeader) - -start_time = time.time() - - -# Setup argument parser -#----------------------- - -ap = argparse.ArgumentParser() - -ap.add_argument("--graph", required=True, help="path to the assembly graph file") -ap.add_argument("--binned", required=True, help="path to the .csv file with the initial binning output from an existing tool") -ap.add_argument("--output", required=True, help="path to the output folder") -ap.add_argument("--prefix", required=False, help="prefix for the output file") -ap.add_argument("--max_iteration", required=False, type=int, help="maximum number of iterations for label propagation algorithm. [default: 100]") -ap.add_argument("--diff_threshold", required=False, type=float, help="difference threshold for label propagation algorithm. [default: 0.1]") - -args = vars(ap.parse_args()) - -assembly_graph_file = args["graph"] -contig_bins_file = args["binned"] -output_path = args["output"] -prefix = args["prefix"] -max_iteration = args["max_iteration"] -diff_threshold = args["diff_threshold"] - - -# Setup output path for log file -#--------------------------------------------------- - -fileHandler = logging.FileHandler(output_path+"/"+prefix+"graphbin.log") -fileHandler.setLevel(logging.INFO) -fileHandler.setFormatter(formatter) -logger.addHandler(fileHandler) - -logger.info("Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs.") -logger.info("This version of GraphBin makes use of the assembly graph produced by Miniasm.") - -logger.info("Assembly graph file: "+assembly_graph_file) -logger.info("Existing binning output file: "+contig_bins_file) -logger.info("Final binning output file: "+output_path) -logger.info("Maximum number of iterations: "+str(max_iteration)) -logger.info("Difference threshold: "+str(diff_threshold)) - -logger.info("GraphBin started") - - -# Get the number of bins from the initial binning result -#-------------------------------------------------------- - -n_bins = 0 - -try: - all_bins_list = [] - - with open(contig_bins_file) as csvfile: - readCSV = csv.reader(csvfile, delimiter=',') - for row in readCSV: - all_bins_list.append(row[1]) - - bins_list = list(set(all_bins_list)) - bins_list.sort() - - n_bins = len(bins_list) - logger.info("Number of bins available in the binning result: "+str(n_bins)) -except: - logger.error("Please make sure that the correct path to the binning result file is provided and it is having the correct format.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - -logger.info("Constructing the assembly graph") - -# Get the links from the .gfa file -#----------------------------------- - -my_map = BidirectionalMap() - -node_count = 0 - -nodes = [] - -links = [] - -try: - # Get contig connections from .gfa file - with open(assembly_graph_file) as file: - line = file.readline() - - while line != "": - - # Count the number of contigs - if "S" in line: - strings = line.split("\t") - my_node = strings[1] - my_map[node_count] = my_node - nodes.append(my_node) - node_count += 1 - - # Identify lines with link information - elif "L" in line: - - link = [] - strings = line.split("\t") - - if strings[1] != strings[3]: - start = strings[1] - end = strings[3] - link.append(start) - link.append(end) - links.append(link) - - line = file.readline() - -except: - logger.error("Please make sure that the correct path to the assembly graph file is provided.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - -contigs_map = my_map -contigs_map_rev = my_map.inverse - -logger.info("Total number of contigs available: "+str(node_count)) - - -## Construct the assembly graph -#------------------------------- - -try: - - # Create the graph - assembly_graph = Graph() - - # Create list of edges - edge_list = [] - - # Add vertices - assembly_graph.add_vertices(node_count) - - # Name vertices - for i in range(len(assembly_graph.vs)): - assembly_graph.vs[i]["id"]= i - assembly_graph.vs[i]["label"]= str(contigs_map[i]) - - # Iterate links - for link in links: - # Remove self loops - if link[0] != link[1]: - # Add edge to list of edges - edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) - - # Add edges to the graph - assembly_graph.add_edges(edge_list) - assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) - -except: - logger.error("Please make sure that the correct path to the assembly graph file is provided.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - -logger.info("Total number of edges in the assembly graph: "+str(len(edge_list))) - - -# Get initial binning result -#---------------------------- - -logger.info("Obtaining the initial binning result") - -bins = [[] for x in range(n_bins)] - -try: - with open(contig_bins_file) as contig_bins: - readCSV = csv.reader(contig_bins, delimiter=',') - for row in readCSV: - contig_num = contigs_map_rev[row[0]] - - bin_num = int(row[1])-1 - bins[bin_num].append(contig_num) - - for i in range(n_bins): - bins[i].sort() - -except: - logger.error("Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - -# Remove labels of ambiguous vertices -#------------------------------------- - -def getClosestLabelledVertices(graph, node, binned_contigs): - queu_l = [graph.neighbors(node, mode='ALL')] - visited_l = [node] - labelled = [] - - while len(queu_l) > 0: - active_level = queu_l.pop(0) - is_finish = False - visited_l += active_level - - for n in active_level: - if n in binned_contigs: - is_finish = True - labelled.append(n) - if is_finish: - return labelled - else: - temp = [] - for n in active_level: - temp += graph.neighbors(n, mode='ALL') - temp = list(set(temp)) - temp2 = [] - - for n in temp: - if n not in visited_l: - temp2.append(n) - if len(temp2) > 0: - queu_l.append(temp2) - return labelled - - -logger.info("Determining ambiguous vertices") - -remove_labels = [] - -neighbours_have_same_label_list = [] - -for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - dist = {} - - # Get set of closest labelled vertices with distance = 1 - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - neighbours_binned = False - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - neighbours_binned = True - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - remove_labels.append(i) - elif neighbours_binned: - neighbours_have_same_label_list.append(i) - -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -# Further remove labels of ambiguous vertices -binned_contigs = [] - -for n in range(n_bins): - binned_contigs = sorted(binned_contigs+bins[n]) - -for b in range(n_bins): - - for i in bins[b]: - - if i not in neighbours_have_same_label_list: - - my_bin = b - - # Get set of closest labelled vertices - closest_neighbours = getClosestLabelledVertices(assembly_graph, i, binned_contigs) - - if len(closest_neighbours) > 0: - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label and i not in remove_labels: - remove_labels.append(i) - -remove_labels.sort() - -logger.info("Removing labels of ambiguous vertices") - -# Remove labels of ambiguous vertices -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -logger.info("Obtaining refined binning result") - - -# Get vertices which are not isolated and not in components without any labels -#----------------------------------------------------------------------------- - -logger.info("Deteremining vertices which are not isolated and not in components without any labels") - -non_isolated = [] - -for i in range(node_count): - - if i not in non_isolated and i in binned_contigs: - - component = [] - component.append(i) - length = len(component) - neighbours = assembly_graph.neighbors(i, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - component = list(set(component)) - - while length!= len(component): - - length = len(component) - - for j in component: - - neighbours = assembly_graph.neighbors(j, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - labelled = False - for j in component: - if j in binned_contigs: - labelled = True - break - - if labelled: - for j in component: - if j not in non_isolated: - non_isolated.append(j) - -logger.info("Number of non-isolated contigs: "+str(len(non_isolated))) - - -# Run label propagation -#----------------------- - -data = [] - -for contig in range(node_count): - - # Consider vertices that are not isolated - - if contig in non_isolated: - line = [] - line.append(contig) - - assigned = False - - for i in range(n_bins): - if contig in bins[i]: - line.append(i+1) - assigned = True - - if not assigned: - line.append(0) - - neighbours = assembly_graph.neighbors(contig, mode=ALL) - - neighs = [] - - for neighbour in neighbours: - n = [] - n.append(neighbour) - n.append(1.0) - neighs.append(n) - - line.append(neighs) - - data.append(line) - -lp = LabelProp() - -lp.load_data_from_mem(data) - -logger.info("Starting label propagation with eps="+str(diff_threshold)+" and max_iteration="+str(max_iteration)) - -ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) -ans.sort() - -logger.info("Obtaining Label Propagation result") - -for l in ans: - for i in range(n_bins): - if l[1]==i+1 and l[0] not in bins[i]: - bins[i].append(l[0]) - - -# Remove labels of ambiguous vertices -#------------------------------------- - -logger.info("Determining ambiguous vertices") - -remove_labels = [] - -for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - remove_labels.append(i) - -remove_labels.sort() - -logger.info("Removing labels of ambiguous vertices") - -# Remove labels of ambiguous vertices -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -elapsed_time = time.time() - start_time - -logger.info("Obtaining the Final Refined Binning result") - -for i in range(n_bins): - bins[i].sort() - -# Print elapsed time for the process -logger.info("Elapsed time: "+str(elapsed_time)+" seconds") - - -# Write result to output file -#----------------------------- - -logger.info("Writing the Final Binning result to file") - -output_bins = [] - -for i in range(node_count): - for k in range(n_bins): - if i in bins[k]: - line = [] - line.append(str(contigs_map[i])) - line.append(k+1) - output_bins.append(line) - -output_file = output_path + prefix + 'graphbin_output.csv' - -with open(output_file, mode='w') as out_file: - output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - - for row in output_bins: - output_writer.writerow(row) - -logger.info("Final binning results can be found at "+output_file) - - -unbinned_contigs = [] - -for i in range(node_count): - if i in remove_labels or i not in non_isolated: - line = [] - line.append(str(contigs_map[i])) - unbinned_contigs.append(line) - -if len(unbinned_contigs)!=0: - unbinned_file = output_path + prefix + 'graphbin_unbinned.csv' - - with open(unbinned_file, mode='w') as out_file: - output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - - for row in unbinned_contigs: - output_writer.writerow(row) - - logger.info("Unbinned contigs can be found at "+unbinned_file) - - -# Exit program -#-------------- - -logger.info("Thank you for using GraphBin! Bye...!") - -logger.removeHandler(fileHandler) -logger.removeHandler(consoleHeader) \ No newline at end of file diff --git a/src/graphbin_SGA.py b/src/graphbin_SGA.py deleted file mode 100644 index b9f2b4e..0000000 --- a/src/graphbin_SGA.py +++ /dev/null @@ -1,548 +0,0 @@ -#!/usr/bin/python3 - -"""graphbin_SGA.py: Refined binning of metagenomic contigs using SGA assembly graphs. - -GraphBin is a metagenomic contig binning tool that makes use of the contig -connectivity information from the assembly graph to bin contigs. It utilizes -the binning result of an existing binning tool and a label propagation algorithm -to correct mis-binned contigs and predict the labels of contigs which are -discarded due to short length. - -graphbin_SGA.py makes use of the assembly graphs produced by SGA (String Graph Assembler). -""" - -import sys -import os -import subprocess -import csv -import operator -import time -import argparse -import re -import logging - -from igraph import * -from labelpropagation.labelprop import LabelProp -from bidirectionalmap.bidirectionalmap import BidirectionalMap - -# Sample command -# ------------------------------------------------------------------- -# python graphbin_SGA.py --graph /path/to/graph_file.asqg -# --binned /path/to/binning_result.csv -# --output /path/to/output_folder -# ------------------------------------------------------------------- - - -# Setup logger -#----------------------- - -logger = logging.getLogger('GraphBin 1.1') -logger.setLevel(logging.INFO) -formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') -consoleHeader = logging.StreamHandler() -consoleHeader.setFormatter(formatter) -logger.addHandler(consoleHeader) - -start_time = time.time() - - -# Setup argument parser -#----------------------- - -ap = argparse.ArgumentParser() - -ap.add_argument("--graph", required=True, help="path to the assembly graph file") -ap.add_argument("--binned", required=True, help="path to the .csv file with the initial binning output from an existing tool") -ap.add_argument("--output", required=True, help="path to the output folder") -ap.add_argument("--prefix", required=False, help="prefix for the output file") -ap.add_argument("--max_iteration", required=False, type=int, help="maximum number of iterations for label propagation algorithm. [default: 100]") -ap.add_argument("--diff_threshold", required=False, type=float, help="difference threshold for label propagation algorithm. [default: 0.1]") - -args = vars(ap.parse_args()) - -assembly_graph_file = args["graph"] -contig_bins_file = args["binned"] -output_path = args["output"] -prefix = args["prefix"] -max_iteration = args["max_iteration"] -diff_threshold = args["diff_threshold"] - - -# Setup output path for log file -#--------------------------------------------------- - -fileHandler = logging.FileHandler(output_path+"/"+prefix+"graphbin.log") -fileHandler.setLevel(logging.INFO) -fileHandler.setFormatter(formatter) -logger.addHandler(fileHandler) - -logger.info("Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs.") -logger.info("This version of GraphBin makes use of the assembly graph produced by SGA which is based on the OLC (more recent string graph) approach.") - -logger.info("Assembly graph file: "+assembly_graph_file) -logger.info("Existing binning output file: "+contig_bins_file) -logger.info("Final binning output file: "+output_path) -logger.info("Maximum number of iterations: "+str(max_iteration)) -logger.info("Difference threshold: "+str(diff_threshold)) - -logger.info("GraphBin started") - - -# Get the number of bins from the initial binning result -#-------------------------------------------------------- - -n_bins = 0 - -try: - all_bins_list = [] - - with open(contig_bins_file) as csvfile: - readCSV = csv.reader(csvfile, delimiter=',') - for row in readCSV: - all_bins_list.append(row[1]) - - bins_list = list(set(all_bins_list)) - bins_list.sort() - - n_bins = len(bins_list) - logger.info("Number of bins available in the binning result: "+str(n_bins)) -except: - logger.error("Please make sure that the correct path to the binning result file is provided and it is having the correct format.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - -logger.info("Constructing the assembly graph") - -# Get the links from the .asqg file -#----------------------------------- - -links = [] - -my_map = BidirectionalMap() - -node_count = 0 - -try: - # Get contig connections from .asqg file - with open(assembly_graph_file) as file: - line = file.readline() - - while line != "": - - # Count the number of contigs - if "VT" in line: - start = 'contig-' - end = '' - contig_num = int(re.search('%s(.*)%s' % (start, end), str(line.split()[1])).group(1)) - my_map[node_count] = contig_num - node_count += 1 - - # Identify lines with link information - elif "ED" in line: - link = [] - strings = line.split("\t")[1].split() - link.append(int(strings[0][7:])) - link.append(int(strings[1][7:])) - links.append(link) - line = file.readline() - -except: - logger.error("Please make sure that the correct path to the assembly graph file is provided.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - -contigs_map = my_map -contigs_map_rev = my_map.inverse - -logger.info("Total number of contigs available: "+str(node_count)) - - -## Construct the assembly graph -#------------------------------- - -try: - - # Create the graph - assembly_graph = Graph() - - # Create list of edges - edge_list = [] - - # Add vertices - assembly_graph.add_vertices(node_count) - - # Name vertices - for i in range(len(assembly_graph.vs)): - assembly_graph.vs[i]["id"]= i - assembly_graph.vs[i]["label"]= str(contigs_map[i]) - - # Iterate links - for link in links: - # Remove self loops - if link[0] != link[1]: - # Add edge to list of edges - edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) - - # Add edges to the graph - assembly_graph.add_edges(edge_list) - assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) - -except: - logger.error("Please make sure that the correct path to the assembly graph file is provided.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - -logger.info("Total number of edges in the assembly graph: "+str(len(edge_list))) - - -# Get initial binning result -#---------------------------- - -logger.info("Obtaining the initial binning result") - -bins = [[] for x in range(n_bins)] - -try: - with open(contig_bins_file) as contig_bins: - readCSV = csv.reader(contig_bins, delimiter=',') - for row in readCSV: - start = 'contig-' - end = '' - contig_num = contigs_map_rev[int(re.search('%s(.*)%s' % (start, end), row[0]).group(1))] - - bin_num = int(row[1])-1 - bins[bin_num].append(contig_num) - - for i in range(n_bins): - bins[i].sort() - -except: - logger.error("Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - -# Remove labels of ambiguous vertices -#------------------------------------- - -def getClosestLabelledVertices(graph, node, binned_contigs): - queu_l = [graph.neighbors(node, mode='ALL')] - visited_l = [node] - labelled = [] - - while len(queu_l) > 0: - active_level = queu_l.pop(0) - is_finish = False - visited_l += active_level - - for n in active_level: - if n in binned_contigs: - is_finish = True - labelled.append(n) - if is_finish: - return labelled - else: - temp = [] - for n in active_level: - temp += graph.neighbors(n, mode='ALL') - temp = list(set(temp)) - temp2 = [] - - for n in temp: - if n not in visited_l: - temp2.append(n) - if len(temp2) > 0: - queu_l.append(temp2) - return labelled - - -logger.info("Determining ambiguous vertices") - -remove_labels = [] - -neighbours_have_same_label_list = [] - -for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - dist = {} - - # Get set of closest labelled vertices with distance = 1 - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - neighbours_binned = False - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - neighbours_binned = True - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - remove_labels.append(i) - elif neighbours_binned: - neighbours_have_same_label_list.append(i) - -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -# Further remove labels of ambiguous vertices -binned_contigs = [] - -for n in range(n_bins): - binned_contigs = sorted(binned_contigs+bins[n]) - -for b in range(n_bins): - - for i in bins[b]: - - if i not in neighbours_have_same_label_list: - - my_bin = b - - # Get set of closest labelled vertices - closest_neighbours = getClosestLabelledVertices(assembly_graph, i, binned_contigs) - - if len(closest_neighbours) > 0: - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label and i not in remove_labels: - remove_labels.append(i) - -remove_labels.sort() - -logger.info("Removing labels of ambiguous vertices") - -# Remove labels of ambiguous vertices -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -logger.info("Obtaining refined binning result") - - -# Get vertices which are not isolated and not in components without any labels -#----------------------------------------------------------------------------- - -logger.info("Deteremining vertices which are not isolated and not in components without any labels") - -non_isolated = [] - -for i in range(node_count): - - if i not in non_isolated and i in binned_contigs: - - component = [] - component.append(i) - length = len(component) - neighbours = assembly_graph.neighbors(i, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - component = list(set(component)) - - while length!= len(component): - - length = len(component) - - for j in component: - - neighbours = assembly_graph.neighbors(j, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - labelled = False - for j in component: - if j in binned_contigs: - labelled = True - break - - if labelled: - for j in component: - if j not in non_isolated: - non_isolated.append(j) - -logger.info("Number of non-isolated contigs: "+str(len(non_isolated))) - - -# Run label propagation -#----------------------- - -data = [] - -for contig in range(node_count): - - # Consider vertices that are not isolated - - if contig in non_isolated: - line = [] - line.append(contig) - - assigned = False - - for i in range(n_bins): - if contig in bins[i]: - line.append(i+1) - assigned = True - - if not assigned: - line.append(0) - - neighbours = assembly_graph.neighbors(contig, mode=ALL) - - neighs = [] - - for neighbour in neighbours: - n = [] - n.append(neighbour) - n.append(1.0) - neighs.append(n) - - line.append(neighs) - - data.append(line) - -lp = LabelProp() - -lp.load_data_from_mem(data) - -logger.info("Starting label propagation with eps="+str(diff_threshold)+" and max_iteration="+str(max_iteration)) - -ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) -ans.sort() - -logger.info("Obtaining Label Propagation result") - -for l in ans: - for i in range(n_bins): - if l[1]==i+1 and l[0] not in bins[i]: - bins[i].append(l[0]) - - -# Remove labels of ambiguous vertices -#------------------------------------- - -logger.info("Determining ambiguous vertices") - -remove_labels = [] - -for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - remove_labels.append(i) - -remove_labels.sort() - -logger.info("Removing labels of ambiguous vertices") - -# Remove labels of ambiguous vertices -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -elapsed_time = time.time() - start_time - -logger.info("Obtaining the Final Refined Binning result") - -for i in range(n_bins): - bins[i].sort() - -# Print elapsed time for the process -logger.info("Elapsed time: "+str(elapsed_time)+" seconds") - - -# Write result to output file -#----------------------------- - -logger.info("Writing the Final Binning result to file") - -output_bins = [] - -for i in range(node_count): - for k in range(n_bins): - if i in bins[k]: - line = [] - line.append("contig-"+str(contigs_map[i])) - line.append(k+1) - output_bins.append(line) - -output_file = output_path + prefix + 'graphbin_output.csv' - -with open(output_file, mode='w') as out_file: - output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - - for row in output_bins: - output_writer.writerow(row) - -logger.info("Final binning results can be found at "+output_file) - - -unbinned_contigs = [] - -for i in range(node_count): - if i in remove_labels or i not in non_isolated: - line = [] - line.append("contig-"+str(contigs_map[i])) - unbinned_contigs.append(line) - -if len(unbinned_contigs)!=0: - unbinned_file = output_path + prefix + 'graphbin_unbinned.csv' - - with open(unbinned_file, mode='w') as out_file: - output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - - for row in unbinned_contigs: - output_writer.writerow(row) - - logger.info("Unbinned contigs can be found at "+unbinned_file) - - -# Exit program -#-------------- - -logger.info("Thank you for using GraphBin! Bye...!") - -logger.removeHandler(fileHandler) -logger.removeHandler(consoleHeader) \ No newline at end of file diff --git a/src/graphbin_SPAdes.py b/src/graphbin_SPAdes.py deleted file mode 100644 index ea4d96e..0000000 --- a/src/graphbin_SPAdes.py +++ /dev/null @@ -1,613 +0,0 @@ -#!/usr/bin/python3 - -"""graphbin_SPAdes.py: Refined binning of metagenomic contigs using SPAdes assembly graphs. - -GraphBin is a metagenomic contig binning tool that makes use of the contig -connectivity information from the assembly graph to bin contigs. It utilizes -the binning result of an existing binning tool and a label propagation algorithm -to correct mis-binned contigs and predict the labels of contigs which are -discarded due to short length. - -graphbin_SPAdes.py makes use of the assembly graphs produced by SPAdes. -""" - -import sys -import os -import subprocess -import csv -import operator -import time -import argparse -import re -import logging - -from igraph import * -from collections import defaultdict -from labelpropagation.labelprop import LabelProp -from bidirectionalmap.bidirectionalmap import BidirectionalMap - -# Sample command -# ------------------------------------------------------------------- -# python graphbin_SPAdes.py --graph /path/to/graph_file.gfa -# --paths /path/to/paths_file.paths -# --binned /path/to/binning_result.csv -# --output /path/to/output_folder -# ------------------------------------------------------------------- - - -# Setup logger -#----------------------- - -logger = logging.getLogger('GraphBin 1.1') -logger.setLevel(logging.INFO) -formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') -consoleHeader = logging.StreamHandler() -consoleHeader.setFormatter(formatter) -logger.addHandler(consoleHeader) - -start_time = time.time() - - -# Setup argument parser -#--------------------------------------------------- - -ap = argparse.ArgumentParser() - -ap.add_argument("--graph", required=True, help="path to the assembly graph file") -ap.add_argument("--paths", required=True, help="path to the contigs.paths file") -ap.add_argument("--binned", required=True, help="path to the .csv file with the initial binning output from an existing tool") -ap.add_argument("--output", required=True, help="path to the output folder") -ap.add_argument("--prefix", required=False, help="prefix for the output file") -ap.add_argument("--max_iteration", required=False, type=int, help="maximum number of iterations for label propagation algorithm. [default: 100]") -ap.add_argument("--diff_threshold", required=False, type=float, help="difference threshold for label propagation algorithm. [default: 0.1]") - -args = vars(ap.parse_args()) - -assembly_graph_file = args["graph"] -contig_paths = args["paths"] -contig_bins_file = args["binned"] -output_path = args["output"] -prefix = args["prefix"] -max_iteration = args["max_iteration"] -diff_threshold = args["diff_threshold"] - - -# Setup output path for log file -#--------------------------------------------------- - -fileHandler = logging.FileHandler(output_path+"/"+prefix+"graphbin.log") -fileHandler.setLevel(logging.INFO) -fileHandler.setFormatter(formatter) -logger.addHandler(fileHandler) - -logger.info("Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs.") -logger.info("This version of GraphBin makes use of the assembly graph produced by SPAdes which is based on the de Bruijn graph approach.") - -logger.info("Input arguments:") -logger.info("Assembly graph file: "+assembly_graph_file) -logger.info("Contig paths file: "+contig_paths) -logger.info("Existing binning output file: "+contig_bins_file) -logger.info("Final binning output file: "+output_path) -logger.info("Maximum number of iterations: "+str(max_iteration)) -logger.info("Difference threshold: "+str(diff_threshold)) - -logger.info("GraphBin started") - - -# Get the number of bins from the initial binning result -#--------------------------------------------------- - -n_bins = 0 - -try: - all_bins_list = [] - - with open(contig_bins_file) as csvfile: - readCSV = csv.reader(csvfile, delimiter=',') - for row in readCSV: - all_bins_list.append(row[1]) - - bins_list = list(set(all_bins_list)) - bins_list.sort() - - n_bins = len(bins_list) - logger.info("Number of bins available in the initial binning result: "+str(n_bins)) -except: - logger.error("Please make sure that the correct path to the initial binning result file is provided and it is having the correct format.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - -logger.info("Constructing the assembly graph") - -# Get contig paths from contigs.paths -#------------------------------------- - -paths = {} -segment_contigs = {} -node_count = 0 - -my_map = BidirectionalMap() - -current_contig_num = "" - -try: - with open(contig_paths) as file: - name = file.readline() - path = file.readline() - - while name != "" and path != "": - - while ";" in path: - path = path[:-2]+","+file.readline() - - start = 'NODE_' - end = '_length_' - contig_num = str(int(re.search('%s(.*)%s' % (start, end), name).group(1))) - - segments = path.rstrip().split(",") - - if current_contig_num != contig_num: - my_map[node_count] = int(contig_num) - current_contig_num = contig_num - node_count += 1 - - if contig_num not in paths: - paths[contig_num] = [segments[0], segments[-1]] - - for segment in segments: - if segment not in segment_contigs: - segment_contigs[segment] = set([contig_num]) - else: - segment_contigs[segment].add(contig_num) - - name = file.readline() - path = file.readline() - -except: - logger.error("Please make sure that the correct path to the contig paths file is provided.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - -contigs_map = my_map -contigs_map_rev = my_map.inverse - -logger.info("Total number of contigs available: "+str(node_count)) - -links = [] -links_map = defaultdict(set) - - -## Construct the assembly graph -#------------------------------- - -try: - # Get links from assembly_graph_with_scaffolds.gfa - with open(assembly_graph_file) as file: - line = file.readline() - - while line != "": - - # Identify lines with link information - if "L" in line: - strings = line.split("\t") - f1, f2 = strings[1]+strings[2], strings[3]+strings[4] - links_map[f1].add(f2) - links_map[f2].add(f1) - links.append(strings[1]+strings[2]+" "+strings[3]+strings[4]) - line = file.readline() - - - # Create graph - assembly_graph = Graph() - - # Add vertices - assembly_graph.add_vertices(node_count) - - # Create list of edges - edge_list = [] - - # Name vertices - for i in range(node_count): - assembly_graph.vs[i]["id"]= i - assembly_graph.vs[i]["label"]= str(contigs_map[i]) - - for i in range(len(paths)): - segments = paths[str(contigs_map[i])] - - start = segments[0] - start_rev = "" - - if start.endswith("+"): - start_rev = start[:-1]+"-" - else: - start_rev = start[:-1]+"+" - - end = segments[1] - end_rev = "" - - if end.endswith("+"): - end_rev = end[:-1]+"-" - else: - end_rev = end[:-1]+"+" - - new_links = [] - - if start in links_map: - new_links.extend(list(links_map[start])) - if start_rev in links_map: - new_links.extend(list(links_map[start_rev])) - if end in links_map: - new_links.extend(list(links_map[end])) - if end_rev in links_map: - new_links.extend(list(links_map[end_rev])) - - for new_link in new_links: - if new_link in segment_contigs: - for contig in segment_contigs[new_link]: - if i!=contigs_map_rev[int(contig)]: - # Add edge to list of edges - edge_list.append((i,contigs_map_rev[int(contig)])) - - # Add edges to the graph - assembly_graph.add_edges(edge_list) - assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) - -except: - logger.error("Please make sure that the correct path to the assembly graph file is provided.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - -logger.info("Total number of edges in the assembly graph: "+str(len(edge_list))) - - -# Get initial binning result -#---------------------------- - -logger.info("Obtaining the initial binning result") - -bins = [[] for x in range(n_bins)] - -try: - with open(contig_bins_file) as contig_bins: - readCSV = csv.reader(contig_bins, delimiter=',') - for row in readCSV: - start = 'NODE_' - end = '' - contig_num = contigs_map_rev[int(re.search('%s(.*)%s' % (start, end), row[0]).group(1))] - - bin_num = int(row[1])-1 - bins[bin_num].append(contig_num) - - for i in range(n_bins): - bins[i].sort() - -except: - logger.error("Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format.") - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - -# Remove labels of ambiguous vertices -#------------------------------------- - -def getClosestLabelledVertices(graph, node, binned_contigs): - queu_l = [graph.neighbors(node, mode='ALL')] - visited_l = [node] - labelled = [] - - while len(queu_l) > 0: - active_level = queu_l.pop(0) - is_finish = False - visited_l += active_level - - for n in active_level: - if n in binned_contigs: - is_finish = True - labelled.append(n) - if is_finish: - return labelled - else: - temp = [] - for n in active_level: - temp += graph.neighbors(n, mode='ALL') - temp = list(set(temp)) - temp2 = [] - - for n in temp: - if n not in visited_l: - temp2.append(n) - if len(temp2) > 0: - queu_l.append(temp2) - return labelled - - -logger.info("Determining ambiguous vertices") - -remove_labels = [] - -neighbours_have_same_label_list = [] - -for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - dist = {} - - # Get set of closest labelled vertices with distance = 1 - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - neighbours_binned = False - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - neighbours_binned = True - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - remove_labels.append(i) - elif neighbours_binned: - neighbours_have_same_label_list.append(i) - -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -# Further remove labels of ambiguous vertices -binned_contigs = [] - -for n in range(n_bins): - binned_contigs = sorted(binned_contigs+bins[n]) - -for b in range(n_bins): - - for i in bins[b]: - - if i not in neighbours_have_same_label_list: - - my_bin = b - - # Get set of closest labelled vertices - closest_neighbours = getClosestLabelledVertices(assembly_graph, i, binned_contigs) - - if len(closest_neighbours) > 0: - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label and i not in remove_labels: - remove_labels.append(i) - -remove_labels.sort() - -logger.info("Removing labels of ambiguous vertices") - -# Remove labels of ambiguous vertices -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -logger.info("Obtaining the refined binning result") - - -# Get vertices which are not isolated and not in components without any labels -#----------------------------------------------------------------------------- - -logger.info("Deteremining vertices which are not isolated and not in components without any labels") - -non_isolated = [] - -for i in range(node_count): - - if i not in non_isolated and i in binned_contigs: - - component = [] - component.append(i) - length = len(component) - neighbours = assembly_graph.neighbors(i, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - component = list(set(component)) - - while length!= len(component): - - length = len(component) - - for j in component: - - neighbours = assembly_graph.neighbors(j, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - labelled = False - for j in component: - if j in binned_contigs: - labelled = True - break - - if labelled: - for j in component: - if j not in non_isolated: - non_isolated.append(j) - -logger.info("Number of non-isolated contigs: "+str(len(non_isolated))) - - -# Run label propagation -#----------------------- - -data = [] - -for contig in range(node_count): - - # Consider vertices that are not isolated - - if contig in non_isolated: - line = [] - line.append(contig) - - assigned = False - - for i in range(n_bins): - if contig in bins[i]: - line.append(i+1) - assigned = True - - if not assigned: - line.append(0) - - neighbours = assembly_graph.neighbors(contig, mode=ALL) - - neighs = [] - - for neighbour in neighbours: - n = [] - n.append(neighbour) - n.append(1.0) - neighs.append(n) - - line.append(neighs) - - data.append(line) - -lp = LabelProp() - -lp.load_data_from_mem(data) - -logger.info("Starting label propagation with eps="+str(diff_threshold)+" and max_iteration="+str(max_iteration)) - -ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) -ans.sort() - -logger.info("Obtaining Label Propagation result") - -for l in ans: - for i in range(n_bins): - if l[1]==i+1 and l[0] not in bins[i]: - bins[i].append(l[0]) - - -# Remove labels of ambiguous vertices -#------------------------------------- - -logger.info("Determining ambiguous vertices") - -remove_labels = [] - -for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - remove_labels.append(i) - -remove_labels.sort() - -logger.info("Removing labels of ambiguous vertices") - -# Remove labels of ambiguous vertices -for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - -elapsed_time = time.time() - start_time - -logger.info("Obtaining the Final Refined Binning result") - -for i in range(n_bins): - bins[i].sort() - -# Print elapsed time for the process -logger.info("Elapsed time: "+str(elapsed_time)+" seconds") - - -# Write result to output file -#----------------------------- - -logger.info("Writing the Final Binning result to file") - -output_bins = [] - -for i in range(node_count): - for k in range(n_bins): - if i in bins[k]: - line = [] - line.append("NODE_"+str(contigs_map[i])) - line.append(k+1) - output_bins.append(line) - -output_file = output_path + prefix + 'graphbin_output.csv' - -with open(output_file, mode='w') as out_file: - output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - - for row in output_bins: - output_writer.writerow(row) - -logger.info("Final binning results can be found at "+output_file) - - -unbinned_contigs = [] - -for i in range(node_count): - if i in remove_labels or i not in non_isolated: - line = [] - line.append("NODE_"+str(contigs_map[i])) - unbinned_contigs.append(line) - -if len(unbinned_contigs)!=0: - unbinned_file = output_path + prefix + 'graphbin_unbinned.csv' - - with open(unbinned_file, mode='w') as out_file: - output_writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) - - for row in unbinned_contigs: - output_writer.writerow(row) - - logger.info("Unbinned contigs can be found at "+unbinned_file) - - -# Exit program -#-------------- - -logger.info("Thank you for using GraphBin! Bye...!") - -logger.removeHandler(fileHandler) -logger.removeHandler(consoleHeader) \ No newline at end of file