diff --git a/GetOrganelleLib/assembly_parser.py b/GetOrganelleLib/assembly_parser.py index 24497da..245e7fd 100755 --- a/GetOrganelleLib/assembly_parser.py +++ b/GetOrganelleLib/assembly_parser.py @@ -37,11 +37,11 @@ def minimize(self, fun=None, x0=None, jac=None, method=None, bounds=None, constr import random from copy import deepcopy -major_version, minor_version = sys.version_info[:2] -if major_version == 2 and minor_version >= 7: +MAJOR_VERSION, MINOR_VERSION = sys.version_info[:2] +if MAJOR_VERSION == 2 and MINOR_VERSION >= 7: python_version = "2.7+" RecursionError = RuntimeError -elif major_version == 3 and minor_version >= 5: +elif MAJOR_VERSION == 3 and MINOR_VERSION >= 5: python_version = "3.5+" else: sys.stdout.write("Python version have to be 2.7+ or 3.5+") @@ -50,7 +50,7 @@ def minimize(self, fun=None, x0=None, jac=None, method=None, bounds=None, constr class ProcessingGraphFailed(Exception): - def __init__(self, value): + def __init__(self, value=""): self.value = value def __str__(self): @@ -3001,8 +3001,8 @@ def export_path(self, in_path): return Sequence(",".join(seq_names), "".join(seq_segments)) -class NaiveDeBruijnGraph(Assembly): - def __init__(self, fasta_file, kmer_len=55, circular="auto", circular_head_ends="(circular)"): +class NaiveKmerNodeGraph(Assembly): + def __init__(self, fasta_file, kmer_len=55, circular="auto", circular_head_ends="(circular)", single_chain=False): """ :param fasta_file: :param kmer_len: @@ -3010,7 +3010,7 @@ def __init__(self, fasta_file, kmer_len=55, circular="auto", circular_head_ends= :param circular_head_ends: :return: """ - super(NaiveDeBruijnGraph, self).__init__(overlap=kmer_len - 1) + super(NaiveKmerNodeGraph, self).__init__(overlap=kmer_len - 1) assert circular in ("auto", "yes", "no") assert kmer_len >= 3 and kmer_len % 2 == 1 self.__kmer = kmer_len # overlap is actually kmer_len - 1 @@ -3033,7 +3033,8 @@ def __init__(self, fasta_file, kmer_len=55, circular="auto", circular_head_ends= self.vertex_info[this_vertex] = this_v_info = Vertex(this_vertex, kmer_len, 1., this_kmer_seq) # record the connection as dict() rather than set() for counting self.vertex_info[this_vertex].connections = {True: {}, False: {}} - recorded_kmers[this_v_info.seq[False]] = this_vertex, not this_end + if not single_chain: + recorded_kmers[this_v_info.seq[False]] = this_vertex, not this_end if go_circle: # add connection between the first kmer and the last kmer if the seq is circular prev_kmer_seq = kmer_list[- 1] @@ -3045,7 +3046,8 @@ def __init__(self, fasta_file, kmer_len=55, circular="auto", circular_head_ends= recorded_kmers[prev_kmer_seq] = prev_vertex, prev_end = str(count_vertices), True self.vertex_info[prev_vertex] = prev_v_info = Vertex(prev_vertex, kmer_len, 0., prev_kmer_seq) self.vertex_info[prev_vertex].connections = {True: {}, False: {}} - recorded_kmers[prev_v_info.seq[False]] = prev_vertex, not prev_end + if not single_chain: + recorded_kmers[prev_v_info.seq[False]] = prev_vertex, not prev_end if (this_vertex, not this_end) not in self.vertex_info[prev_vertex].connections[prev_end]: self.vertex_info[prev_vertex].connections[prev_end][(this_vertex, not this_end)] = 0 self.vertex_info[prev_vertex].connections[prev_end][(this_vertex, not this_end)] += 1 @@ -3064,7 +3066,8 @@ def __init__(self, fasta_file, kmer_len=55, circular="auto", circular_head_ends= recorded_kmers[this_kmer_seq] = this_vertex, this_end = str(count_vertices), True self.vertex_info[this_vertex] = this_v_info = Vertex(this_vertex, kmer_len, 1., this_kmer_seq) self.vertex_info[this_vertex].connections = {True: {}, False: {}} - recorded_kmers[this_v_info.seq[False]] = this_vertex, not this_end + if not single_chain: + recorded_kmers[this_v_info.seq[False]] = this_vertex, not this_end # add the connection between this_kmer_seq and prev_kmer_seq prev_kmer_seq = kmer_list[go_to - 1] prev_vertex, prev_end = recorded_kmers[prev_kmer_seq] diff --git a/GetOrganelleLib/pipe_control_func.py b/GetOrganelleLib/pipe_control_func.py index 2cd99d0..67118a0 100755 --- a/GetOrganelleLib/pipe_control_func.py +++ b/GetOrganelleLib/pipe_control_func.py @@ -5,10 +5,10 @@ import os from multiprocessing import Pool -major_version, minor_version = sys.version_info[:2] -if major_version == 2 and minor_version >= 7: +MAJOR_VERSION, MINOR_VERSION = sys.version_info[:2] +if MAJOR_VERSION == 2 and MINOR_VERSION >= 7: python_version = "2.7+" -elif major_version == 3 and minor_version >= 5: +elif MAJOR_VERSION == 3 and MINOR_VERSION >= 5: python_version = "3.5+" else: sys.stdout.write("Python version have to be 2.7+ or 3.5+") @@ -74,6 +74,15 @@ def timed_log(log, output_base, prefix, log_level="NOTSET"): return log_timed +if MAJOR_VERSION == 2: + class TimeoutError(Exception): + def __init__(self, value=""): + self.value = value + + def __str__(self): + return repr(self.value) + + def set_time_limit(num, flag_str="'--time-limit'"): def wrap(func): def handle(sig_num, interrupted_stack_frame): diff --git a/GetOrganelleLib/versions.py b/GetOrganelleLib/versions.py index e249b5c..1fd547a 100644 --- a/GetOrganelleLib/versions.py +++ b/GetOrganelleLib/versions.py @@ -5,22 +5,26 @@ def get_versions(): versions = [ + {"number": "1.6.3a", + "features": [ + "1. Minor bugs fixes", + ], + "time": "2020-02-27 17:14 GMT-6"}, {"number": "1.6.3-beta", "features": [ - "1. log plastome info", - "2. get_organelle_from_assembly.py & disentangle_organelle_assembly.py: --max-multiplicity added", - "3. Assembly.estimate_copy_and_depth_precisely() modified: constraint_max_function() for --max-multiplicity", - "4. Assembly.tag_in_between() modified", - "5. Assembly.estimate_copy_and_depth_by_cov() modified: min average coverage limit", - "6. Assembly.processing_polymorphism():" + "1. get_organelle_from_assembly.py & disentangle_organelle_assembly.py: --max-multiplicity added", + "2. Assembly.estimate_copy_and_depth_precisely() modified: constraint_max_function() for --max-multiplicity", + "3. Assembly.tag_in_between() modified", + "4. Assembly.estimate_copy_and_depth_by_cov() modified: min average coverage limit", + "5. Assembly.processing_polymorphism():" " fix a bug when kmer-len repeats shared by two contigs; fix a bug that cause RuntimeError", - "7. Assembly: too many results due to palindromic repeats, problem solved", - "8. Utilities/reconstruct_graph_from_fasta.py & NaiveDeBruijnGraph added", - "9. Utilities/gfa_to_fasta.py, Utilities/fastg_to_gfa.py: description corrected", - "10. Assembly.parse_gfa(): compatibility increased", - "11. Utilities/gfa2fastg.py: compatibility increased", - "12. Assembly.estimate_copy_and_depth_precisely(): fix a bug on a rare case that multiplicities res are 4,8,4", - "13. README.md: updated", + "6. Assembly: too many results due to palindromic repeats, problem solved", + "7. Utilities/reconstruct_graph_from_fasta.py & NaiveKmerNodeGraph added", + "8. Utilities/gfa_to_fasta.py, Utilities/fastg_to_gfa.py: description corrected", + "9. Assembly.parse_gfa(): compatibility increased", + "10. Utilities/gfa2fastg.py: compatibility increased", + "11. Assembly.estimate_copy_and_depth_precisely(): fix a bug on a rare case that multiplicities res are 4,8,4", + "12. README.md: updated", ], "time": "2020-02-22 02:40 GMT-6"}, {"number": "1.6.2e", diff --git a/README.md b/README.md index d4edfcc..3139c8f 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ Perl is required for the wrapper of Bowtie2, but we assume that it was builtin i Bandage is a fantastic tool to view the assembly graph (`*.fastg`/`*.gfa`). If you have Bandage correctly configured and add the binary folder of Bandage (which is `Bandage.app/Contents/MacOS` for MacOS) to the $PATH, get_organelle_from_*.py would automatically generate the a png formatted image of the assembly graph. -If you installed python library psutil (version >= 3.0; pip install psutil), the memory cost of get_organelle_from_reads.py will be automatically logged. If you want to evaluate your results and plot the evaluation with `evaluate_assembly_using_mapping.py` and `round_statistics.py`, you have to further install python library matplotlib (pip install matplotlib). +If you installed python library psutil (version >= 3.0; pip install -U psutil), the memory cost of get_organelle_from_reads.py will be automatically logged. If you want to evaluate your results and plot the evaluation with `evaluate_assembly_using_mapping.py` and `round_statistics.py`, you have to further install python library matplotlib (pip install matplotlib). ## How To diff --git a/Utilities/reconstruct_graph_from_fasta.py b/Utilities/reconstruct_graph_from_fasta.py index 47e5ea2..d0aaa9f 100755 --- a/Utilities/reconstruct_graph_from_fasta.py +++ b/Utilities/reconstruct_graph_from_fasta.py @@ -17,7 +17,7 @@ def get_options(): usage="reconstruct_graph_from_fasta.py -i fasta_file -o out.gfa") parser.add_option("-i", dest="input", help="Input fasta file.") - parser.add_option("-o", dest="output", + parser.add_option("-o", dest="output", default="", help="Output graph file. The output format is GFA by default, but FASTG only when " "indicated with postfix '.fastg'.") parser.add_option("-k", dest="kmer", default=55, type=int, @@ -26,8 +26,13 @@ def get_options(): help="Sequences in input fasta file are all circular (yes/no/auto). " "The auto mode enables detection by checking the existence of '(circular)' in " "the end of the header of each sequence. Default:%default") + parser.add_option("--single-chain", dest="single_chain", default=False, action="store_true", + help="The input sequence(s) was by default treated as DNA double-chain with its complementary " + "sequence. Choose this flag to turn off.") + parser.add_option("--out-kg", dest="out_kg", default="", + help="Output kmer node graph.") options, argv = parser.parse_args() - if not (options.output and options.input): + if not ((options.output or options.out_kg) and options.input): parser.print_help() sys.stdout.write("Insufficient arguments!\n") sys.exit() @@ -46,15 +51,20 @@ def main(): time_0 = time.time() options, argv = get_options() # detect postfix - de_burijn_graph = NaiveDeBruijnGraph(options.input, kmer_len=options.kmer, circular=options.circular) - assembly_graph = de_burijn_graph.generate_assembly_graph() - if options.output.endswith(".fastg"): - sys.stdout.warning("Fastg is not recommended!\n") - assembly_graph.write_to_fastg(options.output) - else: - assembly_graph.write_to_gfa(options.output) - # de_burijn_graph.write_to_gfa(options.output + ".db.gfa") - sys.stdout.write("Took " + "%.4f" % (time.time() - time_0) + "s in generating " + options.output + "\n") + kmer_node_graph = NaiveKmerNodeGraph(options.input, kmer_len=options.kmer, + circular=options.circular, single_chain=options.single_chain) + if options.output: + assembly_graph = kmer_node_graph.generate_assembly_graph() + if options.output.endswith(".fastg"): + sys.stdout.warning("Fastg is not recommended!\n") + assembly_graph.write_to_fastg(options.output) + else: + assembly_graph.write_to_gfa(options.output) + if options.out_kg: + kmer_node_graph.write_to_gfa(options.out_kg) + sys.stdout.write("Took " + "%.4f" % (time.time() - time_0) + "s in generating " + + options.output * int(bool(options.output)) + ", " * int(bool(options.output and options.out_kg)) + + options.out_kg * int(bool(options.out_kg)) + "\n") if __name__ == '__main__': diff --git a/get_organelle_from_assembly.py b/get_organelle_from_assembly.py index d85f768..8326291 100755 --- a/get_organelle_from_assembly.py +++ b/get_organelle_from_assembly.py @@ -421,15 +421,6 @@ def get_options(description, version): return options, log_handler -if MAJOR_VERSION == 2: - class TimeoutError(Exception): - def __init__(self, value): - self.value = value - - def __str__(self): - return repr(self.value) - - def slim_spades_result(organelle_types, in_custom, ex_custom, graph_in, graph_out_base, verbose_log, log_handler, threads, which_slim, which_blast="", other_options="", resume=False, keep_temp=False): @@ -682,7 +673,7 @@ def disentangle_inside(fastg_f, tab_f, o_p, w_f, log_in, type_f=3., mode_in="emb raise e except RuntimeError as e: log_handler.info("Disentangling failed: RuntimeError: " + str(e).strip()) - except TimeoutError: + except TimeoutError as e: log_handler.info("Disentangling timeout. (see " + timeout_flag + " for more)") except ProcessingGraphFailed as e: log_handler.info("Disentangling failed: " + str(e).strip()) @@ -719,7 +710,7 @@ def disentangle_inside(fastg_f, tab_f, o_p, w_f, log_in, type_f=3., mode_in="emb if verbose: log_handler.exception("") log_handler.info("Disentangling failed: RuntimeError: " + str(e).strip()) - except TimeoutError: + except TimeoutError as e: log_handler.info("Disentangling timeout. (see " + timeout_flag + " for more)") except ProcessingGraphFailed as e: log_handler.info("Disentangling failed: " + str(e).strip()) diff --git a/get_organelle_from_reads.py b/get_organelle_from_reads.py index d03ca9a..32d41e6 100755 --- a/get_organelle_from_reads.py +++ b/get_organelle_from_reads.py @@ -903,15 +903,6 @@ def get_options(description, version): return options, log_handler, previous_attributes -if MAJOR_VERSION == 2: - class TimeoutError(Exception): - def __init__(self, value): - self.value = value - - def __str__(self): - return repr(self.value) - - def estimate_maximum_n_reads_using_mapping( twice_max_coverage, check_dir, original_fq_list, reads_paired, designed_maximum_n_reads, seed_files, organelle_types, target_genome_sizes,