From 75f80e2d077f2977a42b19cf5388f274886ab7e0 Mon Sep 17 00:00:00 2001 From: moshi <41852815+moshi4@users.noreply.github.com> Date: Fri, 29 Oct 2021 00:25:12 +0900 Subject: [PATCH] Fix clean code (#64) --- .github/workflows/ci.yml | 3 + README.md | 48 ++++++++------- src/fastdtlmapper/args.py | 21 +++++-- src/fastdtlmapper/goea/goea.py | 71 ++++++++++------------ src/fastdtlmapper/out_path.py | 2 +- src/fastdtlmapper/scripts/FastDTLgoea.py | 16 ++++- src/fastdtlmapper/scripts/FastDTLmapper.py | 34 +++++++---- src/fastdtlmapper/setup_binpath.py | 4 +- src/fastdtlmapper/util/time.py | 26 ++++++++ src/fastdtlmapper/util/tree.py | 5 +- tests/goea/test_goea.py | 11 ++-- tests/test_args.py | 2 +- tests/test_out_path.py | 6 +- 13 files changed, 158 insertions(+), 91 deletions(-) create mode 100644 src/fastdtlmapper/util/time.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5939d5e..5accc6e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,9 @@ on: pull_request: branches: main paths: ["src/**", "tests/**", ".github/workflows/**"] + schedule: + # Scheduled Daily CI + - cron: "0 0 * * *" jobs: CI_black-flake8-pytest: diff --git a/README.md b/README.md index 8d0cd11..2585477 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# FastDTLmapper: Fast genome-wide DTL event mapper +# FastDTLmapper: Fast genome-wide DTL event mapper ![Python3](https://img.shields.io/badge/Language-Python_3.7_|_3.8_|_3.9-steelblue) ![OS](https://img.shields.io/badge/OS-Linux-steelblue) @@ -24,7 +24,7 @@ driving adaptive evolution, but it remains largely unexplored. Therefore, to investigate the relationship between gene gain/loss and adaptive evolution in the evolutionary process of organisms, I developed a software pipeline **FastDTLmapper** which automatically estimates and maps genome-wide gene gain/loss. -FastDTLmapper takes two inputs, 1. *Species tree (Newick format)* 2. *Genomic Protein CDSs (Fasta|Genbank format)*, +FastDTLmapper takes two inputs, 1. *Species tree (Newick format)* & 2. *Genomic Protein CDSs (Fasta|Genbank format)*, and performs genome-wide mapping of DTL(Duplication-Transfer-Loss) events by DTL reconciliation of species tree and gene trees. Additionally, FastDTLmapper can perform @@ -162,25 +162,25 @@ This is brief description of analysis pipeline. See [wiki](https://github.com/mo --dup_cost Duplication event cost (Default: 2) --los_cost Loss event cost (Default: 1) --trn_cost Transfer event cost (Default: 3) - --inflation OrthoFinder MCL inflation parameter (Default: 3.0) + --inflation OrthoFinder MCL inflation parameter (Default: 1.5) --timetree Use species tree as timetree in AnGST (Default: off) --rseed Number of random seed (Default: 0) -#### Timetree Option +- **Timetree Option** -If user set this option, input species tree must be ultrametric tree. ---timetree enable AnGST timetree option below (See [AnGST manual]() for details). -> If the branch lengths on the provided species tree represent times, -> AnGST can restrict the set of possible inferred gene transfers to -> only those between contemporaneous lineages + If user set this option, input species tree must be ultrametric tree. + --timetree enable AnGST timetree option below (See [AnGST manual]() for details). + > If the branch lengths on the provided species tree represent times, + > AnGST can restrict the set of possible inferred gene transfers to + > only those between contemporaneous lineages -#### Input Limitation +- **Input Limitation** -fasta or genbank files (--indir option) ->:warning: Following characters cannot be included in file name '_', '-', '|', '.' + fasta or genbank files (--indir option) + >:warning: Following characters cannot be included in file name '_', '-', '|', '.' -species tree file (--tree option) ->:warning: Species name in species tree must match fasta or genbank file name + species tree file (--tree option) + >:warning: Species name in species tree must match fasta or genbank file name ### Example Command @@ -190,23 +190,23 @@ Download example dataset: This dataset is identical to [example](https://github.com/moshi4/FastDTLmapper/tree/main/example) in this repository. -#### 1. Minimum test dataset +- **Minimum test dataset** -7 species, 100 CDS limited fasta dataset + 7 species, 100 CDS limited fasta dataset - FastDTLmapper -i example/minimum_dataset/fasta/ -t example/minimum_dataset/species_tree.nwk -o output_minimum + FastDTLmapper -i example/minimum_dataset/fasta/ -t example/minimum_dataset/species_tree.nwk -o output_minimum -#### 2. Mycoplasma dataset (Input Format = Fasta) +- **Mycoplasma dataset (Input Format = Fasta)** -7 Mycoplasma species, 500 ~ 1000 CDS fasta dataset + 7 Mycoplasma species, 500 ~ 1000 CDS fasta dataset - FastDTLmapper -i example/mycoplasma_dataset/fasta/ -t example/mycoplasma_dataset/species_tree.nwk -o output_mycoplasma_fasta + FastDTLmapper -i example/mycoplasma_dataset/fasta/ -t example/mycoplasma_dataset/species_tree.nwk -o output_mycoplasma_fasta -#### 3. Mycoplasma dataset (Input Format = Genbank) +- **Mycoplasma dataset (Input Format = Genbank)** -7 Mycoplasma species, 500 ~ 1000 CDS genbank dataset + 7 Mycoplasma species, 500 ~ 1000 CDS genbank dataset - FastDTLmapper -i example/mycoplasma_dataset/genbank/ -t example/mycoplasma_dataset/species_tree.nwk -o output_mycoplasma_genbank + FastDTLmapper -i example/mycoplasma_dataset/genbank/ -t example/mycoplasma_dataset/species_tree.nwk -o output_mycoplasma_genbank ## Output Contents @@ -256,6 +256,8 @@ This dataset is identical to [example](https://github.com/moshi4/FastDTLmapper/t ├── parallel_cmds/ -- Parallel run command log results └── run_config.log -- Program run config log file +See [wiki](https://github.com/moshi4/FastDTLmapper/wiki/1.2.-Output-Contents-(FastDTLmapper)) for output files details. + ## Further Analysis ### Plot Gain/Loss Map Figure diff --git a/src/fastdtlmapper/args.py b/src/fastdtlmapper/args.py index d4dd7e2..5d72128 100644 --- a/src/fastdtlmapper/args.py +++ b/src/fastdtlmapper/args.py @@ -6,6 +6,7 @@ import sys import time from dataclasses import dataclass +from enum import IntEnum, auto from pathlib import Path from typing import List, Optional, Union @@ -66,6 +67,18 @@ def unixtime_to_datestr(unixtime: float) -> str: return log_text +class RestartFrom(IntEnum): + """RestartFrom Enum Class""" + + ORTHO_FINDER = auto() + MAFFT = auto() + TRIMAL = auto() + IQTREE = auto() + TREERECS = auto() + ANGST = auto() + AGG_MAP = auto() + + def get_args(argv: Optional[List[str]] = None) -> Args: """Get arguments @@ -135,7 +148,7 @@ def get_args(argv: Optional[List[str]] = None) -> Args: default=default_trn_cost, metavar="", ) - default_inflation = 3.0 + default_inflation = 1.5 parser.add_argument( "--inflation", type=float, @@ -145,7 +158,7 @@ def get_args(argv: Optional[List[str]] = None) -> Args: ) parser.add_argument( "--timetree", - help="Use species tree as timetree", + help="Use species tree as timetree in AnGST (Default: off)", action="store_true", ) default_rseed = 0 @@ -163,8 +176,8 @@ def get_args(argv: Optional[List[str]] = None) -> Args: "--restart_from", type=int, help=argparse.SUPPRESS, - default=1, - choices=[1, 2, 3, 4, 5, 6, 7], + default=RestartFrom.ORTHO_FINDER, + choices=[rf.value for rf in RestartFrom], ) args = parser.parse_args(argv) diff --git a/src/fastdtlmapper/goea/goea.py b/src/fastdtlmapper/goea/goea.py index f5b21ff..0d2f4b5 100644 --- a/src/fastdtlmapper/goea/goea.py +++ b/src/fastdtlmapper/goea/goea.py @@ -58,41 +58,44 @@ def run(self, output_prefix: Path) -> List[Path]: def plot( self, goea_result_file: Path, - output_prefix: Path, + plot_outfile: Path, + over_or_under: str, + title: str = "", ) -> None: """Plot GOEA significant GOterms Args: goea_result_file (Path): GOEA result file path - output_prefix (Path): Output files prefix path + plot_outfile (Path): Output file path + over_or_under (str): "over" or "under" + title (str): Plot title """ - for goea_type in ("over", "under"): - # Extract goterm & pvalue - goterm2pvalue = self._extract_goterm2pvalue(goea_result_file, goea_type) - if len(goterm2pvalue) == 0: - continue + if over_or_under not in ("over", "under"): + raise ValueError("goea_type must be 'over' or 'under'") - # Plot color setting - goterm2hexcolor = {} - if self.plot_color: - # Set specified plot color - for goterm in goterm2pvalue.keys(): - goterm2hexcolor[goterm] = self.plot_color - else: - # Get hexcolor from pvalue for color plot - pvalue_abs_log10_list = [ - abs(math.log10(v)) for v in goterm2pvalue.values() - ] - pvalue_hexcolor_list = self._convert_hexcolor_gradient( - pvalue_abs_log10_list - ) - # Set yellow to red gradient plot color - for goterm, hexcolor in zip(goterm2pvalue.keys(), pvalue_hexcolor_list): - goterm2hexcolor[goterm] = hexcolor - - # Plot GOterm with gradient color - plot_outfile = Path(f"{output_prefix}_{goea_type}.{self.plot_format}") - self._color_plot(plot_outfile, goterm2hexcolor, goterm2pvalue) + # Extract goterm & pvalue + goterm2pvalue = self._extract_goterm2pvalue(goea_result_file, over_or_under) + if len(goterm2pvalue) == 0: + return + + # Plot color setting + goterm2hexcolor = {} + if self.plot_color: + # Set specified plot color + for goterm in goterm2pvalue.keys(): + goterm2hexcolor[goterm] = self.plot_color + else: + # Get hexcolor from pvalue for color plot + pvalue_abs_log10_list = [abs(math.log10(v)) for v in goterm2pvalue.values()] + pvalue_hexcolor_list = self._convert_hexcolor_gradient( + pvalue_abs_log10_list + ) + # Set yellow to red gradient plot color + for goterm, hexcolor in zip(goterm2pvalue.keys(), pvalue_hexcolor_list): + goterm2hexcolor[goterm] = hexcolor + + # Plot GOterm with gradient color + self._color_plot(plot_outfile, goterm2hexcolor, goterm2pvalue, title) def _extract_goterm2pvalue( self, @@ -181,6 +184,7 @@ def _color_plot( plot_outfile: Union[str, Path], goid2color: Dict[str, str], goid2pvalue: Dict[str, float] = {}, + title: str = "", ) -> None: """Plot GO DAG using self-defined GO color @@ -188,6 +192,7 @@ def _color_plot( plot_outfile (str): Output plot file path goid2color (Dict[str, str]): go id and hexcolor dict goid2pvalue (Dict[str, float], optional): go id and pvalue dict + title (str): Plot title """ # Get plot target GO DAG obodag = GODag(self.obo_file) @@ -214,16 +219,6 @@ def _color_plot( godag_plg_vars = GODagPltVars() godag_plg_vars.fmthdr = "{GO}" - # Plot title - title = Path(plot_outfile).with_suffix("").name.replace("_", " ") - title += " representation\n" - title += f"Top{self.plot_max_num} GOterm " - if self.use_adjusted_pvalue: - title += f"(BH adjusted P-value < {self.pvalue_thr})" - else: - title += f"(P-value < {self.pvalue_thr})" - title = f"\n{title}\n" - # Create plot obj & add plot color godagplot = GODagSmallPlot( godagsmall, abodag=obodag, GODagPltVars=godag_plg_vars, title=title diff --git a/src/fastdtlmapper/out_path.py b/src/fastdtlmapper/out_path.py index ed3e0d5..6960123 100644 --- a/src/fastdtlmapper/out_path.py +++ b/src/fastdtlmapper/out_path.py @@ -52,7 +52,7 @@ def __post_init__(self): self.result_summary_dir = self.goea_dir / "result_summary" self.result_summary_plot_dir = self.result_summary_dir / "significant_go_plot" - self.obo_file = self.goea_dir / "go-basic.obo" + self.obo_file = self.go_enrichment_dir / "go-basic.obo" self.og2go_association_file = self.go_enrichment_dir / "og2go_association.txt" self.significant_go_list_file = ( self.result_summary_dir / "significant_go_list.tsv" diff --git a/src/fastdtlmapper/scripts/FastDTLgoea.py b/src/fastdtlmapper/scripts/FastDTLgoea.py index b9642a8..8ed9485 100755 --- a/src/fastdtlmapper/scripts/FastDTLgoea.py +++ b/src/fastdtlmapper/scripts/FastDTLgoea.py @@ -224,7 +224,21 @@ def run_goatools_goea( goea_result_file_list = goea.run(output_prefix) # Plot GOEA significant GOterms for goea_result_file in goea_result_file_list: - goea.plot(goea_result_file, goea_result_file.with_suffix("")) + for over_or_under in ("over", "under"): + go_category = str(goea_result_file.with_suffix("")).split("_")[-1] + # Define title + title = f"{node_id} {gain_or_loss} {over_or_under} representation\n" + title += f"Top{plot_max_num} {go_category} GOterm " + if use_adjusted_pvalue: + title += f"(BH adjusted P-value < {pvalue_thr})" + else: + title += f"(P-value < {pvalue_thr})" + title = f"\n{title}\n" + + plot_outfile = Path( + f"{output_prefix}_{over_or_under}_{go_category}.{plot_format}" + ) + goea.plot(goea_result_file, plot_outfile, over_or_under, title) if __name__ == "__main__": diff --git a/src/fastdtlmapper/scripts/FastDTLmapper.py b/src/fastdtlmapper/scripts/FastDTLmapper.py index d8ca8a9..9526c2d 100755 --- a/src/fastdtlmapper/scripts/FastDTLmapper.py +++ b/src/fastdtlmapper/scripts/FastDTLmapper.py @@ -7,13 +7,14 @@ from typing import Dict, List, Optional from fastdtlmapper.angst import AngstEventMap, AngstTransferGene, NodeEvent -from fastdtlmapper.args import Args, get_args +from fastdtlmapper.args import Args, RestartFrom, get_args from fastdtlmapper.cmd import Cmd from fastdtlmapper.input_check import InputCheck from fastdtlmapper.out_path import OutPath from fastdtlmapper.reconcilation import Reconciliation as Rec from fastdtlmapper.setup_binpath import SetupBinpath from fastdtlmapper.util import UtilFasta, UtilGenbank, UtilTree +from fastdtlmapper.util.time import print_runtime def main(args: Optional[Args] = None): @@ -33,29 +34,27 @@ def main(args: Optional[Args] = None): # 00. Prepare analysis data format_user_tree(args.tree_file, outpath, cmd) format_user_fasta(args.indir, outpath.user_fasta_dir) - if args.restart_from <= 1: + if args.restart_from <= RestartFrom.ORTHO_FINDER: # 01. Grouping ortholog sequences using OrthoFinder orthofinder_run(outpath, cmd) - if args.restart_from <= 2: + if args.restart_from <= RestartFrom.MAFFT: # 02. Align each OG(Ortholog Group) sequences using mafft mafft_run(outpath, cmd) - if args.restart_from <= 3: + if args.restart_from <= RestartFrom.TRIMAL: # 03. Trim each OG alignment using trimal trimal_run(outpath, cmd) - if args.restart_from <= 4: + if args.restart_from <= RestartFrom.IQTREE: # 04. Reconstruct each OG gene tree using iqtree iqtree_run(outpath, cmd) - if args.restart_from <= 5: + if args.restart_from <= RestartFrom.TREERECS: # 05. Correct gene tree multifurcation using treerecs treerecs_run(outpath, cmd) - if args.restart_from <= 6: + if args.restart_from <= RestartFrom.ANGST: # 06. DTL reconciliation using AnGST angst_run(outpath, cmd) - if args.restart_from <= 7: + if args.restart_from <= RestartFrom.AGG_MAP: # 07. Aggregate and map DTL reconciliation result - group_id2all_node_event = aggregate_dtl_results(args, outpath) - output_aggregate_map_results(outpath, group_id2all_node_event) - output_aggregate_transfer_results(outpath) + aggregate_and_map(args, outpath) args.write_log(outpath.run_config_log_file) @@ -113,6 +112,7 @@ def orthofinder_run(outpath: OutPath, cmd: Cmd) -> None: shutil.copy(fasta_file, dtl_rec_group_dir) +@print_runtime def mafft_run(outpath: OutPath, cmd: Cmd) -> None: """Run mafft""" print("\n# 02. Align each OG(Ortholog Group) sequences using mafft") @@ -130,6 +130,7 @@ def mafft_run(outpath: OutPath, cmd: Cmd) -> None: ) +@print_runtime def trimal_run(outpath: OutPath, cmd: Cmd) -> None: """Run trimal""" print("\n# 03. Trim each OG alignment using trimal") @@ -153,6 +154,7 @@ def trimal_run(outpath: OutPath, cmd: Cmd) -> None: shutil.copy(aln_file, aln_trim_file) +@print_runtime def iqtree_run(outpath: OutPath, cmd: Cmd) -> None: """Run iqtree""" print("\n# 04. Reconstruct each OG gene tree using iqtree") @@ -184,6 +186,7 @@ def iqtree_run(outpath: OutPath, cmd: Cmd) -> None: ) +@print_runtime def treerecs_run(outpath: OutPath, cmd: Cmd) -> None: """Run treerecs""" print("\n# 05. Correct gene tree multifurcation using treerecs") @@ -213,6 +216,7 @@ def treerecs_run(outpath: OutPath, cmd: Cmd) -> None: UtilTree.unroot_tree(treerecs_outfile, treerecs_outfile) +@print_runtime def angst_run(outpath: OutPath, cmd: Cmd) -> None: """Run AnGST""" print("\n# 06. DTL reconciliation using AnGST") @@ -228,6 +232,14 @@ def angst_run(outpath: OutPath, cmd: Cmd) -> None: ) +@print_runtime +def aggregate_and_map(args: Args, outpath: OutPath): + """Run aggregate and map functions""" + group_id2all_node_event = aggregate_dtl_results(args, outpath) + output_aggregate_map_results(outpath, group_id2all_node_event) + output_aggregate_transfer_results(outpath) + + def aggregate_dtl_results(args: Args, outpath: OutPath) -> Dict[str, List[NodeEvent]]: """Aggregate DTL reconciliation result""" print("\n# 07. Aggregate and map DTL reconciliation result") diff --git a/src/fastdtlmapper/setup_binpath.py b/src/fastdtlmapper/setup_binpath.py index fb552f6..61ca237 100644 --- a/src/fastdtlmapper/setup_binpath.py +++ b/src/fastdtlmapper/setup_binpath.py @@ -49,7 +49,7 @@ def _add_bin_path(self, bin_path_list: List[Path]) -> None: def _bin_exists_check(self, bin_list: List[str]) -> None: """Check bin program exists""" bin_exists_check_flg = False - print("Required bin program exists check...") + print("Checking required bin programs...") for bin in bin_list: bin_path = shutil.which(bin) if bin_path: @@ -61,4 +61,4 @@ def _bin_exists_check(self, bin_list: List[str]) -> None: print("Required bin program not exist!!\n") exit(1) else: - print("All required bin programs exists.\n") + print("Check result = All OK!!\n") diff --git a/src/fastdtlmapper/util/time.py b/src/fastdtlmapper/util/time.py new file mode 100644 index 0000000..ab87fef --- /dev/null +++ b/src/fastdtlmapper/util/time.py @@ -0,0 +1,26 @@ +import datetime as dt +import functools +import time + + +def print_runtime(func): + """Print runtime decorator""" + + @functools.wraps(func) + def wrap(*args, **kargs): + start_time = time.time() + ret = func(*args, **kargs) + end_time = time.time() + + def unixtime_to_datestr(unixtime: float) -> str: + return dt.datetime.fromtimestamp(unixtime).strftime("%Y/%m/%y %H:%M:%S") + + elapsed_time_s = end_time - start_time + elapsed_time_h = elapsed_time_s / 3600 + print( + f"Finished: {unixtime_to_datestr(end_time)} " + + f"(Elapsed time = {elapsed_time_h:.3f}[h])" + ) + return ret + + return wrap diff --git a/src/fastdtlmapper/util/tree.py b/src/fastdtlmapper/util/tree.py index 62a6105..8f0a6f6 100644 --- a/src/fastdtlmapper/util/tree.py +++ b/src/fastdtlmapper/util/tree.py @@ -143,10 +143,9 @@ def multifurcate_zero_length_nodes( descendants_node_list = node.get_descendants() if len(descendants_node_list) <= 2: continue - descendants_total_dist = sum([n.dist for n in descendants_node_list]) - if descendants_total_dist == 0: + children_total_dist = sum([n.dist for n in node.children]) + if children_total_dist == 0: node.unroot() - # gene_tree.unroot() gene_tree_text_list.append(gene_tree.write(dist_formatter="%1.10f")) # Output unrooted gene trees diff --git a/tests/goea/test_goea.py b/tests/goea/test_goea.py index c763cd6..3bb700e 100644 --- a/tests/goea/test_goea.py +++ b/tests/goea/test_goea.py @@ -21,7 +21,8 @@ def test_goea_default_param_plot( output_prefix = tmp_path / "goea" goea_result_file_list = goea.run(output_prefix) for goea_result_file in goea_result_file_list: - goea.plot(goea_result_file, goea_result_file.with_suffix("")) + plot_outfile = goea_result_file.with_suffix(".png") + goea.plot(goea_result_file, plot_outfile, "over") # Check goea result file exists for goea_result_file in goea_result_file_list: assert goea_result_file.is_file() @@ -34,7 +35,7 @@ def test_goea_user_specified_param_plot( go_basic_obo_file: Path, tmp_path: Path, ): - """test GOEA default parameter plot (Only check run successfully)""" + """test GOEA user specified parameter plot (Only check run successfully)""" goea = GOEA( target_list_file=goea_target_gene_list_file, all_list_file=goea_all_gene_list_file, @@ -49,14 +50,16 @@ def test_goea_user_specified_param_plot( output_prefix = tmp_path / "goea" goea_result_file_list = goea.run(output_prefix) for goea_result_file in goea_result_file_list: - goea.plot(goea_result_file, goea_result_file.with_suffix("")) + plot_outfile = goea_result_file.with_suffix(".png") + goea.plot(goea_result_file, plot_outfile, "under", "Title") + plot_outfile.is_file() # Check goea result file exists for goea_result_file in goea_result_file_list: assert goea_result_file.is_file() @pytest.mark.skip(reason="This test depends on network status.") -def test_download_obo(tmp_path: Path) -> Path: +def test_download_obo(tmp_path: Path): """test go-basic.obo file downaload""" go_basic_obo_file = tmp_path / "go-basic.obo" GOEA.download_obo(go_basic_obo_file) diff --git a/tests/test_args.py b/tests/test_args.py index 69ac85a..44f34c3 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -18,7 +18,7 @@ def test_get_args_default_ok(): assert args.dup_cost == 2 assert args.los_cost == 1 assert args.trn_cost == 3 - assert args.inflation == 3.0 + assert args.inflation == 1.5 assert args.timetree is False assert args.rseed == 0 diff --git a/tests/test_out_path.py b/tests/test_out_path.py index ab75241..66f5ecf 100644 --- a/tests/test_out_path.py +++ b/tests/test_out_path.py @@ -22,9 +22,9 @@ def test_makedirs_goea_ok(tmp_path: Path): outpath = OutPath(tmp_path) # No error occur, after FastDTLmapper run OutPath(outpath.rootdir, goea_mode=True) - assert outpath.go_annotation_workdir - assert outpath.go_enrichment_dir - assert outpath.result_summary_plot_dir + assert outpath.go_annotation_workdir.is_dir() + assert outpath.go_enrichment_dir.is_dir() + assert outpath.result_summary_plot_dir.is_dir() def test_makedirs_goea_ng(tmp_path: Path):