Add dump_task_info_table (#235)

* add dump_task_info_table * implement make task info table * Update gokart/tree/task_info.py Co-authored-by: hirosassa <[email protected]> * move task_info_formatter * fix test * fix doc Co-authored-by: hirosassa <[email protected]>
m3dev · Aug 31, 2021 · 489e2af · 489e2af
1 parent 1b900df
commit 489e2af
Show file tree

Hide file tree

Showing 6 changed files with 300 additions and 106 deletions.
diff --git a/docs/task_information.rst b/docs/task_information.rst
@@ -1,13 +1,20 @@
 Task Information
 ================
 
-Task Tree
----------
+There are 5 ways to print the significant parameters and state of the task and its dependencies.
+
+* 1. One is to use luigi module. See `luigi.tools.deps_tree module <https://luigi.readthedocs.io/en/stable/api/luigi.tools.deps_tree.html>`_ for details.
+* 2. ``task-info`` option of ``gokart.run()``.
+* 3. ``make_task_info_as_tree_str()`` will return significant parameters and dependency tree as str.
+* 4. ``make_task_info_as_table()`` will return significant parameter and dependent tasks as pandas.DataFrame table format.
+* 5. ``dump_task_info_table()`` will dump the result of ``make_task_info_as_table()`` to a file.
+
+
+This document will cover 2~5.
 
-There are two ways to print the significant parameters and state of the task and its dependencies in a tree format.
-One is to use luigi module. See `luigi.tools.deps_tree module <https://luigi.readthedocs.io/en/stable/api/luigi.tools.deps_tree.html>`_ for details.
-Another is to use ``task-info`` option which is implemented in gokart.
 
+2. task-info option of gokart.run()
+--------------------------------------------
 
 On CLI
 ~~~~~~
@@ -55,12 +62,35 @@ An example output is as follows:
        └─-(COMPLETE) TaskA[2549878535c070fb6c3cd4061bdbbcff](parameter={'workspace_directory': './resources/', 'local_temporary_directory': './resources/tmp/', 'param': 'called by TaskB'}, output=['./resources/output_of_task_a_2549878535c070fb6c3cd4061bdbbcff.pkl'], time=0.0009829998016357422s, task_log={})
 
 
-On Python
-~~~~~~~~~
 
-It use :func:`~gokart.tree.task_info.make_tree_info_string` in the following:
+3. make_task_info_as_tree_str()
+-----------------------------------------
+
+``gokart.tree.task_info.make_task_info_as_tree_str()`` will return a tree dependency tree as a str.
+
+.. code:: python
+
+    from gokart.tree.task_info import make_task_info_as_tree_str
+
+    make_task_info_as_tree_str(task, ignore_task_names)
+    # Parameters
+    # ----------
+    # - task: TaskOnKart
+    #     Root task.
+    # - details: bool
+    #     Whether or not to output details.
+    # - abbr: bool
+    #     Whether or not to simplify tasks information that has already appeared.
+    # - ignore_task_names: Optional[List[str]]
+    #     List of task names to ignore.
+    # Returns
+    # -------
+    # - tree_info : str
+    #     Formatted task dependency tree.
 
 
+example
+
 .. code:: python
 
     import luigi
@@ -91,9 +121,6 @@ It use :func:`~gokart.tree.task_info.make_tree_info_string` in the following:
             self.dump(','.join(task))
 
 
-The more dependencies you have, the harder it is to grasp the task tree.
-
-
 .. code:: python
 
     task = TaskD(
@@ -106,7 +133,7 @@ The more dependencies you have, the harder it is to grasp the task tree.
             task2=TaskD(task1=TaskC(task=TaskA(param='foo')), task2=TaskC(task=TaskB(task=TaskA(param='bar'))))   # same task
         )
     )
-    print(gokart.make_tree_info_string(task))
+    print(gokart.make_task_info_as_tree_str(task))
 
 
 .. code:: sh
@@ -125,12 +152,63 @@ The more dependencies you have, the harder it is to grasp the task tree.
             └─- ...
 
 
-In task dependency tree output by `make_tree_info_string`, the sub-trees already shown in above will be omitted.
-We can disable this omission by passing ``False`` to ``abbr`` flag:
+In the above example, the sub-trees already shown is omitted.
+This can be disabled by passing ``False`` to ``abbr`` flag:
+
+.. code:: python
+
+    print(make_task_info_as_tree_str(task, abbr=False))
+
+
+4. make_task_info_as_table()
+--------------------------------
+
+``gokart.tree.task_info.make_task_info_as_table()`` will return a table containing the information of significant parameters and dependent tasks as a pandas DataFrame.
+This table contains `task name`, `cache unique id`, `cache file path`, `task parameters`, `task processing time`, `completed flag`, and `task log`.
+
+.. code:: python
+
+    from gokart.tree.task_info import make_task_info_as_table
+
+    make_task_info_as_table(task, ignore_task_names)
+    # """Return a table containing information about dependent tasks.
+    #
+    # Parameters
+    # ----------
+    # - task: TaskOnKart
+    #     Root task.
+    # - ignore_task_names: Optional[List[str]]
+    #     List of task names to ignore.
+    # Returns
+    # -------
+    # - task_info_table : pandas.DataFrame 
+    #     Formatted task dependency table.
+    # """
+
+
+5. dump_task_info_table()
+-----------------------------------------
+
+``gokart.tree.task_info.dump_task_info_table()`` will dump the task_info table made at ``make_task_info_as_table()`` to a file.
 
 .. code:: python
 
-    print(make_tree_info_string(task, abbr=False))
+    from gokart.tree.task_info import dump_task_info_table
+
+    dump_task_info_table(task, task_info_dump_path, ignore_task_names)
+    # Parameters
+    # ----------
+    # - task: TaskOnKart
+    #     Root task.
+    # - task_info_dump_path: str
+    #     Output target file path. Path destination can be `local`, `S3`, or `GCS`.
+    #     File extension can be any type that gokart file processor accepts, including `csv`, `pickle`, or `txt`.
+    #     See `TaskOnKart.make_target module <https://gokart.readthedocs.io/en/latest/task_on_kart.html#taskonkart-make-target>` for details.
+    # - ignore_task_names: Optional[List[str]]
+    #     List of task names to ignore.
+    # Returns
+    # -------
+    # None
 
 
 
@@ -170,3 +248,4 @@ the output could be like:
 Delete Unnecessary Output Files
 --------------------------------
 To delete output files which are not necessary to run a task, add option ``--delete-unnecessary-output-files``. This option is supported only when a task outputs files in local storage not S3 for now.
+
diff --git a/gokart/__init__.py b/gokart/__init__.py
@@ -5,6 +5,6 @@
 from gokart.run import run
 from gokart.task import TaskOnKart
 from gokart.testing import test_run
-from gokart.tree.task_info import make_tree_info_string
+from gokart.tree.task_info import make_task_info_as_tree_str
 from gokart.utils import add_config
 from gokart.workspace_management import delete_local_unnecessary_outputs
diff --git a/gokart/info.py b/gokart/info.py
@@ -5,7 +5,7 @@
 import luigi
 
 from gokart.task import TaskOnKart
-from gokart.tree.task_info import make_tree_info_string
+from gokart.tree.task_info import make_task_info_as_tree_str
 
 logger = getLogger(__name__)
 
@@ -20,7 +20,7 @@ def make_tree_info(task: TaskOnKart,
     """
     Return a string representation of the tasks, their statuses/parameters in a dependency tree format
 
-    This function has moved to `gokart.tree.task_info.make_tree_info_string`.
+    This function has moved to `gokart.tree.task_info.make_task_info_as_tree_str`.
     This code is remained for backward compatibility.
 
     Parameters
@@ -38,7 +38,7 @@ def make_tree_info(task: TaskOnKart,
     - tree_info : str
         Formatted task dependency tree.
     """
-    return make_tree_info_string(task=task, details=details, abbr=abbr, ignore_task_names=ignore_task_names)
+    return make_task_info_as_tree_str(task=task, details=details, abbr=abbr, ignore_task_names=ignore_task_names)
 
 
 class tree_info(TaskOnKart):

diff --git a/gokart/tree/task_info.py b/gokart/tree/task_info.py
@@ -3,89 +3,14 @@
 from typing import List, Optional, Set
 
 import luigi
+import pandas as pd
 
+from gokart.target import make_target
 from gokart.task import TaskOnKart
+from gokart.tree.task_info_formatter import make_task_info_tree, make_tree_info, make_tree_info_table_list
 
 
-@dataclass
-class TaskInfo:
-    name: str
-    unique_id: str
-    output_paths: List[TaskOnKart]
-    params: dict
-    processing_time: str
-    is_complete: str
-    task_log: dict
-    children_task_infos: List['TaskInfo']
-
-    def get_task_id(self):
-        return f'{self.name}_{self.unique_id}'
-
-    def get_task_title(self):
-        return f'({self.is_complete}) {self.name}[{self.unique_id}]'
-
-    def get_task_detail(self):
-        return f'(parameter={self.params}, output={self.output_paths}, time={self.processing_time}, task_log={self.task_log})'
-
-
-def _make_task_info_tree(task: TaskOnKart, ignore_task_names: Optional[List[str]]) -> TaskInfo:
-    with warnings.catch_warnings():
-        warnings.filterwarnings(action='ignore', message='Task .* without outputs has no custom complete() method')
-        is_task_complete = task.complete()
-
-    name = task.__class__.__name__
-    unique_id = task.make_unique_id()
-    output_paths = [t.path() for t in luigi.task.flatten(task.output())]
-    params = task.get_info(only_significant=True)
-    processing_time = task.get_processing_time()
-    if type(processing_time) == float:
-        processing_time = str(processing_time) + 's'
-    is_complete = ('COMPLETE' if is_task_complete else 'PENDING')
-    task_log = dict(task.get_task_log())
-
-    children = luigi.task.flatten(task.requires())
-    children_task_infos: List[TaskInfo] = []
-    for child in children:
-        if ignore_task_names is None or child.__class__.__name__ not in ignore_task_names:
-            children_task_infos.append(_make_task_info_tree(child, ignore_task_names=ignore_task_names))
-    return TaskInfo(name=name,
-                    unique_id=unique_id,
-                    output_paths=output_paths,
-                    params=params,
-                    processing_time=processing_time,
-                    is_complete=is_complete,
-                    task_log=task_log,
-                    children_task_infos=children_task_infos)
-
-
-def _make_tree_info(task_info: TaskInfo, indent: str, last: bool, details: bool, abbr: bool, visited_tasks: Set[str]):
-    result = '\n' + indent
-    if last:
-        result += '└─-'
-        indent += '   '
-    else:
-        result += '|--'
-        indent += '|  '
-    result += task_info.get_task_title()
-
-    if abbr:
-        task_id = task_info.get_task_id()
-        if task_id not in visited_tasks:
-            visited_tasks.add(task_id)
-        else:
-            result += f'\n{indent}└─- ...'
-            return result
-
-    if details:
-        result += task_info.get_task_detail()
-
-    children = task_info.children_task_infos
-    for index, child in enumerate(children):
-        result += _make_tree_info(child, indent, (index + 1) == len(children), details=details, abbr=abbr, visited_tasks=visited_tasks)
-    return result
-
-
-def make_tree_info_string(task: TaskOnKart, details: bool = False, abbr: bool = True, ignore_task_names: Optional[List[str]] = None):
+def make_task_info_as_tree_str(task: TaskOnKart, details: bool = False, abbr: bool = True, ignore_task_names: Optional[List[str]] = None):
     """
     Return a string representation of the tasks, their statuses/parameters in a dependency tree format
 
@@ -104,6 +29,52 @@ def make_tree_info_string(task: TaskOnKart, details: bool = False, abbr: bool =
     - tree_info : str
         Formatted task dependency tree.
     """
-    task_info = _make_task_info_tree(task, ignore_task_names=ignore_task_names)
-    result = _make_tree_info(task_info=task_info, indent='', last=True, details=details, abbr=abbr, visited_tasks=set())
+    task_info = make_task_info_tree(task, ignore_task_names=ignore_task_names)
+    result = make_tree_info(task_info=task_info, indent='', last=True, details=details, abbr=abbr, visited_tasks=set())
     return result
+
+
+def make_task_info_as_table(task: TaskOnKart, ignore_task_names: Optional[List[str]]):
+    """Return a table containing information about dependent tasks.
+
+    Parameters
+    ----------
+    - task: TaskOnKart
+        Root task.
+    - ignore_task_names: Optional[List[str]]
+        List of task names to ignore.
+    Returns
+    -------
+    - task_info_table : pandas.DataFrame 
+        Formatted task dependency table.
+    """
+
+    task_info = make_task_info_tree(task, ignore_task_names=ignore_task_names)
+    task_info_table = pd.DataFrame(make_tree_info_table_list(task_info=task_info, visited_tasks=set()))
+
+    return task_info_table
+
+
+def dump_task_info_table(task: TaskOnKart, task_info_dump_path: str, ignore_task_names: Optional[List[str]]):
+    """Dump a table containing information about dependent tasks.
+
+    Parameters
+    ----------
+    - task: TaskOnKart
+        Root task.
+    - task_info_dump_path: str
+        Output target file path. Path destination can be `local`, `S3`, or `GCS`.
+        File extension can be any type that gokart file processor accepts, including `csv`, `pickle`, or `txt`.
+        See `TaskOnKart.make_target module <https://gokart.readthedocs.io/en/latest/task_on_kart.html#taskonkart-make-target>` for details.
+    - ignore_task_names: Optional[List[str]]
+        List of task names to ignore.
+    Returns
+    -------
+    None
+    """
+    task_info_table = make_task_info_as_table(task=task, ignore_task_names=ignore_task_names)
+
+    unique_id = task.make_unique_id()
+
+    task_info_target = make_target(file_path=task_info_dump_path, unique_id=unique_id)
+    task_info_target.dump(obj=task_info_table, lock_at_dump=False)