Cleaner report on completed stages (#68)

Signed-off-by: Jeremy Fowers <[email protected]> Co-authored-by: Jeremy Fowers <[email protected]> Co-authored-by: Jeremy Fowers <[email protected]>
onnx · Jan 10, 2024 · f8e8c1f · f8e8c1f
1 parent b44bbfa
commit f8e8c1f
Show file tree

Hide file tree

Showing 12 changed files with 262 additions and 113 deletions.
diff --git a/docs/release_notes.md b/docs/release_notes.md
@@ -8,6 +8,83 @@ We are tracking two types of major changes:
 
 If you are creating the release notes for a new version, please see the [template](#template-version-majorminorpatch). Release notes should capture all of the significant changes since the last numbered package release.
 
+# Version 1.1.0
+
+This version focuses on improving the clarity of the telemetry reported.
+
+## Users
+
+### User Improvements
+
+- ONNX files exported from PyTorch models now have a `torch_export_validity` key in their stats/report files that indicates whether the `torch.onnx.verification.find_mismatch()` API could find any issue with the exported ONNX file. Possible values:
+  - `valid`: passed verification.
+  - `invalid`: failed verification.
+  - `unverified`: turnkey was unable to complete the verification process.
+- Stats and report CSV files split `stages_completed` into stage status and duration.
+- Build, benchmark, and stage status values in the stat and report files now use the same terminology values:
+
+```
+class FunctionStatus(enum.Enum):
+    """
+    Status values that are assigned to stages, builds, benchmarks, and other
+    functionality to help the user understand whether that function completed
+    successfully or not.
+    """
+
+    # SUCCESSFUL means the stage/build/benchmark completed successfully.
+    SUCCESSFUL = "successful"
+
+    # ERROR means the stage/build/benchmark failed and threw some error that
+    # was caught by turnkey. You should proceed by looking at the build
+    # logs to see what happened.
+
+    ERROR = "error"
+
+    # KILLED means the build/benchmark failed because the system killed it. This can
+    # happen because of an out-of-memory (OOM), timeout, system shutdown, etc.
+    # You should proceed by re-running the build and keeping an eye on it to observe
+    # why it is being killed (e.g., watch the RAM utilization to diagnose an OOM).
+    KILLED = "killed"
+
+    # The NOT_STARTED status is applied to all stages/builds/benchmarks at startup.
+    # It will be replaced by one of the other status values if the stage/build/benchmark
+    # has a chance to start running.
+    # A value of NOT_STARTED in the report CSV indicates that the stage/build/benchmark
+    # never had a chance to start because turnkey exited before that functionality had
+    # a chance to start running.
+    NOT_STARTED = "not_started"
+
+    # INCOMPLETE indicates that a stage/build/benchmark started running and did not complete.
+    # Each stage, build, and benchmark are marked as INCOMPLETE when they start running.
+    # If you open the turnkey_stats.yaml file while the stage/build/benchmark
+    # is still running, the status will show as INCOMPLETE. If the stage/build/benchmark
+    # is killed without the chance to do any stats cleanup, the status will continue to
+    # show as INCOMPLETE in turnkey_stats.yaml.
+    # When the report CSV is created, any instance of an INCOMPLETE stage/build/benchmark
+    # status will be replaced by KILLED.
+    INCOMPLETE = "incomplete"
+```
+
+- The CLI help page for the `benchmark` command has been reorganized for clarity (try `turnkey benchmark -h`).
+- The CLI now provides more helpful errors when the user provides arguments incorrectly.
+- Fixed a bug where multi-cache reporting could repeat entries in the report CSV file.
+
+
+## User Breaking Changes
+
+- Previous turnkey caches are not compatible with this version and must be rebuilt.
+- The status terminology changes documented above mean that stats/reports from pre-v1.1.0 builds are not directly comparable to post-v1.1.0 builds.
+
+## Developers
+
+### Developer Improvements
+
+None
+
+### Developer Breaking Changes
+
+- `build.Status` and `filesystem.FunctionStatus` have both been removed, and replaced with `build.FunctionStatus` which is the union of those two Enums.
+
 # Version 1.0.0
 
 This version focuses on cleaning up technical debts and most of the changes are not visible to users. It removes cumbersome requirements for developers, removes unused features to streamline the codebase, and also clarifying some API naming schemes.

diff --git a/src/turnkeyml/analyze/script.py b/src/turnkeyml/analyze/script.py
@@ -88,9 +88,13 @@ def set_status_on_exception(build_state: build.State, stats: fs.Stats):
     # We get `state` when the build tool succeeds, so we can use that to identify
     # whether the exception was thrown during build or benchmark
     if not build_state:
-        stats.save_model_eval_stat(fs.Keys.BUILD_STATUS, fs.FunctionStatus.FAILED)
+        stats.save_model_eval_stat(
+            fs.Keys.BUILD_STATUS, build.FunctionStatus.ERROR.value
+        )
     else:
-        stats.save_model_eval_stat(fs.Keys.BENCHMARK_STATUS, fs.FunctionStatus.FAILED)
+        stats.save_model_eval_stat(
+            fs.Keys.BENCHMARK_STATUS, build.FunctionStatus.ERROR.value
+        )
 
 
 def explore_invocation(
@@ -248,6 +252,14 @@ def explore_invocation(
 
         return
 
+    # Initialize build and benchmark status to "not started"
+    stats.save_model_eval_stat(
+        fs.Keys.BUILD_STATUS, build.FunctionStatus.NOT_STARTED.value
+    )
+    stats.save_model_eval_stat(
+        fs.Keys.BENCHMARK_STATUS, build.FunctionStatus.NOT_STARTED.value
+    )
+
     build_state = None
     perf = None
     try:
@@ -257,7 +269,9 @@ def explore_invocation(
             # we will try to catch the exception and note it in the stats.
             # If a concluded build still has a status of "running", this means
             # there was an uncaught exception.
-            stats.save_model_eval_stat(fs.Keys.BUILD_STATUS, fs.FunctionStatus.RUNNING)
+            stats.save_model_eval_stat(
+                fs.Keys.BUILD_STATUS, build.FunctionStatus.INCOMPLETE.value
+            )
 
             build_state = build_model(
                 model=model_info.model,
@@ -272,7 +286,7 @@ def explore_invocation(
             )
 
             stats.save_model_eval_stat(
-                fs.Keys.BUILD_STATUS, fs.FunctionStatus.SUCCESSFUL
+                fs.Keys.BUILD_STATUS, build.FunctionStatus.SUCCESSFUL.value
             )
 
             model_to_benchmark = build_state.results[0]
@@ -294,7 +308,7 @@ def explore_invocation(
                 rt_args_to_use = tracer_args.rt_args
 
             stats.save_model_eval_stat(
-                fs.Keys.BENCHMARK_STATUS, fs.FunctionStatus.RUNNING
+                fs.Keys.BENCHMARK_STATUS, build.FunctionStatus.INCOMPLETE.value
             )
 
             model_handle = runtime_info["RuntimeClass"](
@@ -317,7 +331,7 @@ def explore_invocation(
                 )
 
             stats.save_model_eval_stat(
-                fs.Keys.BENCHMARK_STATUS, fs.FunctionStatus.SUCCESSFUL
+                fs.Keys.BENCHMARK_STATUS, build.FunctionStatus.SUCCESSFUL.value
             )
 
             invocation_info.status_message = "Model successfully benchmarked!"

diff --git a/src/turnkeyml/build/ignition.py b/src/turnkeyml/build/ignition.py
@@ -157,10 +157,7 @@ def validate_cached_model(
     )
     if build_conditions_changed:
         # Show an error if build_name is not specified for different models on the same script
-        if (
-            state.uid == build.unique_id()
-            and state.build_status != build.Status.PARTIAL_BUILD
-        ):
+        if state.uid == build.unique_id():
             msg = (
                 "You are building multiple different models in the same script "
                 "without specifying a unique build_model(..., build_name=) for each build."
@@ -201,8 +198,9 @@ def validate_cached_model(
                 result.append(msg)
     else:
         if (
-            state.build_status == build.Status.FAILED_BUILD
-            or state.build_status == build.Status.BUILD_RUNNING
+            state.build_status == build.FunctionStatus.ERROR
+            or state.build_status == build.FunctionStatus.INCOMPLETE
+            or state.build_status == build.FunctionStatus.KILLED
         ) and turnkey_version == state.turnkey_version:
             msg = (
                 "build_model() has detected that you already attempted building "
@@ -324,7 +322,7 @@ def load_or_make_state(
 
             if (
                 model_type == build.ModelType.UNKNOWN
-                and state.build_status == build.Status.SUCCESSFUL_BUILD
+                and state.build_status == build.FunctionStatus.SUCCESSFUL
             ):
                 msg = (
                     "Model caching is disabled for successful builds against custom Sequences. "
@@ -335,7 +333,7 @@ def load_or_make_state(
                 return _begin_fresh_build(state_args, state_type)
             elif (
                 model_type == build.ModelType.UNKNOWN
-                and state.build_status == build.Status.PARTIAL_BUILD
+                and state.build_status == build.FunctionStatus.INCOMPLETE
             ):
                 msg = (
                     f"Model {config.build_name} was partially built in a previous call to "

diff --git a/src/turnkeyml/build/stage.py b/src/turnkeyml/build/stage.py
@@ -83,6 +83,8 @@ def __init__(
         _name_is_file_safe(unique_name)
 
         self.unique_name = unique_name
+        self.status_key = f"{fs.Keys.STAGE_STATUS}:{unique_name}"
+        self.duration_key = f"{fs.Keys.STAGE_DURATION}:{unique_name}"
         self.monitor_message = monitor_message
         self.progress = None
         self.logfile_path = None
@@ -107,10 +109,10 @@ def fire_helper(self, state: build.State) -> Tuple[build.State, int]:
                 including in the event of an exception
         """
 
-        # Set the build status to BUILD_RUNNING to indicate that a Stage
+        # Set the build status to INCOMPLETE to indicate that a Stage
         # started running. This allows us to test whether the Stage exited
-        # unexpectedly, before it was able to set FAILED_BUILD
-        state.build_status = build.Status.BUILD_RUNNING
+        # unexpectedly, before it was able to set ERROR
+        state.build_status = build.FunctionStatus.INCOMPLETE
 
         self.logfile_path = os.path.join(
             build.output_dir(state.cache_dir, state.config.build_name),
@@ -131,18 +133,18 @@ def fire_helper(self, state: build.State) -> Tuple[build.State, int]:
                 successful=False,
                 verbosity=state.monitor,
             )
-            state.build_status = build.Status.FAILED_BUILD
+            state.build_status = build.FunctionStatus.ERROR
             raise
 
         else:
             self.status_line(successful=True, verbosity=state.monitor)
 
-            # Stages should not set build.Status.SUCCESSFUL_BUILD, as that is
-            # reserved for Sequence.launch()
-            if state.build_status == build.Status.SUCCESSFUL_BUILD:
+            # Stages should not set build.FunctionStatus.SUCCESSFUL for the whole build,
+            # as that is reserved for Sequence.launch()
+            if state.build_status == build.FunctionStatus.SUCCESSFUL:
                 raise exp.StageError(
                     "TurnkeyML Stages are not allowed to set "
-                    "`state.build_status == build.Status.SUCCESSFUL_BUILD`, "
+                    "`state.build_status == build.FunctionStatus.SUCCESSFUL`, "
                     "however that has happened. If you are a plugin developer, "
                     "do not do this. If you are a user, please file an issue at "
                     "https://github.com/onnx/turnkeyml/issues."
@@ -266,9 +268,7 @@ def launch(self, state: build.State) -> build.State:
         can include both Stages and Sequences (ie, sequences can be nested).
         """
 
-        if state.build_status == build.Status.NOT_STARTED:
-            state.build_status = build.Status.PARTIAL_BUILD
-        elif state.build_status == build.Status.SUCCESSFUL_BUILD:
+        if state.build_status == build.FunctionStatus.SUCCESSFUL:
             msg = """
             build_model() is running a build on a model that already built successfully, which
             should not happen because the build should have loaded from cache or rebuilt from scratch.
@@ -280,46 +280,59 @@ def launch(self, state: build.State) -> build.State:
         # Collect telemetry for the build
         stats = fs.Stats(state.cache_dir, state.config.build_name, state.evaluation_id)
         stats.save_model_eval_stat(
-            fs.Keys.ALL_BUILD_STAGES,
+            fs.Keys.SELECTED_SEQUENCE_OF_STAGES,
             self.get_names(),
         )
 
+        # At the beginning of a sequence no stage has started
+        for stage in self.stages:
+            stats.save_model_eval_stat(
+                stage.status_key, build.FunctionStatus.NOT_STARTED.value
+            )
+            stats.save_model_eval_stat(stage.duration_key, "-")
+
         # Run the build
+        start_time = time.time()
         try:
             for stage in self.stages:
+                # Set status as incomplete, since stage just started
+                stats.save_model_eval_stat(
+                    stage.status_key, build.FunctionStatus.INCOMPLETE.value
+                )
+
                 # Collect telemetry about the stage
                 state.current_build_stage = stage.unique_name
-                start_time = time.time()
 
                 # Run the stage
                 state = stage.fire_helper(state)
 
-                # Collect telemetry about the stage
-                execution_time = time.time() - start_time
-
-                stats.save_model_eval_sub_stat(
-                    parent_key=fs.Keys.COMPLETED_BUILD_STAGES,
-                    key=stage.unique_name,
-                    value=execution_time,
+                # Set status as successful
+                stats.save_model_eval_stat(
+                    stage.status_key, build.FunctionStatus.SUCCESSFUL.value
                 )
 
         except exp.StageError as e:
             # Advance the cursor below the monitor so
             # we can print an error message
             stage_depth_in_sequence = self.get_depth() - self.get_names().index(
-                stage.unique_name
+                stage.unique_name  # pylint: disable=undefined-loop-variable
             )
             stdout_lines_to_advance = stage_depth_in_sequence - 2
             cursor_down = "\n" * stdout_lines_to_advance
 
             print(cursor_down)
 
             printing.log_error(e)
+
+            stats.save_model_eval_stat(
+                stage.status_key, build.FunctionStatus.ERROR.value
+            )
+
             raise
 
         else:
             state.current_build_stage = None
-            state.build_status = build.Status.SUCCESSFUL_BUILD
+            state.build_status = build.FunctionStatus.SUCCESSFUL
 
             # We use a deepcopy here because the Stage framework supports
             # intermediate_results of any type, including model objects in memory.
@@ -329,6 +342,11 @@ def launch(self, state: build.State) -> build.State:
 
             return state
 
+        finally:
+            # Collect telemetry about the stage
+            execution_time = time.time() - start_time
+            stats.save_model_eval_stat(stage.duration_key, execution_time)
+
     def status_line(self, successful, verbosity):
         """
         This override of status_line simply propagates status_line()

diff --git a/src/turnkeyml/build_api.py b/src/turnkeyml/build_api.py
@@ -105,7 +105,7 @@ def build_model(
 
     # Return a cached build if possible, otherwise prepare the model State for
     # a build
-    if state.build_status == build.Status.SUCCESSFUL_BUILD:
+    if state.build_status == build.FunctionStatus.SUCCESSFUL:
         # Successful builds can be loaded from cache and returned with
         # no additional steps
         additional_msg = " (build_name auto-selected)" if config.auto_name else ""