diff --git a/torchx/runner/events/__init__.py b/torchx/runner/events/__init__.py index c8eb89d96..8fab92a10 100644 --- a/torchx/runner/events/__init__.py +++ b/torchx/runner/events/__init__.py @@ -20,7 +20,9 @@ """ +import json import logging +import sys import time import traceback from types import TracebackType @@ -123,6 +125,20 @@ def __exit__( ) // 1000 if traceback_type: self._torchx_event.raw_exception = traceback.format_exc() + typ, value, tb = sys.exc_info() + if tb: + last_frame = traceback.extract_tb(tb)[-1] + self._torchx_event.exception_source_location = json.dumps( + { + "filename": last_frame.filename, + "lineno": last_frame.lineno, + "name": last_frame.name, + } + ) + if exec_type: + self._torchx_event.exception_type = exec_type.__name__ + if exec_value: + self._torchx_event.exception_message = str(exec_value) record(self._torchx_event) def _generate_torchx_event( diff --git a/torchx/runner/events/api.py b/torchx/runner/events/api.py index 355c03f6c..f03815e75 100644 --- a/torchx/runner/events/api.py +++ b/torchx/runner/events/api.py @@ -52,6 +52,9 @@ class TorchxEvent: wall_time_usec: Optional[int] = None start_epoch_time_usec: Optional[int] = None workspace: Optional[str] = None + exception_type: Optional[str] = None + exception_message: Optional[str] = None + exception_source_location: Optional[str] = None def __str__(self) -> str: return self.serialize()