From f430d871994264764818aad35150db3658f291ee Mon Sep 17 00:00:00 2001 From: Aravindh Mahendran Date: Wed, 10 Jul 2024 02:33:58 -0700 Subject: [PATCH] File saving to track end of evaluation runs. This helps with extensions that might wait for some evals to finish before running an op. PiperOrigin-RevId: 650927891 --- kauldron/evals/__init__.py | 3 +++ kauldron/evals/eval_impl.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/kauldron/evals/__init__.py b/kauldron/evals/__init__.py index be22429f..bcd702e5 100644 --- a/kauldron/evals/__init__.py +++ b/kauldron/evals/__init__.py @@ -21,6 +21,9 @@ # Lazy-import is import here as `run_strategies` is imported from kxm and # we do not want to trigger a full import. with _epy.lazy_api_imports(globals()): + from kauldron.evals.eval_impl import TRAIN_COMPLETE_FILENAME + from kauldron.evals.eval_impl import EVAL_COMPLETE_FILENAME + from kauldron.evals.evaluators import CollectionKeys from kauldron.evals.evaluators import Evaluator from kauldron.evals.evaluators import EvaluatorBase diff --git a/kauldron/evals/eval_impl.py b/kauldron/evals/eval_impl.py index 90398613..ce125c86 100644 --- a/kauldron/evals/eval_impl.py +++ b/kauldron/evals/eval_impl.py @@ -33,6 +33,7 @@ # XManager API do not have API for jobs within a work-unit to communicate, # so use files for communication. TRAIN_COMPLETE_FILENAME = 'train_complete.txt' +EVAL_COMPLETE_FILENAME = 'eval_{}_complete.txt' def continuous_eval( @@ -120,11 +121,25 @@ def continuous_eval( final_step = step + # All every_checkpoint_evals have been processed. Marks those as complete. + if trainer.workdir.exists(): # `TrainEvaluator` do not have a workdir + for ev in every_checkpoint_evals: + epath.Path(trainer.workdir).joinpath( + EVAL_COMPLETE_FILENAME.format(ev.name) + ).touch() + logging.info('Running final evals...') for ev in last_checkpoint_evals: with tracker.catch_exception(name=ev.name, step=final_step): aux[ev.name] = ev.evaluate(state=state, step=final_step) + # All last_checkpoint_evals have been processed. Marks those as complete. + if trainer.workdir.exists(): # `TrainEvaluator` do not have a workdir + for ev in last_checkpoint_evals: + epath.Path(trainer.workdir).joinpath( + EVAL_COMPLETE_FILENAME.format(ev.name) + ).touch() + tracker.maybe_reraise() # Return the last aux