Skip to content

Commit

Permalink
File saving to track end of evaluation runs. This helps with extensio…
Browse files Browse the repository at this point in the history
…ns that might wait for some evals to finish before running an op.

PiperOrigin-RevId: 650927891
  • Loading branch information
aravindhm authored and The kauldron Authors committed Jul 11, 2024
1 parent ec5067d commit f430d87
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 0 deletions.
3 changes: 3 additions & 0 deletions kauldron/evals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
# Lazy-import is import here as `run_strategies` is imported from kxm and
# we do not want to trigger a full import.
with _epy.lazy_api_imports(globals()):
from kauldron.evals.eval_impl import TRAIN_COMPLETE_FILENAME
from kauldron.evals.eval_impl import EVAL_COMPLETE_FILENAME

from kauldron.evals.evaluators import CollectionKeys
from kauldron.evals.evaluators import Evaluator
from kauldron.evals.evaluators import EvaluatorBase
Expand Down
15 changes: 15 additions & 0 deletions kauldron/evals/eval_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
# XManager API do not have API for jobs within a work-unit to communicate,
# so use files for communication.
TRAIN_COMPLETE_FILENAME = 'train_complete.txt'
EVAL_COMPLETE_FILENAME = 'eval_{}_complete.txt'


def continuous_eval(
Expand Down Expand Up @@ -120,11 +121,25 @@ def continuous_eval(

final_step = step

# All every_checkpoint_evals have been processed. Marks those as complete.
if trainer.workdir.exists(): # `TrainEvaluator` do not have a workdir
for ev in every_checkpoint_evals:
epath.Path(trainer.workdir).joinpath(
EVAL_COMPLETE_FILENAME.format(ev.name)
).touch()

logging.info('Running final evals...')
for ev in last_checkpoint_evals:
with tracker.catch_exception(name=ev.name, step=final_step):
aux[ev.name] = ev.evaluate(state=state, step=final_step)

# All last_checkpoint_evals have been processed. Marks those as complete.
if trainer.workdir.exists(): # `TrainEvaluator` do not have a workdir
for ev in last_checkpoint_evals:
epath.Path(trainer.workdir).joinpath(
EVAL_COMPLETE_FILENAME.format(ev.name)
).touch()

tracker.maybe_reraise()

# Return the last aux
Expand Down

0 comments on commit f430d87

Please sign in to comment.