forked from facebookresearch/EGG
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgame_callbacks.py
91 lines (71 loc) · 3.33 KB
/
game_callbacks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import json
import torch
from egg.core import Callback, Interaction
class BestStatsTracker(Callback):
def __init__(self):
super().__init__()
# TRAIN
self.best_train_acc, self.best_train_loss, self.best_train_epoch = (
-float("inf"),
float("inf"),
-1,
)
self.last_train_acc, self.last_train_loss, self.last_train_epoch = 0.0, 0.0, 0
# last_val_epoch useful for runs that end before the final epoch
def on_epoch_end(self, _loss, logs: Interaction, epoch: int):
if logs.aux["acc"].mean().item() > self.best_train_acc:
self.best_train_acc = logs.aux["acc"].mean().item()
self.best_train_epoch = epoch
self.best_train_loss = _loss
self.last_train_acc = logs.aux["acc"].mean().item()
self.last_train_epoch = epoch
self.last_train_loss = _loss
def on_train_end(self):
is_distributed = self.trainer.distributed_context.is_distributed
is_leader = self.trainer.distributed_context.is_leader
if (not is_distributed) or (is_distributed and is_leader):
train_stats = dict(
mode="train",
epoch=self.best_train_epoch,
acc=self.best_train_acc,
loss=self.best_train_loss,
)
print(json.dumps(train_stats), flush=True)
class VisionModelSaver(Callback):
"""A callback that stores vision module(s) in trainer's checkpoint_dir, if any."""
def __init__(self):
super().__init__()
def save_vision_model(self, epoch=""):
is_distributed = self.trainer.distributed_context.is_distributed
is_leader = self.trainer.distributed_context.is_leader
if hasattr(self.trainer, "checkpoint_path"):
if self.trainer.checkpoint_path and (
(not is_distributed) or (is_distributed and is_leader)
):
self.trainer.checkpoint_path.mkdir(exist_ok=True, parents=True)
if is_distributed:
# if distributed training the model is an instance of
# DistributedDataParallel and we need to unpack it from it.
vision_module = self.trainer.game.module.vision_module
else:
vision_module = self.trainer.game.vision_module
torch.save(
vision_module.encoder.state_dict(),
self.trainer.checkpoint_path
/ f"vision_module{epoch if epoch else '_final'}.pt",
)
def on_epoch_end(self, loss: float, logs: Interaction, epoch: int):
self.save_vision_model(epoch=epoch)
def on_train_end(self):
self.save_vision_model()
class DistributedSamplerEpochSetter(Callback):
"""A callback that sets the right epoch of a DistributedSampler instance."""
def __init__(self):
super().__init__()
def on_epoch_begin(self, epoch):
# just being cautious here given that non distributed jobs won't have probaly have distributed sampler set
if self.trainer.distributed_context.is_distributed:
self.trainer.train_data.sampler.set_epoch(epoch)