mlcommons · priyakasimbeg · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024
@@ -327,7 +327,7 @@ def _eval_model_on_split(self,
                            global_step: int = 0) -> Dict[str, float]:
     """Run a full evaluation of the model."""
     del global_step
-    if model_state is not None:
+    if model_state is not None and len(model_state) > 0:
       # Sync batch statistics across replicas before evaluating.
       model_state = self.sync_batch_stats(model_state)
 

@@ -47,7 +47,8 @@ def init_model_fn(
     variables = model_init_fn({'params': params_rng, 'dropout': dropout_rng},
                               *fake_input_batch)
 
-    model_state = variables['batch_stats']
+    model_state = variables[
+        'batch_stats'] if not self.layernorm_everywhere else {}
     params = variables['params']
     self._param_shapes = param_utils.jax_param_shapes(params)
     self._param_types = param_utils.jax_param_types(self._param_shapes)

@@ -121,7 +121,7 @@ def predict_step(self,
                    max_decode_len: int,
                    beam_size: int = 4) -> spec.Tensor:
     """Predict translation with fast decoding beam search on a batch."""
-    config = models.TransformerConfig(deterministic=True, decode=True)
+    config = replace(self._eval_model.config, decode=True)
     # Prepare transformer fast-decoder call for beam search: for beam search, we
     # need to set up our decoder model to handle a batch size equal to
     # batch_size * beam_size, where each batch item's data is expanded in-place