From 3a14236643916ecc5112f23560874cca083f2044 Mon Sep 17 00:00:00 2001 From: Huanxing Date: Wed, 8 May 2024 20:37:34 +0800 Subject: [PATCH] Fix wrongly all_gather for mixtral finetune (#965) Co-authored-by: ccrhx4 --- optimum/habana/transformers/models/mixtral/modeling_mixtral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py index 3e22f49629..dd80a4e243 100644 --- a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py +++ b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py @@ -367,7 +367,7 @@ def gaudi_mixtral_block_sparse_moe_forward(self, hidden_states: torch.Tensor) -> # router_logits: (batch * sequence_length, n_experts) router_logits = self.gate(hidden_states) - if is_deepspeed_available(): + if is_deepspeed_available() and (not self.training): from deepspeed import comm as dist if dist.is_initialized():