diff --git a/tencentpretrain/mpu/layers.py b/tencentpretrain/mpu/layers.py index 5fe834e1..7b9f252d 100644 --- a/tencentpretrain/mpu/layers.py +++ b/tencentpretrain/mpu/layers.py @@ -36,8 +36,6 @@ from .utils import split_tensor_along_last_dim from .utils import VocabUtility #from megatron import get_args -import deepspeed.runtime.activation_checkpointing.checkpointing as ds_checkpointing - _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = { "tensor_model_parallel": False, @@ -83,6 +81,7 @@ def maybe_copy(attribute): def _initialize_affine_weight_gpu(weight, init_method, partition_dim, stride=1): """Initialize affine weight for model parallel on GPU.""" + import deepspeed.runtime.activation_checkpointing.checkpointing as ds_checkpointing set_tensor_model_parallel_attributes( tensor=weight, is_parallel=True, dim=partition_dim, stride=stride )