diff --git a/benchmarks/bench_linear_float8.py b/benchmarks/bench_linear_float8.py index 8020ccc..5f8e4f9 100644 --- a/benchmarks/bench_linear_float8.py +++ b/benchmarks/bench_linear_float8.py @@ -14,11 +14,9 @@ import torch import torch.utils.benchmark as benchmark -from float8_experimental.float8_linear import TensorScalingType +from float8_experimental.float8_linear import Float8Linear, TensorScalingType from float8_experimental.float8_linear_utils import ( - get_float8_linear, linear_requires_sync, - LinearType, sync_float8_amax_and_scale_history, ) from float8_experimental.float8_tensor import ScaledMMConfig @@ -69,7 +67,6 @@ class Experiment: dtype: torch.dtype compiled: bool use_fast_accum: bool - linear_type: str scaling_repr: str # 3 Times since we are calculating forward backward @@ -98,7 +95,6 @@ def main( n_limit: Optional[int] = None, fast_accum_filter: Optional[bool] = None, shape_name_filter: Optional[str] = None, - linear_type_filter: Optional[str] = None, scaling_type_x: str = "delayed", scaling_type_w: str = "delayed", scaling_type_dL_dY: str = "delayed", @@ -123,44 +119,28 @@ def main( use_fast_accum = [fast_accum_filter] else: use_fast_accum = [True, False] - if linear_type_filter is not None: - linear_types = [linear_type_filter] - else: - linear_types = ["delayed", "dynamic"] if shape_name_filter is not None: k = shape_name_filter name_to_shapes_70b = {k: name_to_shapes_70b[k]} experiment_list: List[Experiment] = [] dtype = torch.bfloat16 - for idx, (fast_accum, (name, (K, N)), linear_type) in enumerate( - tqdm(list(product(use_fast_accum, name_to_shapes_70b.items(), linear_types))) + for idx, (fast_accum, (name, (K, N))) in enumerate( + tqdm(list(product(use_fast_accum, name_to_shapes_70b.items()))) ): if n_limit is not None and idx >= n_limit: break linear_ref = torch.nn.Linear(K, N, bias=input_bias).to( device=device, dtype=dtype ) - linear_type_enum = ( - LinearType.DELAYED if linear_type == "delayed" else LinearType.DYNAMIC - ) - if linear_type == "delayed": - linear_float8 = get_float8_linear( - linear_type_enum, - copy.deepcopy(linear_ref), - emulate=False, - scaling_type_x=scaling_type_x, - scaling_type_w=scaling_type_w, - scaling_type_dL_dY=scaling_type_dL_dY, - ) - scaling_repr = linear_float8.scaling_repr() - else: - linear_float8 = get_float8_linear( - linear_type_enum, - copy.deepcopy(linear_ref), - emulate=False, - ) - scaling_repr = None + linear_float8 = Float8Linear.from_float( + copy.deepcopy(linear_ref), + emulate=False, + scaling_type_x=scaling_type_x, + scaling_type_w=scaling_type_w, + scaling_type_dL_dY=scaling_type_dL_dY, + ) + scaling_repr = linear_float8.scaling_repr() if fast_accum: linear_float8.forward_config = ScaledMMConfig(False, True, False) @@ -172,19 +152,10 @@ def main( input_tensor = torch.randn(M, K, device=device, dtype=dtype, requires_grad=True) ref_forw_backward = lambda: linear_ref(input_tensor).sum().backward() - if linear_type_enum == LinearType.DELAYED: - - def float8_forw_backward(): - if linear_requires_sync( - linear_type_enum, scaling_type_x, scaling_type_w, scaling_type_dL_dY - ): - sync_float8_amax_and_scale_history(linear_float8) - linear_float8(input_tensor).sum().backward() - - else: - - def float8_forw_backward(): - linear_float8(input_tensor).sum().backward() + def float8_forw_backward(): + if linear_requires_sync(scaling_type_x, scaling_type_w, scaling_type_dL_dY): + sync_float8_amax_and_scale_history(linear_float8) + linear_float8(input_tensor).sum().backward() def n_times(n, fn, *args, **kwargs): def wrapper(*args, **kwargs): @@ -224,7 +195,6 @@ def wrapper(*args, **kwargs): dtype, compile, use_fast_accum=fast_accum, - linear_type=linear_type, scaling_repr=scaling_repr, ) print(experiment) @@ -237,7 +207,6 @@ def wrapper(*args, **kwargs): "M", "K", "N", - "linear_type", "scaling_repr", "ref_dtype", "compiled", @@ -257,7 +226,6 @@ def wrapper(*args, **kwargs): experiment.shape[0], experiment.shape[1], experiment.shape[2], - experiment.linear_type, experiment.scaling_repr, experiment.dtype, experiment.compiled, @@ -287,7 +255,6 @@ def wrapper(*args, **kwargs): [ "name", "shape", - "linear_type", "scaling_repr", "compiled", "use_fast_accum", @@ -311,7 +278,6 @@ def invoke_main() -> None: parser.add_argument("-n", "--n_limit", type=int, required=False) parser.add_argument("--fast_accum_filter", type=bool, required=False) parser.add_argument("--shape_name_filter", type=str, required=False) - parser.add_argument("--linear_type_filter", type=str, required=False) parser.add_argument("--scaling_type_x", type=str, required=False) parser.add_argument("--scaling_type_w", type=str, required=False) parser.add_argument("--scaling_type_dL_dY", type=str, required=False) @@ -330,7 +296,6 @@ def invoke_main() -> None: args.n_limit, args.fast_accum_filter, args.shape_name_filter, - args.linear_type_filter, **kwargs, ) diff --git a/benchmarks/bench_multi_gpu.py b/benchmarks/bench_multi_gpu.py index 8c54b3d..12a1ddb 100644 --- a/benchmarks/bench_multi_gpu.py +++ b/benchmarks/bench_multi_gpu.py @@ -14,7 +14,7 @@ import torch.multiprocessing as mp import torch.nn as nn import torch.utils.benchmark as benchmark -from float8_experimental.float8_linear import Float8Linear +from float8_experimental.float8_linear import Float8Linear, TensorScalingType from float8_experimental.float8_linear_utils import ( swap_linear_with_float8_linear, sync_float8_amax_and_scale_history, @@ -65,7 +65,13 @@ def get_model(K, N, is_fp8, base_dtype=torch.float32): modules.append(nn.ReLU()) m = nn.Sequential(*modules) if is_fp8: - swap_linear_with_float8_linear(m, Float8Linear, emulate=False) + swap_linear_with_float8_linear( + m, + emulate=False, + scaling_type_x=TensorScalingType.DELAYED, + scaling_type_w=TensorScalingType.DELAYED, + scaling_type_dL_dY=TensorScalingType.DELAYED, + ) return m diff --git a/benchmarks/profile_linear_float8.py b/benchmarks/profile_linear_float8.py index c5d7c44..1ef5478 100644 --- a/benchmarks/profile_linear_float8.py +++ b/benchmarks/profile_linear_float8.py @@ -18,11 +18,9 @@ import torch import torch.nn as nn import torch.nn.functional as F -from float8_experimental.float8_dynamic_linear import Float8DynamicLinear from float8_experimental.float8_linear import Float8Linear, TensorScalingType from float8_experimental.float8_linear_utils import ( linear_requires_sync, - LinearType, swap_linear_with_float8_linear, sync_float8_amax_and_scale_history, ) @@ -206,19 +204,25 @@ def profile_function( def main( profile_path_prefix: Path, compile: bool = True, - linear_type: str = "dynamic", - scaling_type_x: str = "delayed", - scaling_type_w: str = "delayed", - scaling_type_dL_dY: str = "delayed", + scaling_type_x: str = "dynamic", + scaling_type_w: str = "dynamic", + scaling_type_dL_dY: str = "dynamic", model_type: str = "linear", dtype_filter: str = "both", ): assert model_type in ("linear", "ln_linear", "norm_ffn_norm"), "unsupported" assert dtype_filter in ("both", "float8", "bfloat16") - print(f"Compile is set to | {compile}") - print(f"Using Linear type: | {linear_type}") - print(f"model_type is set to | {model_type}") + scaling_type_x = TensorScalingType(scaling_type_x) + scaling_type_w = TensorScalingType(scaling_type_w) + scaling_type_dL_dY = TensorScalingType(scaling_type_dL_dY) + scaling_repr = "_".join( + [s.short_str() for s in (scaling_type_x, scaling_type_w, scaling_type_dL_dY)] + ) + + print(f"Compile is set to | {compile}") + print(f"model_type is set to | {model_type}") + print(f"scaling_repr is set to | {scaling_repr}") device = "cuda" ref_dtype = torch.bfloat16 @@ -249,21 +253,14 @@ def main( m_ref = m_ref.to(device).to(ref_dtype) - linear_type = LinearType[linear_type.upper()] - linear_cls = ( - Float8Linear if linear_type is LinearType.DELAYED else Float8DynamicLinear - ) - extra_kwargs = {} - scaling_type_x = TensorScalingType(scaling_type_x) - scaling_type_w = TensorScalingType(scaling_type_w) - scaling_type_dL_dY = TensorScalingType(scaling_type_dL_dY) - if linear_type is LinearType.DELAYED: - extra_kwargs["scaling_type_x"] = scaling_type_x - extra_kwargs["scaling_type_w"] = scaling_type_w - extra_kwargs["scaling_type_dL_dY"] = scaling_type_dL_dY + extra_kwargs = { + "scaling_type_x": scaling_type_x, + "scaling_type_w": scaling_type_w, + "scaling_type_dL_dY": scaling_type_dL_dY, + } m_float8 = copy.deepcopy(m_ref) - swap_linear_with_float8_linear(m_float8, linear_cls, **extra_kwargs) + swap_linear_with_float8_linear(m_float8, **extra_kwargs) def ref_forw_backward(x): out = m_ref(x) @@ -281,9 +278,7 @@ def float8_forw_backward_wrapper(x): # inspection of the fw+bw torch.compile without the scale # syncing code # TODO(future): make this better - if linear_requires_sync( - linear_type, scaling_type_x, scaling_type_w, scaling_type_dL_dY - ): + if linear_requires_sync(scaling_type_x, scaling_type_w, scaling_type_dL_dY): with record_function("scale_amax_and_scales"): sync_amax_history(m_float8) out = float8_forw(x) @@ -345,7 +340,9 @@ def float8_forw_backward_wrapper(x): if dtype_filter != "bfloat16": # Profile Float8 Model print("profiling float8") - float8_suffix = f"_{model_type}_float8_compile_{compile}_{linear_type}.json" + float8_suffix = ( + f"_{model_type}_float8_compile_{compile}_{scaling_repr}.json" + ) float8_path = profile_path_prefix + float8_suffix profile_config = ProfileConfig( float8_path, diff --git a/float8_experimental/float8_dynamic_linear.py b/float8_experimental/float8_dynamic_utils.py similarity index 72% rename from float8_experimental/float8_dynamic_linear.py rename to float8_experimental/float8_dynamic_utils.py index 763a521..f48424c 100644 --- a/float8_experimental/float8_dynamic_linear.py +++ b/float8_experimental/float8_dynamic_utils.py @@ -53,64 +53,6 @@ def backward(ctx, gradY): return fp8_tensor, None -class Float8DynamicLinear(torch.nn.Linear): - """ - A wrapper around a `torch.nn.Linear` module which does fp8 compute. By on the fly - conversion to fp8 of the input and weight tensors. - """ - - def __init__(self, **super_kwargs): - super().__init__(**super_kwargs) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - x_fp8 = cast_to_float8_e4m3_dynamic(input, self.forward_config) - if isinstance(self.weight, Float8Tensor): # cast by FSDP - w_fp8 = self.weight - else: - w_fp8 = cast_to_float8_e4m3_dynamic(self.weight, self.forward_config) - y = torch.nn.functional.linear(x_fp8, w_fp8, self.bias) - y = cast_to_float8_e5m2_dynamic_bw(y, self.backward_config) - return y - - @classmethod - def from_float(cls, mod, emulate: bool = False) -> "Float8DynamicLinear": - """ - Create an nn.Linear with fp8 compute from a regular nn.Linear - - Args: - mod (torch.nn.Linear): nn.Linear to convert - emulate (bool): whether to emulate fp8 matmul logic in float32 - """ - with torch.device("meta"): - super_kwargs = { - "in_features": mod.in_features, - "out_features": mod.out_features, - "bias": False, - } - new_mod = cls(**super_kwargs) - - new_mod.forward_config = ScaledMMConfig( - emulate=emulate, - use_fast_accum=not bool(emulate), - fp8_output=False, - pad_inner_dim=config.pad_inner_dim, - ) - new_mod.backward_config = ScaledMMConfig( - emulate=emulate, - use_fast_accum=False, - fp8_output=False, - pad_inner_dim=config.pad_inner_dim, - ) - if config.enable_fsdp_fp8_all_gather: - new_mod.weight = nn.Parameter( - WeightWithDynamicFloat8CastTensor(mod.weight, new_mod.forward_config) - ) - else: - new_mod.weight = mod.weight - new_mod.bias = mod.bias - return new_mod - - def cast_to_float8_e4m3_dynamic( inpt_tensor: torch.Tensor, mm_config: ScaledMMConfig, reduce_amax: bool = False ) -> Float8Tensor: diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py index 90c207f..664a03a 100644 --- a/float8_experimental/float8_linear.py +++ b/float8_experimental/float8_linear.py @@ -16,7 +16,7 @@ import torch -from float8_experimental.float8_dynamic_linear import ( +from float8_experimental.float8_dynamic_utils import ( cast_to_float8_e4m3_dynamic, cast_to_float8_e5m2_dynamic_bw, WeightWithDynamicFloat8CastTensor, @@ -402,8 +402,8 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: def scaling_repr(self): # add scaling settings without using too many characters - # example: "x:del,w:del,dldy:dyn" - return f"x:{self.scaling_type_x.short_str()},w:{self.scaling_type_w.short_str()},dldy:{self.scaling_type_dL_dY.short_str()}" + # example: "x_del_w_del_dldy_dyn" + return f"x_{self.scaling_type_x.short_str()}_w_{self.scaling_type_w.short_str()}_dldy_{self.scaling_type_dL_dY.short_str()}" def extra_repr(self): s = f'{super().extra_repr()}, scaling="{self.scaling_repr()}"' diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py index b1a17e4..945f7a6 100644 --- a/float8_experimental/float8_linear_utils.py +++ b/float8_experimental/float8_linear_utils.py @@ -11,7 +11,6 @@ import torch import torch.distributed as dist import torch.nn as nn -from float8_experimental.float8_dynamic_linear import Float8DynamicLinear from float8_experimental.float8_linear import Float8Linear, TensorScalingType from float8_experimental.float8_utils import ( @@ -25,51 +24,13 @@ log.addHandler(logging.NullHandler()) -class LinearType(Enum): - DELAYED = auto() - DYNAMIC = auto() - - -def get_float8_linear( - linear_type: LinearType, - linear_ref: torch.nn.Linear, - emulate: bool = False, - scaling_type_x: TensorScalingType = TensorScalingType.DELAYED, - scaling_type_w: TensorScalingType = TensorScalingType.DELAYED, - scaling_type_dL_dY: TensorScalingType = TensorScalingType.DELAYED, -): - """Returns a Float8Linear module of the given type, initialized from linear_ref. - Args: - linear_type: The type of Float8Linear to return. - linear_ref: The linear module to initialize from. - emulate: Whether to emulate the fp8 matmul logic in float32. - scaling_type_x: delayed vs dynamic scaling for `x`. - scaling_type_w: delayed vs dynamic scaling for `w`. - scaling_type_dL_dY: delayed vs dynamic scaling for `dL_dY`. - """ - if linear_type is LinearType.DYNAMIC: - return Float8DynamicLinear.from_float( - copy.deepcopy(linear_ref), emulate=emulate - ) - else: - assert linear_type is LinearType.DELAYED - return Float8Linear.from_float( - copy.deepcopy(linear_ref), - emulate=emulate, - scaling_type_x=scaling_type_x, - scaling_type_w=scaling_type_w, - scaling_type_dL_dY=scaling_type_dL_dY, - ) - - def linear_requires_sync( - linear_type: LinearType, scaling_type_x: TensorScalingType = TensorScalingType.DELAYED, scaling_type_w: TensorScalingType = TensorScalingType.DELAYED, scaling_type_dL_dY: TensorScalingType = TensorScalingType.DELAYED, ): """Returns whether the given linear_type requires sync before forward.""" - return linear_type is LinearType.DELAYED and any( + return any( [ scaling_type_x is TensorScalingType.DELAYED, scaling_type_w is TensorScalingType.DELAYED, @@ -186,7 +147,6 @@ def post_order_traversal( def swap_linear_with_float8_linear( module: nn.Module, - module_cls: Union[Type[Float8Linear], Type[Float8DynamicLinear]], *, skip_fqn_list: Optional[List[str]] = None, emulate: bool = False, @@ -196,12 +156,10 @@ def swap_linear_with_float8_linear( scaling_type_dL_dY: TensorScalingType = TensorScalingType.DYNAMIC, ) -> Optional[nn.Module]: """ - Swaps `torch.nn.Linear` in `module` with `Float8Linear` or `Float8DynamicLinear`. + Swaps `torch.nn.Linear` in `module` with `Float8Linear`. Args: module: Module to modify. - module_cls: `Float8Linear` or `Float8DynamicLinear`. - from_float_func: Function that accepts a linear layer and returns a new type of linear layer. skip_fqn_list: If specified, a list of module FQNs to skip. emulate: If True, emulation is used instead of hardware accelerated gemm linear_layer_filter: If specified, only the linear layers @@ -213,16 +171,13 @@ def swap_linear_with_float8_linear( Returns: nn.Module: The modified module with swapped linear layers. """ - if module_cls is Float8DynamicLinear: - from_float = lambda m: module_cls.from_float(m, emulate=emulate) - else: - from_float = lambda m: module_cls.from_float( - m, - emulate=emulate, - scaling_type_x=scaling_type_x, - scaling_type_w=scaling_type_w, - scaling_type_dL_dY=scaling_type_dL_dY, - ) + from_float = lambda m: Float8Linear.from_float( + m, + emulate=emulate, + scaling_type_x=scaling_type_x, + scaling_type_w=scaling_type_w, + scaling_type_dL_dY=scaling_type_dL_dY, + ) return swap_linear_layers( module, from_float, diff --git a/float8_experimental/float8_tensor_parallel.py b/float8_experimental/float8_tensor_parallel.py index fac0201..7c012f6 100644 --- a/float8_experimental/float8_tensor_parallel.py +++ b/float8_experimental/float8_tensor_parallel.py @@ -1,6 +1,6 @@ import torch import torch.nn as nn -from float8_experimental.float8_dynamic_linear import ( +from float8_experimental.float8_dynamic_utils import ( cast_to_float8_e4m3_dynamic, cast_to_float8_e5m2_dynamic_bw, ) @@ -21,7 +21,6 @@ # creating the DTensor. # NOTE: This only works and tested with the dynamic scaling -# (Float8DynamicLinear and Float8Linear with dynamic scaling for all tensors) def _float8_linear_supports_float8_allgather(m): @@ -71,12 +70,11 @@ def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_me return outputs.to_local() if use_local_output else outputs def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module: - from float8_experimental.float8_dynamic_linear import Float8DynamicLinear from float8_experimental.float8_linear import Float8Linear - if not isinstance(module, (Float8DynamicLinear, Float8Linear)): + if not isinstance(module, Float8Linear): raise ValueError( - f"Expecting module to be Float8DynamicLinear or Float8Linear but found {type(module)}" + f"Expecting module to be Float8Linear but found {type(module)}" ) elif isinstance( module, Float8Linear @@ -122,12 +120,11 @@ def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_me return outputs.to_local() if use_local_output else outputs def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module: - from float8_experimental.float8_dynamic_linear import Float8DynamicLinear from float8_experimental.float8_linear import Float8Linear - if not isinstance(module, (Float8DynamicLinear, Float8Linear)): + if not isinstance(module, Float8Linear): raise ValueError( - f"Expecting module to be Float8DynamicLinear or Float8Linear but found {type(module)}" + f"Expecting module to be Float8Linear but found {type(module)}" ) elif isinstance( module, Float8Linear @@ -147,7 +144,7 @@ class PrepareFloat8ModuleInput(PrepareModuleInput): # float8_dtype (torch.dtype, optional): control what float8 dtype to cast to when prepare the module input, # we currently only support torch.float8_e4m3fn. default: torch.float8_e4m3fn # fwd_config_submodule_fqn (str, optional): the fqn of the submodule that contains the forward config used - # for the float8 cast. If not specified, we will search for the Float8DynamicLinear in the submodules + # for the float8 cast. If not specified, we will search for the Float8Linear in the submodules # and use the forward config from that module, in this case all module's forward config must be # the same. @@ -204,24 +201,23 @@ def _prepare_input_arg(self, input, mesh, input_layout, desired_layout): return input def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module: - from float8_experimental.float8_dynamic_linear import Float8DynamicLinear from float8_experimental.float8_linear import Float8Linear fwd_linear_config = None if self.fwd_config_submodule_fqn is not None: fwd_linear = module.get_submodule(self.fwd_config_submodule_fqn) - assert isinstance(fwd_linear, (Float8DynamicLinear, Float8Linear)) + assert isinstance(fwd_linear, Float8Linear) fwd_linear_config = fwd_linear.forward_config else: # search for ScaledMM configs for all the submodules and make sure they are the same for mod in module.modules(): - if isinstance(mod, (Float8DynamicLinear, Float8Linear)): + if isinstance(mod, Float8Linear): if fwd_linear_config is None: fwd_linear_config = mod.forward_config else: assert ( fwd_linear_config == mod.forward_config - ), "All the Float8DynamicLinear and Float8Linear modules should have same forward config!" + ), "All the Float8Linear modules should have same forward config!" self.fwd_linear_config = fwd_linear_config super()._apply(module, device_mesh) diff --git a/test/test_base.py b/test/test_base.py index 754e656..b2ee071 100644 --- a/test/test_base.py +++ b/test/test_base.py @@ -3,6 +3,7 @@ # # This source code is licensed under the BSD 3-Clause license found in the # LICENSE file in the root directory of this source tree. +import copy import io import itertools import random @@ -15,13 +16,10 @@ import torch import torch.nn as nn -from float8_experimental.float8_dynamic_linear import Float8DynamicLinear from float8_experimental.float8_linear import Float8Linear, TensorScalingType from float8_experimental.float8_linear_utils import ( filter_out_small_unaligned_layers, - get_float8_linear, linear_requires_sync, - LinearType, swap_linear_with_float8_linear, sync_float8_amax_and_scale_history, ) @@ -149,24 +147,20 @@ def _test_linear_impl( self, x, m_ref, - linear_type: LinearType, emulate: bool, scaling_type_x: TensorScalingType = TensorScalingType.DELAYED, scaling_type_w: TensorScalingType = TensorScalingType.DELAYED, scaling_type_dL_dY: TensorScalingType = TensorScalingType.DELAYED, ): - m_fp8 = get_float8_linear( - linear_type, - m_ref, + m_fp8 = Float8Linear.from_float( + copy.deepcopy(m_ref), emulate, scaling_type_x, scaling_type_w, scaling_type_dL_dY, ) for _ in range(2): - if linear_requires_sync( - linear_type, scaling_type_x, scaling_type_w, scaling_type_dL_dY - ): + if linear_requires_sync(scaling_type_x, scaling_type_w, scaling_type_dL_dY): sync_float8_amax_and_scale_history(m_fp8) y_fp8 = m_fp8(x) y_fp8.sum().backward() @@ -184,9 +178,7 @@ def _test_linear_impl( torch.testing.assert_close(m_ref.bias.grad, m_fp8.bias.grad) # verify all of the amax buffers got updated - if linear_requires_sync( - linear_type, scaling_type_x, scaling_type_w, scaling_type_dL_dY - ): + if linear_requires_sync(scaling_type_x, scaling_type_w, scaling_type_dL_dY): # only check buffers that are actually used, based on per-tensor # scaling settings amax_buffer_names = [] @@ -231,7 +223,6 @@ def _test_linear_impl( @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True]) @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)]) - @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC]) @pytest.mark.parametrize( "scaling_type_x", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC] ) @@ -245,7 +236,6 @@ def _test_linear_impl( def test_linear_nobias( self, x_shape, - linear_type: LinearType, emulate: bool, scaling_type_x: TensorScalingType, scaling_type_w: TensorScalingType, @@ -260,25 +250,11 @@ def test_linear_nobias( f"CUDA capability {torch.cuda.get_device_capability()} < (9.0)" ) pytest.skip() - if linear_type is LinearType.DYNAMIC: - # Only test one combination of scaling types, as they are a no-op - # for Float8DynamicLinear. It would be cleaner to split into two - # tests, but IMO not worth it since Float8DynamicLinear will be - # deleted soon - is_all_dynamic = ( - scaling_type_x is TensorScalingType.DYNAMIC - and scaling_type_w is TensorScalingType.DYNAMIC - and scaling_type_dL_dY is TensorScalingType.DYNAMIC - ) - if not is_all_dynamic: - pytest.skip() - x = torch.randn(*x_shape, device="cuda") m_ref = nn.Linear(16, 32, bias=False, device="cuda") self._test_linear_impl( x, m_ref, - linear_type, emulate, scaling_type_x, scaling_type_w, @@ -287,7 +263,6 @@ def test_linear_nobias( @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True]) @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)]) - @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC]) @pytest.mark.parametrize( "scaling_type_x", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC] ) @@ -304,7 +279,6 @@ def test_linear_nobias( def test_linear_bias( self, x_shape, - linear_type: LinearType, scaling_type_x: TensorScalingType, scaling_type_w: TensorScalingType, scaling_type_dL_dY: TensorScalingType, @@ -320,25 +294,11 @@ def test_linear_bias( f"CUDA capability {torch.cuda.get_device_capability()} < (9.0)" ) pytest.skip() - if linear_type is LinearType.DYNAMIC: - # Only test one combination of scaling types, as they are a no-op - # for Float8DynamicLinear. It would be cleaner to split into two - # tests, but IMO not worth it since Float8DynamicLinear will be - # deleted soon - is_all_dynamic = ( - scaling_type_x is TensorScalingType.DYNAMIC - and scaling_type_w is TensorScalingType.DYNAMIC - and scaling_type_dL_dY is TensorScalingType.DYNAMIC - ) - if not is_all_dynamic: - pytest.skip() - x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype) m_ref = nn.Linear(16, 32, bias=True, device="cuda", dtype=linear_dtype) self._test_linear_impl( x, m_ref, - linear_type, emulate, scaling_type_x, scaling_type_w, @@ -346,14 +306,12 @@ def test_linear_bias( ) @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True]) - @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC]) @pytest.mark.parametrize( "linear_dtype", [torch.float16, torch.bfloat16, torch.float32] ) @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") def test_autocast_outputs( self, - linear_type: LinearType, emulate: bool, linear_dtype: torch.dtype, ): @@ -368,49 +326,56 @@ def test_autocast_outputs( pytest.skip() m_ref = nn.Linear(32, 16, device="cuda", dtype=linear_dtype) - m = get_float8_linear(linear_type, m_ref, emulate) + kwargs = { + "scaling_type_x": TensorScalingType.DELAYED, + "scaling_type_w": TensorScalingType.DELAYED, + "scaling_type_dL_dY": TensorScalingType.DELAYED, + } + m = Float8Linear.from_float(copy.deepcopy(m_ref), emulate, **kwargs) # autocast off x = torch.randn(16, 32, device="cuda", dtype=linear_dtype) - if linear_requires_sync(linear_type): + if linear_requires_sync(**kwargs): sync_float8_amax_and_scale_history(m) y = m(x) assert y.dtype == linear_dtype, f"y.dtype is {y.dtype}, expected {linear_dtype}" # autocast on with torch.autocast("cuda"): - if linear_requires_sync(linear_type): + if linear_requires_sync(**kwargs): sync_float8_amax_and_scale_history(m) y = m(x) assert y.dtype == torch.half, f"y.dtype is {y.dtype}, expected {torch.half}" with torch.autocast("cuda", dtype=torch.bfloat16): - if linear_requires_sync(linear_type): + if linear_requires_sync(**kwargs): sync_float8_amax_and_scale_history(m) y = m(x) assert ( y.dtype == torch.bfloat16 ), f"y.dtype is {y.dtype}, expected {torch.bfloat16}" - @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC]) @pytest.mark.parametrize( "linear_dtype", [torch.float16, torch.bfloat16, torch.float32] ) @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True]) @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") - def test_type_cast( - self, linear_type: LinearType, linear_dtype: torch.dtype, emulate: bool - ): + def test_type_cast(self, linear_dtype: torch.dtype, emulate: bool): emulate = ( not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0) ) m = nn.Linear(32, 16, device="cuda", dtype=linear_dtype) - m = get_float8_linear(linear_type, m, emulate) + kwargs = { + "scaling_type_x": TensorScalingType.DYNAMIC, + "scaling_type_w": TensorScalingType.DYNAMIC, + "scaling_type_dL_dY": TensorScalingType.DYNAMIC, + } + m = Float8Linear.from_float(copy.deepcopy(m), emulate, **kwargs) # Cast the module to dtype m = m.to(dtype=linear_dtype) - if linear_requires_sync(linear_type): + if linear_requires_sync(**kwargs): # Check amax buffer types for key in [ "fp8_amax_x", @@ -429,18 +394,21 @@ def test_type_cast( # autocast off x = torch.randn(16, 32, device="cuda", dtype=linear_dtype) - sync_float8_amax_and_scale_history(m) + if linear_requires_sync(**kwargs): + sync_float8_amax_and_scale_history(m) y = m(x) assert y.dtype == linear_dtype, f"y.dtype is {y.dtype}, expected {linear_dtype}" # autocast on with torch.autocast("cuda"): - sync_float8_amax_and_scale_history(m) + if linear_requires_sync(**kwargs): + sync_float8_amax_and_scale_history(m) y = m(x) assert y.dtype == torch.half, f"y.dtype is {y.dtype}, expected {torch.half}" with torch.autocast("cuda", dtype=torch.bfloat16): - sync_float8_amax_and_scale_history(m) + if linear_requires_sync(**kwargs): + sync_float8_amax_and_scale_history(m) y = m(x) assert ( y.dtype == torch.bfloat16 @@ -448,9 +416,8 @@ def test_type_cast( def test_repr(self): m = nn.Linear(32, 16) - m = get_float8_linear( - LinearType.DELAYED, - m, + m = Float8Linear.from_float( + copy.deepcopy(m), emulate=True, scaling_type_x=TensorScalingType.DYNAMIC, scaling_type_w=TensorScalingType.DELAYED, @@ -633,26 +600,22 @@ def test_small_amax_float16(self, float8_dtype): class TestFloat8LinearUtils(unittest.TestCase): def test_swap_root_linear(self): - for module_cls, emulate in itertools.product( - [Float8Linear, Float8DynamicLinear], [True, False] - ): + for emulate in [True, False]: module = nn.Linear(3, 3) - module = swap_linear_with_float8_linear(module, module_cls, emulate=emulate) - self.assertIsInstance(module, module_cls) + module = swap_linear_with_float8_linear(module, emulate=emulate) + self.assertIsInstance(module, Float8Linear) self.assertEqual(module.forward_config.emulate, emulate) self.assertEqual(module.backward_config.emulate, emulate) def test_swap_root_linear_with_children_raises(self): - for module_cls, emulate in itertools.product( - [Float8Linear, Float8DynamicLinear], [True, False] - ): + for emulate in [True, False]: module = nn.Linear(3, 3) module.child = nn.Sequential(nn.Linear(3, 3)) with self.assertRaisesRegex( AssertionError, "Does not support a root nn.Linear with children", ): - swap_linear_with_float8_linear(module, module_cls, emulate=emulate) + swap_linear_with_float8_linear(module, emulate=emulate) def test_swap_submodule_linears(self): class MLP(nn.Module): @@ -661,16 +624,14 @@ def __init__(self, dim: int): self.lin1 = nn.Linear(dim, 4 * dim) self.lin2 = nn.Linear(4 * dim, dim) - for module_cls, emulate in itertools.product( - [Float8Linear, Float8DynamicLinear], [True, False] - ): + for emulate in [True, False]: model = nn.Sequential(MLP(3), nn.Linear(3, 3), MLP(3)) - model = swap_linear_with_float8_linear(model, module_cls, emulate=emulate) - self.assertIsInstance(model[0].lin1, module_cls) - self.assertIsInstance(model[0].lin2, module_cls) - self.assertIsInstance(model[1], module_cls) - self.assertIsInstance(model[2].lin1, module_cls) - self.assertIsInstance(model[2].lin2, module_cls) + model = swap_linear_with_float8_linear(model, emulate=emulate) + self.assertIsInstance(model[0].lin1, Float8Linear) + self.assertIsInstance(model[0].lin2, Float8Linear) + self.assertIsInstance(model[1], Float8Linear) + self.assertIsInstance(model[2].lin1, Float8Linear) + self.assertIsInstance(model[2].lin2, Float8Linear) def test_swap_linears_with_filters(self): class MLP(nn.Module): @@ -679,27 +640,24 @@ def __init__(self, dim: int): self.lin1 = nn.Linear(dim, 4 * dim) self.lin2 = nn.Linear(4 * dim, 4 * dim) - for module_cls, emulate in itertools.product( - [Float8Linear, Float8DynamicLinear], [True, False] - ): + for emulate in [True, False]: model = nn.Sequential(MLP(8), nn.Linear(32, 32), MLP(40)) # filter out the linear layers whose shape is smaller than 32 or non-divisible by 16. model = swap_linear_with_float8_linear( model, - module_cls, emulate=emulate, linear_layer_filter=filter_out_small_unaligned_layers(32), ) # in_features=8, out_features=32, 8 is less than 32. - self.assertNotIsInstance(model[0].lin1, module_cls) + self.assertNotIsInstance(model[0].lin1, Float8Linear) # in_features=32, out_features=32, - self.assertIsInstance(model[0].lin2, module_cls) + self.assertIsInstance(model[0].lin2, Float8Linear) # in_features=32, out_features=32, - self.assertIsInstance(model[1], module_cls) + self.assertIsInstance(model[1], Float8Linear) # in_features=40, out_features=160, 40 is not divisible by 16. - self.assertNotIsInstance(model[2].lin1, module_cls) + self.assertNotIsInstance(model[2].lin1, Float8Linear) # in_features=160, out_features=160, - self.assertIsInstance(model[2].lin2, module_cls) + self.assertIsInstance(model[2].lin2, Float8Linear) def test_swap_submodule_linears_with_skip(self): class MLP(nn.Module): @@ -708,20 +666,18 @@ def __init__(self, dim: int): self.lin1 = nn.Linear(dim, 4 * dim) self.lin2 = nn.Linear(4 * dim, dim) - for module_cls, emulate in itertools.product( - [Float8Linear, Float8DynamicLinear], [True, False] - ): + for emulate in [True, False]: model = nn.Sequential(MLP(3), nn.Linear(3, 3), MLP(3)) skip_fqn_list = ["2", "0.lin2"] model = swap_linear_with_float8_linear( - model, module_cls, emulate=emulate, skip_fqn_list=skip_fqn_list + model, emulate=emulate, skip_fqn_list=skip_fqn_list ) - self.assertIsInstance(model[0].lin1, module_cls) - self.assertNotIsInstance(model[0].lin2, module_cls) + self.assertIsInstance(model[0].lin1, Float8Linear) + self.assertNotIsInstance(model[0].lin2, Float8Linear) self.assertIsInstance(model[0].lin2, nn.Linear) - self.assertIsInstance(model[1], module_cls) - self.assertNotIsInstance(model[2].lin2, module_cls) - self.assertNotIsInstance(model[2].lin2, module_cls) + self.assertIsInstance(model[1], Float8Linear) + self.assertNotIsInstance(model[2].lin2, Float8Linear) + self.assertNotIsInstance(model[2].lin2, Float8Linear) self.assertIsInstance(model[2].lin1, nn.Linear) self.assertIsInstance(model[2].lin2, nn.Linear) diff --git a/test/test_compile.py b/test/test_compile.py index 834d126..5a6e003 100644 --- a/test/test_compile.py +++ b/test/test_compile.py @@ -16,8 +16,6 @@ from float8_experimental.float8_linear import Float8Linear, TensorScalingType from float8_experimental.float8_linear_utils import ( get_float8_layers, - get_float8_linear, - LinearType, swap_linear_with_float8_linear, sync_float8_amax_and_scale_history, ) @@ -34,7 +32,6 @@ def _test_compile_base( backend: str, fullgraph: bool, emulate: bool, - linear_type: LinearType, scaling_type_x, scaling_type_w, scaling_type_dL_dY, @@ -48,8 +45,12 @@ def _test_compile_base( x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype) m_ref = nn.Linear(16, 32, bias=True, device="cuda", dtype=linear_dtype) - m_fp8 = get_float8_linear( - linear_type, m_ref, emulate, scaling_type_x, scaling_type_w, scaling_type_dL_dY + m_fp8 = Float8Linear.from_float( + copy.deepcopy(m_ref), + emulate, + scaling_type_x, + scaling_type_w, + scaling_type_dL_dY, ) m_fp8 = torch.compile(m_fp8, backend=backend, fullgraph=fullgraph) @@ -66,7 +67,6 @@ def _test_compile_base( @pytest.mark.parametrize("fullgraph", [True]) -@pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC]) @pytest.mark.parametrize( "scaling_type_x", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC] ) @@ -82,30 +82,16 @@ def _test_compile_base( def test_eager_only( fullgraph, emulate: bool, - linear_type: bool, scaling_type_x: TensorScalingType, scaling_type_w: TensorScalingType, scaling_type_dL_dY: TensorScalingType, dtype: torch.dtype, ): - if linear_type is LinearType.DYNAMIC: - # Only test one combination of scaling types, as they are a no-op - # for Float8DynamicLinear. It would be cleaner to split into two - # tests, but IMO not worth it since Float8DynamicLinear will be - # deleted soon - is_all_dynamic = ( - scaling_type_x is TensorScalingType.DYNAMIC - and scaling_type_w is TensorScalingType.DYNAMIC - and scaling_type_dL_dY is TensorScalingType.DYNAMIC - ) - if not is_all_dynamic: - pytest.skip() torch._dynamo.reset() _test_compile_base( "eager", fullgraph, emulate, - linear_type, scaling_type_x, scaling_type_w, scaling_type_dL_dY, @@ -115,7 +101,6 @@ def test_eager_only( @pytest.mark.parametrize("fullgraph", [True]) @pytest.mark.parametrize("emulate", [False, True] if is_H100 else [True]) -@pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC]) @pytest.mark.parametrize( "scaling_type_x", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC] ) @@ -130,30 +115,16 @@ def test_eager_only( def test_aot_eager( fullgraph, emulate: bool, - linear_type: bool, scaling_type_x: TensorScalingType, scaling_type_w: TensorScalingType, scaling_type_dL_dY: TensorScalingType, dtype: torch.dtype, ): - if linear_type is LinearType.DYNAMIC: - # Only test one combination of scaling types, as they are a no-op - # for Float8DynamicLinear. It would be cleaner to split into two - # tests, but IMO not worth it since Float8DynamicLinear will be - # deleted soon - is_all_dynamic = ( - scaling_type_x is TensorScalingType.DYNAMIC - and scaling_type_w is TensorScalingType.DYNAMIC - and scaling_type_dL_dY is TensorScalingType.DYNAMIC - ) - if not is_all_dynamic: - pytest.skip() torch._dynamo.reset() _test_compile_base( "aot_eager", fullgraph, emulate, - linear_type, scaling_type_x, scaling_type_w, scaling_type_dL_dY, @@ -163,7 +134,6 @@ def test_aot_eager( @pytest.mark.parametrize("fullgraph", [True]) @pytest.mark.parametrize("emulate", [False]) -@pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC]) @pytest.mark.parametrize( "scaling_type_x", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC] ) @@ -178,30 +148,16 @@ def test_aot_eager( def test_inductor( fullgraph, emulate: bool, - linear_type: bool, scaling_type_x: TensorScalingType, scaling_type_w: TensorScalingType, scaling_type_dL_dY: TensorScalingType, dtype: torch.dtype, ): - if linear_type is LinearType.DYNAMIC: - # Only test one combination of scaling types, as they are a no-op - # for Float8DynamicLinear. It would be cleaner to split into two - # tests, but IMO not worth it since Float8DynamicLinear will be - # deleted soon - is_all_dynamic = ( - scaling_type_x is TensorScalingType.DYNAMIC - and scaling_type_w is TensorScalingType.DYNAMIC - and scaling_type_dL_dY is TensorScalingType.DYNAMIC - ) - if not is_all_dynamic: - pytest.skip() torch._dynamo.reset() _test_compile_base( "inductor", fullgraph, emulate, - linear_type, scaling_type_x, scaling_type_w, scaling_type_dL_dY, @@ -301,7 +257,6 @@ def test_sync_amax_func(): ) float8_mod = swap_linear_with_float8_linear( module, - Float8Linear, scaling_type_x=TensorScalingType.DELAYED, scaling_type_w=TensorScalingType.DELAYED, scaling_type_dL_dY=TensorScalingType.DELAYED, @@ -337,7 +292,6 @@ def test_sync_amax_func_cuda_graph_success(): ).to("cuda") swap_linear_with_float8_linear( my_module, - Float8Linear, scaling_type_x=TensorScalingType.DELAYED, scaling_type_w=TensorScalingType.DELAYED, scaling_type_dL_dY=TensorScalingType.DELAYED, diff --git a/test/test_dtensor.py b/test/test_dtensor.py index 24a5e58..6506ee7 100644 --- a/test/test_dtensor.py +++ b/test/test_dtensor.py @@ -14,10 +14,7 @@ import torch.nn as nn import torch.nn.functional as F -from float8_experimental.float8_dynamic_linear import ( - Float8DynamicLinear, - NoopFwToFloat8E5M2Bw, -) +from float8_experimental.float8_dynamic_utils import NoopFwToFloat8E5M2Bw from float8_experimental.float8_linear import Float8Linear, TensorScalingType from float8_experimental.float8_linear_utils import swap_linear_with_float8_linear from float8_experimental.float8_tensor import Float8Tensor, ScaledMMConfig @@ -171,37 +168,28 @@ def test_dtensor_fp8_autograd(mesh: DeviceMesh, size=16): def _test_fp8_mlp_tensor_parallelism_base( - mesh: DeviceMesh, size=16, compile: bool = False, use_float8_linear: bool = False + mesh: DeviceMesh, size=16, compile: bool = False ): device = mesh.device_type - # TODO(future): delete Float8DynamicLinear from this test once all the - # code is unified - float8_cls = Float8Linear if use_float8_linear else Float8DynamicLinear - extra_kwargs = {} - if use_float8_linear: - # For now, just use Float8Linear with dynamic scaling, which is the - # same behavior as Float8Linear. - # TODO(future): add support for float8 all-gather with delayed scaling - # for activations and gradients. - extra_kwargs = { - "scaling_type_x": TensorScalingType.DYNAMIC, - "scaling_type_w": TensorScalingType.DYNAMIC, - "scaling_type_dL_dY": TensorScalingType.DYNAMIC, - } + # For now, just use Float8Linear with dynamic scaling, which is the + # same behavior as Float8Linear. + # TODO(future): add support for float8 all-gather with delayed scaling + # for activations and gradients. + extra_kwargs = { + "scaling_type_x": TensorScalingType.DYNAMIC, + "scaling_type_w": TensorScalingType.DYNAMIC, + "scaling_type_dL_dY": TensorScalingType.DYNAMIC, + } toy_model = ToyModel().to(device) toy_model_fp8 = swap_linear_with_float8_linear( - toy_model, float8_cls, emulate=True, **extra_kwargs + toy_model, emulate=True, **extra_kwargs ) tp_model = copy.deepcopy(toy_model) - tp_model = swap_linear_with_float8_linear( - tp_model, float8_cls, emulate=True, **extra_kwargs - ) + tp_model = swap_linear_with_float8_linear(tp_model, emulate=True, **extra_kwargs) sp_model = copy.deepcopy(toy_model) - sp_model = swap_linear_with_float8_linear( - sp_model, float8_cls, emulate=True, **extra_kwargs - ) + sp_model = swap_linear_with_float8_linear(sp_model, emulate=True, **extra_kwargs) # vanilla TP tp_model = parallelize_module( @@ -232,9 +220,7 @@ def _test_fp8_mlp_tensor_parallelism_base( # PrepareFloat8ModuleInput with specific submodule fqn sp_model2 = copy.deepcopy(toy_model) - sp_model2 = swap_linear_with_float8_linear( - sp_model2, Float8DynamicLinear, emulate=True, **extra_kwargs - ) + sp_model2 = swap_linear_with_float8_linear(sp_model2, emulate=True, **extra_kwargs) sp_model2 = parallelize_module( sp_model2, @@ -287,27 +273,11 @@ def _test_fp8_mlp_tensor_parallelism_base( def test_fp8_mlp_tensor_parallelism_eager(mesh: DeviceMesh, size=16): - _test_fp8_mlp_tensor_parallelism_base( - mesh, size, compile=False, use_float8_linear=False - ) - - -def test_fp8_mlp_tensor_parallelism_eager_float8_linear(mesh: DeviceMesh, size=16): - _test_fp8_mlp_tensor_parallelism_base( - mesh, size, compile=False, use_float8_linear=True - ) + _test_fp8_mlp_tensor_parallelism_base(mesh, size, compile=False) def test_fp8_mlp_tensor_parallelism_compile(mesh: DeviceMesh, size=16): - _test_fp8_mlp_tensor_parallelism_base( - mesh, size, compile=True, use_float8_linear=False - ) - - -def test_fp8_mlp_tensor_parallelism_compile_float8_linear(mesh: DeviceMesh, size=16): - _test_fp8_mlp_tensor_parallelism_base( - mesh, size, compile=True, use_float8_linear=True - ) + _test_fp8_mlp_tensor_parallelism_base(mesh, size, compile=True) if __name__ == "__main__": @@ -321,9 +291,7 @@ def test_fp8_mlp_tensor_parallelism_compile_float8_linear(mesh: DeviceMesh, size test_dtensor_cast_to_fp8, test_dtensor_fp8_autograd, test_fp8_mlp_tensor_parallelism_eager, - test_fp8_mlp_tensor_parallelism_eager_float8_linear, test_fp8_mlp_tensor_parallelism_compile, - test_fp8_mlp_tensor_parallelism_compile_float8_linear, ] for test in tqdm(tests, desc="Running tests"): diff --git a/test/test_fsdp.py b/test/test_fsdp.py index 031b40d..79bba19 100644 --- a/test/test_fsdp.py +++ b/test/test_fsdp.py @@ -24,7 +24,6 @@ from float8_experimental.float8_linear import Float8Linear, TensorScalingType from float8_experimental.float8_linear_utils import ( linear_requires_sync, - LinearType, swap_linear_with_float8_linear, sync_float8_amax_and_scale_history, ) @@ -84,7 +83,6 @@ def fsdp_main(rank, world_size, args): # with weights. swap_linear_with_float8_linear( model_fp8, - Float8Linear, emulate=False, scaling_type_w=scaling_type_w, ) @@ -133,7 +131,6 @@ def forward_backward(model, optim, is_fp8, i): y_local = model(ref_input_local[i]) y_local.backward(ref_grad_local[i]) if is_fp8 and linear_requires_sync( - LinearType.DELAYED, TensorScalingType.DYNAMIC, scaling_type_w, TensorScalingType.DYNAMIC, diff --git a/test/test_fsdp2/test_fsdp2_common.py b/test/test_fsdp2/test_fsdp2_common.py index 4f20fd5..c20e8cc 100644 --- a/test/test_fsdp2/test_fsdp2_common.py +++ b/test/test_fsdp2/test_fsdp2_common.py @@ -7,7 +7,6 @@ import torch.distributed as dist import torch.nn as nn from float8_experimental.float8_linear import Float8Linear -from float8_experimental.float8_linear_utils import sync_float8_amax_and_scale_history def check_parity_no_mp( @@ -17,7 +16,6 @@ def check_parity_no_mp( fsdp_model: nn.Module, fsdp_optim: torch.optim.Optimizer, local_inp: torch.Tensor, - module_cls: Type, ): for iter_idx in range(10): losses: List[torch.Tensor] = [] @@ -29,8 +27,7 @@ def check_parity_no_mp( for param in model.parameters(): dist.all_reduce(param.grad) param.grad.div_(dist.get_world_size()) - if module_cls is Float8Linear: - sync_float8_amax_and_scale_history(model) + # TODO(future): add amax syncing once delayed scaling is supported optim.step() test_cls.assertEqual(losses[0], losses[1]) @@ -43,7 +40,6 @@ def check_parity_bf16_mp( fsdp_model: nn.Module, fsdp_optim: torch.optim.Optimizer, local_inp: torch.Tensor, - module_cls: Type, ): for iter_idx in range(10): losses: List[torch.Tensor] = [] @@ -62,8 +58,7 @@ def check_parity_bf16_mp( param_bf16.grad.div_(dist.get_world_size()) param_fp32.grad = param_bf16.grad.float() param_bf16.grad = None - if module_cls is Float8Linear: - sync_float8_amax_and_scale_history(model) + # TODO(future): add amax syncing once delayed scaling is supported optim.step() for param_fp32, param_bf16 in zip( ref_model.parameters(), ref_model_bf16.parameters() diff --git a/test/test_fsdp2/test_fsdp2_eager.py b/test/test_fsdp2/test_fsdp2_eager.py index 5e4dc8f..57123cd 100644 --- a/test/test_fsdp2/test_fsdp2_eager.py +++ b/test/test_fsdp2/test_fsdp2_eager.py @@ -8,10 +8,7 @@ import torch._dynamo.testing import torch.distributed as dist import torch.nn as nn -from float8_experimental.float8_dynamic_linear import ( - Float8DynamicLinear, - WeightWithDynamicFloat8CastTensor, -) +from float8_experimental.float8_dynamic_utils import WeightWithDynamicFloat8CastTensor from float8_experimental.float8_linear import Float8Linear, TensorScalingType from float8_experimental.float8_linear_utils import swap_linear_with_float8_linear from test_fsdp2_common import ( @@ -76,16 +73,11 @@ def get_local_inp(self, dtype: torch.dtype = torch.float32): dist.broadcast(global_inp, src=0) return global_inp.view(self.world_size, -1)[self.rank].view(16, 16) - def swap_linear_with_dynamic( - self, module: nn.Module, use_float8_linear=False, **kwargs: Any - ) -> nn.Module: - if use_float8_linear: - kwargs["scaling_type_x"] = TensorScalingType.DYNAMIC - kwargs["scaling_type_w"] = TensorScalingType.DYNAMIC - kwargs["scaling_type_dL_dY"] = TensorScalingType.DYNAMIC - return swap_linear_with_float8_linear(module, Float8Linear, **kwargs) - else: - return swap_linear_with_float8_linear(module, Float8DynamicLinear, **kwargs) + def swap_linear_with_dynamic(self, module: nn.Module, **kwargs: Any) -> nn.Module: + kwargs["scaling_type_x"] = TensorScalingType.DYNAMIC + kwargs["scaling_type_w"] = TensorScalingType.DYNAMIC + kwargs["scaling_type_dL_dY"] = TensorScalingType.DYNAMIC + return swap_linear_with_float8_linear(module, **kwargs) class TestFloat8MultiProcess(FSDPTest, TestFloat8Common): @@ -95,16 +87,10 @@ def world_size(self) -> int: @skip_if_lt_x_gpu(2) def test_transformer_parity_dynamic(self): - for enable_fsdp_fp8_all_gather, use_float8_linear in itertools.product( - [False, True], [False, True] - ): - self._test_transformer_parity_dynamic( - enable_fsdp_fp8_all_gather, use_float8_linear - ) + for enable_fsdp_fp8_all_gather in [False, True]: + self._test_transformer_parity_dynamic(enable_fsdp_fp8_all_gather) - def _test_transformer_parity_dynamic( - self, enable_fsdp_fp8_all_gather: bool, use_float8_linear: bool - ): + def _test_transformer_parity_dynamic(self, enable_fsdp_fp8_all_gather: bool): # NOTE: Weight-tying does not compose with fp8 all-gather because the # embedding weight and output linear weight are tied but only the # latter uses fp8 compute. With fp8 all-gather, FSDP would pre-cast to @@ -112,9 +98,9 @@ def _test_transformer_parity_dynamic( weight_tying = not enable_fsdp_fp8_all_gather module = self.init_transformer(weight_tying=weight_tying) ref_module = copy.deepcopy(module) - ref_module = self.swap_linear_with_dynamic(ref_module, use_float8_linear).cuda() + ref_module = self.swap_linear_with_dynamic(ref_module).cuda() with set_enable_fsdp_fp8_all_gather(enable_fsdp_fp8_all_gather): - module = self.swap_linear_with_dynamic(module, use_float8_linear) + module = self.swap_linear_with_dynamic(module) for submodule in module.modules(): if isinstance(submodule, TransformerBlock): fully_shard(submodule) @@ -124,24 +110,15 @@ def _test_transformer_parity_dynamic( local_inp = torch.randint( 0, ref_module.tok_embeddings.weight.size(0), (16, 16), device="cuda" ) - # TODO(future): change Float8DynamicLinear to module_cls below, and - # ensure there is no amax syncing for all-dynamic - check_parity_no_mp( - self, ref_module, ref_optim, module, optim, local_inp, Float8DynamicLinear - ) + check_parity_no_mp(self, ref_module, ref_optim, module, optim, local_inp) @skip_if_lt_x_gpu(2) def test_transformer_memory(self): """Tests peak active memory in the forward and backward passes.""" - # for enable_fsdp_fp8_all_gather in [False, True]: - for enable_fsdp_fp8_all_gather, use_float8_linear in itertools.product( - [False, True], [False, True] - ): - self._test_transformer_memory(enable_fsdp_fp8_all_gather, use_float8_linear) - - def _test_transformer_memory( - self, enable_fsdp_fp8_all_gather: bool, use_float8_linear: bool - ): + for enable_fsdp_fp8_all_gather in [False, True]: + self._test_transformer_memory(enable_fsdp_fp8_all_gather) + + def _test_transformer_memory(self, enable_fsdp_fp8_all_gather: bool): torch.manual_seed(42) # Pre-run a linear forward (gemm and bias) and backward (gemm) to # allocate the cuBLAS workspaces before measuring the memory usage @@ -164,9 +141,7 @@ def _test_transformer_memory( # Emulate the fp8 matmul to bypass the scaled matmul op's divisibility # requirement to use a smaller activation size with set_enable_fsdp_fp8_all_gather(enable_fsdp_fp8_all_gather): - model = self.swap_linear_with_dynamic( - model, emulate=True, use_float8_linear=use_float8_linear - ) + model = self.swap_linear_with_dynamic(model, emulate=True) model_unsharded_numel = sum(p.numel() for p in model.parameters()) model_sharded_numel = (model_unsharded_numel + 1) // 2 block_lin_weight_numel = 0 @@ -267,21 +242,19 @@ class TestFloat8MultiThread(FSDPTestMultiThread, TestFloat8Common): def world_size(self) -> int: return 2 - def _test_weight_subclass_dynamic(self, use_float8_linear): - float8_cls = Float8Linear if use_float8_linear else Float8DynamicLinear - extra_kwargs = {} - if use_float8_linear: - extra_kwargs["scaling_type_x"] = TensorScalingType.DYNAMIC - extra_kwargs["scaling_type_w"] = TensorScalingType.DYNAMIC - extra_kwargs["scaling_type_dL_dY"] = TensorScalingType.DYNAMIC - pass + @unittest.skipIf(not TEST_CUDA, "no cuda") + def test_weight_subclass_dynamic(self): + extra_kwargs = { + "scaling_type_x": TensorScalingType.DYNAMIC, + "scaling_type_w": TensorScalingType.DYNAMIC, + "scaling_type_dL_dY": TensorScalingType.DYNAMIC, + } tensor_cls = WeightWithDynamicFloat8CastTensor # Check for a single FSDP paramter group module_fp32 = self.init_single_module() with set_enable_fsdp_fp8_all_gather(True): module = swap_linear_with_float8_linear( module_fp32, - float8_cls, emulate=True, **extra_kwargs, ) @@ -297,7 +270,6 @@ def _test_weight_subclass_dynamic(self, use_float8_linear): with set_enable_fsdp_fp8_all_gather(True): module = swap_linear_with_float8_linear( module, - float8_cls, emulate=True, **extra_kwargs, ) @@ -313,14 +285,7 @@ def _test_weight_subclass_dynamic(self, use_float8_linear): self.assertIsInstance(param.to_local(), tensor_cls) @unittest.skipIf(not TEST_CUDA, "no cuda") - def test_weight_subclass_float8_dynamic_linear(self): - self._test_weight_subclass_dynamic(use_float8_linear=False) - - @unittest.skipIf(not TEST_CUDA, "no cuda") - def test_weight_subclass_float8_linear(self): - self._test_weight_subclass_dynamic(use_float8_linear=True) - - def _test_fp8_fp32_all_gather_dynamic_comm_size(self, use_float8_linear): + def test_fp8_fp32_all_gather_dynamic_comm_size(self): """ Tests that fp8 all-gather with dynamic scaling communicates the expected number of bytes. @@ -354,7 +319,7 @@ def get_expected_all_gather_size(module: nn.Module): module_fp32 = self.init_single_module() ref_module = copy.deepcopy(module_fp32) with set_enable_fsdp_fp8_all_gather(True): - module = self.swap_linear_with_dynamic(module_fp32, use_float8_linear) + module = self.swap_linear_with_dynamic(module_fp32) fully_shard(module) local_inp = self.get_local_inp() expected_all_gather_size = get_expected_all_gather_size(ref_module) @@ -398,30 +363,18 @@ def get_expected_all_gather_size(module: nn.Module): [s for s in expected_all_gather_sizes for _ in range(self.world_size)], ) - @unittest.skipIf(not TEST_CUDA, "no cuda") - def test_fp8_fp32_all_gather_float8_dynamic_linear_comm_size(self): - self._test_fp8_fp32_all_gather_dynamic_comm_size(use_float8_linear=False) - - @unittest.skipIf(not TEST_CUDA, "no cuda") - def test_fp8_fp32_all_gather_float8_linear_comm_size(self): - self._test_fp8_fp32_all_gather_dynamic_comm_size(use_float8_linear=True) - @unittest.skipIf(not TEST_CUDA, "no cuda") def test_fp32_fp8_single_module_parity(self): """ Tests numeric parity for fp32 parameters with fp8 computation with a single module/FSDP communication group. """ - for enable_fsdp_fp8_all_gather, use_float8_linear in itertools.product( - [False, True], [False, True] - ): + for enable_fsdp_fp8_all_gather in [False, True]: module_fp32 = self.init_single_module() - ref_module = self.swap_linear_with_dynamic( - copy.deepcopy(module_fp32), use_float8_linear - ) + ref_module = self.swap_linear_with_dynamic(copy.deepcopy(module_fp32)) ref_module = ref_module.cuda() with set_enable_fsdp_fp8_all_gather(enable_fsdp_fp8_all_gather): - module = self.swap_linear_with_dynamic(module_fp32, use_float8_linear) + module = self.swap_linear_with_dynamic(module_fp32) fully_shard(module) ref_optim = torch.optim.Adam(ref_module.parameters(), lr=1e-2) optim = torch.optim.Adam(module.parameters(), lr=1e-2, foreach=True) @@ -433,7 +386,6 @@ def test_fp32_fp8_single_module_parity(self): module, optim, local_inp, - Float8DynamicLinear, ) @unittest.skipIf(not TEST_CUDA, "no cuda") @@ -442,16 +394,12 @@ def test_fp32_fp8_multi_module_parity(self): Tests numeric parity for fp32 parameters with fp8 computation with multiple modules/FSDP communication groups. """ - for enable_fsdp_fp8_all_gather, use_float8_linear in itertools.product( - [False, True], [False, True] - ): + for enable_fsdp_fp8_all_gather in [False, True]: module = self.init_multi_module() ref_module = copy.deepcopy(module) - ref_module = self.swap_linear_with_dynamic( - ref_module, use_float8_linear - ).cuda() + ref_module = self.swap_linear_with_dynamic(ref_module).cuda() with set_enable_fsdp_fp8_all_gather(enable_fsdp_fp8_all_gather): - module = self.swap_linear_with_dynamic(module, use_float8_linear) + module = self.swap_linear_with_dynamic(module) for submodule in module: fully_shard(submodule) fully_shard(module) @@ -465,7 +413,6 @@ def test_fp32_fp8_multi_module_parity(self): module, optim, local_inp, - Float8DynamicLinear, ) @unittest.skipIf(not TEST_CUDA, "no cuda") @@ -482,13 +429,10 @@ def test_bf16_mp_fp8_dynamic_multi_parity(self): ref_module_bf16 = copy.deepcopy(module).to(torch.bfloat16) ref_module_bf16 = swap_linear_with_float8_linear( ref_module_bf16, - Float8DynamicLinear, emulate=True, ) ref_module_fp32 = copy.deepcopy(module).cuda() - module = swap_linear_with_float8_linear( - module, Float8DynamicLinear, emulate=True - ) + module = swap_linear_with_float8_linear(module, emulate=True) mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16) for mlp in module: fully_shard(mlp, mp_policy=mp_policy) @@ -501,7 +445,6 @@ def test_bf16_mp_fp8_dynamic_multi_parity(self): module, torch.optim.Adam(module.parameters(), lr=1e-2, foreach=True), self.get_local_inp(torch.bfloat16), - Float8DynamicLinear, ) diff --git a/test/test_fsdp_compile.py b/test/test_fsdp_compile.py index cc44934..715db29 100644 --- a/test/test_fsdp_compile.py +++ b/test/test_fsdp_compile.py @@ -51,7 +51,6 @@ def get_model(K, N, is_fp8, emulate, base_dtype=torch.float32): ) swap_linear_with_float8_linear( m, - Float8Linear, emulate=emulate, scaling_type_x=TensorScalingType.DELAYED, scaling_type_w=TensorScalingType.DELAYED, diff --git a/test/test_inference_flows.py b/test/test_inference_flows.py index c92b752..1dd09d9 100644 --- a/test/test_inference_flows.py +++ b/test/test_inference_flows.py @@ -193,7 +193,6 @@ def test_fp8_save_and_load(self, dtype: torch.dtype): fp8_mlp.reset_parameters() swap_linear_with_float8_linear( fp8_mlp, - Float8Linear, scaling_type_x=TensorScalingType.DYNAMIC, scaling_type_w=TensorScalingType.DYNAMIC, scaling_type_dL_dY=TensorScalingType.DYNAMIC, @@ -218,7 +217,6 @@ def test_fp8_save_and_load(self, dtype: torch.dtype): new_fp8_mlp = FeedForward().to(dtype=dtype) swap_linear_with_float8_linear( new_fp8_mlp, - Float8Linear, scaling_type_x=TensorScalingType.DYNAMIC, scaling_type_w=TensorScalingType.DYNAMIC, scaling_type_dL_dY=TensorScalingType.DYNAMIC, diff --git a/test/test_numerics_integration.py b/test/test_numerics_integration.py index 1d571de..401d0fd 100644 --- a/test/test_numerics_integration.py +++ b/test/test_numerics_integration.py @@ -14,11 +14,9 @@ import torch import torch.nn as nn import torch.nn.functional as F -from float8_experimental.float8_dynamic_linear import Float8DynamicLinear from float8_experimental.float8_linear import Float8Linear, TensorScalingType from float8_experimental.float8_linear_utils import ( linear_requires_sync, - LinearType, swap_linear_with_float8_linear, sync_float8_amax_and_scale_history, ) @@ -85,32 +83,14 @@ class TestFloat8NumericsIntegrationTest: @pytest.mark.parametrize( "scaling_type_dL_dY", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC] ) - @pytest.mark.parametrize("linear_cls", [Float8Linear, Float8DynamicLinear]) @pytest.mark.skipif(not is_H100, reason="requires H100 GPU") @pytest.mark.skipif(IS_ROCM, reason="test doesn't currently work on the ROCm stack") def test_encoder_fw_bw( self, - linear_cls, scaling_type_x: TensorScalingType, scaling_type_w: TensorScalingType, scaling_type_dL_dY: TensorScalingType, ): - linear_type = ( - LinearType.DELAYED if linear_cls == Float8Linear else LinearType.DYNAMIC - ) - if linear_type is LinearType.DYNAMIC: - # Only test one combination of scaling types, as they are a no-op - # for Float8DynamicLinear. It would be cleaner to split into two - # tests, but IMO not worth it since Float8DynamicLinear will be - # deleted soon - is_all_dynamic = ( - scaling_type_x is TensorScalingType.DYNAMIC - and scaling_type_w is TensorScalingType.DYNAMIC - and scaling_type_dL_dY is TensorScalingType.DYNAMIC - ) - if not is_all_dynamic: - pytest.skip() - # TODO(later): maybe add float16 back if it becomes important data_dtype = torch.bfloat16 @@ -130,7 +110,6 @@ def test_encoder_fw_bw( model_fp8 = copy.deepcopy(model_ref) swap_linear_with_float8_linear( model_fp8, - linear_cls, emulate=False, scaling_type_x=scaling_type_x, scaling_type_w=scaling_type_w, @@ -156,17 +135,13 @@ def test_encoder_fw_bw( model_ref_out = model_ref(data2) model_ref_out.sum().backward() - if linear_requires_sync( - linear_type, scaling_type_x, scaling_type_w, scaling_type_dL_dY - ): + if linear_requires_sync(scaling_type_x, scaling_type_w, scaling_type_dL_dY): sync_float8_amax_and_scale_history(model_fp8) model_fp8(data1).sum().backward() # zero out grads without stepping, since we just want to compare grads # of the second datum optim_fp8.zero_grad() - if linear_requires_sync( - linear_type, scaling_type_x, scaling_type_w, scaling_type_dL_dY - ): + if linear_requires_sync(scaling_type_x, scaling_type_w, scaling_type_dL_dY): sync_float8_amax_and_scale_history(model_fp8) model_fp8_out = model_fp8(data2) model_fp8_out.sum().backward()