# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # See LICENSE for license information. from typing import Tuple import math import os import pathlib import pytest import torch import transformer_engine as te import transformer_engine_torch as tex from transformer_engine.pytorch.fp8 import FP8GlobalStateManager from transformer_engine.common.recipe import Float8BlockScaling from transformer_engine.pytorch.constants import TE_DType from transformer_engine.pytorch.tensor.float8_blockwise_tensor import ( Float8BlockQuantizer, Float8BlockwiseQTensor, ) from references.blockwise_quantizer_reference import ( BlockwiseQuantizerReference, QuantizeResult, ) from test_float8_current_scaling_exact import ( TestFP8RecipeLinearBase, TestFP8RecipeLayerNormLinearBase, ) import logging # read env variable NVTE_TEST_FLOAT8_BLOCK_SCALING_EXACT_TENSOR_DUMP_DIR to override the default tensor dump directory TENSOR_DUMP_DIR = pathlib.Path(__file__).resolve().parent.parent.parent / "tensor_dumps" tensor_dump_dir_env = os.getenv("NVTE_TEST_BLOCK_CURRENT_SCALING_EXACT_TENSOR_DUMP_DIR") if tensor_dump_dir_env is not None: TENSOR_DUMP_DIR = pathlib.Path(tensor_dump_dir_env) recipe_available, reason_for_no_recipe = FP8GlobalStateManager.is_fp8_block_scaling_available() class GetRecipes: @staticmethod def none(): return None @staticmethod def fp8_blockwise(): # return default configs return Float8BlockScaling() # FP8 per tesnor current scaling @pytest.mark.skipif(not recipe_available, reason=reason_for_no_recipe) class TestFP8BlockScalingRecipeLinear(TestFP8RecipeLinearBase): @staticmethod def setup_class(cls) -> None: # Configure RNG seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) @pytest.mark.parametrize( "batch_size, hidden_size, out_size", [ (16, 256, 128), ], ) @pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"]) @pytest.mark.parametrize( "recipe1, recipe2", [ (GetRecipes.none, GetRecipes.fp8_blockwise), ], ) def test_fp8_current_scaling_with_linear_module( self, recipe1, recipe2, batch_size, hidden_size, out_size, dtype, use_bias=False, ): fp8_zero_tolerance_tensor_dumps_recipe2 = None # check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad # if we cannot get all four tensors, then still set the tensor dump to None tensor_map = self._check_golden_tensor_dumps( TENSOR_DUMP_DIR, recipe2, (batch_size, hidden_size, out_size), dtype, use_bias ) if tensor_map is not None: fp8_zero_tolerance_tensor_dumps_recipe2 = tensor_map assert recipe1 == GetRecipes.none, "Only None recipe is supported for recipe1" self.compare_recipe( recipe1, recipe2, batch_size, hidden_size, out_size, use_bias, seed=torch.initial_seed(), dtype=dtype, y_error=0.5, dgrad_error=1, wgrad_error=1, bgrad_error=0.5, recipe1_golden_tensors=None, recipe2_golden_tensors=fp8_zero_tolerance_tensor_dumps_recipe2, ) @pytest.mark.skipif(not recipe_available, reason=reason_for_no_recipe) class TestFP8BlockScalingRecipeLayerNormLinear(TestFP8RecipeLayerNormLinearBase): @staticmethod def setup_class(cls) -> None: # Configure RNG seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) @pytest.mark.parametrize( "batch_size, hidden_size, out_size", [ (16, 256, 128), ], ) @pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"]) @pytest.mark.parametrize( "recipe1, recipe2", [ (GetRecipes.none, GetRecipes.fp8_blockwise), ], ) def test_fp8_current_scaling_with_layernorm_linear_module( self, recipe1, recipe2, batch_size, hidden_size, out_size, dtype, use_bias=False, ): fp8_zero_tolerance_tensor_dumps_recipe2 = None # check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad # if we cannot get all four tensors, then still set the tensor dump to None tensor_map = self._check_golden_tensor_dumps( TENSOR_DUMP_DIR, recipe2, (batch_size, hidden_size, out_size), dtype, use_bias, "LayerNorm", ) if tensor_map is not None: fp8_zero_tolerance_tensor_dumps_recipe2 = tensor_map self.compare_recipe( recipe1, recipe2, batch_size, hidden_size, out_size, use_bias, seed=torch.initial_seed(), dtype=dtype, y_error=0.9, ln_out_error=0.5, dgrad_error=1, wgrad_error=1, bgrad_error=0.5, recipe1_golden_tensors=None, recipe2_golden_tensors=fp8_zero_tolerance_tensor_dumps_recipe2, )