# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.

from typing import Tuple
import math
import os
import pathlib
import pytest
import torch
import transformer_engine as te
import transformer_engine_torch as tex
from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
from transformer_engine.common.recipe import Float8BlockScaling
from transformer_engine.pytorch.constants import TE_DType
from transformer_engine.pytorch.tensor.float8_blockwise_tensor import (
    Float8BlockQuantizer,
    Float8BlockwiseQTensor,
)

from references.blockwise_quantizer_reference import (
    BlockwiseQuantizerReference,
    QuantizeResult,
)
from test_float8_current_scaling_exact import (
    TestFP8RecipeLinearBase,
    TestFP8RecipeLayerNormLinearBase,
)
import logging
# read env variable NVTE_TEST_FLOAT8_BLOCK_SCALING_EXACT_TENSOR_DUMP_DIR to override the default tensor dump directory
TENSOR_DUMP_DIR = pathlib.Path(__file__).resolve().parent.parent.parent / "tensor_dumps"
tensor_dump_dir_env = os.getenv("NVTE_TEST_BLOCK_CURRENT_SCALING_EXACT_TENSOR_DUMP_DIR")
if tensor_dump_dir_env is not None:
    TENSOR_DUMP_DIR = pathlib.Path(tensor_dump_dir_env)
recipe_available, reason_for_no_recipe = FP8GlobalStateManager.is_fp8_block_scaling_available()


class GetRecipes:

    @staticmethod
    def none():
        return None

    @staticmethod
    def fp8_blockwise():
        # return default configs
        return Float8BlockScaling()


# FP8 per tesnor current scaling
@pytest.mark.skipif(not recipe_available, reason=reason_for_no_recipe)
class TestFP8BlockScalingRecipeLinear(TestFP8RecipeLinearBase):

    @staticmethod
    def setup_class(cls) -> None:
        # Configure RNG
        seed = 1234
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)

    @pytest.mark.parametrize(
        "batch_size, hidden_size, out_size",
        [
            (16, 256, 128),
        ],
    )
    @pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
    @pytest.mark.parametrize(
        "recipe1, recipe2",
        [
            (GetRecipes.none, GetRecipes.fp8_blockwise),
        ],
    )
    def test_fp8_current_scaling_with_linear_module(
        self,
        recipe1,
        recipe2,
        batch_size,
        hidden_size,
        out_size,
        dtype,
        use_bias=False,
    ):
        fp8_zero_tolerance_tensor_dumps_recipe2 = None
        # check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
        # if we cannot get all four tensors, then still set the tensor dump to None
        tensor_map = self._check_golden_tensor_dumps(
            TENSOR_DUMP_DIR, recipe2, (batch_size, hidden_size, out_size), dtype, use_bias
        )
        if tensor_map is not None:
            fp8_zero_tolerance_tensor_dumps_recipe2 = tensor_map

        assert recipe1 == GetRecipes.none, "Only None recipe is supported for recipe1"
        self.compare_recipe(
            recipe1,
            recipe2,
            batch_size,
            hidden_size,
            out_size,
            use_bias,
            seed=torch.initial_seed(),
            dtype=dtype,
            y_error=0.5,
            dgrad_error=1,
            wgrad_error=1,
            bgrad_error=0.5,
            recipe1_golden_tensors=None,
            recipe2_golden_tensors=fp8_zero_tolerance_tensor_dumps_recipe2,
        )


@pytest.mark.skipif(not recipe_available, reason=reason_for_no_recipe)
class TestFP8BlockScalingRecipeLayerNormLinear(TestFP8RecipeLayerNormLinearBase):

    @staticmethod
    def setup_class(cls) -> None:
        # Configure RNG
        seed = 1234
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)

    @pytest.mark.parametrize(
        "batch_size, hidden_size, out_size",
        [
            (16, 256, 128),
        ],
    )
    @pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
    @pytest.mark.parametrize(
        "recipe1, recipe2",
        [
            (GetRecipes.none, GetRecipes.fp8_blockwise),
        ],
    )
    def test_fp8_current_scaling_with_layernorm_linear_module(
        self,
        recipe1,
        recipe2,
        batch_size,
        hidden_size,
        out_size,
        dtype,
        use_bias=False,
    ):
        fp8_zero_tolerance_tensor_dumps_recipe2 = None
        # check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
        # if we cannot get all four tensors, then still set the tensor dump to None
        tensor_map = self._check_golden_tensor_dumps(
            TENSOR_DUMP_DIR,
            recipe2,
            (batch_size, hidden_size, out_size),
            dtype,
            use_bias,
            "LayerNorm",
        )
        if tensor_map is not None:
            fp8_zero_tolerance_tensor_dumps_recipe2 = tensor_map

        self.compare_recipe(
            recipe1,
            recipe2,
            batch_size,
            hidden_size,
            out_size,
            use_bias,
            seed=torch.initial_seed(),
            dtype=dtype,
            y_error=0.9,
            ln_out_error=0.5,
            dgrad_error=1,
            wgrad_error=1,
            bgrad_error=0.5,
            recipe1_golden_tensors=None,
            recipe2_golden_tensors=fp8_zero_tolerance_tensor_dumps_recipe2,
        )