[Pytorch] NVIDIA-DL-Framework-Inspect support – part 3 – tests (#1612)

* tests drop Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move dir Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * tests fox Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> Signed-off-by: Przemek Tredak <ptredak@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Przemek Tredak <ptredak@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

[Pytorch] NVIDIA-DL-Framework-Inspect support – part 3 – tests (#1612)
* tests drop Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move dir Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * tests fox Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> Signed-off-by: Przemek Tredak <ptredak@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Przemek Tredak <ptredak@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
2645eaec · Paweł Gadziński · GitHub · 1d903f5e · 2645eaec · 2645eaec
Unverified Commit 2645eaec authored May 19, 2025 by Paweł Gadziński Committed by GitHub May 19, 2025
8 changed files
--- a/transformer_engine/debug/features/fake_quant.py
+++ b/transformer_engine/debug/features/fake_quant.py
@@ -49,7 +49,7 @@ def fake_quantize(tensor: torch.Tensor, fp8_format: tex.DType, out=None):
            fp8_dtype = tex.DType.kFloat8E5M2
        amax = tensor.abs().max().float()
        one = torch.ones(1, device=tensor.device)
-        scale = _default_sf_compute(amax, one, fp8_max)
+        scale = _default_sf_compute(amax, one, fp8_max, 0)
        quantizer = Float8Quantizer(scale, amax, fp8_dtype)
    else:

--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -120,7 +120,6 @@ class LogFp8TensorStats(BaseLogTensorStats):
        if not rowwise:
            return  # tensor was already seen rowwise in the other gemm
-        tensor = tensor._data
        options = (
            config.get("start_step", None),
            config.get("end_step", None),

--- a/transformer_engine/debug/features/per_tensor_scaling.py
+++ b/transformer_engine/debug/features/per_tensor_scaling.py
@@ -15,6 +15,7 @@ import transformer_engine_torch as tex
 from transformer_engine.pytorch.tensor import Quantizer
 from transformer_engine.pytorch.tensor.float8_tensor import (
    Float8Tensor,
+    Float8Quantizer,
    Float8CurrentScalingQuantizer,
 )
 from transformer_engine.debug.features.api import TEConfigAPIMapper
@@ -39,7 +40,7 @@ def per_tensor_cast(
    }, "[NVTORCH INSPECT ERROR] Only 2 FP8 types: E4M3 and E5M2 are supported in TE."
    tensor = tensor.contiguous()
-    quantizer = Float8CurrentScalingQuantizer(fp8_dtype)
+    quantizer = Float8CurrentScalingQuantizer(fp8_dtype, device=tensor.device)
    if out is not None:
        quantizer.update_quantized(tensor, out)
@@ -118,7 +119,7 @@ class PerTensorScaling(TEConfigAPIMapper):
            if key not in ["gemm", "tensor"]:
                raise ValueError(f'[NVTORCH INSPECT ERROR] Unexpected key in config: "{key}".')
-        assert isinstance(default_quantizer, Float8CurrentScalingQuantizer), (
+        assert isinstance(default_quantizer, Float8Quantizer), (
            f"[NVTORCH INSPECT ERROR] Feature={self.__class__.__name__}, API=process_tensor: "
            "Per-tensor current scaling can be used only within `DelayedScaling` recipe autocast."
            f" {layer_name}"

--- a/transformer_engine/debug/features/utils/stats_computation.py
+++ b/transformer_engine/debug/features/utils/stats_computation.py
@@ -96,7 +96,10 @@ STATS = {
    "max": (torch.max, lambda buffers: max(_get(buffers, "max"))),
    "sum": (torch.sum, lambda buffers: sum(_get(buffers, "sum"))),
    "mean": (torch.mean, lambda buffers: sum(_get(buffers, "sum")) / sum(_get(buffers, "numel"))),
-    "numel": (lambda x: x.numel(), lambda buffers: sum(_get(buffers, "numel"))),
+    "numel": (
+        lambda x: x.numel() if hasattr(x, "numel") else x.get_data_tensors()[0].numel(),
+        lambda buffers: sum(_get(buffers, "numel")),
+    ),
    "l1_norm": (lambda x: torch.norm(x, p=1), lambda buffers: sum(_get(buffers, "l1_norm"))),
    "l2_norm_square": (
        lambda x: torch.sum(x**2),
@@ -137,7 +140,7 @@ STATS = {
        - min(_get(buffers, "dynamic_range_bottom")),
    ),
    "underflows%": (
-        lambda x: (x == 0).sum() / x.numel() * 100,
+        lambda x: (x.get_data_tensors()[0] == 0).sum() / x.get_data_tensors()[0].numel() * 100,
        lambda buffers: 100 * sum(_get(buffers, "underflows_num")) / sum(_get(buffers, "numel")),
    ),
 }
--- a/transformer_engine/debug/pytorch/debug_quantization.py
+++ b/transformer_engine/debug/pytorch/debug_quantization.py
@@ -18,6 +18,7 @@ import transformer_engine_torch as tex
 from transformer_engine.pytorch.tensor.quantized_tensor import (
    QuantizedTensor,
    Quantizer,
+    QuantizedTensorBase,
    prepare_for_saving,
    restore_from_saved,
 )
@@ -299,6 +300,7 @@ class DebugQuantizer(Quantizer):
                iteration=self.iteration,
                dtype=dtype,
            )
+            if dtype is not None:
                if columnwise_gemm_tensor.dtype != dtype:
                    raise ValueError("Dtype does not match the output of the modify_tensor call")
        if self.rowwise_tensor_plan == API_CALL_MODIFY:
@@ -311,6 +313,7 @@ class DebugQuantizer(Quantizer):
                iteration=self.iteration,
                dtype=dtype,
            )
+            if dtype is not None:
                if rowwise_gemm_tensor.dtype != dtype:
                    raise ValueError("Dtype does not match the output of the modify_tensor call")
@@ -332,6 +335,7 @@ class DebugQuantizer(Quantizer):
            quantizer=self,
            layer_name=self.layer_name,
            tensor_name=self.tensor_name,
+            original_tensor=tensor,
        )
    def process_gemm_output(self, tensor: torch.Tensor):
@@ -456,7 +460,7 @@ class DebugQuantizer(Quantizer):
        return False
-class DebugQuantizedTensor:
+class DebugQuantizedTensor(QuantizedTensorBase):
    """
    Class containing quantized tensors after debug. Depending on configuration
    it can contain one or two different objects. These objects can be accessed by the method
@@ -470,6 +474,7 @@ class DebugQuantizedTensor:
        quantizer,
        layer_name=None,
        tensor_name=None,
+        original_tensor=None,
    ):
        self.rowwise_gemm_tensor = rowwise_gemm_tensor
@@ -477,6 +482,7 @@ class DebugQuantizedTensor:
        self.quantizer = quantizer
        self._layer_name = layer_name
        self._tensor_name = tensor_name
+        self._original_tensor = original_tensor
    def prepare_for_saving(self):
        """ " Prepare for saving method override"""
@@ -524,5 +530,5 @@ class DebugQuantizedTensor:
        """Size of the tensor."""
        return self.rowwise_gemm_tensor.size()
-    def update_usage(self, rowwise_usage: bool, columnwise_usage: bool):
+    def update_usage(self, rowwise_usage: bool = None, columnwise_usage: bool = None):
        """Update usage of the tensor."""
--- a/transformer_engine/pytorch/distributed.py
+++ b/transformer_engine/pytorch/distributed.py
@@ -1239,12 +1239,18 @@ def gather_along_first_dim(
        final_quantizer = (
            None if not needs_quantized_gemm(inp, rowwise=True) else quantizer.parent_quantizer
        )
+        # Temporary fix for TP communication of Float8BlockwiseQTensorBase
+        if isinstance(rowwise, Float8BlockwiseQTensorBase):
+            rowwise = inp._original_tensor
        rowwise_total = gather_along_first_dim(rowwise, process_group, False, final_quantizer)[0]
        out_obj.rowwise_gemm_tensor = rowwise_total
        if rowwise is not columnwise:
            final_quantizer_columnwise = (
                None if not needs_quantized_gemm(inp, rowwise=False) else quantizer.parent_quantizer
            )
+            # Temporary fix for TP communication of Float8BlockwiseQTensorBase
+            if isinstance(columnwise, Float8BlockwiseQTensorBase):
+                columnwise = inp._original_tensor
            columnwise_total, _ = gather_along_first_dim(
                columnwise, process_group, False, final_quantizer_columnwise
            )

--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -1057,7 +1057,12 @@ class TransformerEngineBaseModule(torch.nn.Module, ABC):
            if (
                isinstance(
                    grad_output_.get_tensor(True),
-                    (QuantizedTensor, Float8TensorBase, MXFP8TensorBase),
+                    (
+                        QuantizedTensor,
+                        Float8TensorBase,
+                        MXFP8TensorBase,
+                        Float8BlockwiseQTensorBase,
+                    ),
                )
                and ctx.use_bias
            ):

--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -193,6 +193,7 @@ class _LayerNormLinear(torch.autograd.Function):
        # or if a gather of ln_out must be in high precision.
        with_quantized_norm = (
            fp8
+            and not debug
            and not return_layernorm_output
            and not return_layernorm_output_gathered
            and not force_hp_blockwise_ln_out_gather