[PyTorch Debug] Fixed the empty tensor bug in statistics computation (#1843)

* fixed the bug Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * lint fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * test change Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

[PyTorch Debug] Fixed the empty tensor bug in statistics computation (#1843)
* fixed the bug Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * lint fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * test change Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1d1d3233 · Paweł Gadziński · GitHub · 964c2ed2 · 1d1d3233 · 1d1d3233
Unverified Commit 1d1d3233 authored Jun 26, 2025 by Paweł Gadziński Committed by GitHub Jun 26, 2025
4 changed files
--- a/tests/pytorch/debug/test_distributed.py
+++ b/tests/pytorch/debug/test_distributed.py
@@ -34,6 +34,6 @@ def test_debug_distributed(feature_dirs):
    test_path = TEST_ROOT / "run_distributed.py"
    test_cmd = LAUNCH_CMD + [str(test_path), f"--feature_dirs={feature_dirs[0]}"]
-    result = subprocess.run(test_cmd, env=os.environ, capture_output=True, check=False)
+    result = subprocess.run(test_cmd, env=os.environ, check=False, text=True)
    if result.returncode != 0:
-        raise AssertionError(result.stderr.decode())
+        raise AssertionError(f"torchrun exited with {result.returncode}")
--- a/tests/pytorch/debug/test_numerics.py
+++ b/tests/pytorch/debug/test_numerics.py
@@ -262,6 +262,18 @@ def _get_tensors():
    return x, weight
+LOGGING_CONFIG = """logging_config:
+  enabled: True
+  layers:
+    layer_types: [linear]
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors: [activation, gradient, weight, output, wgrad, dgrad]
+      stats: [min, max, mean, std, l1_norm, l2_norm, cur_amax, dynamic_range]
+"""
 DISABLE_FP8_CONFIG = Template(
    """disable_fp8_config:
  enabled: True
@@ -275,6 +287,24 @@ DISABLE_FP8_CONFIG = Template(
 )
+@create_config_file
+def run_logging_zero_numel_tensor(feature_dirs, **kwargs):
+    kwargs["config_file"].write(LOGGING_CONFIG)
+    kwargs["config_file"].flush()
+    _init_debug(kwargs["config_file"].name, kwargs["log_dir"], feature_dirs)
+    x, weight = _get_tensors()
+    x1 = x[:0, :]
+    model = _init_model(weight)
+    _ = _run_forward_backward(x1, model)
+    _ = _run_forward_backward(x, model)
+def test_logging_zero_numel_tensor(feature_dirs):
+    run_logging_zero_numel_tensor(feature_dirs)
 @pytest.mark.parametrize("fprop_fp8", all_boolean)
 @pytest.mark.parametrize("dgrad_fp8", all_boolean)
 @pytest.mark.parametrize("wgrad_fp8", all_boolean)

--- a/transformer_engine/debug/features/utils/stats_buffer.py
+++ b/transformer_engine/debug/features/utils/stats_buffer.py
@@ -85,6 +85,13 @@ class _Buffer:
        if self.modified[0] and not self.reduce_within_microbatch:
            return
+        if (
+            tensor.numel() == 0
+            if hasattr(tensor, "numel")
+            else all((t is None or t.numel() == 0) for t in tensor.get_data_tensors())
+        ):
+            return
        # save stats for tensor to tmp buffer
        for stat_name in self.stats_to_compute:
            fn, _ = STATS[stat_name]

--- a/transformer_engine/debug/features/utils/stats_computation.py
+++ b/transformer_engine/debug/features/utils/stats_computation.py
@@ -17,6 +17,8 @@ def _compute_dynamic_range_top(tensor):
    """Computes the log2 of the amax of the tensor"""
    tensor_abs = tensor.abs()
    tensor_abs = tensor_abs[tensor_abs != 0]
+    if tensor_abs.numel() == 0:
+        return torch.inf
    amax = tensor_abs.max().float()
    if not amax.all():
        amax = torch.tensor(1, device=tensor.device).to(torch.float)
@@ -125,7 +127,7 @@ STATS = {
        lambda buffers: min(_get(buffers, "dynamic_range_bottom")),
    ),
    "underflows_num": (
-        lambda x: (x._data == 0).sum(),
+        lambda x: (x.get_data_tensors()[0] == 0).sum(),
        lambda buffers: sum(_get(buffers, "underflows_num")),
    ),
    "std": (