[JAX] Fix distributed Layernorm test failure (#1734)

Fix distributed layernorm test failure Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>

[JAX] Fix distributed Layernorm test failure (#1734)
Fix distributed layernorm test failure Signed-off-by: Jeremy Berchtold <jberchtold@nvidia.com>
dac098d8 · jberchtold-nvidia · GitHub · 2f61c401 · dac098d8 · dac098d8
Unverified Commit dac098d8 authored Apr 30, 2025 by jberchtold-nvidia Committed by GitHub Apr 30, 2025
Showing with 2 additions and 2 deletions

tests/jax/test_distributed_layernorm.py tests/jax/test_distributed_layernorm.py +1 -1

transformer_engine/jax/cpp_extensions/quantization.py transformer_engine/jax/cpp_extensions/quantization.py +1 -1

No files found.
--- a/tests/jax/test_distributed_layernorm.py
+++ b/tests/jax/test_distributed_layernorm.py
@@ -78,7 +78,7 @@ class TestDistributedLayernorm:
        if fp8_recipe == recipe.MXFP8BlockScaling() and "dp" in mesh_axes:
            other_bytes = 384  # required for small scale shapes that require padding
        if fp8_recipe == recipe.Float8CurrentScaling():
-            allreduce_total_bytes += 4  # 1 * FP32 for the amax reduction
+            allreduce_total_bytes += jax_dtype.itemsize  # 1 * dtype for the amax reduction
        return generate_collectives_count(
            allreduce=allreduce_total_bytes * int(is_dp_enabled), allgather=0, other=other_bytes
        )

--- a/transformer_engine/jax/cpp_extensions/quantization.py
+++ b/transformer_engine/jax/cpp_extensions/quantization.py
@@ -614,7 +614,7 @@ def _quantize_dbias_impl(
        # Globally reduce amax across all devices for current scaling so we have a single global scale.
        # This differs from the PyTorch implementation which uses a local amax and scale per-device and persists this
        # until the tensor is dequantized (e.g. in the GEMM).
-        amax = jnp.amax(jnp.abs(x), keepdims=True)
+        amax = jnp.amax(jnp.abs(x), keepdims=True).astype(jnp.float32)
        scale = compute_scale_from_amax(amax, quantizer.q_dtype)

    if isinstance(quantizer, DelayedScaleQuantizer):