Merge branch 'develop_v2.4'

1e018a45 · wenjh · 4ef4eae6 · 40a4d896 · 1e018a45 · 1e018a45
Commit 1e018a45 authored Jul 03, 2025 by wenjh
2 changed files
--- a/tests/pytorch/distributed/run_cast_master_weights_to_fp8.py
+++ b/tests/pytorch/distributed/run_cast_master_weights_to_fp8.py
@@ -570,7 +570,7 @@ def _test_cast_master_weights_to_fp8(quantization, dp_group):
    mock_groups = [dist.new_group(ranks=[i]) for i in range(world_size)]
    mock_group = mock_groups[rank]
-    linear_kwargs = {"params_dtype": torch.bfloat16, "bias": False, "fuse_wgrad_accumulation": True}
+    linear_kwargs = {"params_dtype": torch.bfloat16, "bias": False, "fuse_wgrad_accumulation": False}
    # Create model with FP8 weights
    with te.fp8.fp8_model_init(

--- a/transformer_engine/common/recipe/fp8_block_scaling.cu
+++ b/transformer_engine/common/recipe/fp8_block_scaling.cu
@@ -248,7 +248,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
  using transformer_engine::Vec;
  static_assert(sizeof(OType) == 1);
-  constexpr int kNumOutputElemsPerBank = 4 / sizeof(OType);
+  constexpr int kNumOutputElemsPerBank = 2 / sizeof(OType);
  constexpr int kThreadsPerWarp = 32;
  constexpr int kLoopsPerRow = kTileDim64 / kThreadsPerWarp;
  constexpr int kNumWarps = kThreadsPerBlock / kThreadsPerWarp;