Fix kernel crash on block_len=64

Signed-off-by: wenjh <wenjh@sugon.com>

Fix kernel crash on block_len=64
Signed-off-by: wenjh <wenjh@sugon.com>
40a4d896 · wenjh · b944277c · 40a4d896 · 40a4d896
Commit 40a4d896 authored Jul 03, 2025 by wenjh
2 changed files
--- a/tests/pytorch/distributed/run_cast_master_weights_to_fp8.py
+++ b/tests/pytorch/distributed/run_cast_master_weights_to_fp8.py
@@ -570,7 +570,7 @@ def _test_cast_master_weights_to_fp8(quantization, dp_group):
    mock_groups = [dist.new_group(ranks=[i]) for i in range(world_size)]
    mock_group = mock_groups[rank]

-    linear_kwargs = {"params_dtype": torch.bfloat16, "bias": False, "fuse_wgrad_accumulation": True}
+    linear_kwargs = {"params_dtype": torch.bfloat16, "bias": False, "fuse_wgrad_accumulation": False}

    # Create model with FP8 weights
    with te.fp8.fp8_model_init(

--- a/transformer_engine/common/recipe/fp8_block_scaling.cu
+++ b/transformer_engine/common/recipe/fp8_block_scaling.cu
@@ -248,7 +248,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
  using transformer_engine::Vec;

  static_assert(sizeof(OType) == 1);
-  constexpr int kNumOutputElemsPerBank = 4 / sizeof(OType);
+  constexpr int kNumOutputElemsPerBank = 2 / sizeof(OType);
  constexpr int kThreadsPerWarp = 32;
  constexpr int kLoopsPerRow = kTileDim64 / kThreadsPerWarp;
  constexpr int kNumWarps = kThreadsPerBlock / kThreadsPerWarp;