Fix int8 gemm nt and wgrad

Signed-off-by: wenjh <wenjh@sugon.com>

Fix int8 gemm nt and wgrad
Signed-off-by: wenjh <wenjh@sugon.com>
5fcf30ba · wenjh · 9fe13a33 · 5fcf30ba · 5fcf30ba
Commit 5fcf30ba authored Jul 09, 2025 by wenjh
2 changed files
--- a/transformer_engine/pytorch/triton/blockwise_int8_gemm_nt.py
+++ b/transformer_engine/pytorch/triton/blockwise_int8_gemm_nt.py
@@ -50,8 +50,8 @@ def get_full_tuning_space():
 @triton.autotune(
    configs= get_full_tuning_space() if tuning_full_space else [
        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 2, 'kpack':2}, num_stages=2, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 2,}, num_stages=1, num_warps=4, enable_mmacfuse=2),
+        # triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 2,}, num_stages=1, num_warps=4, enable_mmacfuse=2),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8,}, num_stages=1, num_warps=4, enable_mmacfuse=2),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': blockwise_fp8_block_len, 'BLOCK_SIZE_K': blockwise_fp8_block_len, 'GROUP_SIZE_M': 8,}, num_stages=1, num_warps=4, enable_mmacfuse=2),
    ],
    key=['M', 'N', 'K'],
    # reset_to_zero=['c_ptr']

--- a/transformer_engine/pytorch/triton/blockwise_int8_gemm_nt_wgrad.py
+++ b/transformer_engine/pytorch/triton/blockwise_int8_gemm_nt_wgrad.py
@@ -50,8 +50,8 @@ def get_full_tuning_space():
 @triton.autotune(
    configs= get_full_tuning_space() if tuning_full_space else [
        # triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 2, 'kpack':2}, num_stages=2, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 2,}, num_stages=1, num_warps=4, enable_mmacfuse=2),
+        # triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 2,}, num_stages=1, num_warps=4, enable_mmacfuse=2),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8,}, num_stages=1, num_warps=4, enable_mmacfuse=2),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': blockwise_fp8_block_len, 'BLOCK_SIZE_K': blockwise_fp8_block_len, 'GROUP_SIZE_M': 8,}, num_stages=1, num_warps=4, enable_mmacfuse=2),
    ],
    key=['M', 'N', 'K'],
    # reset_to_zero=['c_ptr']