[Core] Add launch bounds to swizzle kernels (#2076)

Add launch bounds to swizzle kernel, use empty scale inv Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

[Core] Add launch bounds to swizzle kernels (#2076)
Add launch bounds to swizzle kernel, use empty scale inv Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
12065ac2 · Kirthi Shankar Sivamani · GitHub · a169e9e7 · 12065ac2 · 12065ac2
Unverified Commit 12065ac2 authored Aug 14, 2025 by Kirthi Shankar Sivamani Committed by GitHub Aug 14, 2025
Showing with 8 additions and 8 deletions

transformer_engine/common/swizzle/swizzle.cu transformer_engine/common/swizzle/swizzle.cu +6 -6

transformer_engine/pytorch/tensor/mxfp8_tensor.py transformer_engine/pytorch/tensor/mxfp8_tensor.py +2 -2

No files found.
--- a/transformer_engine/common/swizzle/swizzle.cu
+++ b/transformer_engine/common/swizzle/swizzle.cu
@@ -145,9 +145,9 @@ __device__ void swizzle_col_scaling_kernel_impl(const void* input, void* output,
 }
 template <typename LType, int SF_TILE_DIM_M, int SF_TILE_DIM_K>
-__global__ void swizzle_col_scaling_kernel(const void* input, void* output, const int M,
+__global__ void __launch_bounds__(TB_DIM* TB_DIM)
-                                           const int K, const int original_M,
+    swizzle_col_scaling_kernel(const void* input, void* output, const int M, const int K,
-                                           const int original_K) {
+                               const int original_M, const int original_K) {
  swizzle_col_scaling_kernel_impl<LType, SF_TILE_DIM_M, SF_TILE_DIM_K>(
      input, output, M, K, original_M, original_K, blockIdx.x, blockIdx.y, gridDim.x, gridDim.y);
 }
@@ -238,9 +238,9 @@ __device__ void swizzle_row_scaling_kernel_impl(const void* input, void* output,
 }
 template <typename LType, int SF_TILE_DIM_M, int SF_TILE_DIM_K>
-__global__ void swizzle_row_scaling_kernel(const void* input, void* output, const int M,
+__global__ void __launch_bounds__(TB_DIM* TB_DIM)
-                                           const int K, const int original_M,
+    swizzle_row_scaling_kernel(const void* input, void* output, const int M, const int K,
-                                           const int original_K) {
+                               const int original_M, const int original_K) {
  swizzle_row_scaling_kernel_impl<LType, SF_TILE_DIM_M, SF_TILE_DIM_K>(
      input, output, M, K, original_M, original_K, blockIdx.x, blockIdx.y, gridDim.x, gridDim.y);
 }

--- a/transformer_engine/pytorch/tensor/mxfp8_tensor.py
+++ b/transformer_engine/pytorch/tensor/mxfp8_tensor.py
@@ -100,7 +100,7 @@ class MXFP8Quantizer(Quantizer):
        # Allocate FP8 data
        data = torch.empty(shape, dtype=torch.uint8, device=device)
-        scale_inv = torch.zeros(
+        scale_inv = torch.empty(
            round_up_to_nearest_multiple(math.prod(shape[:-1]), 128),
            round_up_to_nearest_multiple(shape[-1] // MXFP8_BLOCK_SCALING_SIZE, 4),
            dtype=torch.uint8,
@@ -112,7 +112,7 @@ class MXFP8Quantizer(Quantizer):
        columnwise_scale_inv = None
        if self.columnwise_usage:
            columnwise_data = torch.empty_like(data)
-            columnwise_scale_inv = torch.zeros(
+            columnwise_scale_inv = torch.empty(
                round_up_to_nearest_multiple(math.prod(shape[:-1]) // MXFP8_BLOCK_SCALING_SIZE, 4),
                round_up_to_nearest_multiple(shape[-1], 128),
                dtype=torch.uint8,