[Common] Fuse pre-swizzling into grouped MXFP8 quantization kernel (#2630)

* Added GEMM-ready preswizzling option Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

[Common] Fuse pre-swizzling into grouped MXFP8 quantization kernel (#2630)
* Added GEMM-ready preswizzling option Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
93d51c82 · Oleg Goncharov · GitHub · c4175fca · 93d51c82 · 93d51c82
Unverified Commit 93d51c82 authored Feb 12, 2026 by Oleg Goncharov Committed by GitHub Feb 12, 2026
Showing with 142 additions and 109 deletions

transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh ...sformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh +135 -109

transformer_engine/common/common.h transformer_engine/common/common.h +7 -0

No files found.
--- a/transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh
+++ b/transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh
@@ -21,6 +21,7 @@
 #include "../../util/ptx.cuh"
 #include "../../utils.cuh"
 #include "../core/common.cuh"
+#include "swizzle.cuh"
 namespace transformer_engine {
 namespace dispatch {
@@ -231,7 +232,7 @@ __device__ __forceinline__ void fence_acquire_tensormap(const CUtensorMap *tenso
 template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename ParamOP,
          float (*OP)(float, const ParamOP &), typename IType, typename OType, bool ROWWISE_SCALING,
-          bool COLWISE_SCALING>
+          bool COLWISE_SCALING, bool WITH_GEMM_SWIZZLED_SCALES>
 __global__ void __launch_bounds__(THREADS_PER_CHUNK) group_quantize_mxfp8_kernel(
    const __grid_constant__ CUtensorMap tensor_map_input_static,
    const __grid_constant__ CUtensorMap tensor_map_act_input_static,
@@ -250,6 +251,8 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK) group_quantize_mxfp8_kernel
  using IType2 = typename ptx::FPx2<IType>;
  using OType2 = typename ptx::FPx2<OType>;
+  using transformer_engine::dispatch::mxfp8::swizzle::gemm_swizzled_scale_idx;
  if constexpr (NO_ACTIVATIONS) {
    if (noop != nullptr && noop[0] == 1.0f) {
      return;
@@ -475,8 +478,14 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK) group_quantize_mxfp8_kernel
      const size_t global_scales_offset_Y = scales_offset_Y_colwise + stage;
      const size_t global_scales_offset_X = scales_offset_X_colwise;
-      const size_t scale_idx =
-          global_scales_offset_Y * scale_stride_colwise + global_scales_offset_X;
+      size_t scale_idx = 0;
+      if constexpr (WITH_GEMM_SWIZZLED_SCALES) {
+        scale_idx = gemm_swizzled_scale_idx(global_scales_offset_X, global_scales_offset_Y,
+                                            DIVUP(rows, static_cast<size_t>(128)));
+      } else {
+        scale_idx = global_scales_offset_Y * scale_stride_colwise + global_scales_offset_X;
+      }
      scales_colwise[scale_idx] = biased_exponent;
      const float block_scale_inverse = ptx::exp2f_rcp(biased_exponent);
@@ -602,7 +611,14 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK) group_quantize_mxfp8_kernel
          ptx::float_to_e8m0(thread_amax * Quantized_Limits<OType>::max_norm_rcp);
      const int stage_scales_offset_Y = scales_offset_Y_rowwise + stage_offset_Y;
      const int stage_scales_offset_X = scales_offset_X_rowwise;
-      const int scale_idx = stage_scales_offset_Y * scale_stride_rowwise + stage_scales_offset_X;
+      size_t scale_idx = 0;
+      if constexpr (WITH_GEMM_SWIZZLED_SCALES) {
+        scale_idx = gemm_swizzled_scale_idx(stage_scales_offset_Y, stage_scales_offset_X,
+                                            DIVUP(cols, static_cast<size_t>(128)));
+      } else {
+        scale_idx = stage_scales_offset_Y * scale_stride_rowwise + stage_scales_offset_X;
+      }
      scales_rowwise[scale_idx] = biased_exponent;
      const float block_scale_inverse = ptx::exp2f_rcp(biased_exponent);
@@ -803,6 +819,8 @@ void group_quantize(const GroupedTensor *input, const GroupedTensor *activations
  const dim3 grid(blocks);
  const size_t block_size = THREADS_PER_CHUNK;
+  const bool with_gemm_swizzled_scales = output->with_gemm_swizzled_scales;
  // Logical shape of a tensor with varying all dims is [1, M*K]
  if (shape_rep != ShapeRepresentation::VARYING_BOTH_DIMS) {
    NVTE_CHECK(first_logical_dim % 128 == 0,
@@ -848,6 +866,8 @@ void group_quantize(const GroupedTensor *input, const GroupedTensor *activations
      input->dtype(), IType,
      TRANSFORMER_ENGINE_TYPE_SWITCH_FP8ONLY(
          output->dtype(), OType,
+          TRANSFORMER_ENGINE_SWITCH_CONDITION(
+              with_gemm_swizzled_scales, WITH_GEMM_SWIZZLED_SCALES,
              alignas(64) CUtensorMap tensor_map_input{};
              alignas(64) CUtensorMap tensor_map_act_input{};
@@ -857,8 +877,9 @@ void group_quantize(const GroupedTensor *input, const GroupedTensor *activations
              constexpr size_t input_type_bit_size = TypeInfo<IType>::size;
              constexpr size_t output_type_bit_size = TypeInfo<OType>::size;
-          create_2D_tensor_map(tensor_map_input, input->data, first_logical_dim, last_logical_dim,
+              create_2D_tensor_map(tensor_map_input, input->data, first_logical_dim,
-                               BUFF_DIM_Y, BUFF_DIM_X, last_logical_dim, 0, input_type_bit_size);
+                                   last_logical_dim, BUFF_DIM_Y, BUFF_DIM_X, last_logical_dim, 0,
+                                   input_type_bit_size);
              if constexpr (IS_DACT) {
                create_2D_tensor_map(tensor_map_act_input, activations->data, first_logical_dim,
@@ -897,22 +918,26 @@ void group_quantize(const GroupedTensor *input, const GroupedTensor *activations
              const size_t dshmem_size = in_mem + out_mem + TMA_SHMEM_ALIGNMENT;
-          auto kernel = group_quantize_mxfp8_kernel<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP, IType,
+              auto kernel =
-                                                    OType, true, true>;
+                  group_quantize_mxfp8_kernel<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP, IType, OType,
+                                              true, true, WITH_GEMM_SWIZZLED_SCALES>;
              switch (scaling_type) {
                case ScalingType::ROWWISE: {
-              kernel = group_quantize_mxfp8_kernel<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP, IType,
+                  kernel =
-                                                   OType, true, false>;
+                      group_quantize_mxfp8_kernel<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP, IType,
+                                                  OType, true, false, WITH_GEMM_SWIZZLED_SCALES>;
                  break;
                }
                case ScalingType::COLWISE: {
-              kernel = group_quantize_mxfp8_kernel<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP, IType,
+                  kernel =
-                                                   OType, false, true>;
+                      group_quantize_mxfp8_kernel<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP, IType,
+                                                  OType, false, true, WITH_GEMM_SWIZZLED_SCALES>;
                  break;
                }
                case ScalingType::BIDIMENSIONAL: {
-              kernel = group_quantize_mxfp8_kernel<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP, IType,
+                  kernel =
-                                                   OType, true, true>;
+                      group_quantize_mxfp8_kernel<IS_DBIAS, IS_DACT, IS_ACT, ParamOP, OP, IType,
+                                                  OType, true, true, WITH_GEMM_SWIZZLED_SCALES>;
                  break;
                }
              }
@@ -933,13 +958,13 @@ void group_quantize(const GroupedTensor *input, const GroupedTensor *activations
                update_tma_descriptors<IType, OType><<<num_tensors, 32, 0, stream>>>(
                    tensor_map_input, tensor_map_act_input, tensor_map_output_rowwise,
                    tensor_map_output_colwise, input_dptr, act_input_dptr, output_rowwise_dptr,
-                output_colwise_dptr, shape_rep, num_tensors, first_logical_dim, last_logical_dim,
+                    output_colwise_dptr, shape_rep, num_tensors, first_logical_dim,
-                offsets_ptr, first_dims_ptr, last_dims_ptr, use_rowwise_scaling,
+                    last_logical_dim, offsets_ptr, first_dims_ptr, last_dims_ptr,
-                use_colwise_scaling, IS_DACT);
+                    use_rowwise_scaling, use_colwise_scaling, IS_DACT);
              }
-          NVTE_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+              NVTE_CHECK_CUDA(cudaFuncSetAttribute(
-                                               dshmem_size));
+                  kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, dshmem_size));
              kernel<<<grid, block_size, dshmem_size, stream>>>(
                  tensor_map_input, tensor_map_act_input, tensor_map_output_rowwise,
@@ -953,6 +978,7 @@ void group_quantize(const GroupedTensor *input, const GroupedTensor *activations
              NVTE_CHECK_CUDA(cudaGetLastError()););  // NOLINT(*)
      );                                              // NOLINT(*)
+  );                                                  // NOLINT(*)
 }
 }  // namespace mxfp8

--- a/transformer_engine/common/common.h
+++ b/transformer_engine/common/common.h
@@ -335,6 +335,12 @@ struct GroupedTensor {
  NVTEGroupedTensor nvte_tensor;
+  /*! \brief Whether scaling factors are in format expected by GEMM
+   *
+   *  Only meaningful for MXFP8 and NVFP4.
+   */
+  bool with_gemm_swizzled_scales = false;
  GroupedTensor(NVTEScalingMode scaling_mode, size_t num_tensors)
      : data(),
        columnwise_data(),
@@ -401,6 +407,7 @@ struct GroupedTensor {
    num_tensors = 0;
    scaling_mode = NVTE_DELAYED_TENSOR_SCALING;
    nvte_tensor = 0;
+    with_gemm_swizzled_scales = false;
  }
 };