Merge tag 'v0.8.2' into v0.8.2-dev

469e903b · zhuwenwen · 389ebcf7 · 25f560a6 · 469e903b · 469e903b
Commit 469e903b authored Mar 28, 2025 by zhuwenwen
20 changed files
--- a/csrc/moe/moe_wna16.cu
+++ b/csrc/moe/moe_wna16.cu
+
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "moe_wna16_utils.h"
+
+#define DIVIDE(x, size) (((x) + (size) - 1) / (size))
+
+template <typename scalar_t, int bit, int GROUPS>
+__global__ void moe_wna16_gemm_kernel(
+    const scalar_t* __restrict__ input, scalar_t* __restrict__ output,
+
+    const uint32_t* __restrict__ qweight, const scalar_t* __restrict__ scales,
+    const uint32_t* __restrict__ qzeros,
+
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_token_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ num_tokens_post_pad,
+
+    uint16_t num_experts, uint16_t group_size, uint16_t top_k, uint32_t size_m,
+    uint32_t size_n, uint32_t size_k, uint16_t BLOCK_SIZE_M,
+    uint16_t BLOCK_SIZE_N, uint16_t BLOCK_SIZE_K, bool has_zp,
+    bool mul_topk_weight) {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800
+  if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    return;
+  } else {
+#endif
+
+    using Dtype = ScalarType<scalar_t>;
+    using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+
+    if (blockIdx.x * BLOCK_SIZE_M >= num_tokens_post_pad[0]) return;
+
+    const int32_t offset_n = blockIdx.y * BLOCK_SIZE_N + threadIdx.x;
+    const int32_t offset_k = blockIdx.z * BLOCK_SIZE_K;
+
+    const int32_t expert_id = expert_ids[blockIdx.x];
+
+    int32_t num_valid_tokens = 0;
+    extern __shared__ uint16_t block_input_tmp[];
+    scalar_t* block_input = reinterpret_cast<scalar_t*>(block_input_tmp);
+    scalar_t2* block_input_half2 = reinterpret_cast<scalar_t2*>(block_input);
+
+    // load BLOCK_SIZE_M * BLOCK_SIZE_K into shared memory
+    for (int m = 0; m < BLOCK_SIZE_M; m++) {
+      const int32_t offset_m = blockIdx.x * BLOCK_SIZE_M + m;
+      const int32_t token_index = sorted_token_ids[offset_m];
+      if (token_index / top_k >= size_m) break;
+
+      num_valid_tokens = m + 1;
+      if (blockIdx.z == 0 && offset_n < size_n)
+        output[token_index * size_n + offset_n] = Dtype::int2num(0);
+
+      if (expert_id != -1) {
+        int k_per_thread = DIVIDE(BLOCK_SIZE_K, BLOCK_SIZE_N);
+        for (int i = 0; i < k_per_thread; i++) {
+          int k = BLOCK_SIZE_N * i + threadIdx.x;
+          if (k >= BLOCK_SIZE_K) break;
+          if (offset_k + k >= size_k) break;
+
+          // load input to shared memory
+          // use a special layout to fit the layout of dequanted-weight
+          int origin_k;
+          if constexpr (bit == 4) {
+            // [0, 4, 1, 5, 2, 6, 3, 7]
+            int8_t order = (threadIdx.x % 2) * 4 + ((threadIdx.x % 8) / 2);
+            origin_k = BLOCK_SIZE_N * i + threadIdx.x / 8 * 8 + order;
+          } else {
+            // [0, 2, 1, 3]
+            int8_t order = (threadIdx.x % 2) * 2 + ((threadIdx.x % 4) / 2);
+            origin_k = BLOCK_SIZE_N * i + threadIdx.x / 4 * 4 + order;
+          }
+
+          origin_k += token_index / top_k * size_k + blockIdx.z * BLOCK_SIZE_K;
+          block_input[m * BLOCK_SIZE_K + k] = input[origin_k];
+        }
+      }
+    }
+
+    if (expert_id == -1) return;
+    __syncthreads();
+    if (threadIdx.x >= BLOCK_SIZE_N || offset_n >= size_n) return;
+
+    float res[64];  // assume BLOCK_SIZE_M <= 64
+    scalar_t2 res2;
+    scalar_t2 scale_f2;
+    scalar_t2 qzero_f2;
+
+    // note that (size_n * size_k * expert_id) may greater than 2 ** 31
+    constexpr int8_t pack_factor = 32 / bit;
+    const uint64_t expert_offset = ((uint64_t)size_n) * size_k * expert_id;
+    const uint32_t* expert_qweight = qweight + expert_offset / pack_factor;
+    const scalar_t* expert_scales = scales + expert_offset / group_size;
+    const uint32_t* expert_qzeros =
+        qzeros + expert_offset / group_size / pack_factor;
+
+    // load 4*int32 one time: 4 int32 = 128 bit = 1 float4
+    // weight would be loaded in loop
+    uint32_t expert_qweight_tmp[4];
+    float4* expert_qweight_tmp_float4 =
+        reinterpret_cast<float4*>(expert_qweight_tmp);
+
+    // load all required scales one time
+    scalar_t expert_scales_groups[GROUPS];
+    int scales_offset_tmp =
+        (offset_n * size_k + offset_k) / group_size / GROUPS;
+    if constexpr (GROUPS == 1) {
+      *expert_scales_groups = expert_scales[scales_offset_tmp];
+    } else if constexpr (GROUPS == 2) {
+      float* expert_scales_groups_tmp =
+          reinterpret_cast<float*>(expert_scales_groups);
+      *expert_scales_groups_tmp =
+          reinterpret_cast<const float*>(expert_scales)[scales_offset_tmp];
+    } else if constexpr (GROUPS == 4) {
+      float2* expert_scales_groups_tmp =
+          reinterpret_cast<float2*>(expert_scales_groups);
+      *expert_scales_groups_tmp =
+          reinterpret_cast<const float2*>(expert_scales)[scales_offset_tmp];
+    } else if constexpr (GROUPS == 8) {
+      float4* expert_scales_groups_tmp =
+          reinterpret_cast<float4*>(expert_scales_groups);
+      *expert_scales_groups_tmp =
+          reinterpret_cast<const float4*>(expert_scales)[scales_offset_tmp];
+    }
+
+    // load all required qzeros one time
+    uint8_t expert_qzeros_groups[GROUPS];
+    if (!has_zp) {
+      if constexpr (bit == 4) {
+        qzero_f2 = Dtype::num2num2(Dtype::int2num(8));
+      } else {
+        qzero_f2 = Dtype::num2num2(Dtype::int2num(128));
+      }
+    } else {
+      int qzeros_offset_tmp =
+          (offset_n / (8 / bit)) * (size_k / group_size / GROUPS) +
+          offset_k / group_size / GROUPS;
+      if constexpr (GROUPS == 1) {
+        uint8_t* expert_qzeros_groups_tmp =
+            reinterpret_cast<uint8_t*>(expert_qzeros_groups);
+        *expert_qzeros_groups_tmp =
+            reinterpret_cast<const uint8_t*>(expert_qzeros)[qzeros_offset_tmp];
+      } else if constexpr (GROUPS == 2) {
+        uint16_t* expert_qzeros_groups_tmp =
+            reinterpret_cast<uint16_t*>(expert_qzeros_groups);
+        *expert_qzeros_groups_tmp =
+            reinterpret_cast<const uint16_t*>(expert_qzeros)[qzeros_offset_tmp];
+      } else if constexpr (GROUPS == 4) {
+        uint32_t* expert_qzeros_groups_tmp =
+            reinterpret_cast<uint32_t*>(expert_qzeros_groups);
+        *expert_qzeros_groups_tmp =
+            reinterpret_cast<const uint32_t*>(expert_qzeros)[qzeros_offset_tmp];
+      } else if constexpr (GROUPS == 8) {
+        uint64_t* expert_qzeros_groups_tmp =
+            reinterpret_cast<uint64_t*>(expert_qzeros_groups);
+        *expert_qzeros_groups_tmp =
+            reinterpret_cast<const uint64_t*>(expert_qzeros)[qzeros_offset_tmp];
+      }
+    }
+
+    for (int tmp_k = 0; tmp_k < BLOCK_SIZE_K / pack_factor; tmp_k++) {
+      int k = offset_k + tmp_k * pack_factor;
+      if (k >= size_k) break;
+      const int32_t weight_offset = offset_n * size_k + k;
+
+      if (tmp_k % 4 == 0) {
+        *expert_qweight_tmp_float4 = reinterpret_cast<const float4*>(
+            expert_qweight)[weight_offset / pack_factor / 4];
+      }
+
+      if (tmp_k % (group_size / pack_factor) == 0) {
+        scalar_t scale_f =
+            expert_scales_groups[tmp_k / (group_size / pack_factor)];
+        scale_f2 = Dtype::num2num2(scale_f);
+
+        if (has_zp) {
+          uint8_t qzero =
+              expert_qzeros_groups[tmp_k / (group_size / pack_factor)];
+          if constexpr (bit == 4) {
+            qzero = (qzero >> ((threadIdx.x % 2) * 4)) & 0xF;
+          }
+          qzero_f2 = Dtype::num2num2(Dtype::int2num(qzero));
+        }
+      }
+
+      scalar_t2 weight_half2[16 / bit];
+      dequant<scalar_t2, bit>(expert_qweight_tmp[tmp_k % 4], weight_half2);
+
+      for (int m = 0; m < num_valid_tokens; m++) {
+        res2 = {};
+
+#pragma unroll
+        for (int i = 0; i < 16 / bit; i++) {
+          int32_t offset_input = m * BLOCK_SIZE_K / 2 + tmp_k * (16 / bit) + i;
+          res2 = __hfma2(__hmul2(__hsub2(weight_half2[i], qzero_f2), scale_f2),
+                         block_input_half2[offset_input], res2);
+        }
+
+        if (tmp_k == 0) {
+          res[m] = Dtype::num2float(res2.x) + Dtype::num2float(res2.y);
+        } else {
+          res[m] += Dtype::num2float(res2.x) + Dtype::num2float(res2.y);
+        }
+      }
+    }
+
+    for (int m = 0; m < num_valid_tokens; ++m) {
+      const int32_t token_index =
+          sorted_token_ids[blockIdx.x * BLOCK_SIZE_M + m];
+      if (mul_topk_weight) {
+        res[m] *= topk_weights[token_index];
+      }
+      atomicAdd(&output[token_index * size_n + offset_n],
+                Dtype::float2num(res[m]));
+    }
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800
+  }
+#endif
+}
+
+template <typename scalar_t>
+void run_moe_wna16_gemm(const scalar_t* input, scalar_t* output,
+                        const uint32_t* b_qweight, const scalar_t* b_scales,
+                        const uint32_t* b_qzeros, const float* topk_weights,
+                        const int32_t* sorted_token_ids,
+                        const int32_t* expert_ids,
+                        const int32_t* num_tokens_post_pad, int num_experts,
+                        int group_size, int num_token_blocks, int top_k,
+                        int size_m, int size_n, int size_k, int BLOCK_SIZE_M,
+                        int BLOCK_SIZE_N, int BLOCK_SIZE_K, int bit,
+                        bool has_zp, bool mul_topk_weight) {
+  dim3 blockDim, gridDim;
+  blockDim.x = BLOCK_SIZE_N;
+  blockDim.y = 1;
+  blockDim.z = 1;
+  gridDim.x = num_token_blocks;
+  gridDim.y = DIVIDE(size_n, BLOCK_SIZE_N);
+  gridDim.z = DIVIDE(size_k, BLOCK_SIZE_K);
+
+  auto kernel = moe_wna16_gemm_kernel<scalar_t, 4, 1>;
+  if (bit == 4) {
+    if (BLOCK_SIZE_K / group_size == 2) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 4, 2>;
+    } else if (BLOCK_SIZE_K / group_size == 4) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 4, 4>;
+    } else if (BLOCK_SIZE_K / group_size == 8) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 4, 8>;
+    }
+  } else {
+    if (BLOCK_SIZE_K / group_size == 1) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 8, 1>;
+    } else if (BLOCK_SIZE_K / group_size == 2) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 8, 2>;
+    } else if (BLOCK_SIZE_K / group_size == 4) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 8, 4>;
+    } else if (BLOCK_SIZE_K / group_size == 8) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 8, 8>;
+    }
+  }
+
+  const int shared_mem_size = BLOCK_SIZE_M * BLOCK_SIZE_K * 2;
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  kernel<<<gridDim, blockDim, shared_mem_size, stream>>>(
+      input, output, b_qweight, b_scales, b_qzeros, topk_weights,
+      sorted_token_ids, expert_ids, num_tokens_post_pad, num_experts,
+      group_size, top_k, size_m, size_n, size_k, BLOCK_SIZE_M, BLOCK_SIZE_N,
+      BLOCK_SIZE_K, has_zp, mul_topk_weight);
+}
+
+torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
+                             torch::Tensor b_qweight, torch::Tensor b_scales,
+                             std::optional<torch::Tensor> b_qzeros,
+                             std::optional<torch::Tensor> topk_weights,
+                             torch::Tensor sorted_token_ids,
+                             torch::Tensor expert_ids,
+                             torch::Tensor num_tokens_post_pad, int64_t top_k,
+                             int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
+                             int64_t BLOCK_SIZE_K, int64_t bit) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  auto options =
+      torch::TensorOptions().dtype(input.dtype()).device(input.device());
+
+  const int num_experts = b_qweight.size(0);
+  const int size_m = input.size(0);
+  const int size_n = b_qweight.size(1);
+  const int size_k = input.size(1);
+  const int group_size = size_k / b_scales.size(2);
+
+  int64_t EM = sorted_token_ids.size(0);
+  if (size_m <= BLOCK_SIZE_M) {
+    EM = min(EM, size_m * BLOCK_SIZE_M * top_k);
+  }
+  const int num_token_blocks = (EM + BLOCK_SIZE_M - 1) / BLOCK_SIZE_M;
+
+  const uint32_t* b_qzeros_ptr;
+  if (b_qzeros.has_value())
+    b_qzeros_ptr = (const uint32_t*)b_qzeros.value().data_ptr<uint8_t>();
+  const float* topk_weights_ptr;
+  if (topk_weights.has_value())
+    topk_weights_ptr = (const float*)topk_weights.value().data_ptr();
+
+  int groups_per_block_row = BLOCK_SIZE_K / group_size;
+  TORCH_CHECK(bit == 4 || bit == 8, "bit must be 4 or 8");
+  TORCH_CHECK(size_k % BLOCK_SIZE_K == 0,
+              "size_k must divisible by BLOCK_SIZE_K");
+  TORCH_CHECK(BLOCK_SIZE_K % group_size == 0,
+              "BLOCK_SIZE_K must divisible by group_size");
+  TORCH_CHECK(BLOCK_SIZE_M <= 64, "BLOCK_SIZE_M must less or equal to 64");
+  TORCH_CHECK(groups_per_block_row == 1 || groups_per_block_row == 2 ||
+                  groups_per_block_row == 4 || groups_per_block_row == 8,
+              "BLOCK_SIZE_K // group_size must be one of [1, 2, 4, 8]");
+
+  if (input.scalar_type() == at::ScalarType::Half) {
+    run_moe_wna16_gemm<half>(
+        (const half*)input.data_ptr<at::Half>(),
+        (half*)output.data_ptr<at::Half>(),
+        (const uint32_t*)b_qweight.data_ptr<uint8_t>(),
+        (const half*)b_scales.data_ptr<at::Half>(), b_qzeros_ptr,
+        topk_weights_ptr, sorted_token_ids.data_ptr<int32_t>(),
+        expert_ids.data_ptr<int32_t>(), num_tokens_post_pad.data_ptr<int32_t>(),
+        num_experts, group_size, num_token_blocks, top_k, size_m, size_n,
+        size_k, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, bit,
+        b_qzeros.has_value(), topk_weights.has_value());
+  } else if (input.scalar_type() == at::ScalarType::BFloat16) {
+    run_moe_wna16_gemm<nv_bfloat16>(
+        (const nv_bfloat16*)input.data_ptr<at::BFloat16>(),
+        (nv_bfloat16*)output.data_ptr<at::BFloat16>(),
+        (const uint32_t*)b_qweight.data_ptr<uint8_t>(),
+        (const nv_bfloat16*)b_scales.data_ptr<at::BFloat16>(), b_qzeros_ptr,
+        topk_weights_ptr, sorted_token_ids.data_ptr<int32_t>(),
+        expert_ids.data_ptr<int32_t>(), num_tokens_post_pad.data_ptr<int32_t>(),
+        num_experts, group_size, num_token_blocks, top_k, size_m, size_n,
+        size_k, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, bit,
+        b_qzeros.has_value(), topk_weights.has_value());
+  } else {
+    TORCH_CHECK(false, "moe_wna16_gemm only supports bfloat16 and float16");
+  }
+  return output;
+}
--- a/csrc/moe/moe_wna16_utils.h
+++ b/csrc/moe/moe_wna16_utils.h
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+template <typename scalar_t>
+class ScalarType {};
+
+template <>
+class ScalarType<half> {
+ public:
+  using scalar_t = half;
+  using scalar_t2 = half2;
+
+  static __device__ float inline num2float(const half x) {
+    return __half2float(x);
+  }
+
+  static __device__ half2 inline num2num2(const half x) {
+    return __half2half2(x);
+  }
+
+  static __device__ half2 inline nums2num2(const half x1, const half x2) {
+    return __halves2half2(x1, x2);
+  }
+
+  static __host__ __device__ half inline float2num(const float x) {
+    return __float2half(x);
+  }
+
+  static __host__ __device__ half inline int2num(const float x) {
+    return __int2half_rn(x);
+  }
+
+  static __host__ __device__ float2 inline num22float2(const half2 x) {
+    return __half22float2(x);
+  }
+
+  static __host__ __device__ half2 inline float22num2(const float2 x) {
+    return __float22half2_rn(x);
+  }
+};
+
+template <>
+class ScalarType<nv_bfloat16> {
+ public:
+  using scalar_t = nv_bfloat16;
+  using scalar_t2 = nv_bfloat162;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  static __device__ float inline num2float(const nv_bfloat16 x) {
+    return __bfloat162float(x);
+  }
+
+  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+    return __bfloat162bfloat162(x);
+  }
+
+  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
+                                                  const nv_bfloat16 x2) {
+    return __halves2bfloat162(x1, x2);
+  }
+
+  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
+    return __float2bfloat16(x);
+  }
+
+  static __host__ __device__ nv_bfloat16 inline int2num(const float x) {
+    return __int2bfloat16_rn(x);
+  }
+
+  static __host__ __device__ float2 inline num22float2(const nv_bfloat162 x) {
+    return __bfloat1622float2(x);
+  }
+
+  static __host__ __device__ nv_bfloat162 inline float22num2(const float2 x) {
+    return __float22bfloat162_rn(x);
+  }
+#endif
+};
+
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <typename scalar_t2, int bit>
+__device__ inline void dequant(int q, scalar_t2* res) {}
+
+template <>
+__device__ inline void dequant<half2, 4>(int q, half2* res) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  const int SUB = 0x64006400;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd400d400;
+
+  int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  q >>= 8;
+  int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+
+  res[0] = __hsub2(*reinterpret_cast<half2*>(&lo0),
+                   *reinterpret_cast<const half2*>(&SUB));
+  res[1] = __hfma2(*reinterpret_cast<half2*>(&hi0),
+                   *reinterpret_cast<const half2*>(&MUL),
+                   *reinterpret_cast<const half2*>(&ADD));
+  res[2] = __hsub2(*reinterpret_cast<half2*>(&lo1),
+                   *reinterpret_cast<const half2*>(&SUB));
+  res[3] = __hfma2(*reinterpret_cast<half2*>(&hi1),
+                   *reinterpret_cast<const half2*>(&MUL),
+                   *reinterpret_cast<const half2*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<half2, 8>(int q, half2* res) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+  res[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                   *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  res[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                   *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+template <>
+__device__ inline void dequant<nv_bfloat162, 4>(int q, nv_bfloat162* res) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  q >>= 4;
+  int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  q >>= 4;
+  int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  q >>= 4;
+  int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+
+  static constexpr uint32_t MUL = 0x3F803F80;
+  static constexpr uint32_t ADD = 0xC300C300;
+
+  res[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo0),
+                   *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                   *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  res[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi0),
+                   *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                   *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  res[2] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo1),
+                   *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                   *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  res[3] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi1),
+                   *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                   *reinterpret_cast<const nv_bfloat162*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, 8>(int q, nv_bfloat162* res) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388608.f;
+  fp32_intermediates[1] -= 8388608.f;
+  fp32_intermediates[2] -= 8388608.f;
+  fp32_intermediates[3] -= 8388608.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(res);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+}
+#endif
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -40,6 +40,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);

 #ifndef USE_ROCM
+  m.def(
+      "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
+      "Tensor b_scales, Tensor? b_qzeros, "
+      "Tensor? topk_weights, Tensor sorted_token_ids, "
+      "Tensor expert_ids, Tensor num_tokens_post_pad, "
+      "int top_k, int BLOCK_SIZE_M, int BLOCK_SIZE_N, int BLOCK_SIZE_K, "
+      "int bit) -> Tensor");
+
+  m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm);
+
  m.def(
      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
@@ -50,6 +60,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "int moe_block_size, bool replicate_input, bool apply_weights)"
      " -> Tensor");
  // conditionally compiled so impl registration is in source file
+
 #endif
 }


--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -292,10 +292,25 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
 torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
                              int64_t row);

+torch::Tensor ggml_moe_a8(torch::Tensor X, torch::Tensor W,
+                          torch::Tensor sorted_token_ids,
+                          torch::Tensor expert_ids,
+                          torch::Tensor num_tokens_post_padded, int64_t type,
+                          int64_t row, int64_t top_k, int64_t tokens);
+
+int64_t ggml_moe_get_block_size(int64_t type);
+
 #ifndef USE_ROCM
+
+bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability);
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);

+void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
+                           torch::Tensor const& B, torch::Tensor const& A_sf,
+                           torch::Tensor const& B_sf,
+                           torch::Tensor const& alpha);
+
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                       torch::Tensor const& b, torch::Tensor const& a_scales,
                       torch::Tensor const& b_scales,
@@ -377,7 +392,6 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
                       const std::optional<at::Tensor>& has_initial_state,
                       bool silu_activation, int64_t pad_slot_id);

-#ifndef USE_ROCM
 using fptr_t = int64_t;
 fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
                      torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
@@ -391,4 +405,9 @@ get_graph_buffer_ipc_meta(fptr_t _fa);
 void register_graph_buffers(fptr_t _fa,
                            const std::vector<std::vector<int64_t>>& handles,
                            const std::vector<std::vector<int64_t>>& offsets);
-#endif
+
+std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size);
+    int64_t open_mem_handle(torch::Tensor& mem_handle);
+    void free_shared_buffer(int64_t buffer);
+                              
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -274,7 +274,7 @@ void advance_step_flashinfer(
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);

-  int block_tables_stride = block_tables.stride(0);
+  [[maybe_unused]] int block_tables_stride = block_tables.stride(0);
  TORCH_CHECK((blocks * threads > num_queries),
              "multi-step: not enough threads to map to num_queries = ",
              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),

--- a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
@@ -16,6 +16,7 @@
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/util/packed_stride.hpp"

 #include "core/math.hpp"
 #include "cutlass_extensions/common.hpp"
@@ -30,12 +31,18 @@ static inline cute::Shape<int, int, int, int> get_problem_shape(
 }

 template <typename GemmKernel>
-void cutlass_gemm_caller(torch::Device device,
-                         cute::Shape<int, int, int, int> prob_shape,
-                         typename GemmKernel::MainloopArguments mainloop_args,
-                         typename GemmKernel::EpilogueArguments epilogue_args) {
+void cutlass_gemm_caller(
+    torch::Device device, cute::Shape<int, int, int, int> prob_shape,
+    typename GemmKernel::MainloopArguments mainloop_args,
+    typename GemmKernel::EpilogueArguments epilogue_args,
+    typename GemmKernel::TileSchedulerArguments scheduler = {}) {
+  cutlass::KernelHardwareInfo hw_info;
  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
+                                      prob_shape,
+                                      mainloop_args,
+                                      epilogue_args,
+                                      hw_info,
+                                      scheduler};

  // Launch the CUTLASS GEMM kernel.
  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
@@ -58,22 +65,28 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
                         torch::Tensor const& b,
                         EpilogueArgs&&... epilogue_params) {
  using ElementAB = typename Gemm::ElementAB;
+  using ElementC = typename Gemm::ElementC;
  using ElementD = typename Gemm::ElementD;
  using GemmKernel = typename Gemm::GemmKernel;

-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = cute::Stride<int64_t, cute::Int<1>, int64_t>;
-  using StrideB = cute::Stride<int64_t, cute::Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, cute::Int<1>{}, 0};
-  StrideB b_stride{ldb, cute::Int<1>{}, 0};
-  StrideC c_stride{ldc, cute::Int<1>{}, cute::Int<0>{}};
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = StrideC;
+  using StrideAux = StrideC;

  typename GemmKernel::ProblemShape prob_shape = get_problem_shape(a, b);
+  auto [M, N, K, L] = prob_shape;
+
+  StrideA a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+  StrideB b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+  StrideC c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+  StrideD d_stride =
+      cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+  StrideAux aux_stride = d_stride;

  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
@@ -81,10 +94,11 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
                                                       b_stride};

  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  // auto d_ptr = static_cast<ElementC*>(out.data_ptr());
  typename GemmKernel::EpilogueArguments epilogue_args{
      Gemm::Epilogue::prepare_args(
          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
+      c_ptr, c_stride, c_ptr, d_stride};

  cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
                                  epilogue_args);

--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@@ -40,12 +40,7 @@ struct cutlass_3x_gemm {
      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
                                float>::type;

-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;

  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
  using ElementC = void;
@@ -88,4 +83,65 @@ struct cutlass_3x_gemm {
  struct GemmKernel : public KernelType {};
 };

+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm_sm100 {
+  using ElementAB = ElementAB_;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC =
+      128 / cutlass::sizeof_bits<ElementD_>::value;
+
+  using ElementD = ElementD_;
+  using LayoutD = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = AlignmentC;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  // MMA type
+  using ElementAccumulator = float;
+
+  // Epilogue types
+  using ElementBias = cutlass::half_t;
+  using ElementCompute = float;
+  using ElementAux = ElementD;
+  using LayoutAux = LayoutD;
+  using ElementAmax = float;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementCompute, ElementC, LayoutC, AlignmentC,
+          ElementD, LayoutD, AlignmentD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentA, ElementAB, LayoutB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+};
+
 }  // namespace vllm
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@@ -22,8 +22,9 @@ namespace vllm {

 using namespace cute;

-template <typename OutType, int GroupSizeM_, int GroupSizeN_, int GroupSizeK_,
-          int TileSizeM_ = 128, class ClusterShape = Shape<_1, _2, _1>>
+template <typename SchedulerType, typename OutType, int GroupSizeM_,
+          int GroupSizeN_, int GroupSizeK_, int TileSizeM_ = 128,
+          class ClusterShape = Shape<_1, _2, _1>>
 struct cutlass_3x_gemm_fp8_blockwise {
  using GroupSizeM = Int<GroupSizeM_>;
  using GroupSizeN = Int<GroupSizeN_>;
@@ -84,7 +85,7 @@ struct cutlass_3x_gemm_fp8_blockwise {

  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      cutlass::gemm::PersistentScheduler>>;
+      SchedulerType>>;

  struct GemmKernel : public KernelType {};

@@ -150,8 +151,24 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
  typename GemmKernel::EpilogueArguments epilogue_args{
      {}, c_ptr, c_stride, c_ptr, c_stride};

+  typename GemmKernel::TileSchedulerArguments scheduler;
+
+  static constexpr bool UsesStreamKScheduler =
+      cute::is_same_v<typename GemmKernel::TileSchedulerTag,
+                      cutlass::gemm::StreamKScheduler>;
+
+  if constexpr (UsesStreamKScheduler) {
+    using DecompositionMode = typename cutlass::gemm::kernel::detail::
+        PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+    using ReductionMode = typename cutlass::gemm::kernel::detail::
+        PersistentTileSchedulerSm90StreamKParams::ReductionMode;
+
+    scheduler.decomposition_mode = DecompositionMode::StreamK;
+    scheduler.reduction_mode = ReductionMode::Nondeterministic;
+  }
+
  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
-                                       epilogue_args);
+                                       epilogue_args, scheduler);
 }

 template <typename OutType>
@@ -160,9 +177,18 @@ void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out,
                                              torch::Tensor const& b,
                                              torch::Tensor const& a_scales,
                                              torch::Tensor const& b_scales) {
-  cutlass_gemm_caller_blockwise<
-      cutlass_3x_gemm_fp8_blockwise<OutType, 1, 128, 128>>(out, a, b, a_scales,
-                                                           b_scales);
+  auto k = a.size(1);
+  auto n = b.size(1);
+
+  if (k > 3 * n) {
+    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+        cutlass::gemm::StreamKScheduler, OutType, 1, 128, 128>>(
+        out, a, b, a_scales, b_scales);
+  } else {
+    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+        cutlass::gemm::PersistentScheduler, OutType, 1, 128, 128>>(
+        out, a, b, a_scales, b_scales);
+  }
 }

 }  // namespace vllm
\ No newline at end of file
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
@@ -30,4 +30,10 @@ void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
                                          torch::Tensor const& a_scales,
                                          torch::Tensor const& b_scales);

+void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
 }  // namespace vllm
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm100_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM100 (fp8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_default {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_256, _128, _64>;
+  using ClusterShape = Shape<_2, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm100_fp8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, b, std::forward<EpilogueArgs>(args)...);
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
+#include <cudaTypedefs.h>
+#include "c3x/scaled_mm_kernels.hpp"
+
+#include "cuda_utils.h"
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm100 (Blackwell).
+*/
+
+#if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
+
+void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  int M = a.size(0), N = b.size(1), K = a.size(1);
+  TORCH_CHECK(
+      (a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
+          (b_scales.numel() == 1 || b_scales.numel() == b.size(1)),
+      "Currently, block scaled fp8 gemm is not implemented for Blackwell");
+
+  // Standard per-tensor/per-token/per-channel scaling
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn,
+              "Currently, only fp8 gemm is implemented for Blackwell");
+  vllm::cutlass_scaled_mm_sm100_fp8(c, a, b, a_scales, b_scales, bias);
+}
+
+#endif
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
 #include <cudaTypedefs.h>
 #include "c3x/scaled_mm_kernels.hpp"

-#include "core/math.hpp"
+#include "cuda_utils.h"

 /*
   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
-   NVIDIA GPUs with sm90a (Hopper) or later.
+   NVIDIA GPUs with sm90a (Hopper).
 */

+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
+
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
@@ -33,7 +35,8 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
    auto make_group_shape = [](torch::Tensor const& x,
                               torch::Tensor const& s) -> GroupShape {
      TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D");
-      return {ceil_div(x.size(0), s.size(0)), ceil_div(x.size(1), s.size(1))};
+      return {cuda_utils::ceil_div(x.size(0), s.size(0)),
+              cuda_utils::ceil_div(x.size(1), s.size(1))};
    };

    GroupShape a_scale_group_shape = make_group_shape(a, a_scales);
@@ -70,3 +73,5 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
  vllm::cutlass_scaled_mm_azp_sm90_int8(out, a, b, a_scales, b_scales, azp_adj,
                                        azp, bias);
 }
+
+#endif
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -23,7 +23,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b_scales,
                            std::optional<torch::Tensor> const& bias);

-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
@@ -31,6 +31,14 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                            std::optional<torch::Tensor> const& bias);
 #endif

+#if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
+void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias);
+#endif
+
 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
                                torch::Tensor const& b,
                                torch::Tensor const& a_scales,
@@ -55,7 +63,7 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
                                std::optional<torch::Tensor> const& azp,
                                std::optional<torch::Tensor> const& bias);

-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
 void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
                                torch::Tensor const& b,
                                torch::Tensor const& a_scales,
@@ -86,7 +94,7 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
  // and at least SM90 (Hopper)

 #if defined CUDA_VERSION
-  if (cuda_device_capability >= 90) {
+  if (cuda_device_capability >= 90 && cuda_device_capability < 100) {
    return CUDA_VERSION >= 12000;
  }
 #endif
@@ -116,11 +124,18 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,

  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
  int32_t version_num = get_sm_version_num();
-  // Hopper
+
+#if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
+  if (version_num >= 100) {
+    cutlass_scaled_mm_sm100(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
+#endif

  // Guard against compilation issues for sm90 kernels
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
-  if (version_num >= 90) {
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
+  if (version_num >= 90 && version_num < 100) {
+    // Hopper
    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
    return;
  }
@@ -194,7 +209,7 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,

  int32_t version_num = get_sm_version_num();

-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
  if (version_num >= 90) {
    cutlass_scaled_mm_azp_sm90(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
    return;

--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -348,10 +348,7 @@ void scaled_fp4_quant_sm100a(torch::Tensor const& output,
  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
-  auto stream = at::cuda::getStreamFromPool(false, input.get_device());
-  if (stream == nullptr) {
-    std::cerr << "Warning: Null CUDA stream" << std::endl;
-  }
+  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());

  // We don't support e8m0 scales at this moment.
  bool useUE8M0 = false;

--- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
+                                  torch::Tensor const& B,
+                                  torch::Tensor const& A_sf,
+                                  torch::Tensor const& B_sf,
+                                  torch::Tensor const& alpha);
+#endif
+
+void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
+                           torch::Tensor const& B, torch::Tensor const& A_sf,
+                           torch::Tensor const& B_sf,
+                           torch::Tensor const& alpha) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "No compiled nvfp4 mm kernel, vLLM should "
+                              "be compiled using CUDA 12.8 and target "
+                              "compute capability 100 or above.");
+}
+
+bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) {
+  int runtimeVersion;
+  cudaRuntimeGetVersion(&runtimeVersion);
+  return cuda_device_capability >= 100 && runtimeVersion >= 12080;
+}
\ No newline at end of file
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/packed_stride.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+// Kernel Perf config
+template <typename T>
+struct KernelTraits;
+
+template <>
+struct KernelTraits<float> {
+  using MmaTileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _128, _256>;
+};
+
+template <>
+struct KernelTraits<cutlass::half_t> {
+  using MmaTileShape = Shape<_256, _256, _256>;
+  using ClusterShape = Shape<_4, _4, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _256, _256>;
+};
+
+template <>
+struct KernelTraits<cutlass::bfloat16_t> {
+  using MmaTileShape = Shape<_256, _256, _256>;
+  using ClusterShape = Shape<_4, _4, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _256, _256>;
+};
+
+template <typename T>
+struct Fp4GemmSm100 {
+  // A matrix configuration
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutATag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 32;
+
+  // B matrix configuration
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 32;
+
+  // C/D matrix configuration
+  using ElementD = T;
+  using ElementC = T;
+  using LayoutCTag = cutlass::layout::RowMajor;
+  using LayoutDTag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  // Kernel functional config
+  using ElementAccumulator = float;
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  // Kernel Perf config
+  using MmaTileShape = typename KernelTraits<T>::MmaTileShape;
+  using ClusterShape = typename KernelTraits<T>::ClusterShape;
+  using PerSmTileShape_MNK = typename KernelTraits<T>::PerSmTileShape_MNK;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, PerSmTileShape_MNK, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutCTag, AlignmentC, ElementD,
+          LayoutDTag, AlignmentD,
+          cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutATag, AlignmentA, ElementB,
+          LayoutBTag, AlignmentB, ElementAccumulator, MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using LayoutA = decltype(cute::make_layout(make_shape(0, 0, 0), StrideA{}));
+  using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using LayoutB = decltype(cute::make_layout(make_shape(0, 0, 0), StrideB{}));
+  using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutC = decltype(cute::make_layout(make_shape(0, 0, 0), StrideC{}));
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using LayoutD = decltype(cute::make_layout(make_shape(0, 0, 0), StrideD{}));
+};
+
+template <typename T>
+typename T::Gemm::Arguments args_from_options(
+    at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+    at::Tensor const& A_sf, at::Tensor const& B_sf, at::Tensor const& alpha,
+    int64_t M, int64_t N, int64_t K) {
+  using ElementA = typename T::Gemm::ElementA;
+  using ElementB = typename T::Gemm::ElementB;
+  using ElementSFA = cutlass::float_ue4m3_t;
+  using ElementSFB = cutlass::float_ue4m3_t;
+  using ElementD = typename T::Gemm::ElementD;
+  using ElementCompute = float;
+  using StrideA = typename T::StrideA;
+  using StrideB = typename T::StrideB;
+  using StrideD = typename T::StrideD;
+  using Sm100BlkScaledConfig =
+      typename T::Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+
+  int m = static_cast<int>(M);
+  int n = static_cast<int>(N);
+  int k = static_cast<int>(K);
+  auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {m, k, 1});
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, 1});
+  auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {m, n, 1});
+
+  auto layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(
+      cute::make_shape(m, n, k, 1));
+  auto layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(
+      cute::make_shape(m, n, k, 1));
+
+  typename T::Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {m, n, k, 1},
+      {// Mainloop arguments
+       static_cast<ElementA const*>(A.data_ptr()), stride_A,
+       static_cast<ElementB const*>(B.data_ptr()), stride_B,
+       static_cast<ElementSFA const*>(A_sf.data_ptr()), layout_SFA,
+       static_cast<ElementSFB const*>(B_sf.data_ptr()), layout_SFB},
+      {     // Epilogue arguments
+       {},  // epilogue.thread
+       static_cast<ElementD const*>(D.data_ptr()),
+       stride_D,
+       static_cast<ElementD*>(D.data_ptr()),
+       stride_D}};
+  auto& fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha_ptr = static_cast<ElementCompute const*>(alpha.data_ptr());
+  return arguments;
+}
+
+template <typename T>
+void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+             at::Tensor const& A_sf, at::Tensor const& B_sf,
+             at::Tensor const& alpha, int64_t m, int64_t n, int64_t k,
+             cudaStream_t stream) {
+  typename Fp4GemmSm100<T>::Gemm gemm;
+
+  auto arguments =
+      args_from_options<Fp4GemmSm100<T>>(D, A, B, A_sf, B_sf, alpha, m, n, k);
+
+  size_t workspace_size = Fp4GemmSm100<T>::Gemm::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(A.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(gemm.run(arguments, workspace.data_ptr(), stream));
+}
+#else
+template <typename T>
+void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+             at::Tensor const& A_sf, at::Tensor const& B_sf,
+             at::Tensor const& alpha, int64_t m, int64_t n, int64_t k,
+             cudaStream_t stream) {
+  TORCH_CHECK(false,
+              "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
+              "a CUTLASS 3.8 source directory to enable support.");
+}
+#endif  // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+#define CHECK_TYPE(x, st, m) \
+  TORCH_CHECK(x.scalar_type() == st, ": Inconsistency of Tensor type:", m)
+#define CHECK_TH_CUDA(x, m) \
+  TORCH_CHECK(x.is_cuda(), m, ": must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x, m) \
+  TORCH_CHECK(x.is_contiguous(), m, ": must be contiguous")
+#define CHECK_INPUT(x, st, m) \
+  CHECK_TH_CUDA(x, m);        \
+  CHECK_CONTIGUOUS(x, m);     \
+  CHECK_TYPE(x, st, m)
+
+constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
+constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
+
+void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
+                                  torch::Tensor const& B,
+                                  torch::Tensor const& A_sf,
+                                  torch::Tensor const& B_sf,
+                                  torch::Tensor const& alpha) {
+  CHECK_INPUT(A, FLOAT4_E2M1X2, "a");
+  CHECK_INPUT(B, FLOAT4_E2M1X2, "b");
+
+  CHECK_INPUT(A_sf, SF_DTYPE, "scale_a");
+  CHECK_INPUT(B_sf, SF_DTYPE, "scale_b");
+
+  CHECK_INPUT(alpha, at::ScalarType::Float, "alpha");
+
+  TORCH_CHECK(A.dim() == 2, "a must be a matrix");
+  TORCH_CHECK(B.dim() == 2, "b must be a matrix");
+  TORCH_CHECK(A.sizes()[1] == B.sizes()[1],
+              "a and b shapes cannot be multiplied (", A.sizes()[0], "x",
+              A.sizes()[1], " and ", B.sizes()[0], "x", B.sizes()[1], ")");
+
+  auto const m = A.sizes()[0];
+  auto const n = B.sizes()[0];
+  auto const k = A.sizes()[1] * 2;
+
+  constexpr int alignment = 32;
+  TORCH_CHECK(k % alignment == 0, "Expected k to be divisible by ", alignment,
+              ", but got a shape: (", A.sizes()[0], "x", A.sizes()[1],
+              "), k: ", k, ".");
+  TORCH_CHECK(n % alignment == 0, "Expected n to be divisible by ", alignment,
+              ", but got b shape: (", B.sizes()[0], "x", B.sizes()[1], ").");
+
+  auto round_up = [](int x, int y) { return (x + y - 1) / y * y; };
+  int rounded_m = round_up(m, 128);
+  int rounded_n = round_up(n, 128);
+  // Since k is divisible by 32 (alignment), k / 16 is guaranteed to be an
+  // integer.
+  int rounded_k = round_up(k / 16, 4);
+
+  TORCH_CHECK(A_sf.dim() == 2, "scale_a must be a matrix");
+  TORCH_CHECK(B_sf.dim() == 2, "scale_b must be a matrix");
+  TORCH_CHECK(A_sf.sizes()[1] == B_sf.sizes()[1],
+              "scale_a and scale_b shapes cannot be multiplied (",
+              A_sf.sizes()[0], "x", A_sf.sizes()[1], " and ", B_sf.sizes()[0],
+              "x", B_sf.sizes()[1], ")");
+  TORCH_CHECK(A_sf.sizes()[0] == rounded_m && A_sf.sizes()[1] == rounded_k,
+              "scale_a must be padded and swizzled to a shape (", rounded_m,
+              "x", rounded_k, "), but got a shape (", A_sf.sizes()[0], "x",
+              A_sf.sizes()[1], ")");
+  TORCH_CHECK(B_sf.sizes()[0] == rounded_n && B_sf.sizes()[1] == rounded_k,
+              "scale_b must be padded and swizzled to a shape (", rounded_n,
+              "x", rounded_k, "), but got a shape (", B_sf.sizes()[0], "x",
+              B_sf.sizes()[1], ")");
+
+  auto out_dtype = D.dtype();
+  at::cuda::CUDAGuard device_guard{(char)A.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
+
+  if (out_dtype == at::ScalarType::Half) {
+    runGemm<cutlass::half_t>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else if (out_dtype == at::ScalarType::BFloat16) {
+    runGemm<cutlass::bfloat16_t>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else if (out_dtype == at::ScalarType::Float) {
+    runGemm<float>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm");
+  }
+}
--- a/csrc/quantization/fp8/amd/hip_float8.h
+++ b/csrc/quantization/fp8/amd/hip_float8.h
-#pragma once
-
-#ifdef __HIPCC__
-  #include <hip/hip_runtime.h>
-#else
-  #include <type_traits>
-  #include <stdint.h>
-  #include <math.h>
-  #include <iostream>
-#endif
-
-#include "hip_float8_impl.h"
-
-struct alignas(1) hip_fp8 {
-  struct from_bits_t {};
-  HIP_FP8_HOST_DEVICE static constexpr from_bits_t from_bits() {
-    return from_bits_t();
-  }
-  uint8_t data;
-
-  hip_fp8() = default;
-  HIP_FP8_HOST_DEVICE constexpr hip_fp8(const hip_fp8&) = default;
-  HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v) = delete;
-  explicit HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v, from_bits_t)
-      : data(v) {}
-
-#ifdef __HIP__MI300__
-  // NOTE: ON-DEVICE... always optimal bias
-  explicit HIP_FP8_DEVICE hip_fp8(float v)
-      : data(hip_fp8_impl::to_fp8_from_fp32(v)) {}
-
-  explicit HIP_FP8_DEVICE hip_fp8(_Float16 v)
-      : hip_fp8(static_cast<float>(v)) {}
-
-  // Host only implementation using s/w simulation
-  explicit HIP_FP8_HOST
-#else   // __HIP__MI300__
-  // both Host and DEVICE for non-MI300 using s/w simulation
-  explicit HIP_FP8_HOST_DEVICE
-#endif  // __HIP__MI300__
-  hip_fp8(float v) {
-    data = hip_fp8_impl::to_float8<4, 3, float, true /*negative_zero_nan*/,
-                                   true /*clip*/>(v);
-  }
-
-  explicit HIP_FP8_HOST_DEVICE hip_fp8(double v)
-      : hip_fp8(static_cast<float>(v)) {}
-
-#ifdef __HIP__MI300__
-  // upcast using device specific intrinsic
-  explicit inline HIP_FP8_DEVICE operator float() const {
-    float fval;
-    uint32_t i32val = static_cast<uint32_t>(data);
-
-    // upcast
-    asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0"
-                 : "=v"(fval)
-                 : "v"(i32val));
-
-    return fval;
-  }
-
-  explicit inline HIP_FP8_HOST operator float() const
-#else   // __HIP__MI300__
-  explicit inline HIP_FP8_HOST_DEVICE operator float() const
-#endif  // __HIP__MI300__
-  {
-    return hip_fp8_impl::from_float8<4, 3, float, true /*negative_zero_nan*/>(
-        data);
-  }
-};
-
-namespace std {
-inline hip_fp8 sin(hip_fp8 a) { return hip_fp8(sinf(float(a))); }
-inline hip_fp8 cos(hip_fp8 a) { return hip_fp8(cosf(float(a))); }
-HIP_FP8_HOST_DEVICE constexpr hip_fp8 real(const hip_fp8& a) { return a; }
-}  // namespace std
-
-// Special operator overloading
-inline std::ostream& operator<<(std::ostream& os, const hip_fp8& f8) {
-  return os << float(f8);
-}
-
-// all + operator overloading with mixed types
-// mixed types, always converts to f32, does computation in f32, and returns
-// float
-inline HIP_FP8_HOST_DEVICE float operator+(const float fa, hip_fp8 b) {
-  return (fa + float(b));
-}
-
-inline HIP_FP8_HOST_DEVICE float operator+(hip_fp8 a, const float fb) {
-  return (float(a) + fb);
-}
-
-inline HIP_FP8_HOST_DEVICE hip_fp8 operator+(hip_fp8 a, hip_fp8 b) {
-  return hip_fp8(float(a) + float(b));
-}
-
-inline HIP_FP8_HOST_DEVICE hip_fp8& operator+=(hip_fp8& a, hip_fp8 b) {
-  return a = hip_fp8(float(a) + float(b));
-}
-
-// overloading multiplication, always returns float,
-inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, hip_fp8 b) {
-  return float(a) * float(b);
-}
-
-inline HIP_FP8_HOST_DEVICE float operator*(float a, hip_fp8 b) {
-  return (a * float(b));
-}
-
-inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, float b) {
-  return (float(a) * b);
-}
-
-inline HIP_FP8_HOST_DEVICE float operator*(int32_t a, hip_fp8 b) {
-  return ((float)a * float(b));
-}
-
-inline HIP_FP8_HOST_DEVICE float operator*(double a, hip_fp8 b) {
-  return ((float)a * float(b));
-}
-
-// overloading for compare
-inline HIP_FP8_HOST_DEVICE bool operator==(hip_fp8 a, hip_fp8 b) {
-  return (a.data == b.data);
-}
-inline HIP_FP8_HOST_DEVICE bool operator!=(hip_fp8 a, hip_fp8 b) {
-  return (a.data != b.data);
-}
-
-inline HIP_FP8_HOST_DEVICE bool operator>=(hip_fp8 a, hip_fp8 b) {
-  return static_cast<float>(a) >= static_cast<float>(b);
-}
-inline HIP_FP8_HOST_DEVICE bool operator>(hip_fp8 a, hip_fp8 b) {
-  return static_cast<float>(a) > static_cast<float>(b);
-}
--- a/csrc/quantization/fp8/amd/hip_float8_impl.h
+++ b/csrc/quantization/fp8/amd/hip_float8_impl.h
-#pragma once
-
-#if defined(__HIPCC__) && defined(__gfx942__)
-  #define __HIP__MI300__
-#endif
-
-#ifdef __HIPCC__
-  #define HIP_FP8_HOST_DEVICE __host__ __device__
-  #define HIP_FP8_HOST __host__
-  #define HIP_FP8_DEVICE __device__
-#else
-  #define HIP_FP8_HOST_DEVICE
-  #define HIP_FP8_HOST
-  #define HIP_FP8_DEVICE
-#endif
-
-namespace hip_fp8_impl {
-
-#ifdef __HIP__MI300__
-HIP_FP8_DEVICE uint8_t to_fp8_from_fp32(float v) {
-  uint8_t i8data;
-  union {
-    float fval;
-    uint32_t i32val;
-    uint8_t i8val[4];  // NOTE: not endian independent
-  } val;
-
-  uint32_t ival = 0;
-  val.fval = v;
-
-  if ((val.i32val & 0x7F800000) !=
-      0x7F800000) {  /// propagate NAN/INF, no clipping
-    val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
-  }
-
-  ival = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival,
-                                         false);  // false -> WORD0
-  val.i32val = ival;
-  i8data = val.i8val[0];
-
-  return i8data;
-}
-#endif  // __HIP__MI300__
-
-HIP_FP8_HOST inline int clz(uint32_t x) { return __builtin_clz(x); }
-#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
-HIP_FP8_DEVICE inline int clz(uint32_t x) { return __clz(x); }
-#endif
-
-template <int we, int wm, typename T, bool negative_zero_nan, bool clip>
-HIP_FP8_HOST_DEVICE uint8_t to_float8(T _x, bool stoch = false,
-                                      uint32_t rng = 0) {
-#ifdef __HIPCC__
-  constexpr bool is_half = std::is_same<T, _Float16>::value;
-#else
-  constexpr bool is_half = false;
-#endif
-  constexpr bool is_float = std::is_same<T, float>::value;
-  static_assert(wm + we == 7, "wm+we==7");
-  static_assert(is_half || is_float, "Only half and float can be cast to f8");
-
-  const int mfmt = (sizeof(T) == 4) ? 23 : 10;
-  uint32_t x;
-  if (sizeof(T) == 4) {
-    x = reinterpret_cast<uint32_t&>(_x);
-  } else {
-    x = reinterpret_cast<uint16_t&>(_x);
-  }
-
-  uint32_t head, mantissa;
-  int exponent, bias;
-  uint32_t sign;
-
-  if (sizeof(T) == 4) {
-    head = x & 0xFF800000;
-    mantissa = x & 0x7FFFFF;
-    exponent = (head >> 23) & 0xFF;
-    sign = head >> 31;
-    bias = 127;
-  } else {
-    head = x & 0xFC00;
-    mantissa = x & 0x3FF;
-    exponent = (head >> 10) & 0x1F;
-    sign = head >> 15;
-    bias = 15;
-  }
-
-  uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
-
-  // Deal with inf and NaNs
-  if (negative_zero_nan) {
-    if (sizeof(T) == 4) {
-      if ((x & 0x7F800000) == 0x7F800000) {
-        return 0x80;
-      }
-    } else {
-      // if(__hisinf(x) || __hisnan(x))
-      if ((x & 0x7C00) == 0x7C00) {
-        return 0x80;
-      }
-    }
-  } else {
-    if (sizeof(T) == 4) {
-      if ((x & 0x7F800000) == 0x7F800000) {
-        return signed_inf + (mantissa != 0 ? 1 : 0);
-      }
-    } else {
-      if ((x & 0x7C00) == 0x7C00) {
-        return signed_inf + (mantissa != 0 ? 1 : 0);
-      }
-    }
-  }
-  if (x == 0) {
-    return 0;
-  }
-
-  // First need to check if it is normal or denorm as there is a difference of
-  // implicit 1 Then need to adjust the exponent to align with the F8 exponent,
-  // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng
-  // to mantissa and truncate. And for RNE, no need to add rng. Then probably
-  // need to check whether there is carry and adjust exponent and mantissa again
-
-  // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent
-  // bits
-  const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
-  const int f8_denormal_act_exponent =
-      1 - f8_bias;  // actual exponent of f8 denormal
-  // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
-  // f8_exponent is the converted f8 exponent with bias encoding
-  // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
-  // the difference needs to be adjusted and mantissa shifted
-  int act_exponent, f8_exponent, exponent_diff;
-
-  if (exponent == 0) {  // fp32/fp16 is in denormal.
-    /* fp32 denormal is below 2^-127 so it is usually not a concern here, we
-mostly concern fp16 here. In this case, f8 is usually in denormal. But there
-could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has
-exponent bias 16. It means that there are some numbers in fp16 denormal but they
-are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
-where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8
-(NANOO) normal. In this case, the fp16 mantissa should be shift left by 1  */
-    act_exponent = exponent - bias + 1;
-    exponent_diff =
-        f8_denormal_act_exponent -
-        act_exponent;  // actual exponent is exponent-bias+1 as it is denormal
-  } else {             // fp32/fp16 is normal with implicit 1
-    act_exponent = exponent - bias;
-    if (act_exponent <= f8_denormal_act_exponent) {
-      /* This is the case where fp32/fp16 is normal but it is in f8 denormal
-range. For example fp8 nanoo mode, denormal exponent is -7, but if the
-fp32/fp16 actual exponent is -7, it is actually larger due to the implicit 1,
-Therefore it needs to be adjust to -6 and mantissa shift right by 1.
-So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
-      exponent_diff = f8_denormal_act_exponent - act_exponent;
-    } else {              // both fp32/fp16 and f8 are in normal range
-      exponent_diff = 0;  // exponent_diff=0 does not mean there is no
-                          // difference for this case, act_exponent could be
-                          // larger. Just that it does not need shift mantissa
-    }
-    mantissa += (1 << mfmt);  // Add the implicit 1 into mantissa
-  }
-
-  bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) ==
-                  static_cast<uint32_t>(1 << (mfmt - wm + exponent_diff - 1));
-  /* This part is a bit tricky. The judgment of whether it is a tie needs to be
- done before we shift right as shift right could rip off some residual part
- and make something not midpoint look like midpoint. For example, the fp16
- number 0x1002 (0 00100 0000000010), it is larger than midpoint, but after
- shift right by 4 bits, it would look like midpoint.
-*/
-
-  if (exponent_diff > 0) {
-    mantissa >>= exponent_diff;
-  } else if (exponent_diff == -1) {
-    mantissa <<= -exponent_diff;
-  }
-  bool implicit_one = mantissa & (1 << mfmt);
-  // if there is no implicit 1, it  means the f8 is denormal and need to adjust
-  // to denorm exponent
-  f8_exponent = (act_exponent + exponent_diff) /*actual f8 exponent*/ +
-                f8_bias - (implicit_one ? 0 : 1);
-
-  // Now we have the exponent and mantissa adjusted
-  uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
-  bool odd = mantissa & (1 << (mfmt - wm));  // if the least significant bit
-                                             // that is not truncated is 1
-  mantissa +=
-      (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) &
-      drop_mask;
-
-  // Now we deal with overflow
-  if (f8_exponent == 0) {
-    if ((1 << mfmt) & mantissa) {
-      f8_exponent = 1;  // denormal overflow to become normal, promote exponent
-    }
-  } else {
-    if ((1 << (mfmt + 1)) & mantissa) {
-      mantissa >>= 1;
-      f8_exponent++;
-    }
-  }
-
-  mantissa >>= (mfmt - wm);
-
-  // above range: quantize to maximum possible float of the same sign
-  const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
-  if (f8_exponent > max_exp) {
-    if (clip) {
-      mantissa = (1 << wm) - 1;
-      f8_exponent = max_exp;
-    } else {
-      return signed_inf;
-    }
-  }
-
-  if (f8_exponent == 0 && mantissa == 0) {
-    return negative_zero_nan ? 0 : (sign << 7);
-  }
-  mantissa &= (1 << wm) - 1;
-  return (sign << 7) | (f8_exponent << wm) | mantissa;
-}
-
-template <int we, int wm, typename T = float, bool negative_zero_nan = true>
-inline HIP_FP8_HOST_DEVICE T from_float8(uint8_t x) {
-#ifdef __HIPCC__
-  constexpr bool is_half = std::is_same<T, _Float16>::value;
-#else
-  constexpr bool is_half = false;
-#endif
-  constexpr bool is_float = std::is_same<T, float>::value;
-  static_assert(is_half || is_float, "only half and float are supported");
-
-  constexpr int weo = is_half ? 5 : 8;
-  constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
-
-  T fInf, fNegInf, fNaN, fNeg0;
-
-#ifdef __HIPCC__
-  if (is_half) {
-    const uint16_t ihInf = 0x7C00;
-    const uint16_t ihNegInf = 0xFC00;
-    const uint16_t ihNaN = 0x7C01;
-    const uint16_t ihNeg0 = 0x8000;
-    fInf = reinterpret_cast<const _Float16&>(ihInf);
-    fNegInf = reinterpret_cast<const _Float16&>(ihNegInf);
-    fNaN = reinterpret_cast<const _Float16&>(ihNaN);
-    fNeg0 = reinterpret_cast<const _Float16&>(ihNeg0);
-  } else
-#endif
-      if (is_float) {
-    const uint32_t ifInf = 0x7F800000;
-    const uint32_t ifNegInf = 0xFF800000;
-    const uint32_t ifNaN = 0x7F800001;
-    const uint32_t ifNeg0 = 0x80000000;
-    fInf = reinterpret_cast<const float&>(ifInf);
-    fNegInf = reinterpret_cast<const float&>(ifNegInf);
-    fNaN = reinterpret_cast<const float&>(ifNaN);
-    fNeg0 = reinterpret_cast<const float&>(ifNeg0);
-  }
-
-  if (x == 0) {
-    return 0;
-  }
-
-  uint32_t sign = x >> 7;
-  uint32_t mantissa = x & ((1 << wm) - 1);
-  int exponent = (x & 0x7F) >> wm;
-  if (negative_zero_nan) {
-    if (x == 0x80) {
-      return fNaN;
-    }
-  } else {
-    if (x == 0x80) {
-      return fNeg0;
-    }
-    if (exponent == ((1 << we) - 1)) {
-      return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
-    }
-  }
-  typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
-  if (we == 5 && is_half && !negative_zero_nan) {
-    retval = x << 8;
-    return reinterpret_cast<const T&>(retval);
-  }
-
-  const int exp_low_cutoff =
-      (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
-
-  // subnormal input
-  if (exponent == 0) {
-    // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
-    int sh = 1 + clz(mantissa) - (32 - wm);
-    mantissa <<= sh;
-    exponent += 1 - sh;
-    mantissa &= ((1 << wm) - 1);
-  }
-  exponent += exp_low_cutoff - 1;
-  mantissa <<= wmo - wm;
-
-  // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
-  if (exponent <= 0) {
-    mantissa |= 1 << wmo;
-    mantissa >>= 1 - exponent;
-    exponent = 0;
-  }
-
-  if (sizeof(T) == 2) {
-    retval = (sign << 15) | (exponent << 10) | mantissa;
-  } else {
-    retval = (sign << 31) | (exponent << 23) | mantissa;
-  }
-  return reinterpret_cast<const T&>(retval);
-}
-
-}  // namespace hip_fp8_impl
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
 #pragma once
-#include "hip_float8.h"
+#include <hip/hip_fp8.h>

 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
 #include <hip/hip_bfloat16.h>

-#include "../../../attention/dtype_fp8.cuh"
-#include "../../../attention/dtype_float32.cuh"
-#include "../../../attention/dtype_bfloat16.cuh"
+#include "../../../attention/attention_dtypes.h"

 namespace vllm {
 #ifdef USE_ROCM
@@ -15,6 +13,40 @@ namespace vllm {
 namespace fp8 {
  #ifdef ENABLE_FP8

+// Use hardware cvt instruction for fp8 on rocm
+template <typename fp8_type>
+__device__ __forceinline__ fp8_type cvt_c10(float const r) {
+  return {};
+}
+
+// __hip_fp8_e4m3 only exists starting in ROCm 6.3. The macro
+// HIP_FP8_TYPE_OCP comes from the hip_fp8.h header and also makes
+// its first appearance in ROCm 6.3. Since VLLM_DISPATCH_FP8_TYPES
+// on ROCm instantiates both OCP and FNUZ kernels, we need to replace
+// the new HW cvt with something reasonable that doesn't rely on the
+// ROCm 6.3 feature. This allows compiling on ROCm 6.2 or newer.
+template <>
+__device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) {
+    #if HIP_FP8_TYPE_OCP
+  return c10::Float8_e4m3fn(
+      __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation,
+                             __hip_fp8_e4m3::__default_interpret),
+      c10::Float8_e4m3fn::from_bits());
+    #else
+  // Cast implemented by pytorch. Uses bit manipulation instead of HW cvt.
+  // HW cvt above is faster when it is available (ROCm 6.3 or newer).
+  return static_cast<c10::Float8_e4m3fn>(r);
+    #endif
+}
+
+template <>
+__device__ __forceinline__ c10::Float8_e4m3fnuz cvt_c10(float const r) {
+  return c10::Float8_e4m3fnuz(
+      __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3_fnuz::__default_saturation,
+                             __hip_fp8_e4m3_fnuz::__default_interpret),
+      c10::Float8_e4m3fnuz::from_bits());
+}
+
 template <typename Tout, typename Tin>
 __inline__ __device__ Tout vec_conversion(const Tin& x) {
  return x;
@@ -26,40 +58,31 @@ __inline__ __device__ Tout scaled_vec_conversion(const Tin& x,
  return x;
 }

+    #if HIP_FP8_TYPE_OCP
+using fp8_type = __hip_fp8_e4m3;
+using fp8x2_type = __hip_fp8x2_e4m3;
+    #else
+using fp8_type = __hip_fp8_e4m3_fnuz;
+using fp8x2_type = __hip_fp8x2_e4m3_fnuz;
+    #endif
+
 // fp8 -> half
 template <>
 __inline__ __device__ uint16_t
 vec_conversion<uint16_t, uint8_t>(const uint8_t& a) {
-  hip_fp8 f8{a, hip_fp8::from_bits()};
-  __half_raw res;
-  res.data = static_cast<float>(f8);
-  return res.x;
+  return __hip_cvt_fp8_to_halfraw(a, fp8_type::__default_interpret).x;
 }

 // fp8x2 -> half2
 template <>
 __inline__ __device__ uint32_t
 vec_conversion<uint32_t, uint16_t>(const uint16_t& a) {
-    #if defined(__HIP__MI300__) && \
-        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
-  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
  union {
    __half2_raw h2r;
    uint32_t ui32;
  } tmp;
-  tmp.h2r.x.data = f2[0];
-  tmp.h2r.y.data = f2[1];
+  tmp.h2r = __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
  return tmp.ui32;
-    #else
-  union {
-    uint16_t u16[2];
-    uint32_t u32;
-  } tmp;
-
-  tmp.u16[0] = vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a));
-  tmp.u16[1] = vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a >> 8U));
-  return tmp.u32;
-    #endif
 }

 // fp8x4 -> half2x2
@@ -92,9 +115,9 @@ using __nv_bfloat16 = __hip_bfloat16;
 template <>
 __inline__ __device__ __nv_bfloat16
 vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a) {
-  hip_fp8 f8{a, hip_fp8::from_bits()};
-  float f{f8};
-  return __float2bfloat16(f);
+  fp8_type f8;
+  f8.__x = a;
+  return __float2bfloat16(static_cast<float>(f8));
 }

 using __nv_bfloat162 = __hip_bfloat162;
@@ -136,27 +159,18 @@ __inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, uint2>(const uint2& a) {
 // fp8 -> float
 template <>
 __inline__ __device__ float vec_conversion<float, uint8_t>(const uint8_t& a) {
-  hip_fp8 fp8{a, hip_fp8::from_bits()};
-  return static_cast<float>(fp8);
+  fp8_type f8;
+  f8.__x = a;
+  return static_cast<float>(f8);
 }

 // fp8x2 -> float2
 template <>
 __inline__ __device__ float2
 vec_conversion<float2, uint16_t>(const uint16_t& a) {
-    #if defined(__HIP__MI300__) && \
-        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
-  float2 res;
-  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
-  res.x = f2[0];
-  res.y = f2[1];
-  return res;
-    #else
-  float2 res;
-  res.x = vec_conversion<float, uint8_t>(static_cast<uint8_t>(a));
-  res.y = vec_conversion<float, uint8_t>(static_cast<uint8_t>(a >> 8U));
-  return res;
-    #endif
+  fp8x2_type f8x2;
+  f8x2.__x = a;
+  return static_cast<float2>(f8x2);
 }

 // fp8x4 -> float4
@@ -169,6 +183,15 @@ vec_conversion<Float4_, uint32_t>(const uint32_t& a) {
  return res;
 }

+// fp8x4 -> float4
+template <>
+__inline__ __device__ float4
+vec_conversion<float4, uint32_t>(const uint32_t& a) {
+  Float4_ tmp = vec_conversion<Float4_, uint32_t>(a);
+  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
+  return res;
+}
+
 // fp8x8 -> float8
 template <>
 __inline__ __device__ Float8_ vec_conversion<Float8_, uint2>(const uint2& a) {
@@ -189,33 +212,36 @@ __inline__ __device__ uint8_t
 vec_conversion<uint8_t, uint16_t>(const uint16_t& a) {
  __half_raw tmp;
  tmp.x = a;
+  return __hip_cvt_halfraw_to_fp8(tmp, fp8_type::__default_saturation,
+                                  fp8_type::__default_interpret);
+}

-  hip_fp8 f8{static_cast<float>(tmp.data)};
-  return f8.data;
+template <>
+__inline__ __device__ uint16_t
+vec_conversion<uint16_t, uint32_t>(const uint32_t& a) {
+  union {
+    uint32_t ui32;
+    __half2_raw h2r;
+  } tmp;
+  tmp.ui32 = a;
+  return __hip_cvt_halfraw2_to_fp8x2(tmp.h2r, fp8_type::__default_saturation,
+                                     fp8_type::__default_interpret);
 }

 // bf16 -> fp8
 template <>
 __inline__ __device__ uint8_t
 vec_conversion<uint8_t, __nv_bfloat16>(const __nv_bfloat16& a) {
-  hip_fp8 res{__bfloat162float(a)};
-  return res.data;
+  return __hip_cvt_float_to_fp8(__bfloat162float(a),
+                                fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
 }

 // float -> fp8
 template <>
 __inline__ __device__ uint8_t vec_conversion<uint8_t, float>(const float& a) {
-  hip_fp8 f8(a);
-  return f8.data;
-}
-
-// fp8x4 -> float4
-template <>
-__inline__ __device__ float4
-vec_conversion<float4, uint32_t>(const uint32_t& a) {
-  Float4_ tmp = vec_conversion<Float4_, uint32_t>(a);
-  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
-  return res;
+  return __hip_cvt_float_to_fp8(a, fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
 }

 // float2 -> half2
@@ -307,90 +333,22 @@ vec_conversion<bf16_8_t, Float8_>(const Float8_& a) {

 */

-// fp8 -> half
-template <>
-__inline__ __device__ uint16_t
-scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, const float scale) {
-  hip_fp8 f8{a, hip_fp8::from_bits()};
-  __half_raw res;
-  res.data = static_cast<float>(f8) * scale;
-  return res.x;
-}
-
-// fp8x2 -> half2
-template <>
-__inline__ __device__ uint32_t scaled_vec_conversion<uint32_t, uint16_t>(
-    const uint16_t& a, const float scale) {
-    #if defined(__HIP__MI300__) && \
-        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
-  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
-  union {
-    __half2_raw h2r;
-    uint32_t ui32;
-  } tmp;
-  tmp.h2r.x.data = f2[0] * scale;
-  tmp.h2r.y.data = f2[1] * scale;
-  return tmp.ui32;
-    #else
-  union {
-    uint16_t u16[2];
-    uint32_t u32;
-  } tmp;
-
-  tmp.u16[0] =
-      scaled_vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a), scale);
-  tmp.u16[1] = scaled_vec_conversion<uint16_t, uint8_t>(
-      static_cast<uint8_t>(a >> 8U), scale);
-  return tmp.u32;
-    #endif
-}
-
-// fp8x4 -> half2x2
-template <>
-__inline__ __device__ uint2
-scaled_vec_conversion<uint2, uint32_t>(const uint32_t& a, const float scale) {
-  union {
-    uint2 u32x2;
-    uint32_t u32[2];
-  } tmp;
-  tmp.u32[0] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale);
-  tmp.u32[1] =
-      scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), scale);
-  return tmp.u32x2;
-}
-
-// fp8x8 -> half2x4
-template <>
-__inline__ __device__ uint4
-scaled_vec_conversion<uint4, uint2>(const uint2& a, const float scale) {
-  union {
-    uint4 u64x2;
-    uint2 u64[2];
-  } tmp;
-  tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale);
-  tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale);
-  return tmp.u64x2;
-}
-
 using __nv_bfloat16 = __hip_bfloat16;

 // fp8 -> __nv_bfloat16
 template <>
 __inline__ __device__ __nv_bfloat16
-scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a,
-                                              const float scale) {
-  hip_fp8 f8{a, hip_fp8::from_bits()};
-  float f{f8};
-  return __float2bfloat16(f * scale);
+scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a, float scale) {
+  fp8_type f8;
+  f8.__x = a;
+  return __float2bfloat16(static_cast<float>(f8) * scale);
 }

-using __nv_bfloat162 = __hip_bfloat162;
-
 // fp8x2 -> __nv_bfloat162
 template <>
 __inline__ __device__ __nv_bfloat162
 scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a,
-                                                const float scale) {
+                                                float scale) {
  __nv_bfloat162 res;
  res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale);
  res.y =
@@ -400,8 +358,8 @@ scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a,

 // fp8x4 -> bf16_4_t
 template <>
-__inline__ __device__ bf16_4_t scaled_vec_conversion<bf16_4_t, uint32_t>(
-    const uint32_t& a, const float scale) {
+__inline__ __device__ bf16_4_t
+scaled_vec_conversion<bf16_4_t, uint32_t>(const uint32_t& a, float scale) {
  bf16_4_t res;
  res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale);
  res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U),
@@ -412,7 +370,7 @@ __inline__ __device__ bf16_4_t scaled_vec_conversion<bf16_4_t, uint32_t>(
 // fp8x8 -> bf16_8_t
 template <>
 __inline__ __device__ bf16_8_t
-scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, const float scale) {
+scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, float scale) {
  bf16_4_t tmp1, tmp2;
  tmp1 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.x, scale);
  tmp2 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.y, scale);
@@ -427,29 +385,19 @@ scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, const float scale) {
 // fp8 -> float
 template <>
 __inline__ __device__ float scaled_vec_conversion<float, uint8_t>(
-    const uint8_t& a, const float scale) {
-  hip_fp8 fp8{a, hip_fp8::from_bits()};
-  return static_cast<float>(fp8) * scale;
+    const uint8_t& a, float scale) {
+  fp8_type f8;
+  f8.__x = a;
+  return static_cast<float>(f8) * scale;
 }

 // fp8x2 -> float2
 template <>
 __inline__ __device__ float2
-scaled_vec_conversion<float2, uint16_t>(const uint16_t& a, const float scale) {
-    #if defined(__HIP__MI300__) && \
-        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
-  float2 res;
-  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
-  res.x = f2[0] * scale;
-  res.y = f2[1] * scale;
-  return res;
-    #else
-  float2 res;
-  res.x = scaled_vec_conversion<float, uint8_t>(static_cast<uint8_t>(a), scale);
-  res.y = scaled_vec_conversion<float, uint8_t>(static_cast<uint8_t>(a >> 8U),
-                                                scale);
-  return res;
-    #endif
+scaled_vec_conversion<float2, uint16_t>(const uint16_t& a, float scale) {
+  fp8x2_type f8x2;
+  f8x2.__x = a;
+  return static_cast<float2>(f8x2) * scale;
 }

 // fp8x4 -> float4
@@ -462,10 +410,18 @@ scaled_vec_conversion<Float4_, uint32_t>(const uint32_t& a, const float scale) {
  return res;
 }

+// fp8x4 -> float4
+template <>
+__inline__ __device__ float4
+scaled_vec_conversion<float4, uint32_t>(const uint32_t& a, float scale) {
+  Float4_ res = scaled_vec_conversion<Float4_, uint32_t>(a, scale);
+  return {res.x.x, res.x.y, res.y.x, res.y.y};
+}
+
 // fp8x8 -> float8
 template <>
 __inline__ __device__ Float8_
-scaled_vec_conversion<Float8_, uint2>(const uint2& a, const float scale) {
+scaled_vec_conversion<Float8_, uint2>(const uint2& a, float scale) {
  Float4_ tmp1, tmp2;
  tmp1 = scaled_vec_conversion<Float4_, uint32_t>(a.x, scale);
  tmp2 = scaled_vec_conversion<Float4_, uint32_t>(a.y, scale);
@@ -477,44 +433,184 @@ scaled_vec_conversion<Float8_, uint2>(const uint2& a, const float scale) {
  return res;
 }

-/* Quantize(HP / scale) => FP8 */
+// fp8 -> half
+template <>
+__inline__ __device__ uint16_t
+scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
+  __half_raw res;
+  res.data = scaled_vec_conversion<float, uint8_t>(a, scale);
+  return res.x;
+}

-// TODO(Hai): vectorized to add
+// fp8x2 -> half2
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
+  [[maybe_unused]] __half2_raw h2r =
+      __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
+  union {
+    __half2_raw h2r;
+    uint32_t ui32;
+  } tmp;
+  tmp.h2r = __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
+  tmp.h2r.x.data *= scale;
+  tmp.h2r.y.data *= scale;
+  return tmp.ui32;
+}
+
+// fp8x4 -> half2x2
+template <>
+__inline__ __device__ uint2
+scaled_vec_conversion<uint2, uint32_t>(const uint32_t& a, float scale) {
+  union {
+    uint2 u32x2;
+    uint32_t u32[2];
+  } tmp;
+  tmp.u32[0] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale);
+  tmp.u32[1] =
+      scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), scale);
+  return tmp.u32x2;
+}
+
+// fp8x8 -> half2x4
+template <>
+__inline__ __device__ uint4 scaled_vec_conversion<uint4, uint2>(const uint2& a,
+                                                                float scale) {
+  union {
+    uint4 u64x2;
+    uint2 u64[2];
+  } tmp;
+  tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale);
+  tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale);
+  return tmp.u64x2;
+}

 // half -> fp8
 template <>
 __inline__ __device__ uint8_t
-scaled_vec_conversion<uint8_t, uint16_t>(const uint16_t& a, const float scale) {
+scaled_vec_conversion<uint8_t, uint16_t>(const uint16_t& a, float scale) {
  __half_raw tmp;
  tmp.x = a;
+  tmp.data /= scale;
+  return __hip_cvt_halfraw_to_fp8(tmp, fp8_type::__default_saturation,
+                                  fp8_type::__default_interpret);
+}

-  hip_fp8 f8{static_cast<float>(tmp.data) / scale};
-  return f8.data;
+// halfx2 -> fp8x2
+template <>
+__inline__ __device__ uint16_t
+scaled_vec_conversion<uint16_t, uint32_t>(const uint32_t& a, float scale) {
+  union {
+    uint32_t ui32;
+    __half2_raw h2r;
+  } tmp;
+  tmp.ui32 = a;
+  tmp.h2r.x.data /= scale;
+  tmp.h2r.y.data /= scale;
+  return __hip_cvt_halfraw2_to_fp8x2(tmp.h2r, fp8_type::__default_saturation,
+                                     fp8_type::__default_interpret);
+}
+
+// half2x2 -> fp8x4
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, uint2>(const uint2& a, float scale) {
+  union {
+    uint16_t ui16[2];
+    uint32_t ui32;
+  } tmp;
+  tmp.ui16[0] = scaled_vec_conversion<uint16_t, uint32_t>(a.x, scale);
+  tmp.ui16[1] = scaled_vec_conversion<uint16_t, uint32_t>(a.y, scale);
+  return tmp.ui32;
+}
+
+// half2x4 -> fp8x8
+template <>
+__inline__ __device__ uint2 scaled_vec_conversion<uint2, uint4>(const uint4& a,
+                                                                float scale) {
+  union {
+    uint2 ui2[2];
+    uint4 ui4;
+  } tmp;
+  tmp.ui4 = a;
+  uint2 res;
+  res.x = scaled_vec_conversion<uint32_t, uint2>(tmp.ui2[0], scale);
+  res.y = scaled_vec_conversion<uint32_t, uint2>(tmp.ui2[1], scale);
+  return res;
 }

 // bf16 -> fp8
 template <>
 __inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, __nv_bfloat16>(
-    const __nv_bfloat16& a, const float scale) {
-  hip_fp8 res{__bfloat162float(a) / scale};
-  return res.data;
+    const __nv_bfloat16& a, float scale) {
+  return __hip_cvt_float_to_fp8(__bfloat162float(a) / scale,
+                                fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
+}
+
+// bf16x2 -> fp8x2
+template <>
+__inline__ __device__ uint16_t scaled_vec_conversion<uint16_t, __nv_bfloat162>(
+    const __nv_bfloat162& a, float scale) {
+  union {
+    uint8_t ui8[2];
+    uint16_t ui16;
+  } tmp;
+  tmp.ui8[0] = scaled_vec_conversion<uint8_t, __nv_bfloat16>(a.x, scale);
+  tmp.ui8[1] = scaled_vec_conversion<uint8_t, __nv_bfloat16>(a.y, scale);
+  return tmp.ui16;
+}
+
+// bf16x4 -> fp8x4
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, bf16_4_t>(const bf16_4_t& a, float scale) {
+  union {
+    uint16_t ui16[2];
+    uint32_t ui32;
+  } tmp;
+  tmp.ui16[0] = scaled_vec_conversion<uint16_t, __nv_bfloat162>(a.x, scale);
+  tmp.ui16[1] = scaled_vec_conversion<uint16_t, __nv_bfloat162>(a.y, scale);
+  return tmp.ui32;
+}
+
+// bf16x8 -> fp8x8
+template <>
+__inline__ __device__ uint2
+scaled_vec_conversion<uint2, bf16_8_t>(const bf16_8_t& a, float scale) {
+  uint2 res;
+  res.x = scaled_vec_conversion<uint32_t, bf16_4_t>({a.x, a.y}, scale);
+  res.y = scaled_vec_conversion<uint32_t, bf16_4_t>({a.z, a.w}, scale);
+  return res;
 }

 // float -> fp8
 template <>
 __inline__ __device__ uint8_t
-scaled_vec_conversion<uint8_t, float>(const float& a, const float scale) {
-  hip_fp8 f8(a / scale);
-  return f8.data;
+scaled_vec_conversion<uint8_t, float>(const float& a, float scale) {
+  return __hip_cvt_float_to_fp8(a / scale, fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
 }

-// fp8x4 -> float4
+// floatx2 -> fp8x2
 template <>
-__inline__ __device__ float4
-scaled_vec_conversion<float4, uint32_t>(const uint32_t& a, const float scale) {
-  Float4_ tmp = scaled_vec_conversion<Float4_, uint32_t>(a, scale);
-  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
-  return res;
+__inline__ __device__ uint16_t
+scaled_vec_conversion<uint16_t, float2>(const float2& a, float scale) {
+  return __hip_cvt_float2_to_fp8x2(a / scale, fp8_type::__default_saturation,
+                                   fp8_type::__default_interpret);
+}
+
+// floatx4 -> fp8x4
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, float4>(const float4& a, float scale) {
+  union {
+    uint16_t ui16[2];
+    uint32_t ui32;
+  } tmp;
+  tmp.ui16[0] = scaled_vec_conversion<uint16_t, float2>({a.x, a.y}, scale);
+  tmp.ui16[1] = scaled_vec_conversion<uint16_t, float2>({a.z, a.w}, scale);
+  return tmp.ui32;
 }
  #endif  // ENABLE_FP8