[Kernel] some optimizations for dense marlin and moe marlin (#16850)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>

[Kernel] some optimizations for dense marlin and moe marlin (#16850)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
1d0c9d6b · Jinzhen Lin · GitHub · f62cad64 · 1d0c9d6b · 1d0c9d6b
Unverified Commit 1d0c9d6b authored May 06, 2025 by Jinzhen Lin Committed by GitHub May 05, 2025
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -301,8 +301,52 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # are not supported by Machete yet.
  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)
+
+    #
+    # For the Marlin kernels we automatically generate sources for various
+    # preselected input type pairs and schedules.
+    # Generate sources:
+    set(MARLIN_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
+    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=$PYTHONPATH
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+        RESULT_VARIABLE marlin_generation_result
+        OUTPUT_VARIABLE marlin_generation_result
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
+        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
+      )
+
+      if (NOT marlin_generation_result EQUAL 0)
+        message(FATAL_ERROR "Marlin generation failed."
+                            " Result: \"${marlin_generation_result}\""
+                            "\nCheck the log for details: "
+                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
+      else()
+        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
+            CACHE STRING "Last run Marlin generate script hash" FORCE)
+        message(STATUS "Marlin generation completed successfully.")
+      endif()
+    else()
+      message(STATUS "Marlin generation script has not changed, skipping generation.")
+    endif()
+
+    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
+      CUDA_ARCHS "${MARLIN_ARCHS}")
+
+    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+
    set(MARLIN_SRCS
-       "csrc/quantization/fp8/fp8_marlin.cu"
       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
@@ -644,7 +688,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+        PYTHONPATH=$PYTHONPATH
          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
        RESULT_VARIABLE moe_marlin_generation_result
        OUTPUT_VARIABLE moe_marlin_generation_output

--- a/csrc/moe/marlin_moe_wna16/.gitignore
+++ b/csrc/moe/marlin_moe_wna16/.gitignore
+kernel_*.cu
\ No newline at end of file
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -25,15 +25,13 @@ TEMPLATE = ("template __global__ void Marlin<"
            "{{thread_k_blocks}}, "
            "{{'true' if m_block_size_8 else 'false'}}, "
            "{{stages}}, "
-            "{{'true' if has_act_order else 'false'}}, "
-            "{{'true' if has_zp else 'false'}}, "
            "{{group_blocks}}, "
            "{{'true' if is_zp_float else 'false'}}>"
            "( MARLIN_KERNEL_PARAMS );")

 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
-SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128"]
+SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn"]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]

 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
@@ -52,21 +50,29 @@ def remove_old_kernels():

 def generate_new_kernels():
    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
-        has_zp = "B" not in scalar_type
        all_template_str_list = []

        for group_blocks, m_blocks, thread_configs in itertools.product(
                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):

-            has_act_order = group_blocks == 0
-            if has_zp and has_act_order:
+            # act order case only support gptq-int4 and gptq-int8
+            if group_blocks == 0 and scalar_type not in [
+                    "vllm::kU4B8", "vllm::kU8B128"
+            ]:
                continue
            if thread_configs[2] == 256:
+                # for small batch (m_blocks == 1), we only need (128, 128, 256)
+                # for large batch (m_blocks > 1), we only need (64, 256, 256)
                if m_blocks <= 1 and thread_configs[0] != 128:
                    continue
                if m_blocks > 1 and thread_configs[0] != 64:
                    continue

+            # we only support channelwise quantization and group_size == 128
+            # for fp8
+            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
+                continue
+
            k_blocks = thread_configs[0] // 16
            n_blocks = thread_configs[1] // 16
            threads = thread_configs[2]
@@ -82,8 +88,6 @@ def generate_new_kernels():
                thread_k_blocks=k_blocks,
                m_block_size_8=m_blocks == 0.5,
                stages="pipe_stages",
-                has_act_order=has_act_order,
-                has_zp=has_zp,
                group_blocks=group_blocks,
                is_zp_float=False,
            )

--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -18,7 +18,7 @@
      const float *__restrict__ topk_weights_ptr, int top_k,                \
      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,        \
      int prob_n, int prob_k, int *locks, bool use_atomic_add,              \
-      bool use_fp32_reduce
+      bool use_fp32_reduce, int max_shared_mem

 namespace MARLIN_NAMESPACE_NAME {
 template <typename scalar_t,  // compute dtype, half or nv_float16
@@ -33,11 +33,9 @@ template <typename scalar_t,  // compute dtype, half or nv_float16
                                      // only works when thread_m_blocks == 1
          const int stages,  // number of stages for the async global->shared
                             // fetch pipeline
-          const bool has_act_order,  // whether act_order is enabled
-          const bool has_zp,         // whether zero-points are enabled
-          const int group_blocks,    // number of consecutive 16x16 blocks
-                                     // with a separate quantization scale
-          const bool is_zp_float     // is zero point of float16 type?
+          const int group_blocks,  // number of consecutive 16x16 blocks
+                                   // with a separate quantization scale
+          const bool is_zp_float   // is zero point of float16 type?
          >
 __global__ void Marlin(MARLIN_KERNEL_PARAMS);


--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
--- a/csrc/quantization/gptq_marlin/.gitignore
+++ b/csrc/quantization/gptq_marlin/.gitignore
+kernel_*.cu
\ No newline at end of file
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
+
+#include "marlin_dtypes.cuh"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <typename scalar_t2, vllm::ScalarTypeId w_type_id>
+__device__ inline void dequant(int q, scalar_t2* frag_b);
+
+//
+// Efficiently dequantize 4bit values packed in an int32 value into a full
+// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
+// with some small changes:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
+//
+template <>
+__device__ inline void dequant<half2, vllm::kU4B8.id()>(int q, half2* frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // clang-format on
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU4.id()>(int q, half2* frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // clang-format on
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64006400;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd400d400;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4B8.id()>(
+    int q, nv_bfloat162* frag_b) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  // clang-format on
+
+  static constexpr uint32_t MUL = 0x3F803F80;
+  static constexpr uint32_t ADD = 0xC308C308;
+
+  frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
+                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
+                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4.id()>(
+    int q, nv_bfloat162* frag_b) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  // clang-format on
+
+  static constexpr uint32_t MUL = 0x3F803F80;
+  static constexpr uint32_t ADD = 0xC300C300;
+
+  frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
+                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
+                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+}
+
+//
+// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
+// bf16 Reference:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
+//
+template <>
+__device__ inline void dequant<half2, vllm::kU8B128.id()>(int q,
+                                                          half2* frag_b) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kU8.id()>(int q, half2* frag_b) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU8B128.id()>(
+    int q, nv_bfloat162* frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388736.f;
+  fp32_intermediates[1] -= 8388736.f;
+  fp32_intermediates[2] -= 8388736.f;
+  fp32_intermediates[3] -= 8388736.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU8.id()>(
+    int q, nv_bfloat162* frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388608.f;
+  fp32_intermediates[1] -= 8388608.f;
+  fp32_intermediates[2] -= 8388608.f;
+  fp32_intermediates[3] -= 8388608.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+}
+
+template <>
+__device__ inline void dequant<half2, vllm::kFE4M3fn.id()>(int q,
+                                                           half2* frag_b) {
+  // Constants for FP8 (E4M3) and FP16 formats
+  constexpr int FP8_EXPONENT = 4, FP8_MANTISSA = 3, FP16_EXPONENT = 5;
+  constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP8_EXPONENT;
+
+  // Calculate MASK for extracting mantissa and exponent
+  constexpr int MASK1 = 0x80000000;
+  constexpr int MASK2 = MASK1 >> (FP8_EXPONENT + FP8_MANTISSA);
+  constexpr int MASK3 = MASK2 & 0x7fffffff;
+  constexpr int MASK = MASK3 | (MASK3 >> 16);
+  // Final MASK value: 0x7F007F00
+
+  // Extract and shift FP8 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  int Out2 = ((q << 8) & 0x80008000) | (((q << 8) & MASK) >> RIGHT_SHIFT);
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (FP16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
+
+  // Convert to half2 and apply bias
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = __hmul2(*reinterpret_cast<const half2*>(&Out1), bias_reg);
+  frag_b[0] = __hmul2(*reinterpret_cast<const half2*>(&Out2), bias_reg);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE4M3fn.id()>(
+    int q, nv_bfloat162* frag_b) {
+  // Constants for FP8 (E4M3) and BF16 formats
+  constexpr int FP8_EXPONENT = 4, FP8_MANTISSA = 3, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
+
+  // Calculate MASK for extracting mantissa and exponent
+  constexpr int MASK1 = 0x80000000;
+  constexpr int MASK2 = MASK1 >> (FP8_EXPONENT + FP8_MANTISSA);
+  constexpr int MASK3 = MASK2 & 0x7fffffff;
+  constexpr int MASK = MASK3 | (MASK3 >> 16);
+  // Final MASK value: 0x7F007F00
+
+  // Extract and shift FP8 values to BF16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  int Out2 = ((q << 8) & 0x80008000) | (((q << 8) & MASK) >> RIGHT_SHIFT);
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (BF16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent
+  // position
+  constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
+  const nv_bfloat162 bias_reg =
+      __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
+
+  // Convert to bfloat162 and apply bias
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = __hmul2(*reinterpret_cast<const nv_bfloat162*>(&Out1), bias_reg);
+  frag_b[0] = __hmul2(*reinterpret_cast<const nv_bfloat162*>(&Out2), bias_reg);
+}
+
+#endif
+
+}  // namespace MARLIN_NAMESPACE_NAME
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
+# SPDX-License-Identifier: Apache-2.0
+import glob
+import itertools
+import os
+import subprocess
+
+import jinja2
+
+FILE_HEAD = """
+// auto generated by generate.py
+// clang-format off
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+""".strip()
+
+TEMPLATE = ("template __global__ void Marlin<"
+            "{{scalar_t}}, "
+            "{{w_type_id}}, "
+            "{{threads}}, "
+            "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
+            "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")
+
+# int8 with zero point case (vllm::kU8) is also supported,
+# we don't add it to reduce wheel size.
+SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn"]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128),
+                  (128, 64, 128)]
+
+THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
+# group_blocks:
+#   = 0 : act order case
+#   = -1 : channelwise quantization
+#   > 0 : group_size=16*group_blocks
+GROUP_BLOCKS = [0, -1, 2, 4, 8]
+DTYPES = ["fp16", "bf16"]
+
+
+def remove_old_kernels():
+    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+        subprocess.call(["rm", "-f", filename])
+
+
+def generate_new_kernels():
+    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+        all_template_str_list = []
+
+        for group_blocks, m_blocks, thread_configs in itertools.product(
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
+
+            # act order case only support gptq-int4 and gptq-int8
+            if group_blocks == 0 and scalar_type not in [
+                    "vllm::kU4B8", "vllm::kU8B128"
+            ]:
+                continue
+            if thread_configs[2] == 256:
+                # for small batch (m_blocks == 1), we only need (128, 128, 256)
+                # for large batch (m_blocks > 1), we only need (64, 256, 256)
+                if m_blocks <= 1 and thread_configs[0] != 128:
+                    continue
+                if m_blocks > 1 and thread_configs[0] != 64:
+                    continue
+
+            # we only support channelwise quantization and group_size == 128
+            # for fp8
+            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
+                continue
+
+            k_blocks = thread_configs[0] // 16
+            n_blocks = thread_configs[1] // 16
+            threads = thread_configs[2]
+
+            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
+
+            is_zp_float_list = [False]
+            if dtype == "fp16" and scalar_type == "vllm::kU4" and \
+                    group_blocks == 4:
+                # HQQ (is_zp_float = true) only supports
+                # 4bit quantization and fp16
+                is_zp_float_list.append(True)
+
+            for is_zp_float in is_zp_float_list:
+                template_str = jinja2.Template(TEMPLATE).render(
+                    scalar_t=c_dtype,
+                    w_type_id=scalar_type + ".id()",
+                    threads=threads,
+                    thread_m_blocks=max(m_blocks, 1),
+                    thread_n_blocks=n_blocks,
+                    thread_k_blocks=k_blocks,
+                    m_block_size_8=m_blocks == 0.5,
+                    stages="pipe_stages",
+                    group_blocks=group_blocks,
+                    is_zp_float=is_zp_float,
+                )
+
+                all_template_str_list.append(template_str)
+
+        file_content = FILE_HEAD + "\n\n"
+        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+
+        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+            f.write(file_content)
+
+
+if __name__ == "__main__":
+    remove_old_kernels()
+    generate_new_kernels()
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+#include "marlin.cuh"
+#include "marlin_dtypes.cuh"
+#include "core/scalar_type.hpp"
+
+#define MARLIN_KERNEL_PARAMS                                                 \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,                    \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                        \
+      const int4 *__restrict__ scales_ptr, const int4 *__restrict__ zp_ptr,  \
+      const int *__restrict__ g_idx, int num_groups, int prob_m, int prob_n, \
+      int prob_k, int lda, int *locks, bool use_atomic_add,                  \
+      bool use_fp32_reduce, int max_shared_mem
+
+namespace MARLIN_NAMESPACE_NAME {
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks,  // number of consecutive 16x16 blocks
+                                   // with a separate quantization scale
+          const bool is_zp_float   // is zero point of float16 type?
+          >
+__global__ void Marlin(MARLIN_KERNEL_PARAMS);
+
+}
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -291,12 +291,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

  // gptq_marlin Optimized Quantized GEMM for GPTQ.
  ops.def(
-      "gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
-      "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
-      "int b_q_type, "
+      "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
+      "Tensor b_scales, Tensor? b_zeros_or_none, Tensor? g_idx_or_none, "
+      "Tensor? perm_or_none, Tensor workspace, int b_q_type, "
      "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
-      "bool has_zp, bool use_atomic_add, bool use_fp32_reduce, "
-      "bool is_zp_float) -> Tensor",
+      "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor",
      {stride_tag});
  // conditionally compiled so impl registration is in source file

@@ -341,14 +340,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size);

 #ifndef USE_ROCM
-  // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
-  ops.def(
-      "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
-      "Tensor! workspace, int num_bits, SymInt size_m, SymInt size_n, "
-      "SymInt size_k) -> Tensor",
-      {stride_tag});
-  // conditionally compiled so impl registration is in source file
-
  // marlin_qqq_gemm for QQQ.
  ops.def(
      "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "

--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -11,19 +11,20 @@ from transformers import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock

 import vllm.model_executor.layers.fused_moe  # noqa
-from tests.kernels.utils import (opcheck, stack_and_dev, torch_moe,
-                                 torch_moe_single)
+from tests.kernels.utils import opcheck, stack_and_dev, torch_moe
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
    fused_moe as iterative_moe)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    marlin_quant_fp8_torch)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    awq_marlin_quantize, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    quantize_weights)
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
-from vllm.scalar_type import scalar_types
+from vllm.scalar_type import ScalarType, scalar_types

 NUM_EXPERTS = [8, 64]
 EP_SIZE = [1, 4]
@@ -285,7 +286,7 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
                                   atol=mixtral_moe_tol[dtype])


-@pytest.mark.parametrize("m", [1, 33, 123])
+@pytest.mark.parametrize("m", [1, 123, 666])
 @pytest.mark.parametrize("n", [128, 1024])
 @pytest.mark.parametrize("k", [256, 2048])
 @pytest.mark.parametrize("e", [4, 12])
@@ -294,8 +295,10 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("group_size", [-1, 32, 128])
 @pytest.mark.parametrize("act_order", [True, False])
-@pytest.mark.parametrize("num_bits", [4, 8])
-@pytest.mark.parametrize("has_zp", [True, False])
+@pytest.mark.parametrize("quant_type", [
+    scalar_types.uint4, scalar_types.uint8b128, scalar_types.uint4b8,
+    scalar_types.float8_e4m3fn
+])
 @pytest.mark.parametrize("is_k_full", [True, False])
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
 def test_fused_marlin_moe(
@@ -308,14 +311,22 @@ def test_fused_marlin_moe(
    dtype: torch.dtype,
    group_size: int,
    act_order: bool,
-    num_bits: int,
-    has_zp: bool,
+    quant_type: ScalarType,
    is_k_full: bool,
 ):
-    current_platform.seed_everything(7)
+    torch.cuda.manual_seed(0)
+    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+
+    if quant_type == scalar_types.float8_e4m3fn:
+        if group_size not in [-1, 128]:
+            return
+        if act_order:
+            return

    # Filter act_order
    if act_order:
+        if quant_type == scalar_types.float8_e4m3fn:
+            return
        if group_size == -1:
            return
        if group_size in (k, n):
@@ -326,17 +337,9 @@ def test_fused_marlin_moe(
        if not is_k_full:
            return

-    if has_zp:
-        # we don't build kernel for int8 with zero
-        if num_bits == 8:
-            return
-        quant_type = scalar_types.uint4 if num_bits == 4 else scalar_types.uint8
-    else:
-        quant_type = scalar_types.uint4b8 \
-                if num_bits == 4 else scalar_types.uint8b128
    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 20
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 20

    if ep_size > 1:
        local_e = e // ep_size
@@ -364,17 +367,23 @@ def test_fused_marlin_moe(
            qweight1_l.append(qweight1)
            scales1_l.append(scales1)
            zeros1_l.append(zeros1)
-        else:
+        elif quant_type != scalar_types.float8_e4m3fn:
            test_perm = torch.randperm(k)
-            quant_res = marlin_quantize(w1[i].transpose(1, 0), quant_type,
-                                        group_size, act_order, test_perm)
-            w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = quant_res
+            w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = \
+                marlin_quantize(w1[i].transpose(1, 0), quant_type,
+                                group_size, act_order, test_perm)

            w_ref1_l.append(w_ref1.T)
            qweight1_l.append(qweight1)
            scales1_l.append(scales1)
            g_idx1_l.append(g_idx1)
            sort_indices1_l.append(sort_indices1)
+        else:
+            w_ref1, qweight1, scales1 = marlin_quant_fp8_torch(
+                w1[i], group_size)
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)

    w_ref1 = stack_and_dev(w_ref1_l)
    qweight1 = stack_and_dev(qweight1_l).contiguous()
@@ -399,17 +408,23 @@ def test_fused_marlin_moe(
            qweight2_l.append(qweight2)
            scales2_l.append(scales2)
            zeros2_l.append(zeros2)
-        else:
+        elif quant_type != scalar_types.float8_e4m3fn:
            test_perm = torch.randperm(n)
-            quant_res = marlin_quantize(w2[i].transpose(1, 0), quant_type,
-                                        group_size, act_order, test_perm)
-            w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = quant_res
+            w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = \
+                marlin_quantize(w2[i].transpose(1, 0), quant_type,
+                                group_size, act_order, test_perm)

            w_ref2_l.append(w_ref2.T)
            qweight2_l.append(qweight2)
            scales2_l.append(scales2)
            g_idx2_l.append(g_idx2)
            sort_indices2_l.append(sort_indices2)
+        else:
+            w_ref2, qweight2, scales2 = marlin_quant_fp8_torch(
+                w2[i], group_size)
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)

    w_ref2 = stack_and_dev(w_ref2_l)
    qweight2 = stack_and_dev(qweight2_l).contiguous()
@@ -442,102 +457,10 @@ def test_fused_marlin_moe(
        sort_indices2=sort_indices2,
        w1_zeros=zeros1,
        w2_zeros=zeros2,
-        num_bits=num_bits,
+        quant_type_id=quant_type.id,
        is_k_full=is_k_full)

-    torch.testing.assert_close(marlin_output, torch_output, atol=2e-2, rtol=0)
-
-
-@pytest.mark.skip("This test is here for the sake of debugging, "
-                  "don't run it in automated tests.")
-@pytest.mark.parametrize("m", [1, 33, 123])
-@pytest.mark.parametrize("n", [128, 1024])
-@pytest.mark.parametrize("k", [256, 2048])
-@pytest.mark.parametrize("e", [4, 12])
-@pytest.mark.parametrize("topk", [2, 3])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("group_size", [-1, 32, 128])
-@pytest.mark.parametrize("act_order", [True, False])
-@pytest.mark.parametrize("num_bits", [4, 8])
-@pytest.mark.parametrize("has_zp", [True, False])
-@pytest.mark.parametrize("is_k_full", [True, False])
-def test_single_marlin_moe_multiply(m: int, n: int, k: int, e: int, topk: int,
-                                    dtype: torch.dtype, group_size: int,
-                                    act_order: bool, num_bits: int,
-                                    has_zp: bool, is_k_full: bool):
-    # Filter act_order
-    if act_order:
-        if group_size == -1:
-            return
-        if group_size in (k, n):
-            return
-        if has_zp:
-            return
-    else:
-        if not is_k_full:
-            return
-
-    if has_zp:
-        quant_type = scalar_types.uint4 if num_bits == 4 else scalar_types.uint8
-    else:
-        quant_type = scalar_types.uint4b8 \
-                if num_bits == 4 else scalar_types.uint8b128
-    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
-
-    w_ref_l = []
-    qweight_l = []
-    scales_l = []
-    zeros_l = []
-    g_idx_l = []
-    sort_indices_l = []
-
-    for i in range(w.shape[0]):
-        if has_zp:
-            w_ref, qweight, scales, zeros = awq_marlin_quantize(
-                w[i].transpose(1, 0), quant_type, group_size)
-
-            w_ref_l.append(w_ref.T)
-            qweight_l.append(qweight)
-            scales_l.append(scales)
-            zeros_l.append(zeros)
-        else:
-            test_perm = torch.randperm(k)
-            w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
-                w[i].transpose(1, 0), quant_type, group_size, act_order,
-                test_perm)
-
-            w_ref_l.append(w_ref.T)
-            qweight_l.append(qweight)
-            scales_l.append(scales)
-            g_idx_l.append(g_idx)
-            sort_indices_l.append(sort_indices)
-
-    w_ref = stack_and_dev(w_ref_l)
-    qweight = stack_and_dev(qweight_l).contiguous()
-    scales = stack_and_dev(scales_l)
-    g_idx = stack_and_dev(g_idx_l) if g_idx_l else None
-    zeros = stack_and_dev(zeros_l) if zeros_l else None
-    sort_indices = stack_and_dev(sort_indices_l) if sort_indices_l else None
-
-    score = torch.randn((m, e), device="cuda", dtype=dtype)
-    marlin_output = torch.ops.vllm.single_marlin_moe(
-        a,
-        qweight,
-        scales,
-        score,
-        topk,
-        renormalize=False,
-        g_idx=g_idx,
-        sort_indices=sort_indices,
-        w_zeros=zeros,
-        num_bits=num_bits,
-        is_k_full=is_k_full,
-    )
-
-    torch_output = torch_moe_single(a, w_ref, score, topk)
-
-    torch.testing.assert_close(marlin_output, torch_output, atol=2e-2, rtol=0)
+    torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)


 def test_moe_align_block_size_opcheck():

--- a/tests/kernels/quantization/test_awq_marlin.py
+++ b/tests/kernels/quantization/test_awq_marlin.py
-# SPDX-License-Identifier: Apache-2.0
-"""Test AWQ with fused MoE Marlin kernels.
-
-Run `pytest tests/kernels/test_awq_marlin.py`.
-"""
-import pytest
-import torch
-
-import vllm.model_executor.layers.fused_moe  # noqa
-from tests.kernels.utils import (compute_max_diff, stack_and_dev, torch_moe,
-                                 torch_moe_single)
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    awq_marlin_quantize)
-from vllm.scalar_type import scalar_types
-
-NUM_EXPERTS = [8, 64]
-TOP_KS = [2, 6]
-GROUP_SIZES = [-1, 32, 128]
-
-
-@pytest.mark.parametrize("m", [1, 33, 64, 222])
-@pytest.mark.parametrize("n", [128, 2048])
-@pytest.mark.parametrize("k", [128, 1024])
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("group_size", GROUP_SIZES)
-@pytest.mark.skipif(not (ops.supports_moe_ops
-                         and hasattr(torch.ops._moe_C, "marlin_gemm_moe")),
-                    reason="Marlin is not supported on this GPU type.")
-def test_fused_marlin_moe_awq(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    group_size: int,
-):
-    torch.manual_seed(7)
-
-    num_bits = 4
-    quant_type = scalar_types.uint4
-    dtype = torch.float16
-    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-
-    w_ref1_l = []
-    qweights1_l = []
-    scales1_l = []
-    zp1_l = []
-
-    for i in range(w1.shape[0]):
-        w_ref1, qweight1, scales1, zp1 = awq_marlin_quantize(
-            w1[i].transpose(1, 0), quant_type, group_size)
-        w_ref1_l.append(w_ref1)
-        qweights1_l.append(qweight1)
-        scales1_l.append(scales1)
-        zp1_l.append(zp1)
-
-    w_ref1 = stack_and_dev(w_ref1_l)
-    qweight1 = stack_and_dev(qweights1_l).contiguous()
-    scales1 = stack_and_dev(scales1_l)
-    zp1 = stack_and_dev(zp1_l)
-
-    w_ref2_l = []
-    qweights2_l = []
-    scales2_l = []
-    zp2_l = []
-
-    for i in range(w2.shape[0]):
-        w_ref2, qweight2, scales2, zp2 = awq_marlin_quantize(
-            w2[i].transpose(1, 0), quant_type, group_size)
-        w_ref2_l.append(w_ref2)
-        qweights2_l.append(qweight2)
-        scales2_l.append(scales2)
-        zp2_l.append(zp2)
-
-    w_ref2 = stack_and_dev(w_ref2_l)
-    qweight2 = stack_and_dev(qweights2_l).contiguous()
-    scales2 = stack_and_dev(scales2_l)
-    zp2 = stack_and_dev(zp2_l)
-
-    score = torch.randn((m, e), device="cuda", dtype=dtype)
-
-    topk_weights, topk_ids, token_expert_indices = fused_topk(
-        a, score, topk, False)
-    marlin_output = torch.ops.vllm.fused_marlin_moe(
-        a,
-        qweight1,
-        qweight2,
-        scales1,
-        scales2,
-        score,
-        topk_weights,
-        topk_ids,
-        w1_zeros=zp1,
-        w2_zeros=zp2,
-        num_bits=num_bits,
-    )
-
-    torch_output = torch_moe(a, w_ref1.transpose(1, 2), w_ref2.transpose(1, 2),
-                             score, topk, None)
-
-    assert compute_max_diff(marlin_output, torch_output) < 4e-2
-
-
-@pytest.mark.skip("This test is here for the sake of debugging, "
-                  "don't run it in automated tests.")
-@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
-@pytest.mark.parametrize("k", [128, 1024, 512])
-@pytest.mark.parametrize("e", [8, 64])
-@pytest.mark.parametrize("topk", [2, 6])
-@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
-def test_single_marlin_moe_multiply_awq(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    group_size: int,
-):
-    torch.manual_seed(7)
-
-    num_bits = 4
-    quant_type = scalar_types.uint4
-    dtype = torch.float16
-    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
-
-    w_ref_l = []
-    qweights_l = []
-    scales_l = []
-    zp_l = []
-
-    for i in range(w.shape[0]):
-        w_ref, qweight, scales, zp = awq_marlin_quantize(
-            w[i].transpose(1, 0), quant_type, group_size)
-        w_ref_l.append(w_ref)
-        qweights_l.append(qweight)
-        scales_l.append(scales)
-        zp_l.append(zp)
-
-    w_ref = stack_and_dev(w_ref_l)
-    qweight = stack_and_dev(qweights_l).contiguous()
-    scales = stack_and_dev(scales_l).contiguous()
-    zp = stack_and_dev(zp_l).contiguous()
-
-    score = torch.randn((m, e), device="cuda", dtype=dtype)
-
-    marlin_output = torch.ops.vllm.single_marlin_moe(a,
-                                                     qweight,
-                                                     scales,
-                                                     score,
-                                                     topk,
-                                                     renormalize=False,
-                                                     w_zeros=zp,
-                                                     num_bits=num_bits)
-
-    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
-
-    assert compute_max_diff(marlin_output, torch_output) < 1e-2
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -22,9 +22,10 @@ from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
    check_marlin_supports_layer, check_moe_marlin_supports_layer,
-    marlin_make_empty_g_idx, marlin_make_workspace, marlin_moe_permute_scales,
-    marlin_permute_scales, moe_awq_to_marlin_zero_points,
-    verify_marlin_supported, verify_marlin_supports_shape)
+    marlin_make_empty_g_idx, marlin_make_workspace_new,
+    marlin_moe_permute_scales, marlin_permute_scales,
+    moe_awq_to_marlin_zero_points, verify_marlin_supported,
+    verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                           PackedvLLMParameter)
@@ -267,8 +268,7 @@ class AWQMarlinLinearMethod(LinearMethodBase):
                                          requires_grad=False)

        # Allocate marlin workspace
-        layer.workspace = marlin_make_workspace(
-            layer.output_size_per_partition, device)
+        layer.workspace = marlin_make_workspace_new(device)

        # Repack weights from AWQ format to marlin format.
        marlin_qweight = ops.awq_marlin_repack(
@@ -322,6 +322,9 @@ class AWQMoEMethod(FusedMoEMethodBase):

    def __init__(self, quant_config: AWQMarlinConfig):
        self.quant_config = quant_config
+        if self.quant_config.weight_bits != 4:
+            raise ValueError("AWQMoEMethod only supports 4bit now.")
+        self.quant_type = scalar_types.uint4

    def create_weights(self, layer: torch.nn.Module, num_experts: int,
                       hidden_size: int, intermediate_size_per_partition: int,
@@ -396,11 +399,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
        set_weight_attrs(w2_qzeros, extra_weight_attrs)

        device = layer.w13_qweight.device
-        sms = torch.cuda.get_device_properties(device).multi_processor_count
-        layer.workspace = torch.zeros((sms * 4, ),
-                                      dtype=torch.int,
-                                      device=device,
-                                      requires_grad=False)
+        layer.workspace = marlin_make_workspace_new(device, 4)

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        num_experts = layer.w13_qweight.shape[0]
@@ -511,10 +510,9 @@ class AWQMoEMethod(FusedMoEMethodBase):
            router_logits,
            topk_weights,
            topk_ids,
+            quant_type_id=self.quant_type.id,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
            w1_zeros=layer.w13_qzeros,
            w2_zeros=layer.w2_qzeros,
-            workspace=layer.workspace,
-            num_bits=self.quant_config.weight_bits,
-        )
+            workspace=layer.workspace)
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py