修复awq的triton支持，和moe模型的接口bug，以及awq_moe_marlin的接口相关问题，以及解决一些w4a16的精度问题

b129793f · chenyue3 · 8900b622 · b129793f · b129793f · b129793f
Commit b129793f authored Feb 03, 2026 by chenyue3
11 changed files
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1540,15 +1540,12 @@ def awq_marlin_moe_repack(
 ) -> torch.Tensor:
    num_experts = b_q_weight.shape[0]
    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
+    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
+                         device=b_q_weight.device,
+                         dtype=b_q_weight.dtype)
    for e in range(num_experts):
-        output[e] = torch.ops._C.awq_marlin_repack(
-            b_q_weight[e], size_k, size_n, num_bits, is_a_8bit
-        )
+        output[e] = op.awq_marlin_repack(b_q_weight[e], size_k,
+                                                   size_n, num_bits)
    return output



--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe_w4a16.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe_w4a16.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Fused MoE utilities for GPTQ."""
+import functools
+from typing import Optional
+
+import torch
+try:
+    import lightop
+except Exception:
+    print("INFO: Please install lightop if you want to infer awq of marlin.\n") 
+
+import vllm.envs as envs
+import vllm._custom_ops as ops
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    moe_align_block_size, try_get_optimal_moe_config)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    marlin_make_workspace_new, maybe_warn_marlin_atomic_add)
+from vllm.scalar_type import ScalarType, scalar_types
+from vllm.utils import direct_register_custom_op
+from vllm.model_executor.layers.fused_moe.fused_moe import get_moe_cache
+def get_scalar_type(num_bits: int, has_zp: bool):
+    if has_zp:
+        return scalar_types.uint4 if num_bits == 4 else scalar_types.uint8
+    else:
+        return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128
+
+def fused_marlin_moe_int4(
+                     hidden_states: torch.Tensor, # 32, 7168 
+                     w1: torch.Tensor, # 256, 512, 7168 --> 32*8, 512 --> 32*8, 256
+                     w2: torch.Tensor, # 256, 256, 7168
+                     w1_scale_zero: torch.Tensor,
+                     w2_scale_zero: torch.Tensor,
+                     gating_output: torch.Tensor,
+                     topk_weights: torch.Tensor,
+                     topk_ids: torch.Tensor,
+                     global_num_experts: int = -1,      
+                     expert_map: Optional[torch.Tensor] = None,
+                     g_idx1: Optional[torch.Tensor] = None,
+                     g_idx2: Optional[torch.Tensor] = None,
+                     sort_indices1: Optional[torch.Tensor] = None,
+                     sort_indices2: Optional[torch.Tensor] = None,
+                     w1_zeros: Optional[torch.Tensor] = None,
+                     w2_zeros: Optional[torch.Tensor] = None,
+                     workspace: Optional[torch.Tensor] = None,
+                     num_bits: int = 4,
+                     is_k_full: bool = True,
+                     inplace: bool = False) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - w1_scale (torch.Tensor): Scale to be used for w1.
+    - w2_scale (torch.Tensor): Scale to be used for w2.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - g_idx1 (Optional[torch.Tensor]): The first set of act_order indices.
+    - g_idx2 (Optional[torch.Tensor]): The second set of act_order indices.
+    - sort_indices1 (Optional[torch.Tensor]): The first act_order input
+        permutation.
+    - sort_indices2 (Optional[torch.Tensor]): The second act_order input
+        permutation.
+    - topk_weights (torch.Tensor): Top-k weights.
+    - topk_ids (torch.Tensor): Indices of topk-k elements.
+    - w1_zeros (Optional[torch.Tensor]): Optional zero points to be used for w1.
+    - w2_zeros (Optional[torch.Tensor]): Optional zero points to be used for w2.
+    - num_bits (bool): The number of bits in expert weights quantization.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # quant_type = ScalarType.from_id(quant_type_id)
+    # assert quant_type in [
+    #     scalar_types.uint4, scalar_types.uint8b128, scalar_types.uint4b8,
+    #     scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f
+    # ]
+
+    # bit4_scalar_types = [
+    #     scalar_types.uint4, scalar_types.uint4b8, scalar_types.float4_e2m1f
+    # ]
+    # num_bits = 4 if quant_type in bit4_scalar_types else 8
+
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[
+        0], "Number of tokens mismatch"
+    assert hidden_states.shape[
+        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[1] == w2.shape[2] // (
+        num_bits // 2), "Hidden size mismatch w2"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
+    # assert num_bits in [4]
+    assert num_bits in [4]
+
+    num_tokens, K = hidden_states.shape # 32, 7168
+    E = w1.shape[0] # 256
+    N = w2.shape[1] * 16 # 256
+    topk = topk_ids.shape[1] # 8
+    
+    #暂时固定为16384
+    #CHUNK_SIZE = 16384
+    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+    M = min(num_tokens, CHUNK_SIZE)
+
+    if workspace is None:
+        sms = torch.cuda.get_device_properties(device='cuda').multi_processor_count
+        workspace = torch.zeros(sms * 3,
+                                dtype=torch.int,
+                                device=hidden_states.device,
+                                requires_grad=False)
+
+    scalar_type1 = get_scalar_type(num_bits, w1_zeros is not None)
+    scalar_type2 = get_scalar_type(num_bits, w2_zeros is not None)
+    if global_num_experts == -1:
+        global_num_experts = E
+    intermediate_cache2 = torch.empty( 
+        (M * topk, N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    if envs.VLLM_USE_GLOBAL_CACHE13:
+        intermediate_cache13 = get_moe_cache(topk, N, K, device=hidden_states.device, dtype=hidden_states.dtype)
+    else:    
+        intermediate_cache13 = torch.empty(
+            (M * topk * max(2 * N, K), ),
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+    intermediate_cache1 = intermediate_cache13[:M * topk * 2 * N]
+    intermediate_cache1 = intermediate_cache1.view(-1, 2 * N)
+    intermediate_cache3 = intermediate_cache13[:M * topk * K] 
+    intermediate_cache3 = intermediate_cache3.view(-1, K) 
+
+    use_atomic_add = hidden_states.dtype == torch.half or \
+        torch.cuda.get_device_capability(hidden_states.device)[0] >= 9
+
+    if inplace:
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+
+        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
+                                          min((chunk + 1) * CHUNK_SIZE,
+                                              num_tokens))
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.size()
+
+        if tokens_in_chunk == 0:
+            break
+        intermediate_cache3 = intermediate_cache3.view(-1, K) 
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk * topk, :]
+            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk * topk, :]
+            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk * topk, :]
+            M = tokens_in_chunk
+
+        # Select block_size_m
+        for block_size_m in [16, 32, 48, 64, 80]:
+            if M * topk / E / block_size_m < 0.9:
+                break
+
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(curr_topk_ids, block_size_m, global_num_experts, expert_map)
+
+        intermediate_cache1 = lightop.moe_marlin_w4a16(
+            curr_hidden_states, 
+            intermediate_cache1,
+            w1,
+            w1_scale_zero,
+            g_idx1,
+            sort_indices1,
+            workspace,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            curr_topk_weights,
+            block_size_m,
+            topk,
+            False,
+            expert_map is not None,
+            M, 
+            2 * N,
+            K,
+            is_k_full,
+            use_atomic_add,
+            True,
+            False
+        )
+
+        torch.ops._C.silu_and_mul(intermediate_cache2, intermediate_cache1)
+
+        intermediate_cache3 = lightop.moe_marlin_w4a16(
+            intermediate_cache2,
+            intermediate_cache3,
+            w2,
+            w2_scale_zero,
+            g_idx2,
+            sort_indices2,
+            workspace,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            curr_topk_weights,
+            block_size_m,
+            1,
+            True,
+            expert_map is not None,
+            M * topk,
+            K,
+            N,
+            is_k_full,
+            use_atomic_add,
+            True,
+            False
+            ).view(-1, topk, K)
+        
+        ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape), out_hidden_states[begin_chunk_idx:end_chunk_idx])
+
+    return out_hidden_states
+
+def fused_marlin_moe_int4_fake(
+                     hidden_states: torch.Tensor, # 32, 7168 
+                     w1: torch.Tensor, # 256, 512, 7168 --> 32*8, 512 --> 32*8, 256
+                     w2: torch.Tensor, # 256, 256, 7168
+                     w1_scale_zero: torch.Tensor,
+                     w2_scale_zero: torch.Tensor,
+                     gating_output: torch.Tensor,
+                     topk_weights: torch.Tensor,
+                     topk_ids: torch.Tensor,
+                     global_num_experts: int = -1,      
+                     expert_map: Optional[torch.Tensor] = None,
+                     g_idx1: Optional[torch.Tensor] = None,
+                     g_idx2: Optional[torch.Tensor] = None,
+                     sort_indices1: Optional[torch.Tensor] = None,
+                     sort_indices2: Optional[torch.Tensor] = None,
+                     w1_zeros: Optional[torch.Tensor] = None,
+                     w2_zeros: Optional[torch.Tensor] = None,
+                     workspace: Optional[torch.Tensor] = None,
+                     num_bits: int = 4,
+                     is_k_full: bool = True,
+                     inplace: bool = False) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+direct_register_custom_op(
+    op_name="fused_marlin_moe_int4",
+    op_func=fused_marlin_moe_int4,
+    mutates_args=[],
+    fake_impl=fused_marlin_moe_int4_fake,
+)
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -234,163 +234,6 @@ def write_zeros_to_output(
    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
    tl.store(c_ptrs, accumulator, mask=c_mask)

-@triton.jit
-def fused_moe_kernel_awq(
-        # Pointers to matrices
-        a_ptr, # [4, 7168] 
-        b_ptr, # [256, 512, 3584]
-        c_ptr, # (8, 8, 512)
-        b_scale_ptr, # (256, 512, 56)
-        b_zp_ptr, # (256, 256, 56)
-        topk_weights_ptr, 
-        sorted_token_ids_ptr, # [0, 1, 2, 3, 4]
-        expert_ids_ptr,
-        num_tokens_post_padded_ptr,
-        # Matrix dimensions
-        N: tl.constexpr,
-        K: tl.constexpr,
-        EM, # pading后的总索引长度
-        num_valid_tokens, # 有效索引的上限
-        # The stride variables represent how much to increase the ptr by when
-        # moving by 1 element in a particular dimension. E.g. `stride_am` is
-        # how much to increase `a_ptr` by to get the element one row down
-        # (A has M rows).
-        stride_am,
-        stride_ak,
-        stride_be,
-        stride_bk, #1
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        stride_bse,
-        stride_bsk,#1
-        stride_bsn,
-        stride_bze,
-        stride_bzk,
-        stride_bzn,
-        block_k_diviable: tl.constexpr,
-        group_size: tl.constexpr, # 128
-        # Meta-parameters
-        BLOCK_SIZE_M: tl.constexpr,
-        BLOCK_SIZE_N: tl.constexpr,
-        BLOCK_SIZE_K: tl.constexpr,
-        GROUP_SIZE_M: tl.constexpr,
-        MUL_ROUTED_WEIGHT: tl.constexpr,
-        top_k: tl.constexpr,
-        compute_type: tl.constexpr,
-        has_zp: tl.constexpr,
-        use_int4_w4a16: tl.constexpr,
-        use_int8_w8a16: tl.constexpr):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
-    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
-        return
-    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) # [block_m]
-    token_mask = offs_token < num_valid_tokens
-
-    offs_bn = (pid_n * BLOCK_SIZE_N +
-               tl.arange(0, BLOCK_SIZE_N)) % N # [block_n]
-    offs_k = tl.arange(0, BLOCK_SIZE_K) # 0, 1, 2, ...... , 127 # # [block_k]
-    offs_k2 = tl.arange(0, BLOCK_SIZE_K // 2) # 0, 1, 2, ...... , 127 # # [block_k]
-    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
-                      offs_k[None, :] * stride_ak) # [block_m, block_k] 
-
-    off_experts = tl.load(expert_ids_ptr + pid_m)
-
-    if use_int4_w4a16:
-        # [0, 1, 2, ...... , 126, 127] --> [0, 0, 1, 1 ...... , 63, 63] 
-        # [128, 129, 130, ...... , 254, 255] --> [64, 64, 65, 65 ...... , 127, 127] 
-        
-        # b_ptrs = b_ptr + off_experts * stride_be + \
-        #     (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * stride_bn
-        b_ptrs = b_ptr + off_experts * stride_be + \
-            offs_bn[:, None] * stride_bn + (offs_k2[None, :]) * stride_bk 
-        # tl.device_print("stride_bn",stride_bsn)>1
-        # tl.device_print("stride_bk",stride_bk)=1
-        b_shifter = (offs_k[:, None] % 2) * 4  # 0, 4
-    elif use_int8_w8a16:
-        b_ptrs = b_ptr + off_experts * stride_be + \
-            offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
-
-    if not has_zp and use_int4_w4a16:
-        b_zp_num = 8
-    if not has_zp and use_int8_w8a16:
-        b_zp_num = 128
-    elif has_zp and use_int4_w4a16:
-        b_zp_shifter = (offs_bn[None, :] % 2) * 4 #  0, 4
-
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        if not block_k_diviable:
-            k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K
-            k_other = 0.0
-        else:
-            k_mask = None
-            k_other = None
-
-        a = tl.load(a_ptrs,
-                    mask=token_mask[:, None] &
-                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
-                    other=0.0)
-        b = tl.load(b_ptrs)
-        if use_int4_w4a16:
-            b = tl.interleave(b, b)
-            b= b.trans()
-            b = (b >> b_shifter) & 0xF
-
-        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + \
-            offs_bn[None, :] * stride_bsk + \
-            ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * stride_bsn
-        qzeros_scles = tl.load(b_scale_ptrs, mask=k_mask, other=k_other)
-
-        scales_int16 = tl.cast(qzeros_scles,tl.uint16)
-        b_scale = tl.cast(scales_int16,tl.float16,bitcast=True)
-        # tl.device_print("b_scale dequant",b_scale)
-
-        mid = qzeros_scles >> 16
-        # b_zp = tl.cast(mid,tl.float16,bitcast=False)
-        b_zp = tl.cast(mid,tl.float16)
-        # b_zp = tl.cast(zeros_int16,tl.float16,bitcast=False)
-
-        # tl.device_print("bzp",b_zp)
-
-        # We accumulate along the K dimension.
-        b = ((b - b_zp) * b_scale).to(tl.float16)
-        accumulator = tl.dot(a, b, acc=accumulator)
-
-        # Advance the ptrs to the next K block.
-        a_ptrs += BLOCK_SIZE_K * stride_ak
-        if use_int4_w4a16:
-            b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
-        else:
-            b_ptrs += BLOCK_SIZE_K * stride_bk
-
-    if MUL_ROUTED_WEIGHT:
-        moe_weight = tl.load(topk_weights_ptr + offs_token,
-                             mask=token_mask,
-                             other=0)
-        accumulator = accumulator * moe_weight[:, None]
-
-    accumulator = accumulator.to(compute_type)
-    # -----------------------------------------------------------
-    # Write back the block of the output
-    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
-        None, :]
-    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
-    tl.store(c_ptrs, accumulator, mask=c_mask)
-
-
 @triton.jit
 def fused_moe_kernel_gptq_awq(
    # Pointers to matrices
@@ -993,98 +836,59 @@ def invoke_fused_moe_wna16_triton_kernel(
        triton.cdiv(EM, META["BLOCK_SIZE_M"])
        * triton.cdiv(B.size(1), META["BLOCK_SIZE_N"]),
    )
-    config = config.copy()
-    config.update(
-        get_moe_wna16_block_config(
-            config=config,
-            use_moe_wna16_cuda=False,
-            num_valid_tokens=num_tokens,
-            size_k=A.size(1),
-            size_n=B.size(1),
-            num_experts=B.size(1),
-            group_size=block_shape[1],
-            real_top_k=top_k,
-            block_size_m=config["BLOCK_SIZE_M"],
-        )
+    # config = config.copy()
+    # config.update(
+    #     get_moe_wna16_block_config(
+    #         config=config,
+    #         use_moe_wna16_cuda=False,
+    #         num_valid_tokens=num_tokens,
+    #         size_k=A.size(1),
+    #         size_n=B.size(1),
+    #         num_experts=B.size(1),
+    #         group_size=block_shape[1],
+    #         real_top_k=top_k,
+    #         block_size_m=config["BLOCK_SIZE_M"],
+    #     )
+    # )
+
+    fused_moe_kernel_gptq_awq[grid](
+        A,
+        B,
+        C,
+        B_scale,
+        B_zp,
+        topk_weights,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        B.size(1),
+        A.size(1),
+        EM,
+        num_tokens,
+        A.stride(0),
+        A.stride(1),
+        B.stride(0),
+        B.stride(2),
+        B.stride(1),
+        C.stride(1),
+        C.stride(2),
+        B_scale.stride(0),
+        B_scale.stride(2),
+        B_scale.stride(1),
+        B_zp.stride(0) if B_zp is not None else 0,
+        B_zp.stride(2) if B_zp is not None else 0,
+        B_zp.stride(1) if B_zp is not None else 0,
+        block_k_diviable=A.size(1) % config["BLOCK_SIZE_K"] == 0,
+        group_size=block_shape[1],
+        MUL_ROUTED_WEIGHT=mul_routed_weight,
+        top_k=top_k,
+        compute_type=compute_type,
+        has_zp=B_zp is not None,
+        use_int4_w4a16=use_int4_w4a16,
+        use_int8_w8a16=use_int8_w8a16,
+        **config,
    )

-    if os.environ.get('AWQ_MOE_SZ') == '1':
-        fused_moe_kernel_awq[grid](
-            A,
-            B,
-            C,
-            B_scale,
-            B_zp,
-            topk_weights,
-            sorted_token_ids,
-            expert_ids,
-            num_tokens_post_padded,
-            B.size(1),
-            A.size(1),
-            EM,
-            topk_ids.numel(),
-            A.stride(0),
-            A.stride(1),
-            B.stride(0),
-            B.stride(2),
-            B.stride(1),
-            C.stride(1),
-            C.stride(2),
-            B_scale.stride(0),
-            B_scale.stride(2),
-            B_scale.stride(1),
-            B_zp.stride(0) if B_zp is not None else 0,
-            B_zp.stride(2) if B_zp is not None else 0,
-            B_zp.stride(1) if B_zp is not None else 0,
-            block_k_diviable=A.size(1) % config["BLOCK_SIZE_K"] == 0,
-            group_size=block_shape[1],
-            MUL_ROUTED_WEIGHT=mul_routed_weight,
-            top_k=top_k,
-            compute_type=compute_type,
-            has_zp=B_zp is not None,
-            use_int4_w4a16=use_int4_w4a16,
-            use_int8_w8a16=use_int8_w8a16,
-            **config,
-        )
-    else:
-        fused_moe_kernel_gptq_awq[grid](
-            A,
-            B,
-            C,
-            B_scale,
-            B_zp,
-            topk_weights,
-            sorted_token_ids,
-            expert_ids,
-            num_tokens_post_padded,
-            B.size(1),
-            A.size(1),
-            EM,
-            num_tokens,
-            A.stride(0),
-            A.stride(1),
-            B.stride(0),
-            B.stride(2),
-            B.stride(1),
-            C.stride(1),
-            C.stride(2),
-            B_scale.stride(0),
-            B_scale.stride(2),
-            B_scale.stride(1),
-            B_zp.stride(0) if B_zp is not None else 0,
-            B_zp.stride(2) if B_zp is not None else 0,
-            B_zp.stride(1) if B_zp is not None else 0,
-            block_k_diviable=A.size(1) % config["BLOCK_SIZE_K"] == 0,
-            group_size=block_shape[1],
-            MUL_ROUTED_WEIGHT=mul_routed_weight,
-            top_k=top_k,
-            compute_type=compute_type,
-            has_zp=B_zp is not None,
-            use_int4_w4a16=use_int4_w4a16,
-            use_int8_w8a16=use_int8_w8a16,
-            **config,
-        )
-

 def invoke_fused_moe_triton_kernel(
    A: torch.Tensor,
@@ -1150,7 +954,7 @@ def invoke_fused_moe_triton_kernel(
    HAS_BIAS = B_bias is not None

    # config = config.copy()
-    # config["SPLIT_K"] = 1
+    #config["SPLIT_K"] = 1
    # BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K")
    # if block_shape is not None:
    #     BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0], block_shape[1]))
@@ -1218,9 +1022,11 @@ def dispatch_fused_moe_kernel(
    use_int8_w8a8: bool,
    use_int8_w8a16: bool,
    use_int4_w4a16: bool,
+    use_int4_w4a8: bool,
    per_channel_quant: bool,
    block_shape: list[int] | None = None,
    B_bias: torch.Tensor | None = None,
+    use_nn_moe: bool | None = False,
 ) -> None:
    assert topk_weights is not None or not mul_routed_weight
    assert topk_weights is None or topk_weights.stride(1) == 1
@@ -1614,7 +1420,7 @@ def get_default_config(
            "BLOCK_SIZE_N": 64,
            "BLOCK_SIZE_K": 32,
            "GROUP_SIZE_M": 8,
-            "SPLIT_K": 1,
+            
        }
        return config

@@ -1628,7 +1434,6 @@ def get_default_config(
            "BLOCK_SIZE_N": block_shape[0],
            "BLOCK_SIZE_K": block_shape[1],
            "GROUP_SIZE_M": 32,
-            "SPLIT_K": 1,
            "num_warps": 4,
            "num_stages": 3 if not current_platform.is_rocm() else 2,
        }
@@ -1652,7 +1457,7 @@ def get_default_config(
            "BLOCK_SIZE_N": 32,
            "BLOCK_SIZE_K": 64,
            "GROUP_SIZE_M": 1,
-            "SPLIT_K": 1,
+
        }
    else:
        config = {
@@ -1660,7 +1465,7 @@ def get_default_config(
            "BLOCK_SIZE_N": 64,
            "BLOCK_SIZE_K": 32,
            "GROUP_SIZE_M": 8,
-            "SPLIT_K": 1,
+
        }

    if use_nn_moe:

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1672,7 +1672,7 @@ class FusedMoE(CustomOp):
                else:
                    topk_weights, topk_ids = ops.moe_fused_gate(
                        router_logits,
-                        e_score_correction_bias=self.e_score_correction_bias,
+                        self.e_score_correction_bias,
                        num_expert_group=self.num_expert_group,
                        topk_group=self.topk_group,
                        topk=self.top_k,

--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEQuantConfig,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe_w4a16 import fused_marlin_moe_int4
 from vllm.model_executor.layers.fused_moe.fused_moe_router import FusedMoERouter
 from vllm.model_executor.layers.fused_moe.layer import (
    FusedMoE,
@@ -50,6 +50,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    moe_awq_to_marlin_zero_points,
    verify_marlin_supported,
    verify_marlin_supports_shape,
+    awq_marlin_moe_permute_sz
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
@@ -793,19 +794,15 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
            use_fused_gate=use_fused_gate,
        )

-        return fused_marlin_moe(
+        return fused_marlin_moe_int4(
            x,
            layer.w13_qweight,
            layer.w2_qweight,
-            getattr(layer, "w13_bias", None),
-            getattr(layer, "w2_bias", None),
            layer.w13_scales,
            layer.w2_scales,
            router_logits,
            topk_weights,
            topk_ids,
-            input_global_scale1=getattr(layer, "w13_input_global_scale", None),
-            input_global_scale2=getattr(layer, "w2_input_global_scale", None),
            # quant_type_id=self.quant_type.id,
            # apply_router_weight_on_input=layer.apply_router_weight_on_input,
            global_num_experts=layer.global_num_experts,
@@ -813,6 +810,5 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
            w1_zeros=layer.w13_qzeros,
            w2_zeros=layer.w2_qzeros,
            workspace=layer.workspace,
-            input_dtype=self.input_dtype,
            num_bits=4,
        )
--- a/vllm/model_executor/layers/quantization/awq_triton.py
+++ b/vllm/model_executor/layers/quantization/awq_triton.py
@@ -114,21 +114,10 @@ def awq_dequantize_kernel(


 @triton.jit
-def awq_gemm_kernel(
-    a_ptr,
-    b_ptr,
-    c_ptr,
-    zeros_ptr,
-    scales_ptr,
-    M,
-    N,
-    K,
-    group_size,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    SPLIT_K: tl.constexpr,
-):
+def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
+                    group_size, BLOCK_SIZE_M: tl.constexpr,
+                    BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+                    GROUP_SIZE_M: tl.constexpr,SPLIT_K: tl.constexpr):
    pid = tl.program_id(axis=0)
    pid_z = tl.program_id(1)
    
@@ -154,17 +143,18 @@ def awq_gemm_kernel(
    # (BLOCK_SIZE_M, BLOCK_SIZE_N))
    # accumulator = accumulator & 0x0
    # accumulator = accumulator.to(accumulator_dtype)
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=accumulator_dtype)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
+                           dtype=tl.float32)

    # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
    # that will map given indices to the correct order.
-    reverse_awq_order_tensor = (
-        (tl.arange(0, 2) * 4)[None, :] + tl.arange(0, 4)[:, None]
-    ).reshape(8)
+    shifts = ((tl.arange(0, 2) * 16)[None, :] +
+                                (tl.arange(0, 4) * 4)[:, None]).reshape(1,8)

    # Create the necessary shifts to use to unpack.
-    shifts = reverse_awq_order_tensor * 4
-    shifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8))
+    # shifts = reverse_awq_order_tensor * 4
+    shifts = tl.broadcast_to(shifts,
+                             (BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8))
    shifts = tl.reshape(shifts, (BLOCK_SIZE_K, BLOCK_SIZE_N))

    # Offsets and masks.
@@ -202,10 +192,8 @@ def awq_gemm_kernel(
        b =tl.join(b, b).reshape(b.shape[:-1] + [2 * b.shape[-1]])

        # Dequantize b.
-        offsets_szk = (
-            BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K
-        ) // group_size 
-        offsets_szk = offsets_szk + (tl.arange(0, BLOCK_SIZE_K) // group_size) 
+        offsets_szk = (BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K) // group_size 
+        offsets_szk = offsets_szk + (tl.arange(0,BLOCK_SIZE_K) // group_size) 
        offsets_z = (N_8) * offsets_szk[:, None] + offsets_bzn[None, :]
        masks_zk = offsets_szk < K // group_size
        masks_z = masks_zk[:, None] & masks_bzn[None, :]
@@ -307,17 +295,12 @@ def awq_dequantize_triton(
 # qzeros  - [K // G, N // 8]
 # scales  - [K // G, N]
 # split_k_iters - parallelism along K-dimension, int, power of 2.
-def awq_gemm_triton(
-    input: torch.Tensor,
-    qweight: torch.Tensor,
-    scales: torch.Tensor,
-    qzeros: torch.Tensor,
-    split_k_iters: int,
-    block_size_m: int = 32,
-    block_size_n: int = 32,
-    block_size_k: int = 32,
-    config = None,
-) -> torch.Tensor:
+def awq_gemm_triton(input: torch.Tensor,
+                    qweight: torch.Tensor,
+                    scales: torch.Tensor,
+                    qzeros: torch.Tensor,
+                    split_k_iters: int,
+                    config=None) -> torch.Tensor:
    M, K = input.shape
    N = qweight.shape[1] * 8
    group_size = qweight.shape[0] // qzeros.shape[0]
@@ -332,37 +315,27 @@ def awq_gemm_triton(
    assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K

    grid = lambda META: (
-        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
-        split_k_iters,
+        triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
+        META['SPLIT_K'],
    )
-    
    if config is None:
        config= {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8,'SPLIT_K': 8}
        if M >256:
        #print("INFO:this size not found in json.")
            config= {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8,'SPLIT_K': 1}
-
-    result = torch.zeros((split_k_iters, M, N), dtype=scales.dtype, device=input.device)
+        
+    result = torch.zeros((M, N), dtype=scales.dtype, device=input.device)

    # A = input, B = qweight, C = result
    # A = M x K, B = K x N, C = M x N
-    awq_gemm_kernel[grid](
-        input,
-        qweight,
-        result,
-        qzeros,
-        scales,
-        M,
-        N,
-        K,
-        group_size,
-        BLOCK_SIZE_M=block_size_m,
-        BLOCK_SIZE_N=block_size_n,
-        BLOCK_SIZE_K=block_size_k,
-        SPLIT_K=split_k_iters,
-        **config,
-    )
-
-    result = result.sum(0)
-
+    awq_gemm_kernel[grid](input,
+                        qweight,
+                        result,
+                        qzeros,
+                        scales,
+                        M,
+                        N,
+                        K,
+                        group_size,
+                        **config)
    return result
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1161,7 +1161,6 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
            global_num_experts=layer.global_num_experts,
            expert_map=layer.expert_map,
            quant_config=self.moe_quant_config,
-            use_fused_gate=use_fused_gate,
            use_nn_moe=False,
        )

@@ -1756,6 +1755,9 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
        router: FusedMoERouter,
        x: torch.Tensor,
        router_logits: torch.Tensor,
+        use_nn_moe: bool | None = False,
+        use_fused_gate: bool | None = False,
+        **_
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        from vllm.model_executor.layers.fused_moe import fused_experts

@@ -1776,6 +1778,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
            global_num_experts=layer.global_num_experts,
            expert_map=layer.expert_map,
            quant_config=self.moe_quant_config,
+            use_nn_moe=False,
        )

    @property

--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -233,12 +233,6 @@ class MoeWNA16Method(FusedMoEMethodBase):
    def __init__(self, quant_config: MoeWNA16Config, moe: "FusedMoEConfig") -> None:
        super().__init__(moe)
        self.quant_config = quant_config
-        self.use_w4a16_moe_sz = os.environ.get('AWQ_MOE_SZ') == '1'
-        self.use_w4a16_cuda = 0
-        self.use_moe_lmslim = 0
-        if self.use_w4a16_moe_sz:
-            self.use_w4a16_cuda = os.environ['W4A16_MOE_CUDA'] == '1'
-            self.use_moe_lmslim = os.environ['W4A16_MOE_LMSLIM'] == "1"

    def create_weights(
        self,
@@ -397,46 +391,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
            router_logits=router_logits,
            use_fused_gate=use_fused_gate,
        )
-      
-        # TODO @yangql
-        if self.use_moe_lmslim:
-            return fused_experts_w4a16(
-                    x,
-                    layer.w13_qweight,
-                    layer.w2_qweight,
-                    topk_weights=topk_weights,
-                    topk_ids=topk_ids,
-                    inplace=True,
-                    activation=activation,         
-                    apply_router_weight_on_input=apply_router_weight_on_input,
-                    use_int4_w4a16=True,
-                    global_num_experts=global_num_experts,
-                    expert_map=expert_map,
-                    w1_scale=layer.w13_scales,
-                    w2_scale=layer.w2_scales,
-                    block_shape=[0, layer.group_size])
-        
-        if self.use_w4a16_cuda:
-            m = topk_ids.shape[0]
-            if m <= 512:
-                return fused_experts_cuda(x,
-                         layer.w13_qweight,
-                         layer.w2_qweight,
-                         topk_weights,
-                         topk_ids,
-                         inplace=True,
-                         use_fp8_w8a8=False,
-                         use_int4_w4a16=True,
-                         use_int8_w8a16=False,
-                         w1_scale=layer.w13_scales,
-                         w2_scale=layer.w2_scales,
-                         w1_zp=None,
-                         w2_zp=None,
-                         a1_scale=None,
-                         a2_scale=None,
-                         block_shape=[0, layer.group_size],
-                         expert_map=expert_map)
-
+             

        return fused_experts(
            x,

--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -228,8 +228,8 @@ def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool:
    return False

 def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool:
-    if current_platform.is_rocm():
-        return False
+    # if current_platform.is_rocm():
+    #     return False
    hidden_size = layer.hidden_size
    intermediate_size_per_partition = layer.intermediate_size_per_partition
    # apply_router_weight_on_input is not supported for moe marlin
@@ -359,7 +359,7 @@ def marlin_permute_scales(

 def marlin_permute_bias(s: torch.Tensor) -> torch.Tensor:
    origin_shape = s.shape
-    _, scale_perm_single = get_scale_perms()
+    scale_perm_single = get_scale_perms()
    s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
    return s.reshape(*origin_shape).contiguous()

@@ -392,7 +392,7 @@ def marlin_zero_points(
    # Permute zero-points in a similar way to scales, but do not use the
    # "single" permutation, since zero-points are applied on every MMA
    # 和 scale 使用一致的重排逻辑，将[128, 128](fp16) B矩阵中 每个[16, 16]计算块中的对应位置的 zero值 放到一起 
-    scale_perm, _ = get_scale_perms()
+    scale_perm = get_scale_perms()
    zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm]

    # uint4 混排

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -252,8 +252,6 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
                    
        # awq相关配置
        try:
-            # if os.getenv('AWQ_MOE_SZ') == None:
-            #     os.environ['AWQ_MOE_SZ'] = '1'
            if os.getenv('AWQ_PAD') == None and (torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120):
                os.environ['AWQ_PAD'] = '1'
        except Exception as e:

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1437,7 +1437,6 @@ class DeepseekV2ForCausalLM(
            os.environ['LLAMA_NN'] = '0'
            os.environ['LM_NN'] = '0'

-        self.use_w4a16_moe_sz = os.environ.get('AWQ_MOE_SZ') == '1'
        self.config = config
        self.quant_config = quant_config