[FEATURE] 接入Aiter MoE W8A8

fd2a4660 · lixh6 · 3842b316 · fd2a4660 · fd2a4660 · fd2a4660
Commit fd2a4660 authored Apr 15, 2026 by lixh6
4 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -167,6 +167,7 @@ if TYPE_CHECKING:
    VLLM_MOE_USE_DEEP_GEMM: bool = True
    VLLM_USE_DEEP_GEMM_E8M0: bool = True
    VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES: bool = True
+    VLLM_USE_AITER_MOE_W8A8: bool = True
    VLLM_DEEP_GEMM_WARMUP: Literal[
        "skip",
        "full",
@@ -1287,6 +1288,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES": lambda: bool(
        int(os.getenv("VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES", "1"))
    ),
+    "VLLM_USE_AITER_MOE_W8A8": lambda: bool(
+        int(os.getenv("VLLM_USE_AITER_MOE_W8A8", "1"))
+    ),
    # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
    # JIT all the required kernels before model execution so there is no
    # JIT'ing in the hot-path. However, this warmup increases the engine

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -6,7 +6,11 @@ import functools
 import json
 import os
 import math
-
+import sys
+import aiter
+from vllm._aiter_ops import rocm_aiter_ops
+from aiter.moe import get_aiter_moe_config, aiter_moe, MoeQuantType, MoeSolutionType
+from aiter.ops.shuffle import moe_layout_shuffle_gemm1, moe_layout_shuffle_gemm2

 from collections.abc import Callable
 from typing import Any, Callable, Dict, List, Optional
@@ -1858,6 +1862,42 @@ def fused_experts_impl(
        cache13 = torch.empty(M * top_k_num * max(N, K if not use_nn_moe else w2.shape[2]), device=hidden_states.device, dtype=hidden_states.dtype)

    if use_int8_w8a8 or use_fp8_w8a8:
+        if envs.VLLM_USE_AITER_MOE_W8A8==True:
+            K_input = hidden_states.size(1)
+            actual_N2 = N // 2
+            quant_type = MoeQuantType.W8A8
+            status, moe_config = get_aiter_moe_config(
+                M=num_tokens,
+                E=global_num_experts,
+                N1=N,
+                N2=actual_N2,
+                K=K_input,
+                top_k=top_k_num,
+                block_size=0,
+                dtype=hidden_states.dtype,
+                quant_type=quant_type,
+            )
+            
+            output = aiter_moe(
+                hidden_states=hidden_states,
+                w1=w1,
+                w2=w2,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                moe_config=moe_config, 
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                w1_zp=w1_zp,
+                w2_zp=w2_zp,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                block_shape=None,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                activation=activation,
+            )
+            return output
+        else:
            return fused_experts_impl_int8(hidden_states=hidden_states,
                                        w1=w1,
                                        w2=w2,

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
@@ -26,6 +26,14 @@ from vllm.model_executor.layers.fused_moe import (
    FusedMoEPrepareAndFinalize,
    FusedMoeWeightScaleSupported,
 )
+
+import aiter
+from aiter.test_common import checkAllclose, perftest
+from aiter.ops.shuffle import moe_layout_shuffle_gemm1, moe_layout_shuffle_gemm2
+from aiter.fused_moe import fused_topk, torch_moe
+from aiter import dtypes, ActivationType
+from aiter.moe import get_aiter_moe_config, aiter_moe, MoeSolutionType, MoeQuantType
+
 try:
    from lmslim.layers.fused_moe.fuse_moe_int8_marlin import fused_experts_impl_int8_marlin
    from lmslim.layers.fused_moe.fuse_moe_fp8_marlin import fused_experts_impl_fp8_marlin
@@ -369,7 +377,24 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
        layer.w13_input_scale = None
        layer.w2_input_scale = None

+    def shuffle_w8a8(self, weight_data):
+        w_i8 = weight_data.to(torch.int8)
+        return moe_layout_shuffle_gemm2(w_i8)
+
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if envs.VLLM_USE_AITER_MOE_W8A8==True:
+            E, N13, K = layer.w13_weight.shape
+            _, K_w2, N2 = layer.w2_weight.shape 
+
+            layer.w13_weight_scale = Parameter(layer.w13_weight_scale.data, requires_grad=False)
+            layer.w2_weight_scale = Parameter(layer.w2_weight_scale.data, requires_grad=False)
+
+            shuffled_w13 = self.shuffle_w8a8(layer.w13_weight)
+            layer.w13_weight = Parameter(shuffled_w13.view(E, N13, K), requires_grad=False)
+            shuffled_w2 = self.shuffle_w8a8(layer.w2_weight)
+            layer.w2_weight = Parameter(shuffled_w2.view(E, N2, K), requires_grad=False)
+            
+        else:
            w1_marlin_list = []
            for ii in range(layer.w13_weight.shape[0]):
                if not self.use_deepep:
@@ -405,7 +430,45 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
        routed_scaling_factor: Optional[float] = 1.0,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        from vllm.model_executor.layers.fused_moe import fused_experts
+        if envs.VLLM_USE_AITER_MOE_W8A8==True:
+            m_flat = x.view(-1, x.shape[-1])
+            M = m_flat.shape[0]
+            E = layer.w13_weight.size(0)
+            K = x.size(-1)
+            N1 = layer.w13_weight.size(1)
+            topk = topk_ids.size(1)
+            w1_input = layer.w13_weight.view(E, N1, K)
+            w2_input = layer.w2_weight.view(E, K, N1 // 2)
+
+            _, moe_cfg = get_aiter_moe_config(
+                M=M,
+                E=E,
+                N1=N1,
+                N2=N1 // 2,
+                K=K,
+                top_k=topk,
+                block_size=0,
+                dtype=x.dtype,
+                quant_type=MoeQuantType.W8A8,
+            )

+            output = aiter_moe(
+                hidden_states=x,
+                w1=w1_input,
+                w2=w2_input,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                moe_config=moe_cfg,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                a1_scale=getattr(layer, "w13_input_scale", None),
+                a2_scale=getattr(layer, "w2_input_scale", None),
+                global_num_experts=E,
+                expert_map=getattr(layer, "expert_map", None),
+                activation=getattr(layer, "activation", "silu")
+            )
+            return output
+        else:
            return fused_experts_impl_int8_marlin(
                hidden_states=x,
                w1=layer.w13_weight,

--- a/vllm/model_executor/layers/sparse_attn_indexer.py
+++ b/vllm/model_executor/layers/sparse_attn_indexer.py
@@ -30,6 +30,7 @@ elif current_platform.is_xpu():
    from vllm._ipex_ops import ipex_ops as ops

 logger = init_logger(__name__)
+_GLOBAL_LOGITS_BUFFERS = {}

 @maybe_transfer_kv_layer
 def sparse_attn_indexer(
@@ -50,7 +51,21 @@ def sparse_attn_indexer(
    # careful! this will be None in dummy run
    attn_metadata = get_forward_context().attn_metadata
    fp8_dtype = current_platform.fp8_dtype()
+    if q_fp8.dtype == fp8_dtype:
+        MAX_ELEMENTS = 65536 * 65536
+    elif q_fp8.dtype in (torch.bfloat16, torch.float16):
+        MAX_ELEMENTS = 16384 * 32768
+    else:
+        MAX_ELEMENTS = 16384 * 32768 

+    device = q_fp8.device
+    if device not in _GLOBAL_LOGITS_BUFFERS or _GLOBAL_LOGITS_BUFFERS[device].numel() < MAX_ELEMENTS:
+        _GLOBAL_LOGITS_BUFFERS[device] = torch.empty(
+            MAX_ELEMENTS, 
+            dtype=torch.float32, 
+            device=device
+        )
+    logits_buffer = _GLOBAL_LOGITS_BUFFERS[device]
    # assert isinstance(attn_metadata, dict)
    if not isinstance(attn_metadata, dict):
        # Reserve workspace for indexer during profiling run
@@ -75,7 +90,7 @@ def sparse_attn_indexer(
        )
    attn_metadata = attn_metadata[layer_name]
    assert isinstance(attn_metadata, DeepseekV32IndexerMetadata)
-    slot_mapping = attn_metadata.slot_mapping
+    slot_mapping = attn_metadata.slot_mapping[:attn_metadata.num_kv_actual_tokens]
    has_decode = attn_metadata.num_decodes > 0
    has_prefill = attn_metadata.num_prefills > 0
    num_decode_tokens = attn_metadata.num_decode_tokens
@@ -116,14 +131,6 @@ def sparse_attn_indexer(
                    chunk.block_table,
                    chunk.cu_seq_lens,
                )
-
-                logits = fp8_mqa_logits(
-                    q_fp8[chunk.token_start : chunk.token_end],
-                    (k_fp8, k_scale.view(torch.float32).flatten()),
-                    weights[chunk.token_start : chunk.token_end],
-                    chunk.cu_seqlen_ks,
-                    chunk.cu_seqlen_ke,
-                )
            elif get_gcn_arch_name() == "gfx938":
                k_fp8 = k_fp8_full[: chunk.total_seq_lens]
                k_scale = k_scale_full[: chunk.total_seq_lens]
@@ -134,19 +141,6 @@ def sparse_attn_indexer(
                    chunk.block_table,
                    chunk.cu_seq_lens,
                )
-                logits = op.mqa_logits(
-                    q_fp8[chunk.token_start:chunk.token_end],  
-                    k_fp8, 
-                    weights[chunk.token_start:chunk.token_end], 
-                    chunk.cu_seqlen_ks, 
-                    chunk.cu_seqlen_ke,
-                    q_fp8[chunk.token_start:chunk.token_end].shape[0],
-                    k_fp8.shape[0],
-                    q_fp8.shape[1],
-                    q_fp8.shape[2],
-                    k_scale.view(torch.float32).flatten(),
-                    True
-                    )
            else:
                k_fp8 = k_fp8_full[: chunk.total_seq_lens]
                k_scale = k_scale_full[: chunk.total_seq_lens]   
@@ -156,44 +150,115 @@ def sparse_attn_indexer(
                    chunk.block_table,
                    chunk.cu_seq_lens,
                )        
-                logits = op.mqa_logits(
-                    q_fp8[chunk.token_start:chunk.token_end],  
+
+            q_all = q_fp8[chunk.token_start:chunk.token_end]
+            weights_all = weights[chunk.token_start:chunk.token_end]
+            ks_all = chunk.cu_seqlen_ks
+            ke_all = chunk.cu_seqlen_ke
+            
+            num_q = q_all.shape[0]
+            num_k = k_fp8.shape[0]
+
+            is_q_fp16_bf16 = q_all.dtype in (torch.float16, torch.bfloat16)
+            align_size = 128 if is_q_fp16_bf16 else 1
+            
+            kv_seq_len_aligned = (num_k + align_size - 1) // align_size * align_size
+
+            current_capacity = logits_buffer.numel()
+            MAX_Q_CHUNK = current_capacity // max(1, kv_seq_len_aligned)
+            if align_size > 1:
+                MAX_Q_CHUNK = (MAX_Q_CHUNK // align_size) * align_size
+            MAX_Q_CHUNK = max(1, MAX_Q_CHUNK)
+
+            slices = []
+
+            for start_idx in range(0, num_q, MAX_Q_CHUNK):
+                end_idx = min(start_idx + MAX_Q_CHUNK, num_q)
+                slices.append((start_idx, end_idx))
+
+            for q_start, q_end in slices:
+                if q_end <= q_start:
+                    continue
+                    
+                q_slice = q_all[q_start:q_end]
+                weights_slice = weights_all[q_start:q_end]
+
+                ks_slice = ks_all[q_start:q_end]
+                ke_slice = ke_all[q_start:q_end]
+
+                q_len = q_end - q_start
+                q_seq_len_aligned = (q_len + align_size - 1) // align_size * align_size
+
+                required_size = q_seq_len_aligned * kv_seq_len_aligned
+                logits_slice_view = logits_buffer[:required_size].view(q_seq_len_aligned, kv_seq_len_aligned)
+
+                if not current_platform.is_rocm():
+                    logits_slice = fp8_mqa_logits(
+                        q_slice,
+                        (k_fp8, k_scale.view(torch.float32).flatten()),
+                        weights_slice,
+                        ks_slice,
+                        ke_slice,
+                    )
+                elif get_gcn_arch_name() == "gfx938":
+                    op.mqa_logits(
+                        q_slice,  
+                        k_fp8, 
+                        weights_slice, 
+                        ks_slice, 
+                        ke_slice,
+                        q_slice.shape[0], 
+                        k_fp8.shape[0],
+                        q_slice.shape[1],
+                        q_slice.shape[2],
+                        k_scale.view(torch.float32).flatten(),
+                        True,
+                        logits_slice_view 
+                    )
+                    logits_slice = logits_slice_view[:q_len, :num_k]
+                else:
+                    op.mqa_logits(
+                        q_slice,  
                        k_fp8, 
-                    weights[chunk.token_start:chunk.token_end].to(torch.float32), 
-                    chunk.cu_seqlen_ks, 
-                    chunk.cu_seqlen_ke,
-                    q_fp8[chunk.token_start:chunk.token_end].shape[0],
+                        weights_slice.to(torch.float32), 
+                        ks_slice, 
+                        ke_slice,
+                        q_slice.shape[0],
                        k_fp8.shape[0],
-                    q_fp8.shape[1],
-                    q_fp8.shape[2],
+                        q_slice.shape[1],
+                        q_slice.shape[2],
                        None,
-                    True
+                        True,
+                        logits_slice_view 
                    )
-            num_rows = logits.shape[0]
+                    logits_slice = logits_slice_view[:q_len, :num_k]

-            topk_indices = topk_indices_buffer[
-                chunk.token_start : chunk.token_end, :topk_tokens
+                num_rows_slice = logits_slice.shape[0]
+                
+                topk_indices_slice = topk_indices_buffer[
+                    chunk.token_start + q_start : chunk.token_start + q_end, :topk_tokens
                ]
+                
                if not envs.USE_LIGHTOP_TOPK:
                    torch.ops._C.top_k_per_row_prefill(
-                    logits,
-                    chunk.cu_seqlen_ks,
-                    chunk.cu_seqlen_ke,
-                    topk_indices,
-                    num_rows,
-                    logits.stride(0),
-                    logits.stride(1),
+                        logits_slice,
+                        ks_slice,
+                        ke_slice,
+                        topk_indices_slice,
+                        num_rows_slice,
+                        logits_slice.stride(0),
+                        logits_slice.stride(1),
                        topk_tokens,
                    )
                else:
                    op.top_k_per_row_prefill(
-                    logits,
-                    chunk.cu_seqlen_ks,
-                    chunk.cu_seqlen_ke,
-                    topk_indices,
-                    num_rows,
-                    logits.stride(0),
-                    logits.stride(1),
+                        logits_slice,
+                        ks_slice,
+                        ke_slice,
+                        topk_indices_slice,
+                        num_rows_slice,
+                        logits_slice.stride(0),
+                        logits_slice.stride(1),
                        topk_tokens,
                    )

@@ -424,5 +489,3 @@ class SparseAttnIndexer(CustomOp):
                self.max_total_seq_len,
                self.topk_indices_buffer,
            )
\ No newline at end of file
-        
-