Merge branch 'v0.15.1-dev-lxh' into 'v0.15.1-dev'

Fix：修复调用Triton MoE gemm时缺失的参数，对齐接口 See merge request dcutoolkit/deeplearing/vllm!476

Merge branch 'v0.15.1-dev-lxh' into 'v0.15.1-dev'
Fix：修复调用Triton MoE gemm时缺失的参数，对齐接口 See merge request dcutoolkit/deeplearing/vllm!476
f7461a96 · zhangqha · 02a1e691 · 3b9aa746 · f7461a96 · f7461a96
Commit f7461a96 authored Mar 11, 2026 by zhangqha
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

csrc/moe/moe_fused_gate.cu csrc/moe/moe_fused_gate.cu +1 -1

vllm/model_executor/layers/fused_moe/fused_moe.py vllm/model_executor/layers/fused_moe/fused_moe.py +1 -1

No files found.
--- a/csrc/moe/moe_fused_gate.cu
+++ b/csrc/moe/moe_fused_gate.cu
@@ -72,7 +72,7 @@ __device__ inline bool cmp_eq(const T& a, const T& b) {
 static constexpr int SIZE_WARP = 32;
 static constexpr int WARPS_PER_CTA = 6;
 // static constexpr int MAX_VPT = 32;  // maximum VPT we support, > params.VPT = num_expert / num_expert_group
-static constexpr int MAX_VPT = 128; // Extend MAX_VPT from 32 to 128 to accommodate large-scale MoE models (e.g., GLM-4V-quantized model).
+static constexpr int MAX_VPT = 256; // Extend MAX_VPT from 32 to 256 to accommodate large-scale MoE models (e.g., GLM-5-quantized model).

 // Create an alias for Array using AlignedArray
 template <typename T, int N>

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -9,7 +9,7 @@ import math


 from collections.abc import Callable
-from typing import Any
+from typing import Any, Callable, Dict, List, Optional

 import torch