qwen3-vl-235b-a22b moe_nn=0问题修改-ai

11cbe065 · guanyu1 · a4df8463 · 11cbe065 · 11cbe065 · 11cbe065
Commit 11cbe065 authored Jan 30, 2026 by guanyu1
3 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1726,6 +1726,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    lambda: (os.environ.get("VLLM_USE_NN", "True").lower() in
             ("true", "1")),
+    # Controls whether MoE weights use the NN layout (1) or the default layout (0).
+    # This needs to propagate to workers for correct MoE weight loading.
+    "MOE_NN": lambda: os.environ.get("MOE_NN", "1"),
    # Enable two batch overlap.
    "VLLM_ENABLE_TBO":
    lambda: bool(int(os.getenv("VLLM_ENABLE_TBO", "0"))),

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -994,6 +994,12 @@ def invoke_fused_moe_wna16_triton_kernel(
        * triton.cdiv(B.size(1), META["BLOCK_SIZE_N"]),
    )
    config = config.copy()
+    # Some configs (or older config files) may include SPLIT_K, but the
+    # current Triton kernels in this file do not accept it as a tl.constexpr.
+    # Passing it will raise:
+    #   KeyError: 'Keyword argument SPLIT_K was specified but unrecognised'
+    config.pop("SPLIT_K", None)
+    config.pop("num_ldmatrixes", None)
    config.update(
        get_moe_wna16_block_config(
            config=config,
@@ -1149,8 +1155,13 @@ def invoke_fused_moe_triton_kernel(
    )
    HAS_BIAS = B_bias is not None
-    # config = config.copy()
+    config = config.copy()
-    # config["SPLIT_K"] = 1
+    # Some configs (or older config files) may include SPLIT_K, but the
+    # current Triton kernels in this file do not accept it as a tl.constexpr.
+    # Passing it will raise:
+    #   KeyError: 'Keyword argument SPLIT_K was specified but unrecognised'
+    config.pop("SPLIT_K", None)
+    config.pop("num_ldmatrixes", None)
    # BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K")
    # if block_shape is not None:
    #     BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0], block_shape[1]))

--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -242,8 +242,22 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
                    if is_pp_missing_parameter(name_mapped, self):
                        continue
                    if is_fused_expert:
-                        loaded_weight = loaded_weight.transpose(-1, -2)  # no bias
+                        hidden_size = self.config.hidden_size
                        if "experts.gate_up_proj" in name:
+                            # For some checkpoints, fused expert weights are
+                            # stored in NN layout (in_features, out_features).
+                            # vLLM's fused MoE loader expects the checkpoint
+                            # weights in HF/torch Linear layout
+                            # (out_features, in_features). Detect and transpose
+                            # if needed.
+                            if loaded_weight.shape[-2] == hidden_size:
+                                loaded_weight = loaded_weight.transpose(-1, -2)
+                            elif loaded_weight.shape[-1] != hidden_size:
+                                raise ValueError(
+                                    "Unexpected gate_up_proj expert weight shape "
+                                    f"{tuple(loaded_weight.shape)}; expected last two dims "
+                                    f"to contain hidden_size={hidden_size}."
+                                )
                            loaded_weight = loaded_weight.chunk(2, dim=-2)
                            success_w1 = self.load_fused_expert_weights(
                                name_mapped,
@@ -262,6 +276,14 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
                            success = success_w1 and success_w3
                        else:
                            # down_proj
+                            if loaded_weight.shape[-1] == hidden_size:
+                                loaded_weight = loaded_weight.transpose(-1, -2)
+                            elif loaded_weight.shape[-2] != hidden_size:
+                                raise ValueError(
+                                    "Unexpected down_proj expert weight shape "
+                                    f"{tuple(loaded_weight.shape)}; expected last two dims "
+                                    f"to contain hidden_size={hidden_size}."
+                                )
                            success = self.load_fused_expert_weights(
                                name_mapped,
                                params_dict,