Commit 11cbe065 authored by guanyu1's avatar guanyu1
Browse files

qwen3-vl-235b-a22b moe_nn=0问题修改-ai

parent a4df8463
...@@ -1726,6 +1726,10 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1726,6 +1726,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: (os.environ.get("VLLM_USE_NN", "True").lower() in lambda: (os.environ.get("VLLM_USE_NN", "True").lower() in
("true", "1")), ("true", "1")),
# Controls whether MoE weights use the NN layout (1) or the default layout (0).
# This needs to propagate to workers for correct MoE weight loading.
"MOE_NN": lambda: os.environ.get("MOE_NN", "1"),
# Enable two batch overlap. # Enable two batch overlap.
"VLLM_ENABLE_TBO": "VLLM_ENABLE_TBO":
lambda: bool(int(os.getenv("VLLM_ENABLE_TBO", "0"))), lambda: bool(int(os.getenv("VLLM_ENABLE_TBO", "0"))),
......
...@@ -994,6 +994,12 @@ def invoke_fused_moe_wna16_triton_kernel( ...@@ -994,6 +994,12 @@ def invoke_fused_moe_wna16_triton_kernel(
* triton.cdiv(B.size(1), META["BLOCK_SIZE_N"]), * triton.cdiv(B.size(1), META["BLOCK_SIZE_N"]),
) )
config = config.copy() config = config.copy()
# Some configs (or older config files) may include SPLIT_K, but the
# current Triton kernels in this file do not accept it as a tl.constexpr.
# Passing it will raise:
# KeyError: 'Keyword argument SPLIT_K was specified but unrecognised'
config.pop("SPLIT_K", None)
config.pop("num_ldmatrixes", None)
config.update( config.update(
get_moe_wna16_block_config( get_moe_wna16_block_config(
config=config, config=config,
...@@ -1149,8 +1155,13 @@ def invoke_fused_moe_triton_kernel( ...@@ -1149,8 +1155,13 @@ def invoke_fused_moe_triton_kernel(
) )
HAS_BIAS = B_bias is not None HAS_BIAS = B_bias is not None
# config = config.copy() config = config.copy()
# config["SPLIT_K"] = 1 # Some configs (or older config files) may include SPLIT_K, but the
# current Triton kernels in this file do not accept it as a tl.constexpr.
# Passing it will raise:
# KeyError: 'Keyword argument SPLIT_K was specified but unrecognised'
config.pop("SPLIT_K", None)
config.pop("num_ldmatrixes", None)
# BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K") # BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K")
# if block_shape is not None: # if block_shape is not None:
# BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0], block_shape[1])) # BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0], block_shape[1]))
......
...@@ -242,8 +242,22 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): ...@@ -242,8 +242,22 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
if is_pp_missing_parameter(name_mapped, self): if is_pp_missing_parameter(name_mapped, self):
continue continue
if is_fused_expert: if is_fused_expert:
loaded_weight = loaded_weight.transpose(-1, -2) # no bias hidden_size = self.config.hidden_size
if "experts.gate_up_proj" in name: if "experts.gate_up_proj" in name:
# For some checkpoints, fused expert weights are
# stored in NN layout (in_features, out_features).
# vLLM's fused MoE loader expects the checkpoint
# weights in HF/torch Linear layout
# (out_features, in_features). Detect and transpose
# if needed.
if loaded_weight.shape[-2] == hidden_size:
loaded_weight = loaded_weight.transpose(-1, -2)
elif loaded_weight.shape[-1] != hidden_size:
raise ValueError(
"Unexpected gate_up_proj expert weight shape "
f"{tuple(loaded_weight.shape)}; expected last two dims "
f"to contain hidden_size={hidden_size}."
)
loaded_weight = loaded_weight.chunk(2, dim=-2) loaded_weight = loaded_weight.chunk(2, dim=-2)
success_w1 = self.load_fused_expert_weights( success_w1 = self.load_fused_expert_weights(
name_mapped, name_mapped,
...@@ -262,6 +276,14 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): ...@@ -262,6 +276,14 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
success = success_w1 and success_w3 success = success_w1 and success_w3
else: else:
# down_proj # down_proj
if loaded_weight.shape[-1] == hidden_size:
loaded_weight = loaded_weight.transpose(-1, -2)
elif loaded_weight.shape[-2] != hidden_size:
raise ValueError(
"Unexpected down_proj expert weight shape "
f"{tuple(loaded_weight.shape)}; expected last two dims "
f"to contain hidden_size={hidden_size}."
)
success = self.load_fused_expert_weights( success = self.load_fused_expert_weights(
name_mapped, name_mapped,
params_dict, params_dict,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment