Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c8c3935b
Unverified
Commit
c8c3935b
authored
Mar 04, 2026
by
Raghavan
Committed by
GitHub
Mar 04, 2026
Browse files
[Bugfix][Model] Fix FP8 k_scale/v_scale not loaded for Qwen3-MoE (#35656)
Signed-off-by:
raghavan
<
oneraghavan@gmail.com
>
parent
bb6888b8
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
129 additions
and
36 deletions
+129
-36
tests/model_executor/test_weight_utils.py
tests/model_executor/test_weight_utils.py
+116
-0
vllm/model_executor/models/qwen3_moe.py
vllm/model_executor/models/qwen3_moe.py
+6
-18
vllm/model_executor/models/qwen3_vl_moe.py
vllm/model_executor/models/qwen3_vl_moe.py
+7
-18
No files found.
tests/model_executor/test_weight_utils.py
View file @
c8c3935b
...
@@ -11,6 +11,7 @@ from huggingface_hub.utils import LocalEntryNotFoundError
...
@@ -11,6 +11,7 @@ from huggingface_hub.utils import LocalEntryNotFoundError
from
vllm.model_executor.model_loader.weight_utils
import
(
from
vllm.model_executor.model_loader.weight_utils
import
(
download_weights_from_hf
,
download_weights_from_hf
,
enable_hf_transfer
,
enable_hf_transfer
,
maybe_remap_kv_scale_name
,
)
)
...
@@ -61,6 +62,121 @@ def test_download_weights_from_hf():
...
@@ -61,6 +62,121 @@ def test_download_weights_from_hf():
)
)
class
TestMaybeRemapKvScaleName
:
"""Tests for maybe_remap_kv_scale_name covering all checkpoint formats."""
PARAMS_DICT
=
{
"model.layers.0.self_attn.attn.k_scale"
:
None
,
"model.layers.0.self_attn.attn.v_scale"
:
None
,
"model.layers.0.self_attn.attn.q_scale"
:
None
,
"model.layers.0.self_attn.qkv_proj.weight"
:
None
,
}
def
test_qkv_proj_k_scale
(
self
):
"""Qwen3-MoE / llm-compressor format: qkv_proj.k_scale -> attn.k_scale
Regression test for https://github.com/vllm-project/vllm/issues/25047"""
result
=
maybe_remap_kv_scale_name
(
"model.layers.0.self_attn.qkv_proj.k_scale"
,
self
.
PARAMS_DICT
)
assert
result
==
"model.layers.0.self_attn.attn.k_scale"
def
test_qkv_proj_v_scale
(
self
):
"""Qwen3-MoE / llm-compressor format: qkv_proj.v_scale -> attn.v_scale
Regression test for https://github.com/vllm-project/vllm/issues/25047"""
result
=
maybe_remap_kv_scale_name
(
"model.layers.0.self_attn.qkv_proj.v_scale"
,
self
.
PARAMS_DICT
)
assert
result
==
"model.layers.0.self_attn.attn.v_scale"
def
test_modelopt_k_proj_k_scale
(
self
):
"""ModelOpt format: k_proj.k_scale -> attn.k_scale"""
result
=
maybe_remap_kv_scale_name
(
"model.layers.0.self_attn.k_proj.k_scale"
,
self
.
PARAMS_DICT
)
assert
result
==
"model.layers.0.self_attn.attn.k_scale"
def
test_modelopt_v_proj_v_scale
(
self
):
"""ModelOpt format: v_proj.v_scale -> attn.v_scale"""
result
=
maybe_remap_kv_scale_name
(
"model.layers.0.self_attn.v_proj.v_scale"
,
self
.
PARAMS_DICT
)
assert
result
==
"model.layers.0.self_attn.attn.v_scale"
def
test_deprecated_kv_scale
(
self
):
"""Old format: kv_scale -> attn.k_scale (deprecated)"""
result
=
maybe_remap_kv_scale_name
(
"model.layers.0.self_attn.kv_scale"
,
self
.
PARAMS_DICT
)
assert
result
==
"model.layers.0.self_attn.attn.k_scale"
def
test_default_bare_k_scale
(
self
):
"""Default format: .k_scale -> .attn.k_scale"""
result
=
maybe_remap_kv_scale_name
(
"model.layers.0.self_attn.k_scale"
,
self
.
PARAMS_DICT
)
assert
result
==
"model.layers.0.self_attn.attn.k_scale"
def
test_non_scale_name_unchanged
(
self
):
"""Non-scale names should be returned unchanged."""
name
=
"model.layers.0.self_attn.qkv_proj.weight"
result
=
maybe_remap_kv_scale_name
(
name
,
self
.
PARAMS_DICT
)
assert
result
==
name
def
test_nvfp4_modelopt_k_proj_k_scale
(
self
):
"""ModelOpt NVFP4 format (e.g. nvidia/Qwen3-30B-A3B-NVFP4):
k_proj.k_scale -> attn.k_scale.
Validates that NVFP4 checkpoints are not broken by this change."""
result
=
maybe_remap_kv_scale_name
(
"model.layers.0.self_attn.k_proj.k_scale"
,
self
.
PARAMS_DICT
)
assert
result
==
"model.layers.0.self_attn.attn.k_scale"
def
test_nvfp4_modelopt_v_proj_v_scale
(
self
):
"""ModelOpt NVFP4 format (e.g. nvidia/Qwen3-30B-A3B-NVFP4):
v_proj.v_scale -> attn.v_scale.
Validates that NVFP4 checkpoints are not broken by this change."""
result
=
maybe_remap_kv_scale_name
(
"model.layers.0.self_attn.v_proj.v_scale"
,
self
.
PARAMS_DICT
)
assert
result
==
"model.layers.0.self_attn.attn.v_scale"
def
test_qwen3_vl_moe_qkv_proj_k_scale
(
self
):
"""Qwen3-VL-MoE uses the same fused qkv_proj naming as Qwen3-MoE.
Regression test for qwen3_vl_moe.py fix (same bug as #25047)."""
result
=
maybe_remap_kv_scale_name
(
"model.layers.0.self_attn.qkv_proj.k_scale"
,
self
.
PARAMS_DICT
)
assert
result
==
"model.layers.0.self_attn.attn.k_scale"
def
test_qwen3_vl_moe_qkv_proj_v_scale
(
self
):
"""Qwen3-VL-MoE uses the same fused qkv_proj naming as Qwen3-MoE.
Regression test for qwen3_vl_moe.py fix (same bug as #25047)."""
result
=
maybe_remap_kv_scale_name
(
"model.layers.0.self_attn.qkv_proj.v_scale"
,
self
.
PARAMS_DICT
)
assert
result
==
"model.layers.0.self_attn.attn.v_scale"
def
test_nvfp4_weight_scale_not_remapped
(
self
):
"""NVFP4 weight_scale should not be touched by remap (not a kv scale)."""
name
=
"model.layers.0.self_attn.k_proj.weight_scale"
result
=
maybe_remap_kv_scale_name
(
name
,
self
.
PARAMS_DICT
)
assert
result
==
name
def
test_nvfp4_input_scale_not_remapped
(
self
):
"""NVFP4 input_scale should not be touched by remap (not a kv scale)."""
name
=
"model.layers.0.self_attn.k_proj.input_scale"
result
=
maybe_remap_kv_scale_name
(
name
,
self
.
PARAMS_DICT
)
assert
result
==
name
def
test_missing_target_returns_none
(
self
):
"""If remapped name not in params_dict, return None."""
empty_params
:
dict
[
str
,
None
]
=
{}
result
=
maybe_remap_kv_scale_name
(
"model.layers.0.self_attn.qkv_proj.k_scale"
,
empty_params
)
assert
result
is
None
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
test_hf_transfer_auto_activation
()
test_hf_transfer_auto_activation
()
test_download_weights_from_hf
()
test_download_weights_from_hf
()
vllm/model_executor/models/qwen3_moe.py
View file @
c8c3935b
...
@@ -535,10 +535,6 @@ class Qwen3MoeModel(nn.Module):
...
@@ -535,10 +535,6 @@ class Qwen3MoeModel(nn.Module):
ignore_suffixes
=
(
ignore_suffixes
=
(
".bias"
,
".bias"
,
"_bias"
,
"_bias"
,
".k_scale"
,
"_k_scale"
,
".v_scale"
,
"_v_scale"
,
".weight_scale"
,
".weight_scale"
,
"_weight_scale"
,
"_weight_scale"
,
".input_scale"
,
".input_scale"
,
...
@@ -562,6 +558,10 @@ class Qwen3MoeModel(nn.Module):
...
@@ -562,6 +558,10 @@ class Qwen3MoeModel(nn.Module):
weight_loader
(
param
,
loaded_weight
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
scale_name
)
loaded_params
.
add
(
scale_name
)
continue
continue
if
"scale"
in
name
or
"zero_point"
in
name
:
name
=
maybe_remap_kv_scale_name
(
name
,
params_dict
)
if
name
is
None
:
continue
for
param_name
,
weight_name
,
shard_id
in
stacked_params_mapping
:
for
param_name
,
weight_name
,
shard_id
in
stacked_params_mapping
:
# Skip non-stacked layers and experts (experts handled below).
# Skip non-stacked layers and experts (experts handled below).
if
weight_name
not
in
name
:
if
weight_name
not
in
name
:
...
@@ -654,20 +654,8 @@ class Qwen3MoeModel(nn.Module):
...
@@ -654,20 +654,8 @@ class Qwen3MoeModel(nn.Module):
# Skip layers on other devices.
# Skip layers on other devices.
if
is_pp_missing_parameter
(
name
,
self
):
if
is_pp_missing_parameter
(
name
,
self
):
continue
continue
# Remapping the name of FP8 kv-scale.
if
name
not
in
params_dict
:
if
name
.
endswith
(
"kv_scale"
):
continue
remapped_kv_scale_name
=
name
.
replace
(
".kv_scale"
,
".attn.kv_scale"
)
if
remapped_kv_scale_name
not
in
params_dict
:
logger
.
warning_once
(
"Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded."
,
# noqa: E501
name
,
remapped_kv_scale_name
,
)
continue
else
:
name
=
remapped_kv_scale_name
param
=
params_dict
[
name
]
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
param
,
"weight_loader"
,
default_weight_loader
...
...
vllm/model_executor/models/qwen3_vl_moe.py
View file @
c8c3935b
...
@@ -172,10 +172,6 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
...
@@ -172,10 +172,6 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
ignore_suffixes
=
(
ignore_suffixes
=
(
".bias"
,
".bias"
,
"_bias"
,
"_bias"
,
".k_scale"
,
"_k_scale"
,
".v_scale"
,
"_v_scale"
,
".weight_scale"
,
".weight_scale"
,
"_weight_scale"
,
"_weight_scale"
,
".input_scale"
,
".input_scale"
,
...
@@ -191,6 +187,11 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
...
@@ -191,6 +187,11 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
]
]
num_experts
=
self
.
config
.
num_experts
num_experts
=
self
.
config
.
num_experts
for
name
,
loaded_weight
in
weights
:
for
name
,
loaded_weight
in
weights
:
if
"scale"
in
name
or
"zero_point"
in
name
:
name
=
maybe_remap_kv_scale_name
(
name
,
params_dict
)
if
name
is
None
:
continue
for
param_name
,
weight_name
,
shard_id
in
stacked_params_mapping
:
for
param_name
,
weight_name
,
shard_id
in
stacked_params_mapping
:
if
"experts.gate_up_proj"
in
name
or
"experts.down_proj"
in
name
:
if
"experts.gate_up_proj"
in
name
or
"experts.down_proj"
in
name
:
is_fused_expert
=
True
is_fused_expert
=
True
...
@@ -305,20 +306,8 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
...
@@ -305,20 +306,8 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
# Skip layers on other devices.
# Skip layers on other devices.
if
is_pp_missing_parameter
(
name
,
self
):
if
is_pp_missing_parameter
(
name
,
self
):
continue
continue
# Remapping the name of FP8 kv-scale.
if
name
not
in
params_dict
:
if
name
.
endswith
(
"kv_scale"
):
continue
remapped_kv_scale_name
=
name
.
replace
(
".kv_scale"
,
".attn.kv_scale"
)
if
remapped_kv_scale_name
not
in
params_dict
:
logger
.
warning_once
(
"Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded."
,
# noqa: E501
name
,
remapped_kv_scale_name
,
)
continue
else
:
name
=
remapped_kv_scale_name
param
=
params_dict
[
name
]
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
param
,
"weight_loader"
,
default_weight_loader
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment