Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bf184a66
Unverified
Commit
bf184a66
authored
Jan 07, 2026
by
roikoren755
Committed by
GitHub
Jan 07, 2026
Browse files
Enable quantized attention in NemotronH models (#31898)
Signed-off-by:
Roi Koren
<
roik@nvidia.com
>
parent
30399cc7
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
4 additions
and
0 deletions
+4
-0
vllm/model_executor/model_loader/weight_utils.py
vllm/model_executor/model_loader/weight_utils.py
+3
-0
vllm/model_executor/models/nemotron_h.py
vllm/model_executor/models/nemotron_h.py
+1
-0
No files found.
vllm/model_executor/model_loader/weight_utils.py
View file @
bf184a66
...
@@ -1153,6 +1153,9 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
...
@@ -1153,6 +1153,9 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
# Qwen3 MoE format: .self_attn.qkqkv_proj.{k,v}_scale ->
# Qwen3 MoE format: .self_attn.qkqkv_proj.{k,v}_scale ->
# .self_attn.attn.{k,v}_scale
# .self_attn.attn.{k,v}_scale
(
r
"\.self_attn\.qkqkv_proj\.([kv])_scale$"
,
r
".self_attn.attn.\1_scale"
),
(
r
"\.self_attn\.qkqkv_proj\.([kv])_scale$"
,
r
".self_attn.attn.\1_scale"
),
# NemotronH format: .mixer.{k,v}_proj.{k,v}_scale ->
# .mixer.attn.{k,v}_scale
(
r
"\.mixer\.[kv]_proj\.([kv])_scale$"
,
r
".mixer.attn.\1_scale"
),
# Default format: .{k,v}_scale -> .attn.{k,v}_scale
# Default format: .{k,v}_scale -> .attn.{k,v}_scale
(
r
"\.([kv])_scale$"
,
r
".attn.\1_scale"
),
(
r
"\.([kv])_scale$"
,
r
".attn.\1_scale"
),
]
]
...
...
vllm/model_executor/models/nemotron_h.py
View file @
bf184a66
...
@@ -483,6 +483,7 @@ class NemotronHAttention(nn.Module):
...
@@ -483,6 +483,7 @@ class NemotronHAttention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
num_kv_heads
=
self
.
num_kv_heads
,
cache_config
=
cache_config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.attn"
,
prefix
=
f
"
{
prefix
}
.attn"
,
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment