Unverified Commit bf184a66 authored by roikoren755's avatar roikoren755 Committed by GitHub
Browse files

Enable quantized attention in NemotronH models (#31898)


Signed-off-by: default avatarRoi Koren <roik@nvidia.com>
parent 30399cc7
...@@ -1153,6 +1153,9 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None: ...@@ -1153,6 +1153,9 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
# Qwen3 MoE format: .self_attn.qkqkv_proj.{k,v}_scale -> # Qwen3 MoE format: .self_attn.qkqkv_proj.{k,v}_scale ->
# .self_attn.attn.{k,v}_scale # .self_attn.attn.{k,v}_scale
(r"\.self_attn\.qkqkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"), (r"\.self_attn\.qkqkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"),
# NemotronH format: .mixer.{k,v}_proj.{k,v}_scale ->
# .mixer.attn.{k,v}_scale
(r"\.mixer\.[kv]_proj\.([kv])_scale$", r".mixer.attn.\1_scale"),
# Default format: .{k,v}_scale -> .attn.{k,v}_scale # Default format: .{k,v}_scale -> .attn.{k,v}_scale
(r"\.([kv])_scale$", r".attn.\1_scale"), (r"\.([kv])_scale$", r".attn.\1_scale"),
] ]
......
...@@ -483,6 +483,7 @@ class NemotronHAttention(nn.Module): ...@@ -483,6 +483,7 @@ class NemotronHAttention(nn.Module):
self.scaling, self.scaling,
num_kv_heads=self.num_kv_heads, num_kv_heads=self.num_kv_heads,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn", prefix=f"{prefix}.attn",
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment