Enable quantized attention in NemotronH models (#31898)
Signed-off-by: Roi Koren <roik@nvidia.com>
This commit is contained in:
@@ -1153,6 +1153,9 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
|
|||||||
# Qwen3 MoE format: .self_attn.qkqkv_proj.{k,v}_scale ->
|
# Qwen3 MoE format: .self_attn.qkqkv_proj.{k,v}_scale ->
|
||||||
# .self_attn.attn.{k,v}_scale
|
# .self_attn.attn.{k,v}_scale
|
||||||
(r"\.self_attn\.qkqkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"),
|
(r"\.self_attn\.qkqkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"),
|
||||||
|
# NemotronH format: .mixer.{k,v}_proj.{k,v}_scale ->
|
||||||
|
# .mixer.attn.{k,v}_scale
|
||||||
|
(r"\.mixer\.[kv]_proj\.([kv])_scale$", r".mixer.attn.\1_scale"),
|
||||||
# Default format: .{k,v}_scale -> .attn.{k,v}_scale
|
# Default format: .{k,v}_scale -> .attn.{k,v}_scale
|
||||||
(r"\.([kv])_scale$", r".attn.\1_scale"),
|
(r"\.([kv])_scale$", r".attn.\1_scale"),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -483,6 +483,7 @@ class NemotronHAttention(nn.Module):
|
|||||||
self.scaling,
|
self.scaling,
|
||||||
num_kv_heads=self.num_kv_heads,
|
num_kv_heads=self.num_kv_heads,
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
|
quant_config=quant_config,
|
||||||
prefix=f"{prefix}.attn",
|
prefix=f"{prefix}.attn",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user