From bf184a66218beae5355c1e9784074c21b896fe0f Mon Sep 17 00:00:00 2001 From: roikoren755 <26850796+roikoren755@users.noreply.github.com> Date: Wed, 7 Jan 2026 19:37:19 +0200 Subject: [PATCH] Enable quantized attention in NemotronH models (#31898) Signed-off-by: Roi Koren --- vllm/model_executor/model_loader/weight_utils.py | 3 +++ vllm/model_executor/models/nemotron_h.py | 1 + 2 files changed, 4 insertions(+) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index d55b61f27..02f10eb2a 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -1153,6 +1153,9 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None: # Qwen3 MoE format: .self_attn.qkqkv_proj.{k,v}_scale -> # .self_attn.attn.{k,v}_scale (r"\.self_attn\.qkqkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"), + # NemotronH format: .mixer.{k,v}_proj.{k,v}_scale -> + # .mixer.attn.{k,v}_scale + (r"\.mixer\.[kv]_proj\.([kv])_scale$", r".mixer.attn.\1_scale"), # Default format: .{k,v}_scale -> .attn.{k,v}_scale (r"\.([kv])_scale$", r".attn.\1_scale"), ] diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index c3275dd3c..aff1d5fd4 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -483,6 +483,7 @@ class NemotronHAttention(nn.Module): self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, + quant_config=quant_config, prefix=f"{prefix}.attn", )