diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py index 6dc120ddb..93535ae0a 100644 --- a/tests/model_executor/test_weight_utils.py +++ b/tests/model_executor/test_weight_utils.py @@ -11,6 +11,7 @@ from huggingface_hub.utils import LocalEntryNotFoundError from vllm.model_executor.model_loader.weight_utils import ( download_weights_from_hf, enable_hf_transfer, + maybe_remap_kv_scale_name, ) @@ -61,6 +62,121 @@ def test_download_weights_from_hf(): ) +class TestMaybeRemapKvScaleName: + """Tests for maybe_remap_kv_scale_name covering all checkpoint formats.""" + + PARAMS_DICT = { + "model.layers.0.self_attn.attn.k_scale": None, + "model.layers.0.self_attn.attn.v_scale": None, + "model.layers.0.self_attn.attn.q_scale": None, + "model.layers.0.self_attn.qkv_proj.weight": None, + } + + def test_qkv_proj_k_scale(self): + """Qwen3-MoE / llm-compressor format: qkv_proj.k_scale -> attn.k_scale + Regression test for https://github.com/vllm-project/vllm/issues/25047""" + result = maybe_remap_kv_scale_name( + "model.layers.0.self_attn.qkv_proj.k_scale", self.PARAMS_DICT + ) + assert result == "model.layers.0.self_attn.attn.k_scale" + + def test_qkv_proj_v_scale(self): + """Qwen3-MoE / llm-compressor format: qkv_proj.v_scale -> attn.v_scale + Regression test for https://github.com/vllm-project/vllm/issues/25047""" + result = maybe_remap_kv_scale_name( + "model.layers.0.self_attn.qkv_proj.v_scale", self.PARAMS_DICT + ) + assert result == "model.layers.0.self_attn.attn.v_scale" + + def test_modelopt_k_proj_k_scale(self): + """ModelOpt format: k_proj.k_scale -> attn.k_scale""" + result = maybe_remap_kv_scale_name( + "model.layers.0.self_attn.k_proj.k_scale", self.PARAMS_DICT + ) + assert result == "model.layers.0.self_attn.attn.k_scale" + + def test_modelopt_v_proj_v_scale(self): + """ModelOpt format: v_proj.v_scale -> attn.v_scale""" + result = maybe_remap_kv_scale_name( + "model.layers.0.self_attn.v_proj.v_scale", self.PARAMS_DICT + ) + assert result == "model.layers.0.self_attn.attn.v_scale" + + def test_deprecated_kv_scale(self): + """Old format: kv_scale -> attn.k_scale (deprecated)""" + result = maybe_remap_kv_scale_name( + "model.layers.0.self_attn.kv_scale", self.PARAMS_DICT + ) + assert result == "model.layers.0.self_attn.attn.k_scale" + + def test_default_bare_k_scale(self): + """Default format: .k_scale -> .attn.k_scale""" + result = maybe_remap_kv_scale_name( + "model.layers.0.self_attn.k_scale", self.PARAMS_DICT + ) + assert result == "model.layers.0.self_attn.attn.k_scale" + + def test_non_scale_name_unchanged(self): + """Non-scale names should be returned unchanged.""" + name = "model.layers.0.self_attn.qkv_proj.weight" + result = maybe_remap_kv_scale_name(name, self.PARAMS_DICT) + assert result == name + + def test_nvfp4_modelopt_k_proj_k_scale(self): + """ModelOpt NVFP4 format (e.g. nvidia/Qwen3-30B-A3B-NVFP4): + k_proj.k_scale -> attn.k_scale. + Validates that NVFP4 checkpoints are not broken by this change.""" + result = maybe_remap_kv_scale_name( + "model.layers.0.self_attn.k_proj.k_scale", self.PARAMS_DICT + ) + assert result == "model.layers.0.self_attn.attn.k_scale" + + def test_nvfp4_modelopt_v_proj_v_scale(self): + """ModelOpt NVFP4 format (e.g. nvidia/Qwen3-30B-A3B-NVFP4): + v_proj.v_scale -> attn.v_scale. + Validates that NVFP4 checkpoints are not broken by this change.""" + result = maybe_remap_kv_scale_name( + "model.layers.0.self_attn.v_proj.v_scale", self.PARAMS_DICT + ) + assert result == "model.layers.0.self_attn.attn.v_scale" + + def test_qwen3_vl_moe_qkv_proj_k_scale(self): + """Qwen3-VL-MoE uses the same fused qkv_proj naming as Qwen3-MoE. + Regression test for qwen3_vl_moe.py fix (same bug as #25047).""" + result = maybe_remap_kv_scale_name( + "model.layers.0.self_attn.qkv_proj.k_scale", self.PARAMS_DICT + ) + assert result == "model.layers.0.self_attn.attn.k_scale" + + def test_qwen3_vl_moe_qkv_proj_v_scale(self): + """Qwen3-VL-MoE uses the same fused qkv_proj naming as Qwen3-MoE. + Regression test for qwen3_vl_moe.py fix (same bug as #25047).""" + result = maybe_remap_kv_scale_name( + "model.layers.0.self_attn.qkv_proj.v_scale", self.PARAMS_DICT + ) + assert result == "model.layers.0.self_attn.attn.v_scale" + + def test_nvfp4_weight_scale_not_remapped(self): + """NVFP4 weight_scale should not be touched by remap (not a kv scale).""" + name = "model.layers.0.self_attn.k_proj.weight_scale" + result = maybe_remap_kv_scale_name(name, self.PARAMS_DICT) + assert result == name + + def test_nvfp4_input_scale_not_remapped(self): + """NVFP4 input_scale should not be touched by remap (not a kv scale).""" + name = "model.layers.0.self_attn.k_proj.input_scale" + result = maybe_remap_kv_scale_name(name, self.PARAMS_DICT) + assert result == name + + def test_missing_target_returns_none(self): + """If remapped name not in params_dict, return None.""" + empty_params: dict[str, None] = {} + result = maybe_remap_kv_scale_name( + "model.layers.0.self_attn.qkv_proj.k_scale", empty_params + ) + assert result is None + + if __name__ == "__main__": test_hf_transfer_auto_activation() test_download_weights_from_hf() diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index f9da9248e..95bb83a6b 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -535,10 +535,6 @@ class Qwen3MoeModel(nn.Module): ignore_suffixes = ( ".bias", "_bias", - ".k_scale", - "_k_scale", - ".v_scale", - "_v_scale", ".weight_scale", "_weight_scale", ".input_scale", @@ -562,6 +558,10 @@ class Qwen3MoeModel(nn.Module): weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue + if "scale" in name or "zero_point" in name: + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue for param_name, weight_name, shard_id in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). if weight_name not in name: @@ -654,20 +654,8 @@ class Qwen3MoeModel(nn.Module): # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue - # Remapping the name of FP8 kv-scale. - if name.endswith("kv_scale"): - remapped_kv_scale_name = name.replace( - ".kv_scale", ".attn.kv_scale" - ) - if remapped_kv_scale_name not in params_dict: - logger.warning_once( - "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", # noqa: E501 - name, - remapped_kv_scale_name, - ) - continue - else: - name = remapped_kv_scale_name + if name not in params_dict: + continue param = params_dict[name] weight_loader = getattr( param, "weight_loader", default_weight_loader diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index e6fc7d409..65f661695 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -172,10 +172,6 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): ignore_suffixes = ( ".bias", "_bias", - ".k_scale", - "_k_scale", - ".v_scale", - "_v_scale", ".weight_scale", "_weight_scale", ".input_scale", @@ -191,6 +187,11 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): ] num_experts = self.config.num_experts for name, loaded_weight in weights: + if "scale" in name or "zero_point" in name: + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + for param_name, weight_name, shard_id in stacked_params_mapping: if "experts.gate_up_proj" in name or "experts.down_proj" in name: is_fused_expert = True @@ -305,20 +306,8 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue - # Remapping the name of FP8 kv-scale. - if name.endswith("kv_scale"): - remapped_kv_scale_name = name.replace( - ".kv_scale", ".attn.kv_scale" - ) - if remapped_kv_scale_name not in params_dict: - logger.warning_once( - "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", # noqa: E501 - name, - remapped_kv_scale_name, - ) - continue - else: - name = remapped_kv_scale_name + if name not in params_dict: + continue param = params_dict[name] weight_loader = getattr( param, "weight_loader", default_weight_loader