From 771913e4a024945a31c9a4ad607b81993704582c Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Fri, 3 Apr 2026 04:45:57 +0400 Subject: [PATCH] [Bugfix] Fix NVFP4+MTP crash: force unquantized mtp.fc for Qwen3.5 (#38832) Signed-off-by: Vadim Gimpelson --- vllm/model_executor/models/qwen3_5_mtp.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_5_mtp.py b/vllm/model_executor/models/qwen3_5_mtp.py index 0eca47492..e49806365 100644 --- a/vllm/model_executor/models/qwen3_5_mtp.py +++ b/vllm/model_executor/models/qwen3_5_mtp.py @@ -75,13 +75,22 @@ class Qwen3_5MultiTokenPredictor(nn.Module): config.hidden_size, ) + # Workaround: mtp.fc is stored as BF16 in NVFP4 checkpoints but is + # missing from hf_quant_config.json exclude_modules. Force unquantized. + # Ref: https://github.com/vllm-project/vllm/pull/38650 + # Ref: https://github.com/NVIDIA/Model-Optimizer/pull/1124 + fc_quant = ( + None + if (quant_config and quant_config.get_name() == "modelopt_fp4") + else quant_config + ) self.fc = ColumnParallelLinear( self.config.hidden_size * 2, self.config.hidden_size, gather_output=True, bias=False, return_bias=False, - quant_config=quant_config, + quant_config=fc_quant, prefix=f"{prefix}.fc", )