From 771913e4a024945a31c9a4ad607b81993704582c Mon Sep 17 00:00:00 2001
From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Date: Fri, 3 Apr 2026 04:45:57 +0400
Subject: [PATCH] [Bugfix] Fix NVFP4+MTP crash: force unquantized mtp.fc for
 Qwen3.5 (#38832)

Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
---
 vllm/model_executor/models/qwen3_5_mtp.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen3_5_mtp.py b/vllm/model_executor/models/qwen3_5_mtp.py
index 0eca47492..e49806365 100644
--- a/vllm/model_executor/models/qwen3_5_mtp.py
+++ b/vllm/model_executor/models/qwen3_5_mtp.py
@@ -75,13 +75,22 @@ class Qwen3_5MultiTokenPredictor(nn.Module):
             config.hidden_size,
         )
 
+        # Workaround: mtp.fc is stored as BF16 in NVFP4 checkpoints but is
+        # missing from hf_quant_config.json exclude_modules. Force unquantized.
+        # Ref: https://github.com/vllm-project/vllm/pull/38650
+        # Ref: https://github.com/NVIDIA/Model-Optimizer/pull/1124
+        fc_quant = (
+            None
+            if (quant_config and quant_config.get_name() == "modelopt_fp4")
+            else quant_config
+        )
         self.fc = ColumnParallelLinear(
             self.config.hidden_size * 2,
             self.config.hidden_size,
             gather_output=True,
             bias=False,
             return_bias=False,
-            quant_config=quant_config,
+            quant_config=fc_quant,
             prefix=f"{prefix}.fc",
         )