From f79d9dce16781b403713b8813df04d57c053bdbf Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Fri, 6 Feb 2026 11:59:20 +0000 Subject: [PATCH] [CPU][BugFix] Fix loading of w8a8int models with bias (#33582) Signed-off-by: Fadi Arafeh --- .../kernels/mixed_precision/dynamic_4bit.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py index bc7076c36..3dfe06f1b 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py @@ -86,9 +86,14 @@ class Dynamic4bitLinearKernel(MPLinearKernel): ) # Float32 & Bfloat16 variants requires float32 scales scales = scales.view(-1, 1) # Channel-wise scales if layer.bias is not None: - layer.bias = layer.bias.to( - torch.float32 - ) # Float32 & Bfloat16 variants requires float32 bias + # Float32 & Bfloat16 variants requires float32 bias + replace_parameter( + layer, + "bias", + torch.nn.Parameter( + layer.bias.to(torch.float32), requires_grad=False + ), + ) else: # KleidiAI kernel requires bfloat16 scales with groupwise scheme scales = scales.to(torch.bfloat16)