From 545d18d81bf11761e51c2b11a006573c2ae366c1 Mon Sep 17 00:00:00 2001 From: LoganJane <42287016+LoganJane@users.noreply.github.com> Date: Wed, 11 Mar 2026 17:48:05 +0800 Subject: [PATCH] [Bugfix] Support other quantization methods in glm41v (#36321) Signed-off-by: g00887675/loganJane Co-authored-by: g00887675/loganJane Co-authored-by: Isotr0py --- vllm/model_executor/models/glm4_1v.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index ff76a26bb..4722b6e3d 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -63,6 +63,9 @@ from vllm.model_executor.layers.linear import ( RowParallelLinear, ) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.compressed_tensors import ( + compressed_tensors, +) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding.common import ( ApplyRotaryEmb, @@ -280,7 +283,9 @@ class Glm4vVisionAttention(nn.Module): bias=False, quant_config=quant_config, # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg - prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv", + prefix=f"{prefix}.qkv_proj" + if isinstance(quant_config, compressed_tensors.CompressedTensorsConfig) + else f"{prefix}.qkv", disable_tp=use_data_parallel, ) self.proj = RowParallelLinear(