[Bugfix] Support other quantization methods in glm41v (#36321)
Signed-off-by: g00887675/loganJane <g00887675/loganJane73@hotmail.com> Co-authored-by: g00887675/loganJane <g00887675/loganJane73@hotmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -63,6 +63,9 @@ from vllm.model_executor.layers.linear import (
|
||||
RowParallelLinear,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors import (
|
||||
compressed_tensors,
|
||||
)
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.rotary_embedding.common import (
|
||||
ApplyRotaryEmb,
|
||||
@@ -280,7 +283,9 @@ class Glm4vVisionAttention(nn.Module):
|
||||
bias=False,
|
||||
quant_config=quant_config,
|
||||
# Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg
|
||||
prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv",
|
||||
prefix=f"{prefix}.qkv_proj"
|
||||
if isinstance(quant_config, compressed_tensors.CompressedTensorsConfig)
|
||||
else f"{prefix}.qkv",
|
||||
disable_tp=use_data_parallel,
|
||||
)
|
||||
self.proj = RowParallelLinear(
|
||||
|
||||
Reference in New Issue
Block a user