[Quantization] fix attention quantization of gpt_oss model (#27334)
Signed-off-by: xuebwang-amd <xuebwang@amd.com>
This commit is contained in:
@@ -190,14 +190,25 @@ class Mxfp4Config(QuantizationConfig):
|
||||
fused_mapping=self.packed_modules_mapping,
|
||||
):
|
||||
return UnquantizedLinearMethod()
|
||||
raise NotImplementedError("Mxfp4 linear layer is not implemented")
|
||||
# TODO: Add support for MXFP4 Linear Method.
|
||||
# MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation
|
||||
# if you are interested in enabling MXFP4 here.
|
||||
logger.warning_once(
|
||||
"MXFP4 linear layer is not implemented - falling back to "
|
||||
"UnquantizedLinearMethod."
|
||||
)
|
||||
return UnquantizedLinearMethod()
|
||||
elif isinstance(layer, FusedMoE):
|
||||
if current_platform.is_xpu():
|
||||
return IpexMxfp4MoEMethod(layer.moe_config)
|
||||
else:
|
||||
return Mxfp4MoEMethod(layer.moe_config)
|
||||
elif isinstance(layer, Attention):
|
||||
raise NotImplementedError("Mxfp4 attention layer is not implemented")
|
||||
# TODO: Add support for MXFP4 Attention.
|
||||
logger.warning_once(
|
||||
"MXFP4 attention layer is not implemented. "
|
||||
"Skipping quantization for this layer."
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@@ -198,6 +198,7 @@ class TransformerBlock(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
quant_config: QuantizationConfig,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -207,7 +208,10 @@ class TransformerBlock(torch.nn.Module):
|
||||
|
||||
self.layer_idx = extract_layer_index(prefix)
|
||||
self.attn = OAIAttention(
|
||||
config, prefix=f"{prefix}.attn", cache_config=cache_config
|
||||
config,
|
||||
prefix=f"{prefix}.attn",
|
||||
quant_config=quant_config,
|
||||
cache_config=cache_config,
|
||||
)
|
||||
self.mlp = MLPBlock(vllm_config, self.layer_idx, prefix=f"{prefix}.mlp")
|
||||
self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
|
||||
@@ -243,6 +247,7 @@ class GptOssModel(nn.Module):
|
||||
):
|
||||
super().__init__()
|
||||
self.config = vllm_config.model_config.hf_config
|
||||
self.quant_config = vllm_config.quant_config
|
||||
self.parallel_config = vllm_config.parallel_config
|
||||
self.config.hidden_size = self.config.hidden_size
|
||||
self.embedding = VocabParallelEmbedding(
|
||||
@@ -254,6 +259,7 @@ class GptOssModel(nn.Module):
|
||||
lambda prefix: TransformerBlock(
|
||||
vllm_config,
|
||||
prefix=prefix,
|
||||
quant_config=self.quant_config,
|
||||
),
|
||||
prefix=f"{prefix}.layers",
|
||||
)
|
||||
@@ -645,7 +651,7 @@ class GptOssModel(nn.Module):
|
||||
|
||||
|
||||
class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
|
||||
packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]}
|
||||
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
||||
|
||||
hf_to_vllm_mapper = WeightsMapper(
|
||||
orig_to_new_substr={
|
||||
|
||||
Reference in New Issue
Block a user