[XPU] enable fp8 online streaming quantization (#30944)
Signed-off-by: Yan Ma <yan.ma@intel.com>
This commit is contained in:
@@ -124,11 +124,13 @@ def get_fp8_moe_backend(
|
||||
block_quant: bool,
|
||||
moe_parallel_config: FusedMoEParallelConfig,
|
||||
with_lora_support: bool,
|
||||
) -> Fp8MoeBackend:
|
||||
) -> Fp8MoeBackend | None:
|
||||
"""
|
||||
Select the primary FP8 MoE backend
|
||||
Note: Shape-specific fallbacks may still occur at runtime.
|
||||
"""
|
||||
if current_platform.is_xpu():
|
||||
return None
|
||||
if with_lora_support:
|
||||
return Fp8MoeBackend.TRITON
|
||||
# Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
|
||||
@@ -292,6 +294,13 @@ class Fp8Config(QuantizationConfig):
|
||||
return UnquantizedLinearMethod()
|
||||
return XPUFp8LinearMethod(fp8_config)
|
||||
elif isinstance(layer, FusedMoE):
|
||||
if is_layer_skipped(
|
||||
prefix=prefix,
|
||||
ignored_layers=self.ignored_layers,
|
||||
fused_mapping=self.packed_modules_mapping,
|
||||
):
|
||||
return UnquantizedFusedMoEMethod(layer.moe_config)
|
||||
|
||||
return XPUFp8MoEMethod(fp8_config, layer)
|
||||
elif isinstance(layer, Attention):
|
||||
return Fp8KVCacheMethod(self)
|
||||
@@ -1107,7 +1116,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
|
||||
) -> mk.FusedMoEPrepareAndFinalize | None:
|
||||
if (
|
||||
self.rocm_aiter_moe_enabled
|
||||
current_platform.is_xpu()
|
||||
or self.rocm_aiter_moe_enabled
|
||||
or self.use_marlin
|
||||
or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
|
||||
):
|
||||
|
||||
Reference in New Issue
Block a user