[XPU] enable fp8 online streaming quantization (#30944)

Signed-off-by: Yan Ma <yan.ma@intel.com>
2025-12-20 21:45:27 +08:00
parent 1501a4070e
commit 560ae9638c
2 changed files with 29 additions and 107 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -124,11 +124,13 @@ def get_fp8_moe_backend(
    block_quant: bool,
    moe_parallel_config: FusedMoEParallelConfig,
    with_lora_support: bool,
-) -> Fp8MoeBackend:
+) -> Fp8MoeBackend | None:
    """
    Select the primary FP8 MoE backend
    Note: Shape-specific fallbacks may still occur at runtime.
    """
+    if current_platform.is_xpu():
+        return None
    if with_lora_support:
        return Fp8MoeBackend.TRITON
    # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
@@ -292,6 +294,13 @@ class Fp8Config(QuantizationConfig):
                return UnquantizedLinearMethod()
            return XPUFp8LinearMethod(fp8_config)
        elif isinstance(layer, FusedMoE):
+            if is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignored_layers,
+                fused_mapping=self.packed_modules_mapping,
+            ):
+                return UnquantizedFusedMoEMethod(layer.moe_config)
+
            return XPUFp8MoEMethod(fp8_config, layer)
        elif isinstance(layer, Attention):
            return Fp8KVCacheMethod(self)
@@ -1107,7 +1116,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    ) -> mk.FusedMoEPrepareAndFinalize | None:
        if (
-            self.rocm_aiter_moe_enabled
+            current_platform.is_xpu()
+            or self.rocm_aiter_moe_enabled
            or self.use_marlin
            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
        ):