[XPU] support MLA model on Intel GPU (#37143)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
Kunshang Ji
2026-03-25 17:43:42 +08:00
committed by GitHub
parent 189ddefbfd
commit 14771f7150
4 changed files with 15 additions and 12 deletions

View File

@@ -1059,6 +1059,10 @@ except ImportError:
"MLA models using TRITON_MLA will require flash_attn. "
"AITER_MLA backends use aiter kernels instead."
)
elif current_platform.is_xpu():
from vllm._xpu_ops import xpu_ops as ops
flash_attn_varlen_func = ops.flash_attn_varlen_func # type: ignore[no-redef]
def dynamic_per_batched_tensor_quant(

View File

@@ -165,6 +165,16 @@ class QuantFP8(CustomOp):
# Fallback to CUDA implementation
return self.forward_cuda(x, scale, scale_ub)
def forward_xpu(
self,
x: torch.Tensor,
scale: torch.Tensor | None = None,
scale_ub: torch.Tensor | None = None,
use_triton: bool = False,
) -> tuple[torch.Tensor, torch.Tensor]:
# XPU can use same code path as CUDA.
return self.forward_cuda(x, scale, scale_ub, use_triton)
def forward_native(
self,
x: torch.Tensor,