[XPU] support MLA model on Intel GPU (#37143)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
@@ -1059,6 +1059,10 @@ except ImportError:
|
||||
"MLA models using TRITON_MLA will require flash_attn. "
|
||||
"AITER_MLA backends use aiter kernels instead."
|
||||
)
|
||||
elif current_platform.is_xpu():
|
||||
from vllm._xpu_ops import xpu_ops as ops
|
||||
|
||||
flash_attn_varlen_func = ops.flash_attn_varlen_func # type: ignore[no-redef]
|
||||
|
||||
|
||||
def dynamic_per_batched_tensor_quant(
|
||||
|
||||
@@ -165,6 +165,16 @@ class QuantFP8(CustomOp):
|
||||
# Fallback to CUDA implementation
|
||||
return self.forward_cuda(x, scale, scale_ub)
|
||||
|
||||
def forward_xpu(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
scale: torch.Tensor | None = None,
|
||||
scale_ub: torch.Tensor | None = None,
|
||||
use_triton: bool = False,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# XPU can use same code path as CUDA.
|
||||
return self.forward_cuda(x, scale, scale_ub, use_triton)
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
|
||||
Reference in New Issue
Block a user