From b1b5e045dfdcb4ca31931e05fafa8ea15823d740 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Mon, 23 Feb 2026 21:06:44 +0800
Subject: [PATCH] [XPU] allow TORCH_SDPA/TRITON_ATTN as XPU vit Backend
 (#35010)

Signed-off-by: Yan Ma <yan.ma@intel.com>
---
 .../layers/attention/mm_encoder_attention.py      | 15 +++++++++++----
 vllm/platforms/xpu.py                             |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py
index 1e9c714ea..e59806abb 100644
--- a/vllm/model_executor/layers/attention/mm_encoder_attention.py
+++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py
@@ -249,7 +249,14 @@ class MMEncoderAttention(CustomOp):
         cu_seqlens: torch.Tensor | None = None,
         max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
     ) -> torch.Tensor:
-        assert self.is_flash_attn_backend, (
-            "XPU only supports FLASH_ATTN for vision attention."
-        )
-        return self._forward_fa(query, key, value, cu_seqlens, max_seqlen)
+        if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
+            return self._forward_fa(query, key, value, cu_seqlens, max_seqlen)
+        elif self.attn_backend == AttentionBackendEnum.TRITON_ATTN:
+            return self._forward_triton(query, key, value, cu_seqlens, max_seqlen)
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
+            return self._forward_sdpa(query, key, value, cu_seqlens)
+        else:
+            raise ValueError(
+                f"Unsupported multi-modal encoder attention backend for XPU: "
+                f"{self.attn_backend}."
+            )
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 8daa2d47f..5ce3cfba8 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -89,6 +89,7 @@ class XPUPlatform(Platform):
     def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]:
         return [
             AttentionBackendEnum.FLASH_ATTN,
+            AttentionBackendEnum.TRITON_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
         ]