[Hardware][Intel GPU] Add v1 Intel GPU support with Flash attention backend. (#19560)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-06-27 00:27:18 +08:00
parent 0bceac9810
commit b69781f107
10 changed files with 393 additions and 42 deletions
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -14,10 +14,12 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
 from vllm.attention.layer import Attention
 from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
-                                           get_flash_attn_version)
+                                           flash_attn_varlen_func,
+                                           get_flash_attn_version,
+                                           get_scheduler_metadata,
+                                           reshape_and_cache_flash)
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils import cdiv
 from vllm.v1.attention.backends.utils import (
    AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout,
@@ -28,10 +30,6 @@ from vllm.v1.worker.block_table import BlockTable
 if TYPE_CHECKING:
    from vllm.v1.worker.gpu_model_runner import GPUModelRunner

-if current_platform.is_cuda():
-    from vllm.vllm_flash_attn import (flash_attn_varlen_func,
-                                      get_scheduler_metadata)
-
 logger = init_logger(__name__)


@@ -443,7 +441,7 @@ class FlashAttentionImpl(AttentionImpl):
            # and value[:num_actual_tokens] because the reshape_and_cache_flash
            # op uses the slot_mapping's shape to determine the number of
            # actual tokens.
-            torch.ops._C_cache_ops.reshape_and_cache_flash(
+            reshape_and_cache_flash(
                key,
                value,
                key_cache,