[XPU] support Triton Attention backend on Intel GPU (#24149)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
Kunshang Ji
2025-09-04 20:41:08 +08:00
committed by GitHub
parent 2b30afa442
commit 16ded21eeb
5 changed files with 49 additions and 15 deletions

View File

@@ -242,10 +242,9 @@ class ipex_ops:
k_scale_float: float = 1.0,
v_scale_float: float = 1.0,
) -> None:
assert kv_cache_dtype == "auto"
# TODO: support FP8 kv cache.
ipex.llm.modules.PagedAttention.reshape_and_cache_flash(
key, value, key_cache, value_cache, slot_mapping)
key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
k_scale_float, v_scale_float)
@staticmethod
def flash_attn_varlen_func(