[XPU][9/N] clean up existing ipex code/doc (#34111)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
@@ -53,7 +53,7 @@ if hasattr(torch.ops._xpu_C, "int4_gemm_w4a16"):
|
||||
return torch.empty((M, N), dtype=input.dtype, device=input.device)
|
||||
|
||||
|
||||
class ipex_ops:
|
||||
class xpu_ops:
|
||||
@staticmethod
|
||||
def flash_attn_varlen_func(
|
||||
q: torch.Tensor,
|
||||
@@ -73,7 +73,7 @@ class ipex_ops:
|
||||
cu_seqlens_k: torch.Tensor | None = None,
|
||||
# passed in qwen vl
|
||||
dropout_p: float = 0.0,
|
||||
# The following parameters are not used in ipex kernel currently,
|
||||
# The following parameters are not used in xpu kernel currently,
|
||||
# we keep API compatible to CUDA's.
|
||||
scheduler_metadata=None,
|
||||
fa_version: int = 2,
|
||||
@@ -153,6 +153,6 @@ class ipex_ops:
|
||||
sm_margin=0, # Can be tuned if some SMs are used for communication
|
||||
) -> None:
|
||||
logger.warning_once(
|
||||
"get_scheduler_metadata is not implemented for ipex_ops, returning None."
|
||||
"get_scheduler_metadata is not implemented for xpu_ops, returning None."
|
||||
)
|
||||
return None
|
||||
@@ -160,7 +160,7 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
|
||||
logger.info_once("Using Triton backend")
|
||||
return Mxfp4Backend.TRITON
|
||||
elif current_platform.is_xpu():
|
||||
logger.info_once("Using ipex marlin backend on XPU")
|
||||
logger.info_once("Using xpu backend on XPU")
|
||||
return Mxfp4Backend.MARLIN
|
||||
elif current_platform.is_rocm() and has_triton_kernels():
|
||||
logger.info_once("Using Triton backend")
|
||||
|
||||
@@ -20,7 +20,7 @@ from vllm.v1.worker.workspace import current_workspace_manager
|
||||
if current_platform.is_cuda_alike():
|
||||
from vllm import _custom_ops as ops
|
||||
elif current_platform.is_xpu():
|
||||
from vllm._ipex_ops import ipex_ops as ops
|
||||
from vllm._xpu_ops import xpu_ops as ops
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@@ -345,7 +345,6 @@ class CpuPlatform(Platform):
|
||||
ld_preload_str += pytorch_libgomp_so
|
||||
os.environ["LD_PRELOAD"] = ld_preload_str
|
||||
|
||||
# To hint IPEX uses shared memory based AllReduce
|
||||
os.environ["LOCAL_WORLD_SIZE"] = str(
|
||||
vllm_config.parallel_config.tensor_parallel_size
|
||||
)
|
||||
|
||||
@@ -23,12 +23,11 @@ if current_platform.is_cuda():
|
||||
|
||||
elif current_platform.is_xpu():
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm._xpu_ops import xpu_ops
|
||||
|
||||
reshape_and_cache_flash = ops.reshape_and_cache_flash
|
||||
from vllm._ipex_ops import ipex_ops
|
||||
|
||||
flash_attn_varlen_func = ipex_ops.flash_attn_varlen_func # type: ignore[assignment]
|
||||
get_scheduler_metadata = ipex_ops.get_scheduler_metadata # type: ignore[assignment]
|
||||
flash_attn_varlen_func = xpu_ops.flash_attn_varlen_func # type: ignore[assignment]
|
||||
get_scheduler_metadata = xpu_ops.get_scheduler_metadata # type: ignore[assignment]
|
||||
elif current_platform.is_rocm():
|
||||
try:
|
||||
from flash_attn import flash_attn_varlen_func # type: ignore[no-redef]
|
||||
@@ -153,7 +152,7 @@ def is_flash_attn_varlen_func_available() -> bool:
|
||||
|
||||
Platform-specific sources:
|
||||
- CUDA: vllm.vllm_flash_attn.flash_attn_varlen_func
|
||||
- XPU: ipex_ops.flash_attn_varlen_func
|
||||
- XPU: xpu_ops.flash_attn_varlen_func
|
||||
- ROCm: upstream flash_attn.flash_attn_varlen_func (if available)
|
||||
|
||||
Note: This is separate from the AITER flash attention backend (rocm_aiter_fa.py)
|
||||
|
||||
@@ -9,7 +9,7 @@ from vllm.platforms import current_platform
|
||||
if current_platform.is_cuda_alike():
|
||||
from vllm import _custom_ops as ops
|
||||
elif current_platform.is_xpu():
|
||||
from vllm._ipex_ops import ipex_ops as ops # type: ignore[no-redef]
|
||||
from vllm._xpu_ops import xpu_ops as ops # type: ignore[no-redef]
|
||||
|
||||
|
||||
class PagedAttention:
|
||||
|
||||
Reference in New Issue
Block a user