diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 98f99d089..063d3e6e4 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -134,7 +134,6 @@ WORKDIR /vllm-workspace # Copy test requirements COPY requirements/test.in requirements/cpu-test.in -# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version RUN \ sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ remove_packages_not_supported_on_aarch64() { \ diff --git a/docs/getting_started/installation/gpu.xpu.inc.md b/docs/getting_started/installation/gpu.xpu.inc.md index 7e9c6a2b9..d8b84ace2 100644 --- a/docs/getting_started/installation/gpu.xpu.inc.md +++ b/docs/getting_started/installation/gpu.xpu.inc.md @@ -6,10 +6,11 @@ vLLM initially supports basic model inference and serving on Intel GPU platform. # --8<-- [start:requirements] - Supported Hardware: Intel Data Center GPU, Intel ARC GPU -- OneAPI requirements: oneAPI 2025.1 +- OneAPI requirements: oneAPI 2025.3 +- Dependency: [vllm-xpu-kernels](https://github.com/vllm-project/vllm-xpu-kernels): a package provide all necessary vllm custom kernel when running vLLM on Intel GPU platform, - Python: 3.12 !!! warning - The provided IPEX whl is Python3.12 specific so this version is a MUST. + The provided vllm-xpu-kernels whl is Python3.12 specific so this version is a MUST. # --8<-- [end:requirements] # --8<-- [start:set-up-using-python] @@ -24,7 +25,7 @@ Currently, there are no pre-built XPU wheels. # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] -- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later. +- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.3 or later. - Second, install Python packages for vLLM XPU backend building: ```bash @@ -37,7 +38,7 @@ pip install -v -r requirements/xpu.txt - Then, build and install vLLM XPU backend: ```bash -VLLM_TARGET_DEVICE=xpu python setup.py install +VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -e . -v ``` # --8<-- [end:build-wheel-from-source] diff --git a/tests/quantization/test_cpu_wna16.py b/tests/quantization/test_cpu_wna16.py index 56b9c39b0..6c8a8f3d5 100644 --- a/tests/quantization/test_cpu_wna16.py +++ b/tests/quantization/test_cpu_wna16.py @@ -17,7 +17,7 @@ DTYPE = ["bfloat16"] @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", DTYPE) -def test_ipex_quant(vllm_runner, model, dtype): +def test_cpu_quant(vllm_runner, model, dtype): with vllm_runner(model, dtype=dtype) as llm: output = llm.generate_greedy(["The capital of France is"], max_tokens=32) assert output diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py deleted file mode 100644 index 4f3c52df6..000000000 --- a/tests/quantization/test_ipex_quant.py +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Test model set-up and inference for quantized HF models supported -on the CPU/GPU backend using IPEX (including AWQ/GPTQ). - -Validating the configuration and printing results for manual checking. - -Run `pytest tests/quantization/test_ipex_quant.py`. -""" - -import pytest - -from vllm.platforms import current_platform - -MODELS = [ - "AMead10/Llama-3.2-1B-Instruct-AWQ", - "shuyuej/Llama-3.2-1B-Instruct-GPTQ", # with g_idx -] -DTYPE = ["bfloat16"] - - -@pytest.mark.skipif( - not current_platform.is_cpu() and not current_platform.is_xpu(), - reason="only supports Intel CPU/XPU backend.", -) -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", DTYPE) -def test_ipex_quant(vllm_runner, model, dtype): - with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=4) - assert output - print(output) diff --git a/vllm/_ipex_ops.py b/vllm/_xpu_ops.py similarity index 96% rename from vllm/_ipex_ops.py rename to vllm/_xpu_ops.py index 22133eaef..e40b18f81 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_xpu_ops.py @@ -53,7 +53,7 @@ if hasattr(torch.ops._xpu_C, "int4_gemm_w4a16"): return torch.empty((M, N), dtype=input.dtype, device=input.device) -class ipex_ops: +class xpu_ops: @staticmethod def flash_attn_varlen_func( q: torch.Tensor, @@ -73,7 +73,7 @@ class ipex_ops: cu_seqlens_k: torch.Tensor | None = None, # passed in qwen vl dropout_p: float = 0.0, - # The following parameters are not used in ipex kernel currently, + # The following parameters are not used in xpu kernel currently, # we keep API compatible to CUDA's. scheduler_metadata=None, fa_version: int = 2, @@ -153,6 +153,6 @@ class ipex_ops: sm_margin=0, # Can be tuned if some SMs are used for communication ) -> None: logger.warning_once( - "get_scheduler_metadata is not implemented for ipex_ops, returning None." + "get_scheduler_metadata is not implemented for xpu_ops, returning None." ) return None diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 13199124b..75501076a 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -160,7 +160,7 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend: logger.info_once("Using Triton backend") return Mxfp4Backend.TRITON elif current_platform.is_xpu(): - logger.info_once("Using ipex marlin backend on XPU") + logger.info_once("Using xpu backend on XPU") return Mxfp4Backend.MARLIN elif current_platform.is_rocm() and has_triton_kernels(): logger.info_once("Using Triton backend") diff --git a/vllm/model_executor/layers/sparse_attn_indexer.py b/vllm/model_executor/layers/sparse_attn_indexer.py index bd063de74..538860ca6 100644 --- a/vllm/model_executor/layers/sparse_attn_indexer.py +++ b/vllm/model_executor/layers/sparse_attn_indexer.py @@ -20,7 +20,7 @@ from vllm.v1.worker.workspace import current_workspace_manager if current_platform.is_cuda_alike(): from vllm import _custom_ops as ops elif current_platform.is_xpu(): - from vllm._ipex_ops import ipex_ops as ops + from vllm._xpu_ops import xpu_ops as ops logger = init_logger(__name__) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 60180b272..3edc83b15 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -345,7 +345,6 @@ class CpuPlatform(Platform): ld_preload_str += pytorch_libgomp_so os.environ["LD_PRELOAD"] = ld_preload_str - # To hint IPEX uses shared memory based AllReduce os.environ["LOCAL_WORLD_SIZE"] = str( vllm_config.parallel_config.tensor_parallel_size ) diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py index ccf52aff2..3150ad9a5 100644 --- a/vllm/v1/attention/backends/fa_utils.py +++ b/vllm/v1/attention/backends/fa_utils.py @@ -23,12 +23,11 @@ if current_platform.is_cuda(): elif current_platform.is_xpu(): from vllm import _custom_ops as ops + from vllm._xpu_ops import xpu_ops reshape_and_cache_flash = ops.reshape_and_cache_flash - from vllm._ipex_ops import ipex_ops - - flash_attn_varlen_func = ipex_ops.flash_attn_varlen_func # type: ignore[assignment] - get_scheduler_metadata = ipex_ops.get_scheduler_metadata # type: ignore[assignment] + flash_attn_varlen_func = xpu_ops.flash_attn_varlen_func # type: ignore[assignment] + get_scheduler_metadata = xpu_ops.get_scheduler_metadata # type: ignore[assignment] elif current_platform.is_rocm(): try: from flash_attn import flash_attn_varlen_func # type: ignore[no-redef] @@ -153,7 +152,7 @@ def is_flash_attn_varlen_func_available() -> bool: Platform-specific sources: - CUDA: vllm.vllm_flash_attn.flash_attn_varlen_func - - XPU: ipex_ops.flash_attn_varlen_func + - XPU: xpu_ops.flash_attn_varlen_func - ROCm: upstream flash_attn.flash_attn_varlen_func (if available) Note: This is separate from the AITER flash attention backend (rocm_aiter_fa.py) diff --git a/vllm/v1/attention/ops/paged_attn.py b/vllm/v1/attention/ops/paged_attn.py index 73995fc93..896e929b5 100644 --- a/vllm/v1/attention/ops/paged_attn.py +++ b/vllm/v1/attention/ops/paged_attn.py @@ -9,7 +9,7 @@ from vllm.platforms import current_platform if current_platform.is_cuda_alike(): from vllm import _custom_ops as ops elif current_platform.is_xpu(): - from vllm._ipex_ops import ipex_ops as ops # type: ignore[no-redef] + from vllm._xpu_ops import xpu_ops as ops # type: ignore[no-redef] class PagedAttention: