diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 98f99d089..063d3e6e4 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -134,7 +134,6 @@ WORKDIR /vllm-workspace
 # Copy test requirements
 COPY requirements/test.in requirements/cpu-test.in
 
-# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 RUN \
     sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
     remove_packages_not_supported_on_aarch64() { \
diff --git a/docs/getting_started/installation/gpu.xpu.inc.md b/docs/getting_started/installation/gpu.xpu.inc.md
index 7e9c6a2b9..d8b84ace2 100644
--- a/docs/getting_started/installation/gpu.xpu.inc.md
+++ b/docs/getting_started/installation/gpu.xpu.inc.md
@@ -6,10 +6,11 @@ vLLM initially supports basic model inference and serving on Intel GPU platform.
 # --8<-- [start:requirements]
 
 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
-- OneAPI requirements: oneAPI 2025.1
+- OneAPI requirements: oneAPI 2025.3
+- Dependency: [vllm-xpu-kernels](https://github.com/vllm-project/vllm-xpu-kernels): a package provide all necessary vllm custom kernel when running vLLM on Intel GPU platform, 
 - Python: 3.12
 !!! warning
-    The provided IPEX whl is Python3.12 specific so this version is a MUST.
+    The provided vllm-xpu-kernels whl is Python3.12 specific so this version is a MUST.
 
 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
@@ -24,7 +25,7 @@ Currently, there are no pre-built XPU wheels.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later.
+- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.3 or later.
 - Second, install Python packages for vLLM XPU backend building:
 
 ```bash
@@ -37,7 +38,7 @@ pip install -v -r requirements/xpu.txt
 - Then, build and install vLLM XPU backend:
 
 ```bash
-VLLM_TARGET_DEVICE=xpu python setup.py install
+VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -e . -v
 ```
 
 # --8<-- [end:build-wheel-from-source]
diff --git a/tests/quantization/test_cpu_wna16.py b/tests/quantization/test_cpu_wna16.py
index 56b9c39b0..6c8a8f3d5 100644
--- a/tests/quantization/test_cpu_wna16.py
+++ b/tests/quantization/test_cpu_wna16.py
@@ -17,7 +17,7 @@ DTYPE = ["bfloat16"]
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", DTYPE)
-def test_ipex_quant(vllm_runner, model, dtype):
+def test_cpu_quant(vllm_runner, model, dtype):
     with vllm_runner(model, dtype=dtype) as llm:
         output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
     assert output
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
deleted file mode 100644
index 4f3c52df6..000000000
--- a/tests/quantization/test_ipex_quant.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test model set-up and inference for quantized HF models supported
-on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
-
-Validating the configuration and printing results for manual checking.
-
-Run `pytest tests/quantization/test_ipex_quant.py`.
-"""
-
-import pytest
-
-from vllm.platforms import current_platform
-
-MODELS = [
-    "AMead10/Llama-3.2-1B-Instruct-AWQ",
-    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",  # with g_idx
-]
-DTYPE = ["bfloat16"]
-
-
-@pytest.mark.skipif(
-    not current_platform.is_cpu() and not current_platform.is_xpu(),
-    reason="only supports Intel CPU/XPU backend.",
-)
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", DTYPE)
-def test_ipex_quant(vllm_runner, model, dtype):
-    with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
-    assert output
-    print(output)
diff --git a/vllm/_ipex_ops.py b/vllm/_xpu_ops.py
similarity index 96%
rename from vllm/_ipex_ops.py
rename to vllm/_xpu_ops.py
index 22133eaef..e40b18f81 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_xpu_ops.py
@@ -53,7 +53,7 @@ if hasattr(torch.ops._xpu_C, "int4_gemm_w4a16"):
         return torch.empty((M, N), dtype=input.dtype, device=input.device)
 
 
-class ipex_ops:
+class xpu_ops:
     @staticmethod
     def flash_attn_varlen_func(
         q: torch.Tensor,
@@ -73,7 +73,7 @@ class ipex_ops:
         cu_seqlens_k: torch.Tensor | None = None,
         # passed in qwen vl
         dropout_p: float = 0.0,
-        # The following parameters are not used in ipex kernel currently,
+        # The following parameters are not used in xpu kernel currently,
         # we keep API compatible to CUDA's.
         scheduler_metadata=None,
         fa_version: int = 2,
@@ -153,6 +153,6 @@ class ipex_ops:
         sm_margin=0,  # Can be tuned if some SMs are used for communication
     ) -> None:
         logger.warning_once(
-            "get_scheduler_metadata is not implemented for ipex_ops, returning None."
+            "get_scheduler_metadata is not implemented for xpu_ops, returning None."
         )
         return None
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 13199124b..75501076a 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -160,7 +160,7 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
             logger.info_once("Using Triton backend")
             return Mxfp4Backend.TRITON
     elif current_platform.is_xpu():
-        logger.info_once("Using ipex marlin backend on XPU")
+        logger.info_once("Using xpu backend on XPU")
         return Mxfp4Backend.MARLIN
     elif current_platform.is_rocm() and has_triton_kernels():
         logger.info_once("Using Triton backend")
diff --git a/vllm/model_executor/layers/sparse_attn_indexer.py b/vllm/model_executor/layers/sparse_attn_indexer.py
index bd063de74..538860ca6 100644
--- a/vllm/model_executor/layers/sparse_attn_indexer.py
+++ b/vllm/model_executor/layers/sparse_attn_indexer.py
@@ -20,7 +20,7 @@ from vllm.v1.worker.workspace import current_workspace_manager
 if current_platform.is_cuda_alike():
     from vllm import _custom_ops as ops
 elif current_platform.is_xpu():
-    from vllm._ipex_ops import ipex_ops as ops
+    from vllm._xpu_ops import xpu_ops as ops
 
 logger = init_logger(__name__)
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 60180b272..3edc83b15 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -345,7 +345,6 @@ class CpuPlatform(Platform):
                 ld_preload_str += pytorch_libgomp_so
                 os.environ["LD_PRELOAD"] = ld_preload_str
 
-        # To hint IPEX uses shared memory based AllReduce
         os.environ["LOCAL_WORLD_SIZE"] = str(
             vllm_config.parallel_config.tensor_parallel_size
         )
diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py
index ccf52aff2..3150ad9a5 100644
--- a/vllm/v1/attention/backends/fa_utils.py
+++ b/vllm/v1/attention/backends/fa_utils.py
@@ -23,12 +23,11 @@ if current_platform.is_cuda():
 
 elif current_platform.is_xpu():
     from vllm import _custom_ops as ops
+    from vllm._xpu_ops import xpu_ops
 
     reshape_and_cache_flash = ops.reshape_and_cache_flash
-    from vllm._ipex_ops import ipex_ops
-
-    flash_attn_varlen_func = ipex_ops.flash_attn_varlen_func  # type: ignore[assignment]
-    get_scheduler_metadata = ipex_ops.get_scheduler_metadata  # type: ignore[assignment]
+    flash_attn_varlen_func = xpu_ops.flash_attn_varlen_func  # type: ignore[assignment]
+    get_scheduler_metadata = xpu_ops.get_scheduler_metadata  # type: ignore[assignment]
 elif current_platform.is_rocm():
     try:
         from flash_attn import flash_attn_varlen_func  # type: ignore[no-redef]
@@ -153,7 +152,7 @@ def is_flash_attn_varlen_func_available() -> bool:
 
     Platform-specific sources:
     - CUDA: vllm.vllm_flash_attn.flash_attn_varlen_func
-    - XPU: ipex_ops.flash_attn_varlen_func
+    - XPU: xpu_ops.flash_attn_varlen_func
     - ROCm: upstream flash_attn.flash_attn_varlen_func (if available)
 
     Note: This is separate from the AITER flash attention backend (rocm_aiter_fa.py)
diff --git a/vllm/v1/attention/ops/paged_attn.py b/vllm/v1/attention/ops/paged_attn.py
index 73995fc93..896e929b5 100644
--- a/vllm/v1/attention/ops/paged_attn.py
+++ b/vllm/v1/attention/ops/paged_attn.py
@@ -9,7 +9,7 @@ from vllm.platforms import current_platform
 if current_platform.is_cuda_alike():
     from vllm import _custom_ops as ops
 elif current_platform.is_xpu():
-    from vllm._ipex_ops import ipex_ops as ops  # type: ignore[no-redef]
+    from vllm._xpu_ops import xpu_ops as ops  # type: ignore[no-redef]
 
 
 class PagedAttention: