diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml index 5c181943c..501fc4d28 100644 --- a/.buildkite/hardware_tests/cpu.yaml +++ b/.buildkite/hardware_tests/cpu.yaml @@ -3,7 +3,6 @@ depends_on: [] steps: - label: CPU-Kernel Tests depends_on: [] - soft_fail: true device: intel_cpu no_plugin: true source_file_dependencies: @@ -23,7 +22,6 @@ steps: - label: CPU-Compatibility Tests depends_on: [] - soft_fail: true device: intel_cpu no_plugin: true source_file_dependencies: @@ -37,7 +35,6 @@ steps: - label: CPU-Language Generation and Pooling Model Tests depends_on: [] - soft_fail: true device: intel_cpu no_plugin: true source_file_dependencies: @@ -53,7 +50,6 @@ steps: - label: CPU-Quantization Model Tests depends_on: [] - soft_fail: true device: intel_cpu no_plugin: true source_file_dependencies: @@ -73,7 +69,6 @@ steps: - label: CPU-Distributed Tests depends_on: [] - soft_fail: true device: intel_cpu no_plugin: true source_file_dependencies: @@ -92,7 +87,6 @@ steps: - label: CPU-Multi-Modal Model Tests %N depends_on: [] - soft_fail: true device: intel_cpu no_plugin: true source_file_dependencies: @@ -107,7 +101,6 @@ steps: - label: "Arm CPU Test" depends_on: [] - soft_fail: true device: arm_cpu no_plugin: true commands: diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index d011ff038..15b254662 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -126,6 +126,12 @@ void cpu_fused_moe(torch::Tensor& output, const torch::Tensor& input, const torch::Tensor& topk_id, const bool skip_weighted, const std::string& act, const std::string& isa); +void compute_slot_mapping_kernel_impl(const torch::Tensor query_start_loc, + const torch::Tensor positions, + const torch::Tensor block_table, + torch::Tensor slot_mapping, + const int64_t block_size); + TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -334,6 +340,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor! out, Tensor query, Tensor kv_cache," " float scale, Tensor block_tables, Tensor seq_lens) -> ()"); ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache); + + ops.def( + "compute_slot_mapping_kernel_impl(Tensor query_start_loc, Tensor " + "positions, Tensor block_table, Tensor(a3!) slot_mapping, SymInt " + "block_size) -> ()", + &compute_slot_mapping_kernel_impl); } REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp index e2812fe57..3c133a0c5 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp @@ -189,3 +189,38 @@ ScratchPadManager* ScratchPadManager::get_scratchpad_manager() { return &manager; } } // namespace cpu_utils + +void compute_slot_mapping_kernel_impl(const torch::Tensor query_start_loc, + const torch::Tensor positions, + const torch::Tensor block_table, + torch::Tensor slot_mapping, + const int64_t block_size) { + const int32_t req_num = query_start_loc.size(0) - 1; + const int64_t block_table_stride = block_table.stride(0); + + const int32_t* __restrict__ query_start_loc_ptr = + query_start_loc.data_ptr(); + const int64_t* __restrict__ positions_ptr = positions.data_ptr(); + const int32_t* __restrict__ blocktable_ptr = block_table.data_ptr(); + int64_t* __restrict__ slot_mapping_ptr = slot_mapping.data_ptr(); + +#pragma omp parallel for + for (int32_t req_idx = 0; req_idx < req_num; ++req_idx) { + int32_t token_start_idx = query_start_loc_ptr[req_idx]; + int32_t token_end_idx = query_start_loc_ptr[req_idx + 1]; + int32_t token_num = token_end_idx - token_start_idx; + const int64_t* __restrict__ curr_position_ptr = + positions_ptr + token_start_idx; + int64_t* __restrict__ curr_slot_mapping_ptr = + slot_mapping_ptr + token_start_idx; + const int32_t* __restrict__ curr_block_table_ptr = + blocktable_ptr + req_idx * block_table_stride; + + for (int32_t token_idx = 0; token_idx < token_num; ++token_idx) { + int64_t token_position = curr_position_ptr[token_idx]; + int64_t block_id = curr_block_table_ptr[token_position / block_size]; + curr_slot_mapping_ptr[token_idx] = + block_id * block_size + token_position % block_size; + } + } +} diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 5f819acc6..abae5d1be 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -161,7 +161,7 @@ RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install -e tests/vllm_test_utils + uv pip install --no-build-isolation -e tests/vllm_test_utils RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/ccache \ diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 0350a2a8b..426b9aa71 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -309,7 +309,7 @@ class AWQMarlinConfig(QuantizationConfig): group_size = quant_config.get("group_size") zero_point = quant_config.get("zero_point") - if not (current_platform.is_cuda_alike() or current_platform.is_cpu()): + if not current_platform.is_cuda_alike(): return False if quant_method != "awq": diff --git a/vllm/utils/cpu_triton_utils.py b/vllm/utils/cpu_triton_utils.py new file mode 100644 index 000000000..d956dde8b --- /dev/null +++ b/vllm/utils/cpu_triton_utils.py @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Contains replacement functions to fallback Triton usages in CPU backend +""" + +from collections.abc import Callable + +import torch + + +class _FuncWrapper: + def __init__(self, func: Callable) -> None: + self.func = func + + def __getitem__(self, *args, **kwargs) -> Callable: + return self.func + + +# For _compute_slot_mapping_kernel in vllm/v1/worker/block_table.py +def _compute_slot_mapping_kernel_impl( + num_tokens: int, + max_num_tokens: int, + query_start_loc: torch.Tensor, # [num_reqs + 1], int32 + positions: torch.Tensor, # [num_tokens], int64 + block_table: torch.Tensor, # [max_num_reqs, max_num_blocks_per_req], int32 + block_table_stride: int, # max_num_blocks_per_req + block_size: int, + slot_mapping: torch.Tensor, # [max_num_tokens], int64 + TOTAL_CP_WORLD_SIZE: int, + TOTAL_CP_RANK: int, + CP_KV_CACHE_INTERLEAVE_SIZE: int, + PAD_ID: int, + BLOCK_SIZE: int, +) -> None: + assert TOTAL_CP_WORLD_SIZE == 1, "Context Parallelism is not supported on CPU." + torch.ops._C.compute_slot_mapping_kernel_impl( + query_start_loc, + positions, + block_table, + slot_mapping, + block_size, + ) + + +compute_slot_mapping_kernel = _FuncWrapper(_compute_slot_mapping_kernel_impl) diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 4015b7722..d66aac90a 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -6,6 +6,7 @@ from typing import Any import torch import torch.nn as nn +import vllm.utils.cpu_triton_utils as cpu_tl from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model @@ -28,6 +29,7 @@ class CPUModelRunner(GPUModelRunner): self.cascade_attn_enabled = False self._postprocess_tensors() + self._postprocess_triton() def _postprocess_tensors(self) -> None: # Note: replace device tensors with cpu tensors @@ -52,6 +54,13 @@ class CPUModelRunner(GPUModelRunner): if isinstance(v, CpuGpuBuffer): v.gpu = v.cpu + def _postprocess_triton(self) -> None: + import vllm.v1.worker.block_table + + vllm.v1.worker.block_table._compute_slot_mapping_kernel = ( + cpu_tl.compute_slot_mapping_kernel + ) + @instrument(span_name="Loading (CPU)") def load_model(self, load_dummy_weights: bool = False) -> None: if load_dummy_weights: