[Hardware][CPU] Support chunked-prefill and prefix-caching on CPU (#10355)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
This commit is contained in:
Li, Jiang
2024-11-20 18:57:39 +08:00
committed by GitHub
parent d5b28447e0
commit 63f1fde277
8 changed files with 558 additions and 368 deletions

View File

@@ -53,11 +53,6 @@ class CpuPlatform(Platform):
cache_config = vllm_config.cache_config
if cache_config.enable_prefix_caching:
logger.warning(
"Prefix caching is not supported on CPU, disable it.")
cache_config.enable_prefix_caching = False
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
if kv_cache_space >= 0:
@@ -74,10 +69,12 @@ class CpuPlatform(Platform):
f" {kv_cache_space}, expect a positive integer value.")
scheduler_config = vllm_config.scheduler_config
if scheduler_config.chunked_prefill_enabled:
logger.warning(
"Chunked prefill is not supported on CPU, disable it.")
scheduler_config.chunked_prefill_enabled = False
if ((scheduler_config.chunked_prefill_enabled
or cache_config.enable_prefix_caching)
and model_config.dtype == torch.half):
logger.warning("Chunked-prefill on the CPU backend only does not"
" support fp16 for now, cast to bf16.")
model_config.dtype = torch.bfloat16
parallel_config = vllm_config.parallel_config
if (parallel_config.distributed_executor_backend is not None