Improve enable chunked_prefill & prefix_caching logic. (#26623)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
wang.yuqi
2025-11-28 14:05:48 +08:00
committed by GitHub
parent 37b15e97e8
commit f4b76056ee
11 changed files with 456 additions and 133 deletions

View File

@@ -119,11 +119,12 @@ class EngineCore:
# Setup scheduler.
Scheduler = vllm_config.scheduler_config.get_scheduler_cls()
if len(kv_cache_config.kv_cache_groups) == 0:
if len(kv_cache_config.kv_cache_groups) == 0: # noqa: SIM102
# Encoder models without KV cache don't support
# chunked prefill. But do SSM models?
logger.info("Disabling chunked prefill for model without KVCache")
vllm_config.scheduler_config.enable_chunked_prefill = False
if vllm_config.scheduler_config.enable_chunked_prefill:
logger.warning("Disabling chunked prefill for model without KVCache")
vllm_config.scheduler_config.enable_chunked_prefill = False
scheduler_block_size = (
vllm_config.cache_config.block_size