[V1] Implement sliding window attention in kv_cache_manager (#14097)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-04-01 15:33:17 +08:00
parent c7e63aa4d8
commit 3a5f0afcd2
15 changed files with 662 additions and 158 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1116,8 +1116,7 @@ class CacheConfig:
        is_attention_free: Whether the model is attention-free.
        num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
            profiled num_gpu_blocks if specified. Does nothing if None.
-        sliding_window: Sliding window size for the KV cache. Can not work with
-            prefix caching enabled.
+        sliding_window: Sliding window size for the KV cache.
        enable_prefix_caching: Whether to enable prefix caching.
        cpu_offload_gb: Size of the CPU offload buffer in GiB.
    """