[Core] Support Local Chunked Attention for Hybrid KV Cache (#19351)

Signed-off-by: Lucia Fang <fanglu@fb.com> Signed-off-by: Lu Fang <fanglu@meta.com> Signed-off-by: Lu Fang <fanglu@fb.com> Co-authored-by: Lu Fang <fanglu@meta.com>
2025-07-19 11:48:38 +08:00
parent 466e878f2a
commit 9a9fda1423
9 changed files with 351 additions and 19 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4722,6 +4722,13 @@ class VllmConfig:
            if self.kv_events_config is not None:
                # Hybrid KV cache manager is not compatible with KV events.
                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+            if self.model_config is not None and \
+                self.model_config.attention_chunk_size is not None and \
+                self.speculative_config is not None and \
+                self.speculative_config.use_eagle():
+                # Hybrid KV cache manager is not yet supported with chunked
+                # local attention + eagle.
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True

    def update_sizes_for_sequence_parallelism(self,
                                              possible_sizes: list) -> list: