[Core] Support Local Chunked Attention for Hybrid KV Cache (#19351)
Signed-off-by: Lucia Fang <fanglu@fb.com> Signed-off-by: Lu Fang <fanglu@meta.com> Signed-off-by: Lu Fang <fanglu@fb.com> Co-authored-by: Lu Fang <fanglu@meta.com>
This commit is contained in:
@@ -4722,6 +4722,13 @@ class VllmConfig:
|
||||
if self.kv_events_config is not None:
|
||||
# Hybrid KV cache manager is not compatible with KV events.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
if self.model_config is not None and \
|
||||
self.model_config.attention_chunk_size is not None and \
|
||||
self.speculative_config is not None and \
|
||||
self.speculative_config.use_eagle():
|
||||
# Hybrid KV cache manager is not yet supported with chunked
|
||||
# local attention + eagle.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
|
||||
def update_sizes_for_sequence_parallelism(self,
|
||||
possible_sizes: list) -> list:
|
||||
|
||||
Reference in New Issue
Block a user