Enable prefix caching with full cuda graphs (#19617)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -4495,7 +4495,6 @@ class VllmConfig:
|
|||||||
"full_cuda_graph is not supported with "
|
"full_cuda_graph is not supported with "
|
||||||
"cascade attention. Disabling cascade attention.")
|
"cascade attention. Disabling cascade attention.")
|
||||||
self.model_config.disable_cascade_attn = True
|
self.model_config.disable_cascade_attn = True
|
||||||
self.cache_config.enable_prefix_caching = False
|
|
||||||
|
|
||||||
if (self.kv_events_config is not None
|
if (self.kv_events_config is not None
|
||||||
and self.kv_events_config.enable_kv_cache_events
|
and self.kv_events_config.enable_kv_cache_events
|
||||||
|
|||||||
Reference in New Issue
Block a user