[Perf] Disable chunked local attention by default with llama4 (#21761)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
@@ -4769,12 +4769,23 @@ class VllmConfig:
|
|||||||
# Hybrid KV cache manager is not compatible with KV events.
|
# Hybrid KV cache manager is not compatible with KV events.
|
||||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||||
if self.model_config is not None and \
|
if self.model_config is not None and \
|
||||||
self.model_config.attention_chunk_size is not None and \
|
self.model_config.attention_chunk_size is not None:
|
||||||
self.speculative_config is not None and \
|
if self.speculative_config is not None and \
|
||||||
self.speculative_config.use_eagle():
|
self.speculative_config.use_eagle():
|
||||||
# Hybrid KV cache manager is not yet supported with chunked
|
# Hybrid KV cache manager is not yet supported with chunked
|
||||||
# local attention + eagle.
|
# local attention + eagle.
|
||||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||||
|
elif \
|
||||||
|
not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
|
||||||
|
logger.warning(
|
||||||
|
"There is a latency regression when using chunked local"
|
||||||
|
" attention with the hybrid KV cache manager. Disabling"
|
||||||
|
" it, by default. To enable it, set the environment "
|
||||||
|
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
|
||||||
|
)
|
||||||
|
# Hybrid KV cache manager is not yet supported with chunked
|
||||||
|
# local attention.
|
||||||
|
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||||
|
|
||||||
def update_sizes_for_sequence_parallelism(self,
|
def update_sizes_for_sequence_parallelism(self,
|
||||||
possible_sizes: list) -> list:
|
possible_sizes: list) -> list:
|
||||||
|
|||||||
12
vllm/envs.py
12
vllm/envs.py
@@ -143,6 +143,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_USE_CUDNN_PREFILL: bool = False
|
VLLM_USE_CUDNN_PREFILL: bool = False
|
||||||
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
|
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
|
||||||
VLLM_LOOPBACK_IP: str = ""
|
VLLM_LOOPBACK_IP: str = ""
|
||||||
|
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
|
||||||
|
|
||||||
|
|
||||||
def get_default_cache_root():
|
def get_default_cache_root():
|
||||||
@@ -991,6 +992,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# The default value is "VLLM".
|
# The default value is "VLLM".
|
||||||
"VLLM_PROCESS_NAME_PREFIX":
|
"VLLM_PROCESS_NAME_PREFIX":
|
||||||
lambda: os.getenv("VLLM_PROCESS_NAME_PREFIX", "VLLM"),
|
lambda: os.getenv("VLLM_PROCESS_NAME_PREFIX", "VLLM"),
|
||||||
|
|
||||||
|
# Allow chunked local attention with hybrid kv cache manager.
|
||||||
|
# Currently using the Hybrid KV cache manager with chunked local attention
|
||||||
|
# in the Llama4 models (the only models currently using chunked local attn)
|
||||||
|
# causes a latency regression. For this reason, we disable it by default.
|
||||||
|
# This flag is used to allow users to enable it if they want to (to save on
|
||||||
|
# kv-cache memory usage and enable longer contexts)
|
||||||
|
# TODO(lucas): Remove this flag once latency regression is resolved.
|
||||||
|
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
|
||||||
|
lambda: bool(int(os.getenv(\
|
||||||
|
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
|
||||||
}
|
}
|
||||||
|
|
||||||
# --8<-- [end:env-vars-definition]
|
# --8<-- [end:env-vars-definition]
|
||||||
|
|||||||
Reference in New Issue
Block a user