[Perf] Disable chunked local attention by default with llama4 (#21761)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-07-28 18:49:04 -04:00
parent 89ac266b26
commit 8aa1485fcf
2 changed files with 29 additions and 6 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -143,6 +143,7 @@ if TYPE_CHECKING:
    VLLM_USE_CUDNN_PREFILL: bool = False
    VLLM_ENABLE_CUDAGRAPH_GC: bool = False
    VLLM_LOOPBACK_IP: str = ""
+    VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False


 def get_default_cache_root():
@@ -991,6 +992,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # The default value is "VLLM".
    "VLLM_PROCESS_NAME_PREFIX":
    lambda: os.getenv("VLLM_PROCESS_NAME_PREFIX", "VLLM"),
+
+    # Allow chunked local attention with hybrid kv cache manager.
+    # Currently using the Hybrid KV cache manager with chunked local attention
+    # in the Llama4 models (the only models currently using chunked local attn)
+    # causes a latency regression. For this reason, we disable it by default.
+    # This flag is used to allow users to enable it if they want to (to save on
+    # kv-cache memory usage and enable longer contexts)
+    # TODO(lucas): Remove this flag once latency regression is resolved.
+    "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
+    lambda: bool(int(os.getenv(\
+            "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
 }

 # --8<-- [end:env-vars-definition]