[Perf] add packed recurrent fast path for decode (#36596)

Signed-off-by: hdj <1293066020@qq.com> Co-authored-by: Roger Wang <hey@rogerw.io>
2026-03-12 19:01:57 +08:00
parent 06e0bc21d2
commit 9e19f8338b
5 changed files with 402 additions and 4 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -96,6 +96,7 @@ if TYPE_CHECKING:
    VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
    VLLM_SKIP_P2P_CHECK: bool = False
    VLLM_DISABLED_KERNELS: list[str] = []
+    VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE: bool = True
    VLLM_DISABLE_PYNCCL: bool = False
    VLLM_USE_OINK_OPS: bool = False
    VLLM_ROCM_USE_AITER: bool = False
@@ -899,6 +900,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_DISABLED_KERNELS": lambda: []
    if "VLLM_DISABLED_KERNELS" not in os.environ
    else os.environ["VLLM_DISABLED_KERNELS"].split(","),
+    "VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE": lambda: bool(
+        int(os.getenv("VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE", "1"))
+    ),
    # Disable pynccl (using torch.distributed instead)
    "VLLM_DISABLE_PYNCCL": lambda: (
        os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")