[Perf] add packed recurrent fast path for decode (#36596)

Signed-off-by: hdj <1293066020@qq.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
caozuoba
2026-03-12 19:01:57 +08:00
committed by GitHub
parent 06e0bc21d2
commit 9e19f8338b
5 changed files with 402 additions and 4 deletions

View File

@@ -96,6 +96,7 @@ if TYPE_CHECKING:
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
VLLM_SKIP_P2P_CHECK: bool = False
VLLM_DISABLED_KERNELS: list[str] = []
VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE: bool = True
VLLM_DISABLE_PYNCCL: bool = False
VLLM_USE_OINK_OPS: bool = False
VLLM_ROCM_USE_AITER: bool = False
@@ -899,6 +900,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DISABLED_KERNELS": lambda: []
if "VLLM_DISABLED_KERNELS" not in os.environ
else os.environ["VLLM_DISABLED_KERNELS"].split(","),
"VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE": lambda: bool(
int(os.getenv("VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE", "1"))
),
# Disable pynccl (using torch.distributed instead)
"VLLM_DISABLE_PYNCCL": lambda: (
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")