[Perf] add packed recurrent fast path for decode (#36596)
Signed-off-by: hdj <1293066020@qq.com> Co-authored-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
@@ -96,6 +96,7 @@ if TYPE_CHECKING:
|
||||
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
||||
VLLM_SKIP_P2P_CHECK: bool = False
|
||||
VLLM_DISABLED_KERNELS: list[str] = []
|
||||
VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE: bool = True
|
||||
VLLM_DISABLE_PYNCCL: bool = False
|
||||
VLLM_USE_OINK_OPS: bool = False
|
||||
VLLM_ROCM_USE_AITER: bool = False
|
||||
@@ -899,6 +900,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_DISABLED_KERNELS": lambda: []
|
||||
if "VLLM_DISABLED_KERNELS" not in os.environ
|
||||
else os.environ["VLLM_DISABLED_KERNELS"].split(","),
|
||||
"VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE": lambda: bool(
|
||||
int(os.getenv("VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE", "1"))
|
||||
),
|
||||
# Disable pynccl (using torch.distributed instead)
|
||||
"VLLM_DISABLE_PYNCCL": lambda: (
|
||||
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
|
||||
|
||||
Reference in New Issue
Block a user