In-Tree AMD Zen CPU Backend via zentorch [1/N] (#35970)
Signed-off-by: Lalithnarayan C <Lalithnarayan.C@amd.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Chinmay-Kulkarni-AMD <Chinmay.Kulkarni@amd.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -51,6 +51,7 @@ if TYPE_CHECKING:
|
||||
VLLM_CPU_OMP_THREADS_BIND: str = "auto"
|
||||
VLLM_CPU_NUM_OF_RESERVED_CPU: int | None = None
|
||||
VLLM_CPU_SGL_KERNEL: bool = False
|
||||
VLLM_ZENTORCH_WEIGHT_PREPACK: bool = True
|
||||
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
|
||||
VLLM_XLA_CHECK_RECOMPILATION: bool = False
|
||||
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", "shm"] = "auto"
|
||||
@@ -709,6 +710,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
else None,
|
||||
# (CPU backend only) whether to use SGL kernels, optimized for small batch.
|
||||
"VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
|
||||
# (Zen CPU backend) eagerly prepack weights into ZenDNN blocked layout
|
||||
# at model load time. Eliminates per-inference layout conversion overhead.
|
||||
"VLLM_ZENTORCH_WEIGHT_PREPACK": lambda: bool(
|
||||
int(os.getenv("VLLM_ZENTORCH_WEIGHT_PREPACK", "1"))
|
||||
),
|
||||
# If the env var is set, Ray Compiled Graph uses the specified
|
||||
# channel type to communicate between workers belonging to
|
||||
# different pipeline-parallel stages.
|
||||
@@ -1768,6 +1774,7 @@ def compile_factors() -> dict[str, object]:
|
||||
"VLLM_V1_OUTPUT_PROC_CHUNK_SIZE",
|
||||
"VLLM_CPU_KVCACHE_SPACE",
|
||||
"VLLM_CPU_MOE_PREPACK",
|
||||
"VLLM_ZENTORCH_WEIGHT_PREPACK",
|
||||
"VLLM_TEST_FORCE_LOAD_FORMAT",
|
||||
"VLLM_ENABLE_CUDA_COMPATIBILITY",
|
||||
"VLLM_CUDA_COMPATIBILITY_PATH",
|
||||
|
||||
Reference in New Issue
Block a user