[Kernel] Add env variable to force flashinfer backend to enable tensor cores (#9497)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
This commit is contained in:
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
|
||||
VLLM_ATTENTION_BACKEND: Optional[str] = None
|
||||
VLLM_USE_FLASHINFER_SAMPLER: bool = False
|
||||
VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False
|
||||
VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
|
||||
VLLM_PP_LAYER_PARTITION: Optional[str] = None
|
||||
VLLM_CPU_KVCACHE_SPACE: int = 0
|
||||
VLLM_CPU_OMP_THREADS_BIND: str = ""
|
||||
@@ -286,6 +287,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
"VLLM_USE_FLASHINFER_SAMPLER":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_SAMPLER", "0"))),
|
||||
|
||||
# If set, vllm will force flashinfer to use tensor cores;
|
||||
# otherwise will use heuristic based on model architecture.
|
||||
"VLLM_FLASHINFER_FORCE_TENSOR_CORES":
|
||||
lambda: bool(int(os.getenv("VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"))),
|
||||
|
||||
# Pipeline stage partition strategy
|
||||
"VLLM_PP_LAYER_PARTITION":
|
||||
lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
|
||||
|
||||
Reference in New Issue
Block a user