[Kernel] Add Exllama as a backend for compressed-tensors (#9395)
This commit is contained in:
@@ -66,6 +66,7 @@ if TYPE_CHECKING:
|
||||
VLLM_SKIP_P2P_CHECK: bool = False
|
||||
VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False
|
||||
VLLM_TORCH_COMPILE_LEVEL: int = 0
|
||||
VLLM_DISABLED_KERNELS: List[str] = []
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@@ -430,6 +431,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
"VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1":
|
||||
lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1", "0"
|
||||
) == "1",
|
||||
|
||||
# List of quantization kernels that should be disabled, used for testing
|
||||
# and performance comparisons. Currently only affects MPLinearKernel
|
||||
# selection
|
||||
# (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
|
||||
"VLLM_DISABLED_KERNELS":
|
||||
lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[
|
||||
"VLLM_DISABLED_KERNELS"].split(","),
|
||||
}
|
||||
|
||||
# end-env-vars-definition
|
||||
|
||||
Reference in New Issue
Block a user