[Kernel] Add Exllama as a backend for compressed-tensors (#9395)

2024-10-17 09:48:26 -04:00
parent dbfa8d31d5
commit e312e52b44
7 changed files with 173 additions and 16 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -66,6 +66,7 @@ if TYPE_CHECKING:
    VLLM_SKIP_P2P_CHECK: bool = False
    VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False
    VLLM_TORCH_COMPILE_LEVEL: int = 0
+    VLLM_DISABLED_KERNELS: List[str] = []


 def get_default_cache_root():
@@ -430,6 +431,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1":
    lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1", "0"
                           ) == "1",
+
+    # List of quantization kernels that should be disabled, used for testing
+    # and performance comparisons. Currently only affects MPLinearKernel
+    # selection
+    # (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel)
+    "VLLM_DISABLED_KERNELS":
+    lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[
+        "VLLM_DISABLED_KERNELS"].split(","),
 }

 # end-env-vars-definition