[Kernel] optimize performance of gptq marlin kernel when n is small (#14138)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
2025-03-08 00:53:38 +08:00
parent 58abe35455
commit d0feea31c7
6 changed files with 99 additions and 24 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -95,6 +95,7 @@ if TYPE_CHECKING:
    VLLM_DP_SIZE: int = 1
    VLLM_DP_MASTER_IP: str = ""
    VLLM_DP_MASTER_PORT: int = 0
+    VLLM_MARLIN_USE_ATOMIC_ADD: bool = False


 def get_default_cache_root():
@@ -630,6 +631,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Whether to use S3 path for model loading in CI via RunAI Streamer
    "VLLM_CI_USE_S3":
    lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
+
+    # Whether to use atomicAdd reduce in gptq/awq marlin kernel.
+    "VLLM_MARLIN_USE_ATOMIC_ADD":
+    lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
 }

 # end-env-vars-definition