[Kernel] optimize performance of gptq marlin kernel when n is small (#14138)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
This commit is contained in:
Jinzhen Lin
2025-03-08 00:53:38 +08:00
committed by GitHub
parent 58abe35455
commit d0feea31c7
6 changed files with 99 additions and 24 deletions

View File

@@ -95,6 +95,7 @@ if TYPE_CHECKING:
VLLM_DP_SIZE: int = 1
VLLM_DP_MASTER_IP: str = ""
VLLM_DP_MASTER_PORT: int = 0
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
def get_default_cache_root():
@@ -630,6 +631,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Whether to use S3 path for model loading in CI via RunAI Streamer
"VLLM_CI_USE_S3":
lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
# Whether to use atomicAdd reduce in gptq/awq marlin kernel.
"VLLM_MARLIN_USE_ATOMIC_ADD":
lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
}
# end-env-vars-definition