[Kernel] Support Microsoft Runtime Kernel Lib for our Low Precision Computation - BitBLAS (#6036)

Signed-off-by: xinyuxiao <xinyuxiao2024@gmail.com> Co-authored-by: xinyuxiao <xinyuxiao2024@gmail.com>
2025-04-22 16:01:36 +08:00
parent c4ab9f3e71
commit 8d32dc603d
15 changed files with 1864 additions and 7 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -750,7 +750,8 @@ class ModelConfig:
        optimized_quantization_methods = [
            "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
            "awq_marlin", "fbgemm_fp8", "compressed_tensors",
-            "compressed-tensors", "experts_int8", "quark", "nvfp4"
+            "compressed-tensors", "experts_int8", "quark", "nvfp4", "bitblas",
+            "gptq_bitblas"
        ]
        if self.quantization is not None:
            self.quantization = self.quantization.lower()