[Kernel] add triton fused moe kernel for gptq/awq (#12185)

2025-01-29 22:07:09 +08:00
parent b02fd288b2
commit 27b78c73ca
4 changed files with 874 additions and 55 deletions
--- a/vllm/model_executor/layers/quantization/init.py
+++ b/vllm/model_executor/layers/quantization/init.py
@@ -26,7 +26,8 @@ QUANTIZATION_METHODS: List[str] = [
    "experts_int8",
    "neuron_quant",
    "ipex",
-    "quark"
+    "quark",
+    "moe_wna16"
 ]

 # The customized quantization methods which will be added to this dict.
@@ -94,6 +95,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
    from .ipex_quant import IPEXConfig
    from .marlin import MarlinConfig
    from .modelopt import ModelOptFp8Config
+    from .moe_wna16 import MoeWNA16Config
    from .neuron_quant import NeuronQuantConfig
    from .qqq import QQQConfig
    from .tpu_int8 import Int8TpuConfig
@@ -121,7 +123,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
        "experts_int8": ExpertsInt8Config,
        "neuron_quant": NeuronQuantConfig,
        "ipex": IPEXConfig,
-        "quark": QuarkConfig
+        "quark": QuarkConfig,
+        "moe_wna16": MoeWNA16Config,
    }
    # Update the `method_to_config` with customized quantization methods.
    method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)