[Kernel] add triton fused moe kernel for gptq/awq (#12185)
This commit is contained in:
@@ -26,7 +26,8 @@ QUANTIZATION_METHODS: List[str] = [
|
||||
"experts_int8",
|
||||
"neuron_quant",
|
||||
"ipex",
|
||||
"quark"
|
||||
"quark",
|
||||
"moe_wna16"
|
||||
]
|
||||
|
||||
# The customized quantization methods which will be added to this dict.
|
||||
@@ -94,6 +95,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
|
||||
from .ipex_quant import IPEXConfig
|
||||
from .marlin import MarlinConfig
|
||||
from .modelopt import ModelOptFp8Config
|
||||
from .moe_wna16 import MoeWNA16Config
|
||||
from .neuron_quant import NeuronQuantConfig
|
||||
from .qqq import QQQConfig
|
||||
from .tpu_int8 import Int8TpuConfig
|
||||
@@ -121,7 +123,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
|
||||
"experts_int8": ExpertsInt8Config,
|
||||
"neuron_quant": NeuronQuantConfig,
|
||||
"ipex": IPEXConfig,
|
||||
"quark": QuarkConfig
|
||||
"quark": QuarkConfig,
|
||||
"moe_wna16": MoeWNA16Config,
|
||||
}
|
||||
# Update the `method_to_config` with customized quantization methods.
|
||||
method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
|
||||
|
||||
Reference in New Issue
Block a user