[Kernel] W8A16 Int8 inside FusedMoE (#7415)

2024-08-16 20:06:51 +03:00
parent e837b624f2
commit 7fc23be81c
15 changed files with 412 additions and 136 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -243,7 +243,8 @@ class ModelConfig:
        rocm_supported_quantization = ["gptq", "squeezellm", "fp8"]
        optimized_quantization_methods = [
            "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
-            "fbgemm_fp8", "compressed_tensors", "compressed-tensors"
+            "fbgemm_fp8", "compressed_tensors", "compressed-tensors",
+            "experts_int8"
        ]
        tpu_supported_quantization = ["tpu_int8"]
        if self.quantization is not None: