[Kernel] W8A16 Int8 inside FusedMoE (#7415)

This commit is contained in:
Mor Zusman
2024-08-16 20:06:51 +03:00
committed by GitHub
parent e837b624f2
commit 7fc23be81c
15 changed files with 412 additions and 136 deletions

View File

@@ -243,7 +243,8 @@ class ModelConfig:
rocm_supported_quantization = ["gptq", "squeezellm", "fp8"]
optimized_quantization_methods = [
"fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
"fbgemm_fp8", "compressed_tensors", "compressed-tensors"
"fbgemm_fp8", "compressed_tensors", "compressed-tensors",
"experts_int8"
]
tpu_supported_quantization = ["tpu_int8"]
if self.quantization is not None: