[Feature][Hardware][Amd] Add fp8 Linear Layer for Rocm (#7210)

2024-08-16 12:06:30 -05:00
parent ec724a725e
commit e837b624f2
7 changed files with 164 additions and 49 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -240,7 +240,7 @@ class ModelConfig:

    def _verify_quantization(self) -> None:
        supported_quantization = [*QUANTIZATION_METHODS]
-        rocm_supported_quantization = ["gptq", "squeezellm"]
+        rocm_supported_quantization = ["gptq", "squeezellm", "fp8"]
        optimized_quantization_methods = [
            "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
            "fbgemm_fp8", "compressed_tensors", "compressed-tensors"