[AMD] Add support for GGUF quantization on ROCm (#10254)

2024-11-23 13:14:49 +08:00
parent 02a43f82a9
commit 7c25fe45a6
11 changed files with 234 additions and 211 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -387,7 +387,7 @@ class ModelConfig:
        supported_quantization = QUANTIZATION_METHODS
        rocm_supported_quantization = [
            "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
-            "fbgemm_fp8"
+            "fbgemm_fp8", "gguf"
        ]
        optimized_quantization_methods = [
            "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",