[Quantization][FP8] Add support for FP8 models with input_scale for output projection and QK quantization (#15734)

Signed-off-by: Randall Smith <Randall.Smith@amd.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Luka Govedič <lgovedic@redhat.com>
2025-04-25 02:45:02 -05:00
parent 6aae216b4e
commit a41351f363
8 changed files with 105 additions and 20 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3767,6 +3767,17 @@ class VllmConfig:
            return quant_config
        return None

+    @staticmethod
+    def get_quantization_config(
+            model_config: ModelConfig,
+            load_config: LoadConfig) -> Optional[QuantizationConfig]:
+        import copy
+
+        # For some reason, the _ version of this modifies the model_config
+        # object, so using deepcopy to avoid this problem.
+        return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
+                                                   load_config)
+
    def with_hf_config(
        self,
        hf_config: PretrainedConfig,