[Quantization][FP8] Add support for FP8 models with input_scale for output projection and QK quantization (#15734)

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
This commit is contained in:
rasmith
2025-04-25 02:45:02 -05:00
committed by GitHub
parent 6aae216b4e
commit a41351f363
8 changed files with 105 additions and 20 deletions

View File

@@ -3767,6 +3767,17 @@ class VllmConfig:
return quant_config
return None
@staticmethod
def get_quantization_config(
model_config: ModelConfig,
load_config: LoadConfig) -> Optional[QuantizationConfig]:
import copy
# For some reason, the _ version of this modifies the model_config
# object, so using deepcopy to avoid this problem.
return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
load_config)
def with_hf_config(
self,
hf_config: PretrainedConfig,