[Quantization][FP8] Add support for FP8 models with input_scale for output projection and QK quantization (#15734)
Signed-off-by: Randall Smith <Randall.Smith@amd.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Luka Govedič <lgovedic@redhat.com>
This commit is contained in:
@@ -3767,6 +3767,17 @@ class VllmConfig:
|
||||
return quant_config
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_quantization_config(
|
||||
model_config: ModelConfig,
|
||||
load_config: LoadConfig) -> Optional[QuantizationConfig]:
|
||||
import copy
|
||||
|
||||
# For some reason, the _ version of this modifies the model_config
|
||||
# object, so using deepcopy to avoid this problem.
|
||||
return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
|
||||
load_config)
|
||||
|
||||
def with_hf_config(
|
||||
self,
|
||||
hf_config: PretrainedConfig,
|
||||
|
||||
Reference in New Issue
Block a user