[Model] [Quantization] Support deepseek_v3 w8a8 fp8 block-wise quantization (#11523)

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: simon-mo <simon.mo@hey.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: HandH1998 <1335248067@qq.com>
This commit is contained in:
Michael Goin
2024-12-26 18:33:30 -05:00
committed by GitHub
parent 720b10fdc6
commit 2072924d14
8 changed files with 931 additions and 70 deletions

View File

@@ -161,7 +161,7 @@ class ModelConfig:
override default pooling config for the pooling model.
logits_processor_pattern: Optional regex pattern specifying valid
logits processor qualified names that can be passed with the
`logits_processors` extra completion argument. Defaults to None,
`logits_processors` extra completion argument. Defaults to None,
which allows no processors.
generation_config: Configuration parameter file for generation.
"""
@@ -364,7 +364,7 @@ class ModelConfig:
def maybe_pull_model_tokenizer_for_s3(self, model: str,
tokenizer: str) -> None:
"""
Pull the model config or tokenizer to a temporary
Pull the model config or tokenizer to a temporary
directory in case of S3.
Args:
@@ -866,14 +866,14 @@ class ModelConfig:
def get_diff_sampling_param(self) -> Dict[str, Any]:
"""
This method returns a dictionary containing the parameters
that differ from the default sampling parameters, but only
if `generation_config` is set. If `generation_config` is not
This method returns a dictionary containing the parameters
that differ from the default sampling parameters, but only
if `generation_config` is set. If `generation_config` is not
set, an empty dictionary is returned.
Returns:
Dict[str, Any]: A dictionary with the differing sampling
parameters if `generation_config` is set, otherwise an
Dict[str, Any]: A dictionary with the differing sampling
parameters if `generation_config` is set, otherwise an
empty dictionary.
"""
if self.generation_config is None: