[Model] [Quantization] Support deepseek_v3 w8a8 fp8 block-wise quantization (#11523)

Signed-off-by: mgoin <michael@neuralmagic.com> Signed-off-by: simon-mo <simon.mo@hey.com> Signed-off-by: simon-mo <xmo@berkeley.edu> Co-authored-by: simon-mo <simon.mo@hey.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: HandH1998 <1335248067@qq.com>
2024-12-26 18:33:30 -05:00
parent 720b10fdc6
commit 2072924d14
8 changed files with 931 additions and 70 deletions
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -328,6 +328,15 @@ class PackedvLLMParameter(ModelWeightParameter):
            marlin_tile_size=self.marlin_tile_size)


+class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    block-wise quantization. Uses both column and row parallelism.
+    """
+
+    pass
+
+
 def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
                          output_dim: int, **kwargs) -> BasevLLMParameter:
    """