[Model] [Quantization] Support deepseek_v3 w8a8 fp8 block-wise quantization (#11523)

Signed-off-by: mgoin <michael@neuralmagic.com> Signed-off-by: simon-mo <simon.mo@hey.com> Signed-off-by: simon-mo <xmo@berkeley.edu> Co-authored-by: simon-mo <simon.mo@hey.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: HandH1998 <1335248067@qq.com>
2024-12-26 18:33:30 -05:00
parent 720b10fdc6
commit 2072924d14
8 changed files with 931 additions and 70 deletions
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -29,6 +29,7 @@ class FusedMoeWeightScaleSupported(Enum):
    TENSOR = "tensor"
    CHANNEL = "channel"
    GROUP = "group"
+    BLOCK = "block"


 class FusedMoEMethodBase(QuantizeMethodBase):
@@ -199,6 +200,7 @@ class FusedMoE(torch.nn.Module):
                        get_tensor_model_parallel_world_size())
        self.top_k = top_k
        self.num_experts = num_experts
+        assert intermediate_size % self.tp_size == 0
        self.intermediate_size_per_partition = intermediate_size // self.tp_size
        self.reduce_results = reduce_results
        self.renormalize = renormalize
@@ -398,7 +400,10 @@ class FusedMoE(torch.nn.Module):
                    loaded_weight=loaded_weight,
                    expert_data=expert_data,
                    tp_rank=tp_rank)
-            elif quant_method == FusedMoeWeightScaleSupported.GROUP.value:
+            elif quant_method in [
+                    FusedMoeWeightScaleSupported.GROUP.value,
+                    FusedMoeWeightScaleSupported.BLOCK.value,
+            ]:
                self._load_model_weight_or_group_weight_scale(
                    shard_id=shard_id,
                    shard_dim=shard_dim,