[ Misc ] Apply MoE Refactor to Deepseekv2 To Support Fp8 (#6417)

2024-07-13 23:03:58 -04:00
parent eeceadaecc
commit fb6af8bc08
9 changed files with 222 additions and 136 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -377,7 +377,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
              x: torch.Tensor,
              router_logits: torch.Tensor,
              top_k: int,
-              renormalize: bool = True) -> torch.Tensor:
+              renormalize: bool = True,
+              use_grouped_topk: bool = False,
+              num_expert_group: Optional[int] = None,
+              topk_group: Optional[int] = None) -> torch.Tensor:

        return fused_moe(x,
                         layer.w13_weight,
@@ -390,7 +393,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                         w1_scale=layer.w13_scale,
                         w2_scale=layer.w2_scale,
                         a1_scale=layer.a13_scale,
-                         a2_scale=layer.a2_scale)
+                         a2_scale=layer.a2_scale,
+                         use_grouped_topk=use_grouped_topk,
+                         num_expert_group=num_expert_group,
+                         topk_group=topk_group)


 class Fp8KVCacheMethod(QuantizeMethodBase):