[Model] Adding support for MSFT Phi-3.5-MoE (#7729)

Co-authored-by: Your Name <you@example.com> Co-authored-by: Zeqi Lin <zelin@microsoft.com> Co-authored-by: Zeqi Lin <Zeqi.Lin@microsoft.com>
2024-08-31 03:42:57 +08:00
parent 2684efc467
commit 1248e8506a
13 changed files with 1254 additions and 81 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional

 import torch
 from torch.nn import Module
@@ -468,15 +468,18 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                                                        requires_grad=False)
            return

-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              router_logits: torch.Tensor,
-              top_k: int,
-              renormalize: bool,
-              use_grouped_topk: bool,
-              topk_group: Optional[int] = None,
-              num_expert_group: Optional[int] = None) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+    ) -> torch.Tensor:

        from vllm.model_executor.layers.fused_moe import fused_experts

@@ -487,7 +490,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
            top_k=top_k,
            renormalize=renormalize,
            topk_group=topk_group,
-            num_expert_group=num_expert_group)
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function)

        return fused_experts(x,
                             layer.w13_weight,