[BugFix] Fix quantization for all other methods (#11547)

2024-12-27 01:23:29 -05:00
parent 1b875a0ef3
commit 2339d59f92
6 changed files with 52 additions and 22 deletions
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -41,9 +41,20 @@ class FusedMoEMethodBase(QuantizeMethodBase):
        raise NotImplementedError

    @abstractmethod
-    def apply(self, layer: torch.nn.Module, x: torch.Tensor,
-              router_logits: torch.Tensor, top_k: int, renormalize: bool,
-              use_grouped_topk: bool) -> torch.Tensor:
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
        raise NotImplementedError


@@ -79,7 +90,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        router_logits: torch.Tensor,
        top_k: int,
        renormalize: bool,
-        use_grouped_topk: bool,
+        use_grouped_topk: bool = False,
        topk_group: Optional[int] = None,
        num_expert_group: Optional[int] = None,
        custom_routing_function: Optional[Callable] = None,
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -440,11 +440,13 @@ class AWQMoEMethod(FusedMoEMethodBase):
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
        use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        topk_weights, topk_ids = FusedMoE.select_experts(
            hidden_states=x,
@@ -454,7 +456,9 @@ class AWQMoEMethod(FusedMoEMethodBase):
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)

        return torch.ops.vllm.fused_marlin_moe(
            x,
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -203,13 +203,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
        use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
-
        from vllm.model_executor.layers.fused_moe import fused_experts

        topk_weights, topk_ids = FusedMoE.select_experts(
@@ -220,7 +221,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)

        return fused_experts(x,
                             layer.w13_weight,
@@ -476,12 +479,15 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
        use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
+
        topk_weights, topk_ids = FusedMoE.select_experts(
            hidden_states=x,
            router_logits=router_logits,
@@ -490,7 +496,9 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)

        return torch.ops.vllm.fused_marlin_moe(
            x,
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -99,11 +99,13 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
        use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        from vllm.model_executor.layers.fused_moe import fused_experts

@@ -115,7 +117,9 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)

        return fused_experts(x,
                             layer.w13_weight,
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -601,14 +601,13 @@ class Fp8MoEMethod(FusedMoEMethodBase):
        router_logits: torch.Tensor,
        top_k: int,
        renormalize: bool,
-        use_grouped_topk: bool,
+        use_grouped_topk: bool = False,
        topk_group: Optional[int] = None,
        num_expert_group: Optional[int] = None,
        custom_routing_function: Optional[Callable] = None,
        scoring_func: str = "softmax",
        e_score_correction_bias: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
-
        from vllm.model_executor.layers.fused_moe import fused_experts

        topk_weights, topk_ids = FusedMoE.select_experts(
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -532,11 +532,13 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
        use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        # The input must currently be float16
        orig_dtype = x.dtype
@@ -550,7 +552,9 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
-            custom_routing_function=None)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)

        return torch.ops.vllm.fused_marlin_moe(
            x,