[BugFix] Fix quantization for all other methods (#11547)

2024-12-27 01:23:29 -05:00
parent 1b875a0ef3
commit 2339d59f92
6 changed files with 52 additions and 22 deletions
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -440,11 +440,13 @@ class AWQMoEMethod(FusedMoEMethodBase):
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
-        renormalize: bool = True,
+        renormalize: bool,
        use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        topk_weights, topk_ids = FusedMoE.select_experts(
            hidden_states=x,
@@ -454,7 +456,9 @@ class AWQMoEMethod(FusedMoEMethodBase):
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function)
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)

        return torch.ops.vllm.fused_marlin_moe(
            x,