[Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-09-17 19:43:31 -04:00
parent e6585ddb45
commit 5963b98b46
68 changed files with 2698 additions and 2526 deletions
--- a/tests/kernels/moe/test_block_int8.py
+++ b/tests/kernels/moe/test_block_int8.py
@@ -4,12 +4,12 @@
 import pytest
 import torch

-from tests.kernels.moe.utils import make_test_weights
+from tests.kernels.moe.utils import make_test_quant_config
 from tests.kernels.quant_utils import (native_per_token_group_quant_int8,
                                       native_w8a8_block_matmul)
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk
 from vllm.platforms import current_platform

 if current_platform.get_device_capability() < (7, 0):
@@ -50,7 +50,7 @@ MNK_FACTORS = [
    (2048, 128, 128),
    (2048, 1024, 7168),
    (2048, 4096, 512),
-    (2048, 4096, 7168),
+    (2048, 4096, 4096),
 ]

 E = [8, 24]
@@ -117,31 +117,28 @@ def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):

    a = torch.randn((M, K), dtype=dtype) / 10
    score = torch.randn((M, E), dtype=dtype)
+    topk_weights, topk_ids, _ = fused_topk(a, score.float(), topk, False)

-    (_, w1, w1_s, _), (_, w2, w2_s,
-                       _) = make_test_weights(E,
-                                              N,
-                                              K,
-                                              dtype,
-                                              torch.int8,
-                                              per_act_token_quant=False,
-                                              block_shape=block_size)
+    w1, w2, quant_config = make_test_quant_config(
+        E,
+        N,
+        K,
+        dtype,
+        quant_dtype=torch.int8,
+        per_act_token_quant=False,
+        block_shape=block_size,
+    )

    # Set the context to avoid lots of warning spam.
    with set_current_vllm_config(vllm_config):
-        out = fused_moe(
-            a,
-            w1,
-            w2,
-            score,
-            topk,
-            renormalize=False,
-            use_int8_w8a8=True,
-            w1_scale=w1_s,
-            w2_scale=w2_s,
-            block_shape=block_size,
-        )
-        ref_out = torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk,
+        out = fused_experts(a,
+                            w1,
+                            w2,
+                            topk_weights,
+                            topk_ids,
+                            quant_config=quant_config)
+        ref_out = torch_w8a8_block_int8_moe(a, w1, w2, quant_config.w1_scale,
+                                            quant_config.w2_scale, score, topk,
                                            block_size)

    # Check results