[Feature]: Remove Chunking From FusedMoE (#34086)

Signed-off-by: SouthWest7 <am1ao@qq.com> Signed-off-by: Southwest <1403572259@qq.com> Signed-off-by: southwest <am1ao@qq.com> Signed-off-by: Xinan Miao <1403572259@qq.com> Co-authored-by: SouthWest7 <am1ao@qq.com>
2026-03-13 02:24:38 +08:00
parent c973ecdead
commit 2cdf92228c
28 changed files with 152 additions and 523 deletions
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -287,7 +287,6 @@ def run_moe_test(
@pytest.mark.parametrize("ep_size", EP_SIZE)
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("padding", [True, False])
-@pytest.mark.parametrize("chunk_size", [8192])
 def test_fused_moe(
    m: int,
    n: int,
@@ -297,14 +296,11 @@ def test_fused_moe(
    ep_size: int,
    dtype: torch.dtype,
    padding: bool,
-    chunk_size: int,
    monkeypatch,
    workspace_init,
 ):
    set_random_seed(7)

-    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
-
    #
    # Setup test data
    #
@@ -398,12 +394,12 @@ def test_fused_moe(
        )


-def test_fused_moe_int64_overflow(monkeypatch, workspace_init):
+def test_fused_moe_int64_overflow(workspace_init):
    """Regression test for int32 overflow in stride*offset products.

-    When chunking is disabled and M is large, stride_cm * offs_token can
-    exceed int32 max. Verifies the offs_token int64 cast (fix for #34413)
-    prevents overflow and produces correct results.
+    With large M, stride_cm * offs_token can exceed int32 max. Verifies
+    the offs_token int64 cast (fix for #34413) prevents overflow and
+    produces correct results.

    Reproduces the scenario from PR #34279.
    """
@@ -417,9 +413,6 @@ def test_fused_moe_int64_overflow(monkeypatch, workspace_init):
    m, n, k, e, topk = 100000, 2048, 1024, 8, 6
    dtype = torch.bfloat16

-    # Disable chunking to expose the overflow-prone code path
-    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "10000000")
-
    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
@@ -452,7 +445,6 @@ def test_fused_moe_int64_overflow(monkeypatch, workspace_init):
@pytest.mark.parametrize("topk", TOP_KS_SMALL)
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("padding", [True, False])
-@pytest.mark.parametrize("chunk_size", [8192])
 def test_naive_block_assignment_moe(
    m: int,
    n: int,
@@ -461,14 +453,11 @@ def test_naive_block_assignment_moe(
    topk: int,
    dtype: torch.dtype,
    padding: bool,
-    chunk_size: int,
    monkeypatch,
    workspace_init,
 ):
    set_random_seed(7)

-    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
-
    #
    # Setup test data
    #