[Feature]: Remove Chunking From FusedMoE (#34086)

Signed-off-by: SouthWest7 <am1ao@qq.com>
Signed-off-by: Southwest <1403572259@qq.com>
Signed-off-by: southwest <am1ao@qq.com>
Signed-off-by: Xinan Miao <1403572259@qq.com>
Co-authored-by: SouthWest7 <am1ao@qq.com>
This commit is contained in:
Xinan Miao
2026-03-13 02:24:38 +08:00
committed by GitHub
parent c973ecdead
commit 2cdf92228c
28 changed files with 152 additions and 523 deletions

View File

@@ -287,7 +287,6 @@ def run_moe_test(
@pytest.mark.parametrize("ep_size", EP_SIZE)
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("padding", [True, False])
@pytest.mark.parametrize("chunk_size", [8192])
def test_fused_moe(
m: int,
n: int,
@@ -297,14 +296,11 @@ def test_fused_moe(
ep_size: int,
dtype: torch.dtype,
padding: bool,
chunk_size: int,
monkeypatch,
workspace_init,
):
set_random_seed(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
#
# Setup test data
#
@@ -398,12 +394,12 @@ def test_fused_moe(
)
def test_fused_moe_int64_overflow(monkeypatch, workspace_init):
def test_fused_moe_int64_overflow(workspace_init):
"""Regression test for int32 overflow in stride*offset products.
When chunking is disabled and M is large, stride_cm * offs_token can
exceed int32 max. Verifies the offs_token int64 cast (fix for #34413)
prevents overflow and produces correct results.
With large M, stride_cm * offs_token can exceed int32 max. Verifies
the offs_token int64 cast (fix for #34413) prevents overflow and
produces correct results.
Reproduces the scenario from PR #34279.
"""
@@ -417,9 +413,6 @@ def test_fused_moe_int64_overflow(monkeypatch, workspace_init):
m, n, k, e, topk = 100000, 2048, 1024, 8, 6
dtype = torch.bfloat16
# Disable chunking to expose the overflow-prone code path
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "10000000")
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
@@ -452,7 +445,6 @@ def test_fused_moe_int64_overflow(monkeypatch, workspace_init):
@pytest.mark.parametrize("topk", TOP_KS_SMALL)
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("padding", [True, False])
@pytest.mark.parametrize("chunk_size", [8192])
def test_naive_block_assignment_moe(
m: int,
n: int,
@@ -461,14 +453,11 @@ def test_naive_block_assignment_moe(
topk: int,
dtype: torch.dtype,
padding: bool,
chunk_size: int,
monkeypatch,
workspace_init,
):
set_random_seed(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
#
# Setup test data
#