[Feature]: Remove Chunking From FusedMoE (#34086)
Signed-off-by: SouthWest7 <am1ao@qq.com> Signed-off-by: Southwest <1403572259@qq.com> Signed-off-by: southwest <am1ao@qq.com> Signed-off-by: Xinan Miao <1403572259@qq.com> Co-authored-by: SouthWest7 <am1ao@qq.com>
This commit is contained in:
@@ -287,7 +287,6 @@ def run_moe_test(
|
||||
@pytest.mark.parametrize("ep_size", EP_SIZE)
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("padding", [True, False])
|
||||
@pytest.mark.parametrize("chunk_size", [8192])
|
||||
def test_fused_moe(
|
||||
m: int,
|
||||
n: int,
|
||||
@@ -297,14 +296,11 @@ def test_fused_moe(
|
||||
ep_size: int,
|
||||
dtype: torch.dtype,
|
||||
padding: bool,
|
||||
chunk_size: int,
|
||||
monkeypatch,
|
||||
workspace_init,
|
||||
):
|
||||
set_random_seed(7)
|
||||
|
||||
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
|
||||
|
||||
#
|
||||
# Setup test data
|
||||
#
|
||||
@@ -398,12 +394,12 @@ def test_fused_moe(
|
||||
)
|
||||
|
||||
|
||||
def test_fused_moe_int64_overflow(monkeypatch, workspace_init):
|
||||
def test_fused_moe_int64_overflow(workspace_init):
|
||||
"""Regression test for int32 overflow in stride*offset products.
|
||||
|
||||
When chunking is disabled and M is large, stride_cm * offs_token can
|
||||
exceed int32 max. Verifies the offs_token int64 cast (fix for #34413)
|
||||
prevents overflow and produces correct results.
|
||||
With large M, stride_cm * offs_token can exceed int32 max. Verifies
|
||||
the offs_token int64 cast (fix for #34413) prevents overflow and
|
||||
produces correct results.
|
||||
|
||||
Reproduces the scenario from PR #34279.
|
||||
"""
|
||||
@@ -417,9 +413,6 @@ def test_fused_moe_int64_overflow(monkeypatch, workspace_init):
|
||||
m, n, k, e, topk = 100000, 2048, 1024, 8, 6
|
||||
dtype = torch.bfloat16
|
||||
|
||||
# Disable chunking to expose the overflow-prone code path
|
||||
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "10000000")
|
||||
|
||||
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
|
||||
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
|
||||
@@ -452,7 +445,6 @@ def test_fused_moe_int64_overflow(monkeypatch, workspace_init):
|
||||
@pytest.mark.parametrize("topk", TOP_KS_SMALL)
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("padding", [True, False])
|
||||
@pytest.mark.parametrize("chunk_size", [8192])
|
||||
def test_naive_block_assignment_moe(
|
||||
m: int,
|
||||
n: int,
|
||||
@@ -461,14 +453,11 @@ def test_naive_block_assignment_moe(
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
padding: bool,
|
||||
chunk_size: int,
|
||||
monkeypatch,
|
||||
workspace_init,
|
||||
):
|
||||
set_random_seed(7)
|
||||
|
||||
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
|
||||
|
||||
#
|
||||
# Setup test data
|
||||
#
|
||||
|
||||
Reference in New Issue
Block a user