[BugFix] Partial revert of #29558 (DeepEP HT + PIECEWISE CG support) (#30910)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Lucas Wilkinson
2025-12-18 02:50:15 -05:00
committed by GitHub
parent aa7e836055
commit 30bb19a760
2 changed files with 14 additions and 74 deletions

View File

@@ -233,24 +233,6 @@ def test_splitting_ops_dynamic():
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
def test_moe_splitting_ops_deepep_ht_piecewise():
# Non-inductor, non-attn-fusion case: DeepEP HT with dp>1
# should add MoE ops to splitting_ops on top of attention ops.
config = VllmConfig(
parallel_config=ParallelConfig(
all2all_backend="deepep_high_throughput",
data_parallel_size=8,
),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
),
)
splitting_ops = config.compilation_config.splitting_ops
assert splitting_ops is not None
assert "vllm::moe_forward" in splitting_ops
assert "vllm::moe_forward_shared" in splitting_ops
def test_moe_splitting_ops_deepep_ht_inductor_partition():
# Inductor partition case: user-provided splitting_ops should be
# preserved and MoE ops should be appended for DeepEP HT with dp>1.
@@ -277,26 +259,6 @@ def test_moe_splitting_ops_deepep_ht_inductor_partition():
]
def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
# Pure attn-fusion case without inductor partition: even with
# DeepEP HT and dp>1, we should not re-enable piecewise compilation
# or add MoE ops into splitting_ops.
config = VllmConfig(
parallel_config=ParallelConfig(
all2all_backend="deepep_high_throughput",
data_parallel_size=8,
),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
pass_config={"fuse_attn_quant": True, "eliminate_noops": True},
custom_ops=["+quant_fp8"],
cudagraph_mode=CUDAGraphMode.PIECEWISE,
),
)
assert config.compilation_config.splitting_ops == []
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
def test_should_split():
import torch