[MoE Refactor] Migrate Unquantized to Full Oracle Flow (#36286)
Signed-off-by: Yifan Zong <yzong@redhat.com> Signed-off-by: Robert Shaw <robshaw@redhat.com> Signed-off-by: yzong-rh <yzong@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Robert Shaw <robshaw@redhat.com>
This commit is contained in:
@@ -1664,7 +1664,7 @@ def test_unquantized_bf16_flashinfer_trtllm_backend(
|
||||
intermediate_size_per_partition=n,
|
||||
num_local_experts=e,
|
||||
num_logical_experts=e,
|
||||
activation="silu",
|
||||
activation=MoEActivation.SILU,
|
||||
device="cuda",
|
||||
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
|
||||
in_dtype=dtype,
|
||||
@@ -1695,13 +1695,25 @@ def test_unquantized_bf16_flashinfer_trtllm_backend(
|
||||
layer.topk_group = 1
|
||||
layer.intermediate_size_per_partition = n
|
||||
layer.ep_rank = 0
|
||||
layer.activation = "silu"
|
||||
layer.activation = MoEActivation.SILU
|
||||
layer.e_score_correction_bias = None
|
||||
layer.routing_method_type = RoutingMethodType.Renormalize
|
||||
layer.expert_map = None
|
||||
layer.apply_router_weight_on_input = False
|
||||
layer.routed_scaling_factor = None
|
||||
layer.shared_experts = None
|
||||
layer._maybe_init_expert_routing_tables = lambda: None
|
||||
|
||||
quant_method.process_weights_after_loading(layer)
|
||||
|
||||
trtllm_output = quant_method.forward_monolithic_cuda(
|
||||
assert quant_method.moe_kernel is not None, (
|
||||
"moe_kernel should be set after process_weights_after_loading"
|
||||
)
|
||||
assert quant_method.supports_internal_mk, (
|
||||
"supports_internal_mk should be True after setup"
|
||||
)
|
||||
|
||||
trtllm_output = quant_method.apply_monolithic(
|
||||
layer=layer,
|
||||
x=a,
|
||||
router_logits=router_logits,
|
||||
|
||||
@@ -24,7 +24,7 @@ from vllm.platforms import current_platform
|
||||
],
|
||||
)
|
||||
@patch(
|
||||
"vllm.model_executor.layers.fused_moe.oracle.unquantized.has_flashinfer",
|
||||
"vllm.utils.flashinfer.has_flashinfer",
|
||||
return_value=False,
|
||||
)
|
||||
@patch(
|
||||
@@ -54,13 +54,29 @@ def test_select_default_backend_by_platform(
|
||||
# Set only the specified platform to True
|
||||
getattr(mock_platform, platform_method).return_value = True
|
||||
|
||||
with (
|
||||
patch.object(current_platform, "is_cuda", return_value=False),
|
||||
patch.object(current_platform, "is_rocm", return_value=False),
|
||||
patch.object(current_platform, "is_cpu", return_value=False),
|
||||
patch.object(current_platform, "is_xpu", return_value=False),
|
||||
patch.object(current_platform, "is_tpu", return_value=False),
|
||||
patch.object(current_platform, "is_out_of_tree", return_value=False),
|
||||
patch.object(current_platform, platform_method, return_value=True),
|
||||
):
|
||||
moe_config = make_dummy_moe_config()
|
||||
selected_backend = select_unquantized_moe_backend(
|
||||
moe_config=moe_config,
|
||||
use_dp=False,
|
||||
selected_backend, expert_cls = select_unquantized_moe_backend(
|
||||
moe_config=moe_config
|
||||
)
|
||||
|
||||
assert selected_backend == expected_backend
|
||||
if expected_backend in [
|
||||
UnquantizedMoeBackend.CPU,
|
||||
UnquantizedMoeBackend.OOT,
|
||||
UnquantizedMoeBackend.TPU,
|
||||
]:
|
||||
assert expert_cls is None
|
||||
else:
|
||||
assert expert_cls is not None
|
||||
|
||||
|
||||
@patch(
|
||||
@@ -87,88 +103,90 @@ def test_select_rocm_aiter_backend(mock_aiter_enabled, mock_has_flashinfer):
|
||||
mock_platform.is_out_of_tree.return_value = False
|
||||
|
||||
moe_config = make_dummy_moe_config()
|
||||
selected_backend = select_unquantized_moe_backend(
|
||||
selected_backend, expert_cls = select_unquantized_moe_backend(
|
||||
moe_config=moe_config,
|
||||
use_dp=False,
|
||||
)
|
||||
|
||||
assert selected_backend == UnquantizedMoeBackend.AITER
|
||||
assert expert_cls is not None
|
||||
|
||||
|
||||
@patch(
|
||||
"vllm.model_executor.layers.fused_moe.oracle.unquantized.has_flashinfer",
|
||||
return_value=True,
|
||||
)
|
||||
@patch(
|
||||
"vllm.model_executor.layers.fused_moe.oracle.unquantized.is_supported_config_trtllm_bf16",
|
||||
"vllm.model_executor.layers.fused_moe.experts.trtllm_bf16_moe.TrtLlmBf16Experts.is_supported_config",
|
||||
return_value=(True, None),
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(), reason="Only supported on NVIDIA platforms."
|
||||
)
|
||||
def test_select_cuda_flashinfer_trtllm_backend(
|
||||
mock_has_flashinfer, mock_is_supported_trtllm, monkeypatch
|
||||
):
|
||||
def test_select_cuda_flashinfer_trtllm_backend(mock_is_supported_trtllm, monkeypatch):
|
||||
"""Test CUDA backend selection when FlashInfer TRTLLM is available and enabled."""
|
||||
with patch(
|
||||
"vllm.model_executor.layers.fused_moe.oracle.unquantized.current_platform"
|
||||
) as mock_platform:
|
||||
# Set as CUDA platform
|
||||
mock_platform.is_cuda.return_value = True
|
||||
mock_platform.is_rocm.return_value = False
|
||||
mock_platform.is_cpu.return_value = False
|
||||
mock_platform.is_xpu.return_value = False
|
||||
mock_platform.is_tpu.return_value = False
|
||||
mock_platform.is_out_of_tree.return_value = False
|
||||
|
||||
with (
|
||||
patch.object(current_platform, "is_cuda", return_value=True),
|
||||
patch.object(current_platform, "is_rocm", return_value=False),
|
||||
patch.object(current_platform, "is_cpu", return_value=False),
|
||||
patch.object(current_platform, "is_xpu", return_value=False),
|
||||
patch.object(current_platform, "is_tpu", return_value=False),
|
||||
patch.object(current_platform, "is_out_of_tree", return_value=False),
|
||||
patch.object(current_platform, "has_device_capability", return_value=True),
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1")
|
||||
|
||||
moe_config = make_dummy_moe_config()
|
||||
# TRTLLM requires EP and does not support DP
|
||||
moe_config.moe_parallel_config.use_ep = True
|
||||
moe_config.moe_parallel_config.use_dp = False
|
||||
|
||||
selected_backend = select_unquantized_moe_backend(
|
||||
moe_config=moe_config,
|
||||
use_dp=False,
|
||||
selected_backend, experts_cls = select_unquantized_moe_backend(
|
||||
moe_config=moe_config
|
||||
)
|
||||
|
||||
assert selected_backend == UnquantizedMoeBackend.FLASHINFER_TRTLLM
|
||||
assert experts_cls is not None
|
||||
|
||||
|
||||
@patch(
|
||||
"vllm.model_executor.layers.fused_moe.oracle.unquantized.has_flashinfer",
|
||||
"vllm.utils.flashinfer.has_flashinfer",
|
||||
return_value=True,
|
||||
)
|
||||
@patch(
|
||||
"vllm.model_executor.layers.fused_moe.oracle.unquantized.is_supported_config_trtllm_bf16",
|
||||
"vllm.model_executor.layers.fused_moe.experts.trtllm_bf16_moe.TrtLlmBf16Experts.is_supported_config",
|
||||
return_value=(False, None),
|
||||
)
|
||||
@patch(
|
||||
"vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts.is_supported_config",
|
||||
return_value=(True, None),
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(), reason="Only supported on NVIDIA platforms."
|
||||
)
|
||||
def test_select_cuda_flashinfer_cutlass_backend(
|
||||
mock_has_flashinfer, mock_is_supported_trtllm, monkeypatch
|
||||
mock_has_flashinfer,
|
||||
mock_is_supported_trtllm,
|
||||
mock_is_supported_cutlass,
|
||||
monkeypatch,
|
||||
):
|
||||
"""Test CUDA backend selection when FlashInfer TRTLLM is not available
|
||||
and FlashInfer CUTLASS is available."""
|
||||
with patch(
|
||||
"vllm.model_executor.layers.fused_moe.oracle.unquantized.current_platform"
|
||||
) as mock_platform:
|
||||
# Set as CUDA platform with Hopper capability
|
||||
mock_platform.is_cuda.return_value = True
|
||||
mock_platform.is_rocm.return_value = False
|
||||
mock_platform.is_cpu.return_value = False
|
||||
mock_platform.is_xpu.return_value = False
|
||||
mock_platform.is_tpu.return_value = False
|
||||
mock_platform.is_out_of_tree.return_value = False
|
||||
mock_platform.has_device_capability.return_value = True # SM90+
|
||||
|
||||
with (
|
||||
patch.object(current_platform, "is_cuda", return_value=True),
|
||||
patch.object(current_platform, "is_rocm", return_value=False),
|
||||
patch.object(current_platform, "is_cpu", return_value=False),
|
||||
patch.object(current_platform, "is_xpu", return_value=False),
|
||||
patch.object(current_platform, "is_tpu", return_value=False),
|
||||
patch.object(current_platform, "is_out_of_tree", return_value=False),
|
||||
patch.object(current_platform, "has_device_capability", return_value=True),
|
||||
):
|
||||
# Enable FlashInfer via env var
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1")
|
||||
|
||||
moe_config = make_dummy_moe_config()
|
||||
# CUTLASS requires EP and does not support DP
|
||||
moe_config.moe_parallel_config.use_ep = True
|
||||
moe_config.moe_parallel_config.use_dp = False
|
||||
|
||||
selected_backend = select_unquantized_moe_backend(
|
||||
moe_config=moe_config,
|
||||
use_dp=False, # CUTLASS doesn't support DP
|
||||
selected_backend, experts_cls = select_unquantized_moe_backend(
|
||||
moe_config=moe_config
|
||||
)
|
||||
|
||||
assert selected_backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS
|
||||
assert experts_cls is not None
|
||||
|
||||
Reference in New Issue
Block a user