[CI Fix] Pin deepep and pplx tags in tools/ep_kernels/, gate multigpu tests (#23568)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -23,6 +23,7 @@ from vllm.utils import has_deep_ep, has_deep_gemm
|
||||
from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
|
||||
is_deep_gemm_supported)
|
||||
|
||||
from ...utils import multi_gpu_test
|
||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||
from .utils import make_test_weights
|
||||
|
||||
@@ -370,6 +371,7 @@ NUM_EXPERTS = [32]
|
||||
@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOPKS)
|
||||
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@requires_deep_ep
|
||||
@requires_deep_gemm
|
||||
@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
|
||||
@@ -427,6 +429,7 @@ USE_FP8_DISPATCH = [False]
|
||||
@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
|
||||
@pytest.mark.parametrize("block_size", [[128, 128]])
|
||||
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@requires_deep_ep
|
||||
@requires_deep_gemm
|
||||
@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
|
||||
|
||||
@@ -24,6 +24,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import has_deep_ep
|
||||
|
||||
from ...utils import multi_gpu_test
|
||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||
|
||||
if has_deep_ep():
|
||||
@@ -411,6 +412,7 @@ DTYPES = [torch.bfloat16, torch.float8_e4m3fn]
|
||||
@pytest.mark.parametrize("topk", [6])
|
||||
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
||||
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@requires_deep_ep
|
||||
def test_deep_ep_moe(
|
||||
dtype: torch.dtype,
|
||||
@@ -459,6 +461,7 @@ USE_FP8_DISPATCH = [True, False]
|
||||
@pytest.mark.parametrize("topk", [6])
|
||||
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
||||
@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@requires_deep_ep
|
||||
def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
|
||||
num_experts: int, topk: int,
|
||||
|
||||
@@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
||||
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
|
||||
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
||||
|
||||
from ...utils import multi_gpu_test
|
||||
from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
|
||||
reference_moe_impl,
|
||||
run_modular_kernel)
|
||||
@@ -162,6 +163,7 @@ def is_nyi_config(config: Config) -> bool:
|
||||
product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
|
||||
@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
|
||||
@pytest.mark.parametrize("world_size", [2])
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@meets_multi_gpu_requirements
|
||||
def test_modular_kernel_combinations_multigpu(
|
||||
k: int, n: int, e: int, dtype: torch.dtype,
|
||||
|
||||
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import cdiv
|
||||
|
||||
from ...utils import multi_gpu_test
|
||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||
|
||||
try:
|
||||
@@ -247,6 +248,7 @@ def _pplx_moe(
|
||||
@pytest.mark.parametrize("per_out_ch", [True, False])
|
||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]]) #, [4, 2]])
|
||||
@pytest.mark.parametrize("use_internode", [False])
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.skipif(
|
||||
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
|
||||
current_platform.get_device_capability()),
|
||||
|
||||
@@ -37,6 +37,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import round_up
|
||||
|
||||
from ...utils import multi_gpu_test
|
||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||
|
||||
requires_pplx = pytest.mark.skipif(
|
||||
@@ -452,6 +453,7 @@ def _pplx_prepare_finalize(
|
||||
@pytest.mark.parametrize("use_internode", [False])
|
||||
@pytest.mark.optional
|
||||
@requires_pplx
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_pplx_prepare_finalize_slow(
|
||||
mnk: tuple[int, int, int],
|
||||
e: int,
|
||||
@@ -740,6 +742,7 @@ def _pplx_moe(
|
||||
@pytest.mark.parametrize("use_internode", [False])
|
||||
@pytest.mark.optional
|
||||
@requires_pplx
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_pplx_moe_slow(
|
||||
mnk: tuple[int, int, int],
|
||||
e: int,
|
||||
@@ -880,6 +883,7 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
|
||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||
@pytest.mark.parametrize("use_internode", [False])
|
||||
@requires_pplx
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_pplx_prepare_finalize(
|
||||
world_dp_size: tuple[int, int],
|
||||
use_internode: bool,
|
||||
@@ -893,6 +897,7 @@ def test_pplx_prepare_finalize(
|
||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
|
||||
@pytest.mark.parametrize("use_internode", [False])
|
||||
@requires_pplx
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_pplx_moe(
|
||||
world_dp_size: tuple[int, int],
|
||||
use_internode: bool,
|
||||
|
||||
Reference in New Issue
Block a user