diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml index 51b9fdc8b..f9eccdcbb 100644 --- a/.buildkite/test_areas/compile.yaml +++ b/.buildkite/test_areas/compile.yaml @@ -36,6 +36,16 @@ steps: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py +- label: AsyncTP Correctness Tests (B200) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/" + device: b200 + optional: true + num_devices: 2 + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py + - label: Distributed Compile Unit Tests (2xH100) timeout_in_minutes: 20 working_dir: "/vllm-workspace/" diff --git a/tests/compile/correctness_e2e/test_async_tp.py b/tests/compile/correctness_e2e/test_async_tp.py index cf9c75d91..3539e4d5a 100644 --- a/tests/compile/correctness_e2e/test_async_tp.py +++ b/tests/compile/correctness_e2e/test_async_tp.py @@ -31,7 +31,12 @@ def test_async_tp_pass_correctness( distributed_backend: str, eager_mode: bool, num_gpus_available: int, + monkeypatch, ): + # Disable FlashInfer FP8 scaled_mm kernel as it is incompatible with + # async TP patterns. No-op on H100 (kernel requires CC >= 100). + monkeypatch.setenv("VLLM_DISABLED_KERNELS", "FlashInferFP8ScaledMMLinearKernel") + model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) model_info.check_transformers_version(on_fail="skip") model_info.check_available_online(on_fail="skip")