diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index ab8bf9d23..c5db1ca83 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -388,9 +388,7 @@ steps: - label: V1 Test e2e + engine # 65min timeout_in_minutes: 90 mirror_hardwares: [amdexperimental, amdproduction] - # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. - # See discussion here: https://github.com/vllm-project/vllm/pull/31040 - agent_pool: mi325_8 + agent_pool: mi325_1 optional: true # grade: Blocking source_file_dependencies: @@ -402,6 +400,34 @@ steps: - pytest -v -s v1/e2e - pytest -v -s v1/engine +- label: V1 Test e2e (2 GPUs) # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + optional: true + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # Only run tests that need exactly 2 GPUs + - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism" + +- label: V1 Test e2e (4 GPUs) # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction] + # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. + # See discussion here: https://github.com/vllm-project/vllm/pull/31040 + agent_pool: mi325_4 + optional: true + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # Only run tests that need 4 GPUs + - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy" + - label: V1 Test entrypoints # 35min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml index 19cd91370..b5b3eeb6d 100644 --- a/.buildkite/test_areas/engine.yaml +++ b/.buildkite/test_areas/engine.yaml @@ -14,7 +14,7 @@ steps: commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py -- label: V1 e2e + engine +- label: V1 e2e + engine (1 GPU) timeout_in_minutes: 45 source_file_dependencies: - vllm/ @@ -36,3 +36,35 @@ steps: commands: - pytest -v -s v1/e2e - pytest -v -s v1/engine + +- label: V1 e2e (2 GPUs) + timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability + optional: true + num_devices: 2 + source_file_dependencies: + - vllm/ + - tests/v1/e2e + commands: + # Only run tests that need exactly 2 GPUs + - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism" + mirror: + amd: + device: mi325_2 + depends_on: + - image-build-amd + +- label: V1 e2e (4 GPUs) + timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability + optional: true + num_devices: 4 + source_file_dependencies: + - vllm/ + - tests/v1/e2e + commands: + # Only run tests that need 4 GPUs + - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy" + mirror: + amd: + device: mi325_4 + depends_on: + - image-build-amd diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 7f2db19a0..4c90df5f4 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -630,7 +630,7 @@ def test_eagle_correctness_medium( False, "auto", 0.8, - marks=multi_gpu_marks(num_gpus=4), + marks=[*multi_gpu_marks(num_gpus=4), large_gpu_mark(min_gb=40)], id="llama4_eagle", ), pytest.param(