diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 062de8f0f..052c85c22 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -549,7 +549,7 @@ steps: - tests/samplers - tests/conftest.py commands: - - pytest -v -s -m samplers + - pytest -v -s samplers - label: LoRA Test %N # 20min each timeout_in_minutes: 30 @@ -2177,7 +2177,7 @@ steps: - tests/samplers - tests/conftest.py commands: - - pytest -v -s -m samplers + - pytest -v -s samplers - label: LoRA Test %N # 20min each timeout_in_minutes: 30 diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml index cc84d2a48..2052a3798 100644 --- a/.buildkite/test_areas/samplers.yaml +++ b/.buildkite/test_areas/samplers.yaml @@ -18,4 +18,4 @@ steps: depends_on: - image-build-amd commands: - - pytest -v -s -m samplers + - pytest -v -s samplers diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index b2df9af6f..aef7eec09 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -9,6 +9,26 @@ import pytest from transformers import AutoModelForSeq2SeqLM from vllm.assets.audio import AudioAsset +from vllm.platforms import current_platform + +# Extra engine kwargs needed for numerically deterministic beam search. +# On ROCm, floating-point reductions in attention and GEMM kernels are +# non-associative and sensitive to batch geometry, so we: +# async_scheduling=False – deterministic batch composition +# enforce_eager=True – no CUDA-graph padding changing effective size +# enable_prefix_caching=False – avoid prefix-sharing side effects +# max_num_seqs=1 – fixed batch size across runs +# On other platforms these are not needed and the dict is empty. +EXTRA_ENGINE_KWARGS: dict = ( + dict( + async_scheduling=False, + enforce_eager=True, + enable_prefix_caching=False, + max_num_seqs=1, + ) + if current_platform.is_rocm() + else {} +) # FIXME(zhuohan): The test can not pass if we: # 1. Increase max_tokens to 256. @@ -25,6 +45,7 @@ MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"] @pytest.mark.parametrize("max_tokens", MAX_TOKENS) @pytest.mark.parametrize("beam_width", BEAM_WIDTHS) def test_beam_search_single_input( + monkeypatch, hf_runner, vllm_runner, example_prompts, @@ -33,13 +54,16 @@ def test_beam_search_single_input( max_tokens: int, beam_width: int, ) -> None: + if current_platform.is_rocm(): + monkeypatch.setenv("VLLM_ROCM_USE_SKINNY_GEMM", "0") + example_prompts = example_prompts[:1] with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_beam_search( example_prompts, beam_width, max_tokens ) - with vllm_runner(model, dtype=dtype) as vllm_model: + with vllm_runner(model, dtype=dtype, **EXTRA_ENGINE_KWARGS) as vllm_model: vllm_outputs = vllm_model.generate_beam_search( example_prompts, beam_width, max_tokens ) @@ -66,6 +90,7 @@ def test_beam_search_single_input( @pytest.mark.parametrize("max_tokens", MAX_TOKENS) @pytest.mark.parametrize("beam_width", BEAM_WIDTHS) def test_beam_search_with_concurrency_limit( + monkeypatch, hf_runner, vllm_runner, example_prompts, @@ -74,21 +99,29 @@ def test_beam_search_with_concurrency_limit( max_tokens: int, beam_width: int, ) -> None: + if current_platform.is_rocm(): + monkeypatch.setenv("VLLM_ROCM_USE_SKINNY_GEMM", "0") + # example_prompts[1]&[3]&[7] fails due to unknown reason even without # concurrency limit. skip them for now. example_prompts = example_prompts[:8] concurrency_limit = 2 assert len(example_prompts) > concurrency_limit - with vllm_runner(model, dtype=dtype) as vllm_model: + with vllm_runner(model, dtype=dtype, **EXTRA_ENGINE_KWARGS) as vllm_model: outputs_with_limit = vllm_model.generate_beam_search( - example_prompts, beam_width, max_tokens, concurrency_limit=concurrency_limit + example_prompts, + beam_width, + max_tokens, + concurrency_limit=concurrency_limit, ) outputs_without_limit = [] for i in range(0, len(example_prompts), concurrency_limit): outputs_without_limit.extend( vllm_model.generate_beam_search( - example_prompts[i : i + concurrency_limit], beam_width, max_tokens + example_prompts[i : i + concurrency_limit], + beam_width, + max_tokens, ) ) @@ -118,6 +151,7 @@ def test_beam_search_with_concurrency_limit( @pytest.mark.parametrize("max_tokens", MAX_TOKENS) @pytest.mark.parametrize("beam_width", MM_BEAM_WIDTHS) def test_beam_search_passes_multimodal_data( + monkeypatch, hf_runner, vllm_runner, dtype: str, @@ -125,6 +159,9 @@ def test_beam_search_passes_multimodal_data( beam_width: int, ) -> None: """Ensure that beam search passes multimodal data through correctly.""" + if current_platform.is_rocm(): + monkeypatch.setenv("VLLM_ROCM_USE_SKINNY_GEMM", "0") + # NOTE - this test is primarily to check that mm data is passed to beams # correctly. As such, we just need to check one extra modality to make # sure things pass through properly. @@ -145,7 +182,7 @@ def test_beam_search_passes_multimodal_data( audios=audios, ) - with vllm_runner(model, dtype=dtype) as vllm_model: + with vllm_runner(model, dtype=dtype, **EXTRA_ENGINE_KWARGS) as vllm_model: vllm_outputs = vllm_model.generate_beam_search( prompts, beam_width=beam_width,