diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index a22abe73e..fad5f593b 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -13,9 +13,10 @@ import os from contextlib import contextmanager import lm_eval -import numpy as np import yaml +from vllm.platforms import current_platform + DEFAULT_RTOL = 0.08 @@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size): "allow_deprecated_quantization=True," ) + if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]: + model_args += "attention_backend=TRITON_ATTN" + env_vars = eval_config.get("env_vars", None) with scoped_env_vars(env_vars): results = lm_eval.simple_evaluate( @@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size): f"ground_truth={ground_truth:.3f} | " f"measured={measured_value:.3f} | rtol={rtol}" ) - success = success and np.isclose(ground_truth, measured_value, rtol=rtol) + + min_acceptable = ground_truth * (1 - rtol) + success = success and measured_value >= min_acceptable assert success diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh index e875ac466..d587f26ae 100644 --- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh @@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH: BACKENDS=("allgather_reducescatter") # Disable MOE padding for ROCm since it is causing eplb to fail export VLLM_ROCM_MOE_PADDING=0 - PLATFORM_ARGS=("--no-async-scheduling") + PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN") echo "Disabled async scheduling for ROCm platform due to issues with spec decode." else # Non-ROCm platform (CUDA/other) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index ad11f3764..9e10a00db 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -529,7 +529,7 @@ steps: commands: - pip install tensorizer # for tensorizer test # for basic - - python3 basic/offline_inference/chat.py + - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN - python3 basic/offline_inference/generate.py --model facebook/opt-125m - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 basic/offline_inference/classify.py @@ -2208,7 +2208,7 @@ steps: commands: - pip install tensorizer # for tensorizer test # for basic - - python3 basic/offline_inference/chat.py + - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN - python3 basic/offline_inference/generate.py --model facebook/opt-125m - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 basic/offline_inference/classify.py diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py index 3cb64d50a..9ac9106db 100644 --- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py +++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py @@ -15,6 +15,7 @@ from vllm.model_executor.model_loader.tensorizer import ( tensorize_lora_adapter, tensorize_vllm_model, ) +from vllm.platforms import current_platform from ...utils import RemoteOpenAIServer @@ -74,6 +75,8 @@ def server(model_uri, tensorize_model_and_lora): MODEL_NAME, "--enable-lora", ] + if current_platform.is_rocm(): + args += ["--attention-backend", "TRITON_ATTN"] model_dir = os.path.dirname(model_uri) with RemoteOpenAIServer(model_dir, args) as remote_server: diff --git a/tests/models/language/pooling_mteb_test/test_gte.py b/tests/models/language/pooling_mteb_test/test_gte.py index f87fd832a..0c35d66c3 100644 --- a/tests/models/language/pooling_mteb_test/test_gte.py +++ b/tests/models/language/pooling_mteb_test/test_gte.py @@ -8,6 +8,7 @@ from tests.models.utils import ( EmbedModelInfo, RerankModelInfo, ) +from vllm.platforms import current_platform from .mteb_embed_utils import mteb_test_embed_models from .mteb_score_utils import mteb_test_rerank_models @@ -142,4 +143,9 @@ def test_embed_models_correctness( @pytest.mark.parametrize("model_info", RERANK_MODELS) def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: - mteb_test_rerank_models(vllm_runner, model_info) + vllm_extra_kwargs = {} + if current_platform.is_rocm(): + vllm_extra_kwargs["attention_backend"] = "TRITON_ATTN" + mteb_test_rerank_models( + vllm_runner, model_info, vllm_extra_kwargs=vllm_extra_kwargs + ) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index c4b82b93e..979aa96af 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -173,6 +173,9 @@ VLM_TEST_SETTINGS = { marks=[ pytest.mark.core_model, ], + vllm_runner_kwargs={"attention_backend": "TRITON_ATTN"} + if current_platform.is_rocm() + else {}, ), "ultravox": VLMTestInfo( models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"], diff --git a/tests/test_regression.py b/tests/test_regression.py index 2fc0308ff..ac82206f7 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -13,6 +13,7 @@ import pytest import torch from vllm import LLM, SamplingParams +from vllm.platforms import current_platform @pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len") @@ -65,7 +66,8 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch): # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail # with 400 Client Error: Bad Request. m.setenv("HF_TOKEN", "") - llm = LLM(model="qwen/Qwen1.5-0.5B-Chat") + attn_backend = "TRITON_ATTN" if current_platform.is_rocm() else "auto" + llm = LLM(model="qwen/Qwen1.5-0.5B-Chat", attention_backend=attn_backend) prompts = [ "Hello, my name is", diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index f895fb72e..92b4d4532 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -91,6 +91,7 @@ def test_kv_sharing_fast_prefill( compilation_config=compilation_config, seed=SEED, kv_sharing_fast_prefill=kv_sharing_fast_prefill, + attention_backend="TRITON_ATTN", ) responses = llm.generate(prompts, sampling_params) check_answers( diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 3988070ca..8fdca83a2 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -732,11 +732,13 @@ def test_mtp_correctness( method, model_name, tp_size = model_setup _skip_if_insufficient_gpus_for_tp(tp_size) + attn_backend = "TRITON_ATTN" if current_platform.is_rocm() else "auto" ref_llm = LLM( model=model_name, max_model_len=2048, tensor_parallel_size=tp_size, trust_remote_code=True, + attention_backend=attn_backend, ) ref_outputs = ref_llm.chat(test_prompts, sampling_config) evaluate_llm_for_gsm8k( @@ -756,6 +758,7 @@ def test_mtp_correctness( "max_model_len": 2048, }, max_model_len=2048, + attention_backend=attn_backend, ) evaluate_llm_for_gsm8k( spec_llm, expected_accuracy_threshold=expected_accuracy_threshold diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index df2fac85e..d029a6ce0 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -42,9 +42,7 @@ SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT # Force LLM instances into an identical, deterministic execution # mode so the test isolates spec-decode correctness only: ROCM_DETERMINISM_KWARGS: dict = ( - dict( - max_num_seqs=1, - ) + dict(max_num_seqs=1, attention_backend="TRITON_ATTN") if current_platform.is_rocm() else {} )