[ROCm][CI] Prep Tests For Change To ROCM_ATTN As New Default Backend On ROCm (#36025)
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
This commit is contained in:
@@ -13,9 +13,10 @@ import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
import lm_eval
|
||||
import numpy as np
|
||||
import yaml
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DEFAULT_RTOL = 0.08
|
||||
|
||||
|
||||
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
|
||||
"allow_deprecated_quantization=True,"
|
||||
)
|
||||
|
||||
if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
|
||||
model_args += "attention_backend=TRITON_ATTN"
|
||||
|
||||
env_vars = eval_config.get("env_vars", None)
|
||||
with scoped_env_vars(env_vars):
|
||||
results = lm_eval.simple_evaluate(
|
||||
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
f"ground_truth={ground_truth:.3f} | "
|
||||
f"measured={measured_value:.3f} | rtol={rtol}"
|
||||
)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
||||
|
||||
min_acceptable = ground_truth * (1 - rtol)
|
||||
success = success and measured_value >= min_acceptable
|
||||
|
||||
assert success
|
||||
|
||||
@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
PLATFORM_ARGS=("--no-async-scheduling")
|
||||
PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
|
||||
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
|
||||
@@ -529,7 +529,7 @@ steps:
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 basic/offline_inference/chat.py
|
||||
- python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
|
||||
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 basic/offline_inference/classify.py
|
||||
@@ -2208,7 +2208,7 @@ steps:
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 basic/offline_inference/chat.py
|
||||
- python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
|
||||
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 basic/offline_inference/classify.py
|
||||
|
||||
Reference in New Issue
Block a user