[ROCm][CI][Bugfix] Fixing the Multi-Modal Models Test (Extended) 1 group (#30013)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
@@ -2,6 +2,8 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Pytest configuration for vLLM tests."""
|
||||
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
@@ -14,6 +16,20 @@ def pytest_configure(config):
|
||||
if not current_platform.is_rocm():
|
||||
return
|
||||
|
||||
skip_patterns = ["test_granite_speech.py"]
|
||||
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
|
||||
# Skip disabling SDP for Granite Speech tests on ROCm
|
||||
return
|
||||
|
||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||
# accuracy issues
|
||||
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
torch.backends.cuda.enable_math_sdp(True)
|
||||
warnings.warn(
|
||||
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
|
||||
"to avoid HuggingFace Transformers accuracy issues",
|
||||
UserWarning,
|
||||
stacklevel=1,
|
||||
)
|
||||
|
||||
@@ -403,12 +403,13 @@ VLM_TEST_SETTINGS = {
|
||||
# So, we need to reduce the number of tokens for the test to pass.
|
||||
max_tokens=8,
|
||||
num_logprobs=10,
|
||||
auto_cls=AutoModelForCausalLM,
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"glm4_1v": VLMTestInfo(
|
||||
models=["zai-org/GLM-4.1V-9B-Thinking"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
|
||||
prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
|
||||
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
|
||||
max_model_len=2048,
|
||||
@@ -423,6 +424,7 @@ VLM_TEST_SETTINGS = {
|
||||
models=["zai-org/GLM-4.1V-9B-Thinking"],
|
||||
# GLM4.1V require include video metadata for input
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
@@ -737,7 +739,13 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
marks=[large_gpu_mark(min_gb=48)],
|
||||
marks=[
|
||||
large_gpu_mark(min_gb=48),
|
||||
pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason="Model produces a vector of <UNK> output in HF on ROCm",
|
||||
),
|
||||
],
|
||||
),
|
||||
"qwen_vl": VLMTestInfo(
|
||||
models=["Qwen/Qwen-VL"],
|
||||
|
||||
@@ -8,6 +8,7 @@ from transformers import AutoModelForSpeechSeq2Seq
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
@@ -34,6 +35,12 @@ audio_lora_path = MODEL_NAME
|
||||
models = [MODEL_NAME]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def set_attention_backend_for_rocm(monkeypatch):
|
||||
if current_platform.is_rocm():
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
@@ -111,8 +118,12 @@ def run_test(
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_model_len", [2048])
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"max_model_len", [512] if current_platform.is_rocm() else [2048]
|
||||
)
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(
|
||||
|
||||
@@ -15,6 +15,7 @@ from transformers import AutoProcessor
|
||||
from vllm import SamplingParams, TextPrompt, TokensPrompt
|
||||
from vllm.logprobs import Logprob, SampleLogprobs
|
||||
from vllm.multimodal import MultiModalDataBuiltins
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import VLLM_PATH, large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
@@ -165,6 +166,15 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
|
||||
def test_chat(
|
||||
vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
|
||||
) -> None:
|
||||
if (
|
||||
model == MISTRAL_SMALL_3_1_ID
|
||||
and max_model_len == 65536
|
||||
and current_platform.is_rocm()
|
||||
):
|
||||
pytest.skip(
|
||||
"OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
|
||||
)
|
||||
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
|
||||
with vllm_runner(
|
||||
model,
|
||||
|
||||
@@ -140,7 +140,7 @@ def video_with_metadata_glm4_1v():
|
||||
metadata = VIDEO_ASSETS[0].metadata
|
||||
question = "Describe the video."
|
||||
video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||
formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
|
||||
formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"
|
||||
|
||||
scales = [0.1, 0.2, 0.25]
|
||||
video_input = [
|
||||
|
||||
@@ -25,6 +25,7 @@ from transformers import (
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
|
||||
from .....conftest import HfRunner, ImageAsset, ImageTestAssets
|
||||
@@ -366,6 +367,40 @@ def gemma3_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOut
|
||||
|
||||
def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
|
||||
if current_platform.is_rocm():
|
||||
import types
|
||||
|
||||
config = hf_model.model.config
|
||||
if hasattr(config, "num_layers") and not hasattr(config, "num_hidden_layers"):
|
||||
config.num_hidden_layers = config.num_layers
|
||||
config.output_hidden_states = True
|
||||
|
||||
def patched_prepare_cache(
|
||||
self, generation_config, model_kwargs, *args, **kwargs
|
||||
):
|
||||
model_kwargs["past_key_values"] = None
|
||||
model_kwargs["use_cache"] = False
|
||||
return model_kwargs
|
||||
|
||||
hf_model.model._prepare_cache_for_generation = types.MethodType(
|
||||
patched_prepare_cache, hf_model.model
|
||||
)
|
||||
original_generate = hf_model.model.generate
|
||||
|
||||
def patched_generate(*args, **kwargs):
|
||||
kwargs["output_hidden_states"] = True
|
||||
kwargs["return_dict_in_generate"] = True
|
||||
return original_generate(*args, **kwargs)
|
||||
|
||||
hf_model.model.generate = patched_generate
|
||||
original_forward = hf_model.model.forward
|
||||
|
||||
def patched_forward(*args, **kwargs):
|
||||
kwargs["output_hidden_states"] = True
|
||||
return original_forward(*args, **kwargs)
|
||||
|
||||
hf_model.model.forward = patched_forward
|
||||
|
||||
hf_processor = hf_model.processor
|
||||
|
||||
def processor(*args, text="", images=None, **kwargs):
|
||||
@@ -406,7 +441,15 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
if videos is not None and is_list_of(videos, tuple):
|
||||
# If videos is a list of tuples, we assume each tuple contains
|
||||
# (video_array, metadata) as in the case of GLM4.1V.
|
||||
video_metadata = [[VideoMetadata(**video[1])] for video in videos]
|
||||
# Filter out 'do_sample_frames' as it's not a valid VideoMetadata arg
|
||||
video_metadata = [
|
||||
[
|
||||
VideoMetadata(
|
||||
**{k: v for k, v in video[1].items() if k != "do_sample_frames"}
|
||||
)
|
||||
]
|
||||
for video in videos
|
||||
]
|
||||
videos = [[video[0]] for video in videos]
|
||||
else:
|
||||
video_metadata = None
|
||||
|
||||
Reference in New Issue
Block a user