VLLM_USE_TRITON_FLASH_ATTN V0 variable deprecation (#27611)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
Signed-off-by: Andreas Karatzas <Andreas.Karatzas@amd.com>
This commit is contained in:
Andreas Karatzas
2025-11-11 20:34:36 -06:00
committed by GitHub
parent 7f829be7d3
commit 9f0247cfa4
15 changed files with 12 additions and 1588 deletions

View File

@@ -5,7 +5,6 @@ image, embedding, and video support for different VLMs in vLLM.
"""
import math
import os
from collections import defaultdict
from pathlib import PosixPath
@@ -38,13 +37,6 @@ from .vlm_utils.types import (
VLMTestType,
)
# This hack is needed for phi3v & paligemma models
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
COMMON_BROADCAST_SETTINGS = {
"test_type": VLMTestType.IMAGE,
"dtype": "half",