Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -15,16 +15,18 @@ from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
@pytest.fixture(autouse=True)
def clear_cache():
"""Clear lru cache to ensure each test case runs without caching.
"""
"""Clear lru cache to ensure each test case runs without caching."""
_cached_get_attn_backend.cache_clear()
# Define MLA and non-MLA backends separately
DEVICE_MLA_BACKENDS = {
"cuda": [
"TRITON_MLA", "FLASHMLA", "FLASHINFER_MLA", "FLASH_ATTN_MLA",
"CUTLASS_MLA"
"TRITON_MLA",
"FLASHMLA",
"FLASHINFER_MLA",
"FLASH_ATTN_MLA",
"CUTLASS_MLA",
],
"hip": ["TRITON_MLA", "ROCM_AITER_MLA"],
"cpu": [],
@@ -40,7 +42,7 @@ DEVICE_MLA_BLOCK_SIZES = {
"cuda": [16, 64], # CUDA supports both standard and extended block sizes
"hip": [16, 1], # HIP requires special handling for block_size=1
# "cpu": [16] # CPU uses fixed block size from test cases
"cpu": [] # FIXME(woosuk): Temporarily disable CPU tests
"cpu": [], # FIXME(woosuk): Temporarily disable CPU tests
}
@@ -48,12 +50,13 @@ def generate_params():
params = []
for use_mla in [True, False]:
for device in ["cuda", "hip", "cpu"]:
backends = DEVICE_MLA_BACKENDS[
device] if use_mla else DEVICE_REGULAR_ATTN_BACKENDS[device]
backends = (
DEVICE_MLA_BACKENDS[device]
if use_mla
else DEVICE_REGULAR_ATTN_BACKENDS[device]
)
for name in backends:
block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [
16
]
block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [16]
for block_size in block_sizes:
params.append(
pytest.param(
@@ -61,14 +64,13 @@ def generate_params():
name,
use_mla,
block_size,
id=
f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}"
))
id=f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}",
)
)
return params
@pytest.mark.parametrize("device, name, use_mla, block_size",
generate_params())
@pytest.mark.parametrize("device, name, use_mla, block_size", generate_params())
def test_env(
device: str,
name: str,
@@ -83,14 +85,12 @@ def test_env(
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
if device == "cpu":
with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
with patch("vllm.attention.selector.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float16, None, block_size)
assert backend.get_name() == "TORCH_SDPA"
elif device == "hip":
with patch("vllm.attention.selector.current_platform",
RocmPlatform()):
with patch("vllm.attention.selector.current_platform", RocmPlatform()):
if use_mla:
# ROCm MLA backend logic:
# - TRITON_MLA: supported when block_size != 1
@@ -101,44 +101,33 @@ def test_env(
if name == "TRITON_MLA" and block_size == 1:
# TRITON_MLA doesn't support block_size == 1
with pytest.raises(ValueError) as exc_info:
get_attn_backend(16,
torch.float16,
None,
block_size,
use_mla=use_mla)
assert f"The selected backend, {name}" in str(
exc_info.value)
get_attn_backend(
16, torch.float16, None, block_size, use_mla=use_mla
)
assert f"The selected backend, {name}" in str(exc_info.value)
elif name == "ROCM_AITER_MLA" and block_size != 1:
# ROCM_AITER_MLA only supports block_size == 1
with pytest.raises(ValueError) as exc_info:
get_attn_backend(16,
torch.float16,
None,
block_size,
use_mla=use_mla)
assert f"The selected backend, {name}" in str(
exc_info.value)
get_attn_backend(
16, torch.float16, None, block_size, use_mla=use_mla
)
assert f"The selected backend, {name}" in str(exc_info.value)
else:
# Valid backend-block_size combination
backend = get_attn_backend(16,
torch.float16,
None,
block_size,
use_mla=use_mla)
backend = get_attn_backend(
16, torch.float16, None, block_size, use_mla=use_mla
)
expected = name
assert backend.get_name() == expected
else:
backend = get_attn_backend(16,
torch.float16,
None,
block_size,
use_mla=use_mla)
backend = get_attn_backend(
16, torch.float16, None, block_size, use_mla=use_mla
)
expected = "TRITON_ATTN"
assert backend.get_name() == expected
elif device == "cuda":
with patch("vllm.attention.selector.current_platform",
CudaPlatform()):
with patch("vllm.attention.selector.current_platform", CudaPlatform()):
if use_mla:
# CUDA MLA backend logic:
# - CUTLASS_MLA: only supported with block_size == 128
@@ -152,28 +141,23 @@ def test_env(
if name == "CUTLASS_MLA":
if block_size != 128:
# CUTLASS_MLA only supports block_size == 128
pytest.skip(
"CUTLASS_MLA only supports block_size 128")
pytest.skip("CUTLASS_MLA only supports block_size 128")
else:
backend = get_attn_backend(16,
torch.float16,
None,
block_size,
use_mla=use_mla)
backend = get_attn_backend(
16, torch.float16, None, block_size, use_mla=use_mla
)
expected = "CUTLASS_MLA"
assert backend.get_name() == expected
elif name == "FLASHINFER_MLA":
if block_size not in [32, 64]:
# FlashInfer MLA only supports block_size 32 or 64
pytest.skip(
"FlashInfer MLA only supports block_size 32 "
"or 64")
"FlashInfer MLA only supports block_size 32 or 64"
)
else:
backend = get_attn_backend(16,
torch.float16,
None,
block_size,
use_mla=use_mla)
backend = get_attn_backend(
16, torch.float16, None, block_size, use_mla=use_mla
)
expected = "FLASHINFER_MLA"
assert backend.get_name() == expected
elif name == "FLASHMLA":
@@ -182,58 +166,47 @@ def test_env(
pytest.skip("FlashMLA only supports block_size 64")
else:
from vllm.v1.attention.backends.mla.flashmla import ( # noqa: E501
is_flashmla_supported)
is_flashmla_supported,
)
is_supported, _ = is_flashmla_supported()
if not is_supported:
pytest.skip(
"FlashMLA not supported on this platform")
pytest.skip("FlashMLA not supported on this platform")
else:
backend = get_attn_backend(16,
torch.float16,
None,
block_size,
use_mla=use_mla)
backend = get_attn_backend(
16, torch.float16, None, block_size, use_mla=use_mla
)
expected = name
assert backend.get_name() == expected
elif name == "FLASH_ATTN_MLA":
backend = get_attn_backend(16,
torch.float16,
None,
block_size,
use_mla=use_mla)
backend = get_attn_backend(
16, torch.float16, None, block_size, use_mla=use_mla
)
expected = "FLASH_ATTN_MLA"
assert backend.get_name() == expected
else:
# TRITON_MLA or other fallback
backend = get_attn_backend(16,
torch.float16,
None,
block_size,
use_mla=use_mla)
backend = get_attn_backend(
16, torch.float16, None, block_size, use_mla=use_mla
)
expected = "TRITON_MLA"
assert backend.get_name() == expected
elif name == "FLASHINFER":
backend = get_attn_backend(16,
torch.float16,
None,
block_size,
use_mla=use_mla)
backend = get_attn_backend(
16, torch.float16, None, block_size, use_mla=use_mla
)
expected = "FLASHINFER"
assert backend.get_name() == expected
elif name == "XFORMERS":
backend = get_attn_backend(32,
torch.float16,
None,
block_size,
use_mla=use_mla)
backend = get_attn_backend(
32, torch.float16, None, block_size, use_mla=use_mla
)
expected = "XFORMERS"
assert backend.get_name() == expected
elif name == "FLASH_ATTN":
backend = get_attn_backend(32,
torch.float16,
None,
block_size,
use_mla=use_mla)
backend = get_attn_backend(
32, torch.float16, None, block_size, use_mla=use_mla
)
expected = "FLASH_ATTN"
assert backend.get_name() == expected
@@ -248,14 +221,12 @@ def test_fp32_fallback(
m.setenv("VLLM_USE_V1", "1")
if device == "cpu":
with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
with patch("vllm.attention.selector.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "TORCH_SDPA"
elif device == "cuda":
with patch("vllm.attention.selector.current_platform",
CudaPlatform()):
with patch("vllm.attention.selector.current_platform", CudaPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "FLEX_ATTENTION"
@@ -265,16 +236,16 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
# TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend
pytest.skip("Skipping as current backend selector does not " \
"handle fallbacks when a backend is set via env var.")
pytest.skip(
"Skipping as current backend selector does not "
"handle fallbacks when a backend is set via env var."
)
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
# Unsupported CUDA arch
monkeypatch.setattr(torch.cuda,
"get_device_capability",
lambda _=None: (7, 5))
monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
backend = get_attn_backend(16, torch.float16, None, 16)
assert backend.get_name() != STR_FLASH_ATTN_VAL
@@ -295,17 +266,17 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
# flash-attn is not installed
import sys
original_module = sys.modules.get('vllm_flash_attn')
monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
original_module = sys.modules.get("vllm_flash_attn")
monkeypatch.setitem(sys.modules, "vllm_flash_attn", None)
backend = get_attn_backend(16, torch.float16, None, 16)
assert backend.get_name() != STR_FLASH_ATTN_VAL
# Restore the original module if it existed
if original_module is not None:
monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
original_module)
monkeypatch.setitem(sys.modules, "vllm_flash_attn", original_module)
else:
monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
monkeypatch.delitem(sys.modules, "vllm_flash_attn", raising=False)
# Unsupported head size
backend = get_attn_backend(17, torch.float16, None, 16)
@@ -314,8 +285,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
"""Test that invalid attention backend names raise ValueError."""
with monkeypatch.context() as m, patch(
"vllm.attention.selector.current_platform", CudaPlatform()):
with (
monkeypatch.context() as m,
patch("vllm.attention.selector.current_platform", CudaPlatform()),
):
m.setenv("VLLM_USE_V1", "1")
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)