[V0 Deprecation] Remove LLMEngine (#25033)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -69,28 +69,20 @@ def generate_params():
|
||||
|
||||
@pytest.mark.parametrize("device, name, use_mla, block_size",
|
||||
generate_params())
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_env(
|
||||
device: str,
|
||||
name: str,
|
||||
use_mla: bool,
|
||||
block_size: int,
|
||||
use_v1: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Test attention backend selection with valid device-backend pairs."""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv(STR_BACKEND_ENV_VAR, name)
|
||||
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
|
||||
|
||||
if name == "FLASHINFER" and not use_v1:
|
||||
pytest.skip("FlashInfer backend is only available on V1 engine")
|
||||
|
||||
if device == "cpu":
|
||||
if not use_v1:
|
||||
pytest.skip("CPU backend only supports V1")
|
||||
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CpuPlatform()):
|
||||
backend = get_attn_backend(16, torch.float16, None, block_size,
|
||||
@@ -137,7 +129,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = f"{name}_VLLM_V1" if use_v1 else name
|
||||
expected = f"{name}_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
backend = get_attn_backend(16,
|
||||
@@ -146,7 +138,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
|
||||
expected = "TRITON_ATTN_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
|
||||
elif device == "cuda":
|
||||
@@ -163,11 +155,7 @@ def test_env(
|
||||
# - TRITON_MLA: fallback for other cases
|
||||
|
||||
if name == "CUTLASS_MLA":
|
||||
if not use_v1:
|
||||
# CUTLASS_MLA only supported on V1 engine
|
||||
pytest.skip(
|
||||
"CUTLASS_MLA only supported on V1 engine")
|
||||
elif block_size != 128:
|
||||
if block_size != 128:
|
||||
# CUTLASS_MLA only supports block_size == 128
|
||||
pytest.skip(
|
||||
"CUTLASS_MLA only supports block_size 128")
|
||||
@@ -181,11 +169,7 @@ def test_env(
|
||||
expected = "CUTLASS_MLA_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASHINFER_MLA":
|
||||
if not use_v1:
|
||||
# FlashInfer MLA only supported on V1 engine
|
||||
pytest.skip(
|
||||
"FlashInfer MLA only supported on V1 engine")
|
||||
elif block_size not in [32, 64]:
|
||||
if block_size not in [32, 64]:
|
||||
# FlashInfer MLA only supports block_size 32 or 64
|
||||
pytest.skip(
|
||||
"FlashInfer MLA only supports block_size 32 "
|
||||
@@ -217,23 +201,17 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = f"{name}_VLLM_V1" if use_v1 else name
|
||||
expected = f"{name}_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASH_ATTN_MLA":
|
||||
if not use_v1:
|
||||
# FlashAttention MLA only supported on V1 engine
|
||||
pytest.skip(
|
||||
"FlashAttention MLA only supported on V1 engine"
|
||||
)
|
||||
else:
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_MLA"
|
||||
assert backend.get_name() == expected
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_MLA"
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
# TRITON_MLA or other fallback
|
||||
backend = get_attn_backend(16,
|
||||
@@ -242,8 +220,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = ("TRITON_MLA_VLLM_V1"
|
||||
if use_v1 else "TRITON_MLA")
|
||||
expected = "TRITON_MLA_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASHINFER":
|
||||
backend = get_attn_backend(16,
|
||||
@@ -252,7 +229,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASHINFER_VLLM_V1" if use_v1 else name
|
||||
expected = "FLASHINFER_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
backend = get_attn_backend(32,
|
||||
@@ -261,36 +238,30 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
|
||||
expected = "FLASH_ATTN_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
|
||||
if use_v1:
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
assert backend.get_name() == "FLEX_ATTENTION", (
|
||||
"Should fallback to FlexAttention if head size is "
|
||||
"not supported by FlashAttention")
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
assert backend.get_name() == "FLEX_ATTENTION", (
|
||||
"Should fallback to FlexAttention if head size is "
|
||||
"not supported by FlashAttention")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_fp32_fallback(
|
||||
device: str,
|
||||
use_v1: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Test attention backend selection with fp32."""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
if device == "cpu":
|
||||
if not use_v1:
|
||||
pytest.skip("CPU backend only supports V1")
|
||||
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CpuPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, None, 16, False)
|
||||
@@ -300,8 +271,7 @@ def test_fp32_fallback(
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CudaPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, None, 16, False)
|
||||
assert (backend.get_name() == "FLEX_ATTENTION"
|
||||
if use_v1 else "XFORMERS")
|
||||
assert backend.get_name() == "FLEX_ATTENTION"
|
||||
|
||||
|
||||
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||
@@ -357,12 +327,11 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
|
||||
def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test that invalid attention backend names raise ValueError."""
|
||||
with monkeypatch.context() as m, patch(
|
||||
"vllm.attention.selector.current_platform", CudaPlatform()):
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
|
||||
|
||||
# Should raise ValueError for invalid backend
|
||||
|
||||
Reference in New Issue
Block a user