[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
|
||||
)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||
|
||||
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
|
||||
|
||||
@@ -32,7 +32,7 @@ model_config = {
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
|
||||
def test_sliding_window_retrieval(
|
||||
monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager
|
||||
model, batch_size, seed, disable_hybrid_kv_cache_manager
|
||||
):
|
||||
"""
|
||||
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
||||
@@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
|
||||
If we tell it upfront which we are going to be looking for, then
|
||||
it answers correctly (mostly).
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
test_config = model_config[model]
|
||||
|
||||
test_config = model_config[model]
|
||||
llm = LLM(
|
||||
model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
|
||||
)
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
|
||||
|
||||
llm = LLM(
|
||||
model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
|
||||
)
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
|
||||
prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)
|
||||
|
||||
prompts, answer, indices = prep_prompts(
|
||||
batch_size, ln_range=test_config.ln_range
|
||||
)
|
||||
check_length(prompts, llm, test_config.sliding_window)
|
||||
|
||||
check_length(prompts, llm, test_config.sliding_window)
|
||||
# Fresh generation
|
||||
responses = llm.generate(prompts, sampling_params)
|
||||
check_answers(
|
||||
indices,
|
||||
answer,
|
||||
[response.outputs[0].text for response in responses],
|
||||
accept_rate=1.0,
|
||||
)
|
||||
|
||||
# Fresh generation
|
||||
responses = llm.generate(prompts, sampling_params)
|
||||
check_answers(
|
||||
indices,
|
||||
answer,
|
||||
[response.outputs[0].text for response in responses],
|
||||
accept_rate=1.0,
|
||||
)
|
||||
|
||||
# Re-generate with the same prompts to test prefix caching
|
||||
responses = llm.generate(prompts, sampling_params)
|
||||
check_answers(
|
||||
indices,
|
||||
answer,
|
||||
[response.outputs[0].text for response in responses],
|
||||
accept_rate=1.0,
|
||||
)
|
||||
# Re-generate with the same prompts to test prefix caching
|
||||
responses = llm.generate(prompts, sampling_params)
|
||||
check_answers(
|
||||
indices,
|
||||
answer,
|
||||
[response.outputs[0].text for response in responses],
|
||||
accept_rate=1.0,
|
||||
)
|
||||
|
||||
|
||||
def check_length(prompts: list[str], llm: LLM, sliding_window: int):
|
||||
|
||||
@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
|
||||
)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
# Make scheduling deterministic for reproducibility
|
||||
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
|
||||
|
||||
@@ -13,7 +13,6 @@ Covers:
|
||||
5) Multiple stop conditions
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
import pytest
|
||||
@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
|
||||
@pytest.fixture(scope="module")
|
||||
def llm_v1():
|
||||
"""Create V1 LLM instance for testing"""
|
||||
# Ensure V1 engine is used
|
||||
os.environ["VLLM_USE_V1"] = "1"
|
||||
|
||||
llm = LLM(
|
||||
model=TEST_MODEL,
|
||||
tensor_parallel_size=1,
|
||||
@@ -503,6 +499,6 @@ if __name__ == "__main__":
|
||||
|
||||
Usage:
|
||||
cd vllm/
|
||||
VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v
|
||||
python -m pytest tests/v1/e2e/test_min_tokens.py -v
|
||||
"""
|
||||
pytest.main([__file__, "-v"])
|
||||
|
||||
@@ -301,7 +301,6 @@ def test_mtp_correctness(
|
||||
model_setup: (method, model_name, tp_size)
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("VLLM_MLA_DISABLE", "1")
|
||||
|
||||
method, model_name, tp_size = model_setup
|
||||
|
||||
Reference in New Issue
Block a user