[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
|
||||
@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
|
||||
def test_basic(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
tensor_parallel_size: int,
|
||||
@@ -55,23 +54,20 @@ def test_basic(
|
||||
)
|
||||
example_prompts = [prompt]
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
with vllm_runner(
|
||||
model,
|
||||
# Note: max_num_batched_tokens == 1024 is needed here to
|
||||
# actually test chunked prompt
|
||||
max_num_batched_tokens=1024,
|
||||
max_model_len=8192,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
output = vllm_outputs[0][1]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
# Note: max_num_batched_tokens == 1024 is needed here to
|
||||
# actually test chunked prompt
|
||||
max_num_batched_tokens=1024,
|
||||
max_model_len=8192,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
output = vllm_outputs[0][1]
|
||||
|
||||
assert "1024" in output or "0, 1" in output
|
||||
assert "1024" in output or "0, 1" in output
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Temporarily disabled due to timeout")
|
||||
@@ -82,7 +78,6 @@ def test_basic(
|
||||
@pytest.mark.parametrize("max_num_seqs", [16])
|
||||
def test_phi3(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
max_tokens: int,
|
||||
max_num_seqs: int,
|
||||
) -> None:
|
||||
@@ -99,18 +94,15 @@ def test_phi3(
|
||||
# test head dim = 96
|
||||
model = "microsoft/Phi-3-mini-128k-instruct"
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with vllm_runner(
|
||||
model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
|
||||
# vllm_outputs is a list of tuples whose first element is the token id
|
||||
# and the second element is the output (including the prompt).
|
||||
for output, answer in zip(vllm_outputs, answers):
|
||||
generated_text = output[1]
|
||||
assert answer in generated_text
|
||||
with vllm_runner(
|
||||
model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
|
||||
# vllm_outputs is a list of tuples whose first element is the token id
|
||||
# and the second element is the output (including the prompt).
|
||||
for output, answer in zip(vllm_outputs, answers):
|
||||
generated_text = output[1]
|
||||
assert answer in generated_text
|
||||
|
||||
|
||||
TP_SIZE_8 = 8
|
||||
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
|
||||
)
|
||||
def test_gemma3_27b_with_text_input_and_tp(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
model = "google/gemma-3-27b-it"
|
||||
max_tokens = 16
|
||||
@@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
|
||||
" but in rising every time we fall.",
|
||||
]
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_num_batched_tokens=256,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
|
||||
# vllm_outputs is a list of tuples whose first element is the token id
|
||||
# and the second element is the output (including the prompt).
|
||||
for output, answer in zip(vllm_outputs, answers):
|
||||
generated_text = output[1]
|
||||
assert answer in generated_text
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_num_batched_tokens=256,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
|
||||
# vllm_outputs is a list of tuples whose first element is the token id
|
||||
# and the second element is the output (including the prompt).
|
||||
for output, answer in zip(vllm_outputs, answers):
|
||||
generated_text = output[1]
|
||||
assert answer in generated_text
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
|
||||
)
|
||||
def test_w8a8_quantization(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
|
||||
max_tokens = 5
|
||||
@@ -176,18 +163,15 @@ def test_w8a8_quantization(
|
||||
)
|
||||
example_prompts = [prompt]
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_num_batched_tokens=64,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
output = vllm_outputs[0][1]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_num_batched_tokens=64,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
output = vllm_outputs[0][1]
|
||||
|
||||
assert "1024" in output or "0, 1" in output
|
||||
assert "1024" in output or "0, 1" in output
|
||||
|
||||
@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
|
||||
@pytest.mark.parametrize("params", TEST_PARAMS)
|
||||
def test_perf(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
params: TestParams,
|
||||
) -> None:
|
||||
tokenizer = get_tokenizer(
|
||||
@@ -107,48 +106,45 @@ def test_perf(
|
||||
)
|
||||
)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=params.decode_len, temperature=1.0, min_p=0.0
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=params.decode_len, temperature=1.0, min_p=0.0
|
||||
)
|
||||
with vllm_runner(
|
||||
params.model,
|
||||
max_num_batched_tokens=MAX_MODEL_LEN,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
gpu_memory_utilization=GPU_UTIL,
|
||||
enforce_eager=False,
|
||||
tensor_parallel_size=1,
|
||||
) as vllm_model:
|
||||
print(" -- Warmup / Compile")
|
||||
for i in range(NUM_WARMUPS):
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
|
||||
with vllm_runner(
|
||||
params.model,
|
||||
max_num_batched_tokens=MAX_MODEL_LEN,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
gpu_memory_utilization=GPU_UTIL,
|
||||
enforce_eager=False,
|
||||
tensor_parallel_size=1,
|
||||
) as vllm_model:
|
||||
print(" -- Warmup / Compile")
|
||||
for i in range(NUM_WARMUPS):
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
print(" -- Benchmarking... ")
|
||||
times = []
|
||||
for i in range(NUM_RUNS):
|
||||
start_time = time.time()
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
times.append(time.time() - start_time)
|
||||
|
||||
print(" -- Benchmarking... ")
|
||||
times = []
|
||||
for i in range(NUM_RUNS):
|
||||
start_time = time.time()
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
times.append(time.time() - start_time)
|
||||
avg_time = sum(times) / len(times)
|
||||
|
||||
avg_time = sum(times) / len(times)
|
||||
|
||||
print(" -- avg_time = {}".format(avg_time))
|
||||
print(
|
||||
" -- expected_avg_time = {} with err_tol = {}".format(
|
||||
params.expected_avg_time, params.err_tol
|
||||
)
|
||||
print(" -- avg_time = {}".format(avg_time))
|
||||
print(
|
||||
" -- expected_avg_time = {} with err_tol = {}".format(
|
||||
params.expected_avg_time, params.err_tol
|
||||
)
|
||||
)
|
||||
diff = avg_time - params.expected_avg_time
|
||||
ok = diff < params.err_tol
|
||||
if diff < -params.err_tol:
|
||||
print(
|
||||
" !! WARNING !! Performance has improved by {}, "
|
||||
"it may be necessary to fine-tune the "
|
||||
"expected_avg_time = {}".format(-diff, params.expected_avg_time)
|
||||
)
|
||||
diff = avg_time - params.expected_avg_time
|
||||
ok = diff < params.err_tol
|
||||
if diff < -params.err_tol:
|
||||
print(
|
||||
" !! WARNING !! Performance has improved by {}, "
|
||||
"it may be necessary to fine-tune the "
|
||||
"expected_avg_time = {}".format(-diff, params.expected_avg_time)
|
||||
)
|
||||
|
||||
assert ok, " !! ERROR !! Regression detected"
|
||||
assert ok, " !! ERROR !! Regression detected"
|
||||
|
||||
Reference in New Issue
Block a user