[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-07 23:42:31 +08:00
committed by GitHub
parent c0a7b89d8e
commit 1e4ecca1d0
51 changed files with 817 additions and 1275 deletions

View File

@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
@pytest.mark.parametrize("params", TEST_PARAMS)
def test_perf(
vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
params: TestParams,
) -> None:
tokenizer = get_tokenizer(
@@ -107,48 +106,45 @@ def test_perf(
)
)
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
sampling_params = SamplingParams(
max_tokens=params.decode_len, temperature=1.0, min_p=0.0
)
sampling_params = SamplingParams(
max_tokens=params.decode_len, temperature=1.0, min_p=0.0
)
with vllm_runner(
params.model,
max_num_batched_tokens=MAX_MODEL_LEN,
max_model_len=MAX_MODEL_LEN,
max_num_seqs=MAX_NUM_SEQS,
gpu_memory_utilization=GPU_UTIL,
enforce_eager=False,
tensor_parallel_size=1,
) as vllm_model:
print(" -- Warmup / Compile")
for i in range(NUM_WARMUPS):
_ = vllm_model.generate(prompts, sampling_params)
with vllm_runner(
params.model,
max_num_batched_tokens=MAX_MODEL_LEN,
max_model_len=MAX_MODEL_LEN,
max_num_seqs=MAX_NUM_SEQS,
gpu_memory_utilization=GPU_UTIL,
enforce_eager=False,
tensor_parallel_size=1,
) as vllm_model:
print(" -- Warmup / Compile")
for i in range(NUM_WARMUPS):
_ = vllm_model.generate(prompts, sampling_params)
print(" -- Benchmarking... ")
times = []
for i in range(NUM_RUNS):
start_time = time.time()
_ = vllm_model.generate(prompts, sampling_params)
times.append(time.time() - start_time)
print(" -- Benchmarking... ")
times = []
for i in range(NUM_RUNS):
start_time = time.time()
_ = vllm_model.generate(prompts, sampling_params)
times.append(time.time() - start_time)
avg_time = sum(times) / len(times)
avg_time = sum(times) / len(times)
print(" -- avg_time = {}".format(avg_time))
print(
" -- expected_avg_time = {} with err_tol = {}".format(
params.expected_avg_time, params.err_tol
)
print(" -- avg_time = {}".format(avg_time))
print(
" -- expected_avg_time = {} with err_tol = {}".format(
params.expected_avg_time, params.err_tol
)
)
diff = avg_time - params.expected_avg_time
ok = diff < params.err_tol
if diff < -params.err_tol:
print(
" !! WARNING !! Performance has improved by {}, "
"it may be necessary to fine-tune the "
"expected_avg_time = {}".format(-diff, params.expected_avg_time)
)
diff = avg_time - params.expected_avg_time
ok = diff < params.err_tol
if diff < -params.err_tol:
print(
" !! WARNING !! Performance has improved by {}, "
"it may be necessary to fine-tune the "
"expected_avg_time = {}".format(-diff, params.expected_avg_time)
)
assert ok, " !! ERROR !! Regression detected"
assert ok, " !! ERROR !! Regression detected"