Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
|
||||
Run `pytest tests/v1/tpu/test_perf.py`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
@@ -37,7 +38,6 @@ TEST_PARAMS = [
|
||||
# open(/dev/vfio/0): Device or resource busy: Device or resource busy;
|
||||
# Couldn't open iommu group /dev/vfio/0
|
||||
# => Investigate
|
||||
|
||||
# TestParams(
|
||||
# model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
# num_prompts=1,
|
||||
@@ -59,16 +59,14 @@ TEST_PARAMS = [
|
||||
num_prompts=64,
|
||||
prefix_len=500,
|
||||
decode_len=50,
|
||||
|
||||
# commit id: ccb246776d93ef105904a8ec015b3587240a1183
|
||||
# tpu: v5lite (old vllm CI/CD)
|
||||
# expected_avg_time=1.4,
|
||||
# err_tol=0.30,
|
||||
|
||||
# (This is the active CI/CD instance)
|
||||
# commit id: ccb246776d93ef105904a8ec015b3587240a1183
|
||||
# tpu: v6e (current vllm CI/CD)
|
||||
expected_avg_time=1.7, # measured with VLLM_XLA_CACHE_PATH=
|
||||
expected_avg_time=1.7, # measured with VLLM_XLA_CACHE_PATH=
|
||||
err_tol=0.20,
|
||||
),
|
||||
]
|
||||
@@ -81,44 +79,50 @@ MAX_NUM_SEQS = 32
|
||||
GPU_UTIL = 0.9
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_tpu(),
|
||||
reason="This is a basic performance test for TPU only")
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_tpu(),
|
||||
reason="This is a basic performance test for TPU only",
|
||||
)
|
||||
@pytest.mark.parametrize("params", TEST_PARAMS)
|
||||
def test_perf(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
params: TestParams,
|
||||
) -> None:
|
||||
tokenizer = get_tokenizer(params.model,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True)
|
||||
tokenizer = get_tokenizer(
|
||||
params.model, tokenizer_mode="auto", trust_remote_code=True
|
||||
)
|
||||
|
||||
prompts = []
|
||||
for i in range(params.num_prompts):
|
||||
prefix_token_ids = np.random.randint(0,
|
||||
tokenizer.vocab_size,
|
||||
size=params.prefix_len).tolist()
|
||||
prefix_token_ids = np.random.randint(
|
||||
0, tokenizer.vocab_size, size=params.prefix_len
|
||||
).tolist()
|
||||
prompt = tokenizer.decode(prefix_token_ids)
|
||||
prompts.append(prompt)
|
||||
|
||||
print(
|
||||
"-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(
|
||||
len(prompts), params.prefix_len, params.decode_len))
|
||||
len(prompts), params.prefix_len, params.decode_len
|
||||
)
|
||||
)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=params.decode_len,
|
||||
temperature=1.0,
|
||||
min_p=0.0)
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=params.decode_len, temperature=1.0, min_p=0.0
|
||||
)
|
||||
|
||||
with vllm_runner(params.model,
|
||||
max_num_batched_tokens=MAX_MODEL_LEN,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
gpu_memory_utilization=GPU_UTIL,
|
||||
enforce_eager=False,
|
||||
tensor_parallel_size=1) as vllm_model:
|
||||
with vllm_runner(
|
||||
params.model,
|
||||
max_num_batched_tokens=MAX_MODEL_LEN,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
gpu_memory_utilization=GPU_UTIL,
|
||||
enforce_eager=False,
|
||||
tensor_parallel_size=1,
|
||||
) as vllm_model:
|
||||
print(" -- Warmup / Compile")
|
||||
for i in range(NUM_WARMUPS):
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
@@ -133,14 +137,18 @@ def test_perf(
|
||||
avg_time = sum(times) / len(times)
|
||||
|
||||
print(" -- avg_time = {}".format(avg_time))
|
||||
print(" -- expected_avg_time = {} with err_tol = {}".format(
|
||||
params.expected_avg_time, params.err_tol))
|
||||
print(
|
||||
" -- expected_avg_time = {} with err_tol = {}".format(
|
||||
params.expected_avg_time, params.err_tol
|
||||
)
|
||||
)
|
||||
diff = avg_time - params.expected_avg_time
|
||||
ok = diff < params.err_tol
|
||||
if diff < -params.err_tol:
|
||||
print(" !! WARNING !! Performance has improved by {}, "
|
||||
"it may be necessary to fine-tune the "
|
||||
"expected_avg_time = {}".format(
|
||||
-diff, params.expected_avg_time))
|
||||
print(
|
||||
" !! WARNING !! Performance has improved by {}, "
|
||||
"it may be necessary to fine-tune the "
|
||||
"expected_avg_time = {}".format(-diff, params.expected_avg_time)
|
||||
)
|
||||
|
||||
assert ok, " !! ERROR !! Regression detected"
|
||||
|
||||
Reference in New Issue
Block a user