Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -54,19 +54,22 @@ def default_server_args():
]
@pytest.fixture(scope="module",
params=[
"",
"--enable-chunked-prefill",
"--disable-frontend-multiprocessing",
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
])
@pytest.fixture(
scope="module",
params=[
"",
"--enable-chunked-prefill",
"--disable-frontend-multiprocessing",
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
],
)
def server(use_v1, default_server_args, request):
if request.param:
default_server_args.append(request.param)
env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0')
with RemoteOpenAIServer(MODEL_NAME, default_server_args,
env_dict=env_dict) as remote_server:
env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
with RemoteOpenAIServer(
MODEL_NAME, default_server_args, env_dict=env_dict
) as remote_server:
yield remote_server
@@ -87,30 +90,36 @@ _NUM_GENERATION_TOKENS_PER_REQUEST = 10
# {metric_family: [(suffix, expected_value)]}
EXPECTED_VALUES = {
"vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
"vllm:time_per_output_token_seconds":
[("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
"vllm:time_per_output_token_seconds": [
("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))
],
"vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_prompt_tokens":
[("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:request_generation_tokens":
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:request_prompt_tokens": [
("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS),
],
"vllm:request_generation_tokens": [
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS),
],
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
"vllm:request_params_max_tokens": [
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)
("_count", _NUM_REQUESTS),
],
"vllm:iteration_tokens_total":
[("_sum", _NUM_REQUESTS *
(_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST)),
("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)],
"vllm:prompt_tokens": [("_total",
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:iteration_tokens_total": [
(
"_sum",
_NUM_REQUESTS
* (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST),
),
("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
],
"vllm:prompt_tokens": [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens": [
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
],
@@ -119,14 +128,16 @@ EXPECTED_VALUES = {
@pytest.mark.asyncio
async def test_metrics_counts(server: RemoteOpenAIServer,
client: openai.AsyncClient, use_v1: bool):
async def test_metrics_counts(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
):
for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged.
await client.completions.create(
model=MODEL_NAME,
prompt=_TOKENIZED_PROMPT,
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST,
)
response = requests.get(server.url_for("metrics"))
print(response.text)
@@ -134,9 +145,10 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
# Loop over all expected metric_families
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
if ((use_v1 and metric_family not in EXPECTED_METRICS_V1)
or (not server.show_hidden_metrics
and metric_family in HIDDEN_DEPRECATED_METRICS)):
if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
not server.show_hidden_metrics
and metric_family in HIDDEN_DEPRECATED_METRICS
):
continue
found_metric = False
@@ -160,14 +172,15 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
assert sample.value == expected_value, (
f"{metric_name_w_suffix} expected value of "
f"{expected_value} did not match found value "
f"{sample.value}")
f"{sample.value}"
)
break
assert found_suffix, (
f"Did not find {metric_name_w_suffix} in prom endpoint"
)
break
assert found_metric, (f"Did not find {metric_family} in prom endpoint")
assert found_metric, f"Did not find {metric_family} in prom endpoint"
EXPECTED_METRICS = [
@@ -290,30 +303,30 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
@pytest.mark.asyncio
async def test_metrics_exist(server: RemoteOpenAIServer,
client: openai.AsyncClient, use_v1: bool):
async def test_metrics_exist(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
):
# sending a request triggers the metrics to be logged.
await client.completions.create(model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
await client.completions.create(
model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
)
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
if (metric in HIDDEN_DEPRECATED_METRICS
and not server.show_hidden_metrics):
for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
continue
assert metric in response.text
@pytest.mark.asyncio
async def test_abort_metrics_reset(server: RemoteOpenAIServer,
client: openai.AsyncClient, use_v1: bool):
running_requests, waiting_requests, kv_cache_usage = (
_get_running_metrics_from_api(server, use_v1))
async def test_abort_metrics_reset(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
):
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server, use_v1
)
# Expect no running requests or kvcache usage
assert running_requests == 0
@@ -328,15 +341,18 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
model=MODEL_NAME,
prompt=_TOKENIZED_PROMPT,
max_tokens=100, # Long generation to give time to abort
temperature=0.0))
temperature=0.0,
)
)
tasks.append(task)
# Wait a bit for requests to start processing
await asyncio.sleep(0.5)
# Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = (
_get_running_metrics_from_api(server, use_v1))
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server, use_v1
)
# Expect running requests and kvcache usage
assert running_requests > 0
@@ -355,17 +371,18 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
_get_running_metrics_from_api(server, use_v1))
_get_running_metrics_from_api(server, use_v1)
)
assert running_requests_after == 0,\
(f"Expected 0 running requests after abort, got "
f"{running_requests_after}")
assert waiting_requests_after == 0,\
(f"Expected 0 waiting requests after abort, got "
f"{waiting_requests_after}")
assert kv_cache_usage_after == 0,\
(f"Expected 0% KV cache usage after abort, got "
f"{kv_cache_usage_after}")
assert running_requests_after == 0, (
f"Expected 0 running requests after abort, got {running_requests_after}"
)
assert waiting_requests_after == 0, (
f"Expected 0 waiting requests after abort, got {waiting_requests_after}"
)
assert kv_cache_usage_after == 0, (
f"Expected 0% KV cache usage after abort, got {kv_cache_usage_after}"
)
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
@@ -377,8 +394,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
# Verify running and waiting requests counts and KV cache usage are zero
running_requests, waiting_requests, kv_cache_usage = None, None, None
kv_cache_usage_metric = ("vllm:kv_cache_usage_perc"
if use_v1 else "vllm:gpu_cache_usage_perc")
kv_cache_usage_metric = (
"vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
)
for family in text_string_to_metric_families(response.text):
if family.name == "vllm:num_requests_running":
@@ -411,28 +429,31 @@ def test_metrics_exist_run_batch(use_v1: bool):
port = "8001"
server_url = f"http://{base_url}:{port}"
with tempfile.NamedTemporaryFile(
"w") as input_file, tempfile.NamedTemporaryFile(
"r") as output_file:
with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
):
input_file.write(input_batch)
input_file.flush()
proc = subprocess.Popen([
sys.executable,
"-m",
"vllm.entrypoints.openai.run_batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
"intfloat/multilingual-e5-small",
"--enable-metrics",
"--url",
base_url,
"--port",
port,
],
env={"VLLM_USE_V1": "1"})
proc = subprocess.Popen(
[
sys.executable,
"-m",
"vllm.entrypoints.openai.run_batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
"intfloat/multilingual-e5-small",
"--enable-metrics",
"--url",
base_url,
"--port",
port,
],
env={"VLLM_USE_V1": "1"},
)
def is_server_up(url):
try: