Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -48,20 +48,23 @@ def run_test(model_name, more_args=None):
measured_value = results["results"][TASK][FILTER]
assert model_name in EXPECTED_VALUES, (
f"Cannot find the expected value for the model {model_name=}")
f"Cannot find the expected value for the model {model_name=}"
)
expected_value = EXPECTED_VALUES[model_name]
assert (measured_value - RTOL < expected_value
and measured_value + RTOL > expected_value
), f"Expected: {expected_value} | Measured: {measured_value}"
assert (
measured_value - RTOL < expected_value
and measured_value + RTOL > expected_value
), f"Expected: {expected_value} | Measured: {measured_value}"
# TODO: [AlexM] Fix it with new CI/CD tests
TPU_TP_TEST_STR = "" #"tensor_parallel_size=4"
TPU_TP_TEST_STR = "" # "tensor_parallel_size=4"
@pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU")
@pytest.mark.skipif(
not current_platform.is_cuda() and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU",
)
@pytest.mark.parametrize("model", MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine."""
@@ -82,12 +85,14 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
run_test(model, more_args)
@pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU")
@pytest.mark.skipif(
not current_platform.is_cuda() and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU",
)
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
model, monkeypatch: pytest.MonkeyPatch):
model, monkeypatch: pytest.MonkeyPatch
):
"""Run with the V1 Engine."""
with monkeypatch.context() as m:

View File

@@ -14,9 +14,7 @@ from ..openai.test_vision import TEST_IMAGE_ASSETS
def text_llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
seed=0)
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, seed=0)
yield weakref.proxy(llm)
@@ -28,14 +26,8 @@ def text_llm():
def test_chat(text_llm):
prompt1 = "Explain the concept of entropy."
messages = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt1},
]
outputs = text_llm.chat(messages)
assert len(outputs) == 1
@@ -46,25 +38,13 @@ def test_multi_chat(text_llm):
prompt2 = "Explain what among us is."
conversation1 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt1},
]
conversation2 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt2
},
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt2},
]
messages = [conversation1, conversation2]
@@ -94,26 +74,22 @@ def vision_llm():
cleanup_dist_env_and_memory()
@pytest.mark.parametrize("image_urls",
[[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]],
indirect=True)
@pytest.mark.parametrize(
"image_urls", [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], indirect=True
)
def test_chat_multi_image(vision_llm, image_urls: list[str]):
messages = [{
"role":
"user",
"content": [
*({
"type": "image_url",
"image_url": {
"url": image_url
}
} for image_url in image_urls),
{
"type": "text",
"text": "What's in this image?"
},
],
}]
messages = [
{
"role": "user",
"content": [
*(
{"type": "image_url", "image_url": {"url": image_url}}
for image_url in image_urls
),
{"type": "text", "text": "What's in this image?"},
],
}
]
outputs = vision_llm.chat(messages)
assert len(outputs) >= 0
@@ -124,14 +100,8 @@ def test_llm_chat_tokenization_no_double_bos(text_llm):
Check we get a single BOS token for llama chat.
"""
messages = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello!"
},
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": "Hello!"},
]
outputs = text_llm.chat(messages)
assert len(outputs) == 1
@@ -167,14 +137,8 @@ def thinking_llm():
@pytest.mark.parametrize("enable_thinking", [True, False])
def test_chat_extra_kwargs(thinking_llm, enable_thinking):
messages = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "What is 1+1?"
},
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": "What is 1+1?"},
]
outputs = thinking_llm.chat(

View File

@@ -23,9 +23,11 @@ def test_collective_rpc(tp_size, backend, monkeypatch):
return self.rank
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
load_format="dummy",
tensor_parallel_size=tp_size,
distributed_executor_backend=backend)
llm = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
load_format="dummy",
tensor_parallel_size=tp_size,
distributed_executor_backend=backend,
)
assert llm.collective_rpc(echo_rank) == list(range(tp_size))

View File

@@ -29,11 +29,13 @@ TOKEN_IDS = [
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model=MODEL_NAME,
max_num_batched_tokens=4096,
tensor_parallel_size=1,
gpu_memory_utilization=0.10,
enforce_eager=True)
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=4096,
tensor_parallel_size=1,
gpu_memory_utilization=0.10,
enforce_eager=True,
)
yield weakref.proxy(llm)
@@ -81,7 +83,8 @@ def test_max_model_len():
outputs = llm.generate(PROMPTS, sampling_params)
for output in outputs:
num_total_tokens = len(output.prompt_token_ids) + len(
output.outputs[0].token_ids)
output.outputs[0].token_ids
)
# Total tokens must not exceed max_model_len + 1 (the last token can be
# generated with the context length equal to the max model length)
# It can be less if generation finishes due to other reasons (e.g., EOS)

View File

@@ -16,9 +16,8 @@ def test_gpu_memory_utilization():
# makes sure gpu_memory_utilization is per-instance limit,
# not a global limit
llms = [
LLM(model="facebook/opt-125m",
gpu_memory_utilization=0.3,
enforce_eager=True) for i in range(3)
LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3, enforce_eager=True)
for i in range(3)
]
for llm in llms:
outputs = llm.generate(prompts, sampling_params)

View File

@@ -8,12 +8,12 @@ from vllm import LLM
def test_empty_prompt():
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
with pytest.raises(ValueError, match="decoder prompt cannot be empty"):
llm.generate([""])
@pytest.mark.skip_v1
def test_out_of_vocab_token():
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
with pytest.raises(ValueError, match='out of vocabulary'):
with pytest.raises(ValueError, match="out of vocabulary"):
llm.generate({"prompt_token_ids": [999999]})