Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -48,20 +48,23 @@ def run_test(model_name, more_args=None):

    measured_value = results["results"][TASK][FILTER]
    assert model_name in EXPECTED_VALUES, (
-        f"Cannot find the expected value for the model {model_name=}")
+        f"Cannot find the expected value for the model {model_name=}"
+    )
    expected_value = EXPECTED_VALUES[model_name]
-    assert (measured_value - RTOL < expected_value
-            and measured_value + RTOL > expected_value
-            ), f"Expected: {expected_value} |  Measured: {measured_value}"
+    assert (
+        measured_value - RTOL < expected_value
+        and measured_value + RTOL > expected_value
+    ), f"Expected: {expected_value} |  Measured: {measured_value}"


 # TODO: [AlexM] Fix it with new CI/CD tests
-TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
+TPU_TP_TEST_STR = ""  # "tensor_parallel_size=4"


-@pytest.mark.skipif(not current_platform.is_cuda()
-                    and not current_platform.is_tpu(),
-                    reason="V1 is currently only supported on CUDA and TPU")
+@pytest.mark.skipif(
+    not current_platform.is_cuda() and not current_platform.is_tpu(),
+    reason="V1 is currently only supported on CUDA and TPU",
+)
@pytest.mark.parametrize("model", MODEL_NAMES)
 def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""
@@ -82,12 +85,14 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
        run_test(model, more_args)


-@pytest.mark.skipif(not current_platform.is_cuda()
-                    and not current_platform.is_tpu(),
-                    reason="V1 is currently only supported on CUDA and TPU")
+@pytest.mark.skipif(
+    not current_platform.is_cuda() and not current_platform.is_tpu(),
+    reason="V1 is currently only supported on CUDA and TPU",
+)
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
 def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
-        model, monkeypatch: pytest.MonkeyPatch):
+    model, monkeypatch: pytest.MonkeyPatch
+):
    """Run with the V1 Engine."""

    with monkeypatch.context() as m:
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -14,9 +14,7 @@ from ..openai.test_vision import TEST_IMAGE_ASSETS
 def text_llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
-              enforce_eager=True,
-              seed=0)
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, seed=0)

    yield weakref.proxy(llm)

@@ -28,14 +26,8 @@ def text_llm():
 def test_chat(text_llm):
    prompt1 = "Explain the concept of entropy."
    messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt1
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": prompt1},
    ]
    outputs = text_llm.chat(messages)
    assert len(outputs) == 1
@@ -46,25 +38,13 @@ def test_multi_chat(text_llm):
    prompt2 = "Explain what among us is."

    conversation1 = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt1
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": prompt1},
    ]

    conversation2 = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt2
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": prompt2},
    ]

    messages = [conversation1, conversation2]
@@ -94,26 +74,22 @@ def vision_llm():
    cleanup_dist_env_and_memory()


-@pytest.mark.parametrize("image_urls",
-                         [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]],
-                         indirect=True)
+@pytest.mark.parametrize(
+    "image_urls", [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], indirect=True
+)
 def test_chat_multi_image(vision_llm, image_urls: list[str]):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            } for image_url in image_urls),
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "image_url", "image_url": {"url": image_url}}
+                    for image_url in image_urls
+                ),
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }
+    ]
    outputs = vision_llm.chat(messages)
    assert len(outputs) >= 0

@@ -124,14 +100,8 @@ def test_llm_chat_tokenization_no_double_bos(text_llm):
    Check we get a single BOS token for llama chat.
    """
    messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": "Hello!"
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello!"},
    ]
    outputs = text_llm.chat(messages)
    assert len(outputs) == 1
@@ -167,14 +137,8 @@ def thinking_llm():
@pytest.mark.parametrize("enable_thinking", [True, False])
 def test_chat_extra_kwargs(thinking_llm, enable_thinking):
    messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": "What is 1+1?"
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "What is 1+1?"},
    ]

    outputs = thinking_llm.chat(
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -23,9 +23,11 @@ def test_collective_rpc(tp_size, backend, monkeypatch):
        return self.rank

    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
-              enforce_eager=True,
-              load_format="dummy",
-              tensor_parallel_size=tp_size,
-              distributed_executor_backend=backend)
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        load_format="dummy",
+        tensor_parallel_size=tp_size,
+        distributed_executor_backend=backend,
+    )
    assert llm.collective_rpc(echo_rank) == list(range(tp_size))
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -29,11 +29,13 @@ TOKEN_IDS = [
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1,
-              gpu_memory_utilization=0.10,
-              enforce_eager=True)
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.10,
+        enforce_eager=True,
+    )

    yield weakref.proxy(llm)

@@ -81,7 +83,8 @@ def test_max_model_len():
    outputs = llm.generate(PROMPTS, sampling_params)
    for output in outputs:
        num_total_tokens = len(output.prompt_token_ids) + len(
-            output.outputs[0].token_ids)
+            output.outputs[0].token_ids
+        )
        # Total tokens must not exceed max_model_len + 1 (the last token can be
        # generated with the context length equal to the max model length)
        # It can be less if generation finishes due to other reasons (e.g., EOS)
--- a/tests/entrypoints/llm/test_gpu_utilization.py
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -16,9 +16,8 @@ def test_gpu_memory_utilization():
    # makes sure gpu_memory_utilization is per-instance limit,
    # not a global limit
    llms = [
-        LLM(model="facebook/opt-125m",
-            gpu_memory_utilization=0.3,
-            enforce_eager=True) for i in range(3)
+        LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3, enforce_eager=True)
+        for i in range(3)
    ]
    for llm in llms:
        outputs = llm.generate(prompts, sampling_params)
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -8,12 +8,12 @@ from vllm import LLM

 def test_empty_prompt():
    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
-    with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
+    with pytest.raises(ValueError, match="decoder prompt cannot be empty"):
        llm.generate([""])


@pytest.mark.skip_v1
 def test_out_of_vocab_token():
    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
-    with pytest.raises(ValueError, match='out of vocabulary'):
+    with pytest.raises(ValueError, match="out of vocabulary"):
        llm.generate({"prompt_token_ids": [999999]})