Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -21,16 +21,16 @@ from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.metrics.loggers import LoggingStatLogger

 if not current_platform.is_cuda():
-    pytest.skip(reason="V1 currently only supported on CUDA.",
-                allow_module_level=True)
+    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)

 TEXT_ENGINE_ARGS = AsyncEngineArgs(
    model="meta-llama/Llama-3.2-1B-Instruct",
    enforce_eager=True,
 )

-VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
-                                     enforce_eager=True)
+VISION_ENGINE_ARGS = AsyncEngineArgs(
+    model="Qwen/Qwen2-VL-2B-Instruct", enforce_eager=True
+)

 TEXT_PROMPT = "Hello my name is Robert and"

@@ -38,12 +38,11 @@ VISION_PROMPT_TEMPLATE = (
    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
    "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
    "What is in the image?<|im_end|>\n"
-    "<|im_start|>assistant\n")
+    "<|im_start|>assistant\n"
+)
 VISION_PROMPT = {
    "prompt": VISION_PROMPT_TEMPLATE,
-    "multi_modal_data": {
-        "image": ImageAsset("stop_sign").pil_image
-    },
+    "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
 }


@@ -70,10 +69,9 @@ async def generate(
        n=n,
        prompt_logprobs=prompt_logprobs,
    )
-    async for out in engine.generate(request_id=request_id,
-                                     prompt=prompt,
-                                     sampling_params=sampling_params):
-
+    async for out in engine.generate(
+        request_id=request_id, prompt=prompt, sampling_params=sampling_params
+    ):
        num_tokens = sum(len(output.token_ids) for output in out.outputs)
        if output_kind == RequestOutputKind.DELTA:
            count += num_tokens
@@ -89,7 +87,8 @@ async def generate(


@pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
@pytest.mark.parametrize(
    "engine_args,prompt",
    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
@@ -121,25 +120,29 @@ async def test_load(
        for request_id in request_ids:
            tasks.append(
                asyncio.create_task(
-                    generate(engine, request_id, prompt, output_kind,
-                             NUM_EXPECTED_TOKENS)))
+                    generate(
+                        engine, request_id, prompt, output_kind, NUM_EXPECTED_TOKENS
+                    )
+                )
+            )

        # Confirm that we got all the EXPECTED tokens from the requests.
-        done, pending = await asyncio.wait(tasks,
-                                           return_when=asyncio.FIRST_EXCEPTION)
+        done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION)
        for task in pending:
            task.cancel()
        for task in done:
            num_generated_tokens, request_id = await task
            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
                f"{request_id} generated {num_generated_tokens} but "
-                f"expected {NUM_EXPECTED_TOKENS}")
+                f"expected {NUM_EXPECTED_TOKENS}"
+            )

        assert not engine.output_processor.has_unfinished_requests()


@pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
@pytest.mark.parametrize(
    "engine_args,prompt",
    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
@@ -151,7 +154,6 @@ async def test_abort(
    engine_args: AsyncEngineArgs,
    prompt: PromptType,
 ):
-
    with monkeypatch.context() as m, ExitStack() as after:
        m.setenv("VLLM_USE_V1", "1")

@@ -170,14 +172,17 @@ async def test_abort(
        # Create concurrent requests.
        tasks: list[asyncio.Task] = []
        for idx, request_id in enumerate(request_ids):
-            max_tokens = (NUM_EXPECTED_TOKENS_LONG if
-                          (idx
-                           in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS)
+            max_tokens = (
+                NUM_EXPECTED_TOKENS_LONG
+                if (idx in REQUEST_IDS_TO_ABORT)
+                else NUM_EXPECTED_TOKENS
+            )
            n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
            tasks.append(
                asyncio.create_task(
-                    generate(engine, request_id, prompt, output_kind,
-                             max_tokens, n)))
+                    generate(engine, request_id, prompt, output_kind, max_tokens, n)
+                )
+            )

        # API server cancels requests when they disconnect.
        for idx in REQUEST_IDS_TO_ABORT:
@@ -197,7 +202,8 @@ async def test_abort(
                expected_tokens = NUM_EXPECTED_TOKENS * n
                assert num_generated_tokens == expected_tokens, (
                    f"{request_id} generated {num_generated_tokens} but "
-                    f"expected {expected_tokens}")
+                    f"expected {expected_tokens}"
+                )

        # Make sure all aborted requests were really aborted.
        assert not engine.output_processor.has_unfinished_requests()
@@ -205,21 +211,21 @@ async def test_abort(
        # Confirm we can do another generation.
        request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
        task = asyncio.create_task(
-            generate(engine, request_id, prompt, output_kind,
-                     NUM_EXPECTED_TOKENS))
+            generate(engine, request_id, prompt, output_kind, NUM_EXPECTED_TOKENS)
+        )
        num_generated_tokens, request_id = await task
        assert num_generated_tokens == NUM_EXPECTED_TOKENS
        assert not engine.output_processor.has_unfinished_requests()


@pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
@pytest.mark.asyncio
 async def test_multi_abort(
    monkeypatch: pytest.MonkeyPatch,
    output_kind: RequestOutputKind,
 ):
-
    with monkeypatch.context() as m, ExitStack() as after:
        m.setenv("VLLM_USE_V1", "1")

@@ -238,14 +244,19 @@ async def test_multi_abort(
        # Create concurrent requests.
        tasks: list[asyncio.Task] = []
        for idx, request_id in enumerate(request_ids):
-            max_tokens = (NUM_EXPECTED_TOKENS_LONG if
-                          (idx
-                           in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS)
+            max_tokens = (
+                NUM_EXPECTED_TOKENS_LONG
+                if (idx in REQUEST_IDS_TO_ABORT)
+                else NUM_EXPECTED_TOKENS
+            )
            n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
            tasks.append(
                asyncio.create_task(
-                    generate(engine, request_id, TEXT_PROMPT, output_kind,
-                             max_tokens, n)))
+                    generate(
+                        engine, request_id, TEXT_PROMPT, output_kind, max_tokens, n
+                    )
+                )
+            )

        # Let requests start
        await asyncio.sleep(0.5)
@@ -261,25 +272,26 @@ async def test_multi_abort(
        for idx, result in enumerate(results):
            if idx in REQUEST_IDS_TO_ABORT:
                # Aborted requests should return partial results
-                assert isinstance(
-                    result, tuple
-                ), f"Request {idx} should have completed with partial results"
+                assert isinstance(result, tuple), (
+                    f"Request {idx} should have completed with partial results"
+                )
                num_generated_tokens, request_id = result
                # Should have generated some tokens before abort
                assert num_generated_tokens > 0, (
-                    f"Aborted request "
-                    f"{request_id} should have generated some tokens")
+                    f"Aborted request {request_id} should have generated some tokens"
+                )
            else:
                # Non-aborted requests should complete normally
-                assert isinstance(
-                    result,
-                    tuple), f"Request {idx} should have completed successfully"
+                assert isinstance(result, tuple), (
+                    f"Request {idx} should have completed successfully"
+                )
                num_generated_tokens, request_id = result
                n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
                expected_tokens = NUM_EXPECTED_TOKENS * n
                assert num_generated_tokens == expected_tokens, (
                    f"{request_id} generated {num_generated_tokens} but "
-                    f"expected {expected_tokens}")
+                    f"expected {expected_tokens}"
+                )

        # Make sure all aborted requests were cleaned up
        assert not engine.output_processor.has_unfinished_requests()
@@ -297,7 +309,6 @@ async def test_finished_flag(
    engine_args: AsyncEngineArgs,
    prompt: PromptType,
 ):
-
    with monkeypatch.context() as m, ExitStack() as after:
        m.setenv("VLLM_USE_V1", "1")

@@ -314,9 +325,9 @@ async def test_finished_flag(
        )
        outputs = [
            out
-            async for out in engine.generate(request_id="request-33",
-                                             prompt=prompt,
-                                             sampling_params=sampling_params)
+            async for out in engine.generate(
+                request_id="request-33", prompt=prompt, sampling_params=sampling_params
+            )
        ]

        # Assert only the last output has the finished flag set
@@ -329,9 +340,9 @@ async def test_finished_flag(
    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
 )
@pytest.mark.asyncio
-async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
-                                       engine_args: AsyncEngineArgs,
-                                       prompt: PromptType):
+async def test_mid_stream_cancellation(
+    monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType
+):
    """Test that requests can be cancelled mid-stream."""
    with monkeypatch.context() as m, ExitStack() as after:
        m.setenv("VLLM_USE_V1", "1")
@@ -358,7 +369,9 @@ async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
                        RequestOutputKind.DELTA,
                        NUM_TOKENS,
                        cancel_after=NUM_EXPECTED_TOKENS,
-                    )))
+                    )
+                )
+            )

        # Wait for all tasks to complete
        results = await asyncio.gather(*tasks)
@@ -367,7 +380,8 @@ async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
        for num_generated_tokens, request_id in results:
            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
                f"{request_id} generated {num_generated_tokens} tokens but "
-                f"expected to cancel after {NUM_EXPECTED_TOKENS}")
+                f"expected to cancel after {NUM_EXPECTED_TOKENS}"
+            )

        # Make sure no requests are left hanging
        assert not engine.output_processor.has_unfinished_requests()
@@ -375,15 +389,16 @@ async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
        # Confirm we can reuse the request id after the cancellations.
        request_id = request_ids[0]
        task = asyncio.create_task(
-            generate(engine, request_id, prompt, RequestOutputKind.DELTA,
-                     NUM_EXPECTED_TOKENS))
+            generate(
+                engine, request_id, prompt, RequestOutputKind.DELTA, NUM_EXPECTED_TOKENS
+            )
+        )
        num_generated_tokens, request_id = await task
        assert num_generated_tokens == NUM_EXPECTED_TOKENS
        assert not engine.output_processor.has_unfinished_requests()


 class MockLoggingStatLogger(LoggingStatLogger):
-
    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
        super().__init__(vllm_config, engine_index)
        self.log = MagicMock()
@@ -410,8 +425,7 @@ async def test_customize_loggers(monkeypatch):

        stat_loggers = engine.logger_manager.per_engine_logger_dict
        assert len(stat_loggers) == 1
-        assert len(
-            stat_loggers[0]) == 2  # LoggingStatLogger + MockLoggingStatLogger
+        assert len(stat_loggers[0]) == 2  # LoggingStatLogger + MockLoggingStatLogger
        stat_loggers[0][0].log.assert_called_once()


@@ -424,24 +438,30 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
        after.callback(engine.shutdown)

-        sampling_params = SamplingParams(max_tokens=100,
-                                         output_kind=RequestOutputKind.DELTA,
-                                         temperature=1.0,
-                                         seed=33)
+        sampling_params = SamplingParams(
+            max_tokens=100,
+            output_kind=RequestOutputKind.DELTA,
+            temperature=1.0,
+            seed=33,
+        )

        # Test with valid DP rank.
-        async for _ in engine.generate(request_id="request-34",
-                                       prompt=TEXT_PROMPT,
-                                       sampling_params=sampling_params,
-                                       data_parallel_rank=0):
+        async for _ in engine.generate(
+            request_id="request-34",
+            prompt=TEXT_PROMPT,
+            sampling_params=sampling_params,
+            data_parallel_rank=0,
+        ):
            pass

        # Test with out-of-range DP rank.
        with pytest.raises(ValueError):
-            async for _ in engine.generate(request_id="request-35",
-                                           prompt=TEXT_PROMPT,
-                                           sampling_params=sampling_params,
-                                           data_parallel_rank=1):
+            async for _ in engine.generate(
+                request_id="request-35",
+                prompt=TEXT_PROMPT,
+                sampling_params=sampling_params,
+                data_parallel_rank=1,
+            ):
                pass


@@ -465,10 +485,14 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
        await engine.check_health()

        # Test 2: Mock the errored property to simulate a dead engine
-        with patch.object(type(engine),
-                          'errored',
-                          new_callable=lambda: property(lambda self: True)
-                          ), pytest.raises(EngineDeadError):
+        with (
+            patch.object(
+                type(engine),
+                "errored",
+                new_callable=lambda: property(lambda self: True),
+            ),
+            pytest.raises(EngineDeadError),
+        ):
            await engine.check_health()

        # Test 3: Verify healthy engine still works after mock
@@ -476,7 +500,8 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):


@pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
@pytest.mark.asyncio
 async def test_abort_final_output(
    monkeypatch: pytest.MonkeyPatch,
@@ -504,8 +529,8 @@ async def test_abort_final_output(

        outputs: list[RequestOutput] = []
        generated = asyncio.create_task(
-            collect_outputs(engine, request_id, TEXT_PROMPT, sampling_params,
-                            outputs))
+            collect_outputs(engine, request_id, TEXT_PROMPT, sampling_params, outputs)
+        )

        # Let it generate some tokens
        await asyncio.sleep(0.5)
@@ -525,14 +550,13 @@ async def test_abort_final_output(
        assert final_output.outputs[0].stop_reason is None

        # Verify num_cached_tokens is set correctly
-        assert hasattr(final_output, 'num_cached_tokens')
+        assert hasattr(final_output, "num_cached_tokens")
        assert final_output.num_cached_tokens >= 0

        # If we got intermediate outputs, verify they are consistent
        if output_kind == RequestOutputKind.DELTA:
            # For DELTA, sum all intermediate tokens should <= final tokens
-            token_count = sum(
-                len(output.outputs[0].token_ids) for output in outputs)
+            token_count = sum(len(output.outputs[0].token_ids) for output in outputs)
            assert token_count > 0
            # This would ordinarily be 0, but could end up > 0 if the
            # final abort is coalesced with another chunk in the output queue.
@@ -554,9 +578,9 @@ async def collect_outputs(
 ) -> Optional[RequestOutput]:
    """Helper to collect outputs and return the final one."""
    final_output: Optional[RequestOutput] = None
-    async for output in engine.generate(request_id=request_id,
-                                        prompt=prompt,
-                                        sampling_params=sampling_params):
+    async for output in engine.generate(
+        request_id=request_id, prompt=prompt, sampling_params=sampling_params
+    ):
        if not output.finished:
            outputs_list.append(output)
        final_output = output