Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/evals/gpt_oss/init.py
+++ b/tests/evals/gpt_oss/init.py
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
--- a/tests/evals/gpt_oss/conftest.py
+++ b/tests/evals/gpt_oss/conftest.py
@@ -8,11 +8,9 @@ Pytest configuration for GPT-OSS evaluation tests.
 def pytest_addoption(parser):
    """Add command line options for pytest."""
    parser.addoption("--model", action="store", help="Model name to evaluate")
-    parser.addoption("--metric",
-                     action="store",
-                     type=float,
-                     help="Expected metric threshold")
-    parser.addoption("--server-args",
-                     action="store",
-                     default="",
-                     help="Additional server arguments")
+    parser.addoption(
+        "--metric", action="store", type=float, help="Expected metric threshold"
+    )
+    parser.addoption(
+        "--server-args", action="store", default="", help="Additional server arguments"
+    )
--- a/tests/evals/gpt_oss/test_gpqa_correctness.py
+++ b/tests/evals/gpt_oss/test_gpqa_correctness.py
@@ -25,9 +25,19 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float:

    # Build the command to run the evaluation
    cmd = [
-        sys.executable, "-m", "gpt_oss.evals", "--eval", "gpqa", "--model",
-        model_name, "--reasoning-effort", "low", "--base-url", base_url,
-        "--n-threads", "200"
+        sys.executable,
+        "-m",
+        "gpt_oss.evals",
+        "--eval",
+        "gpqa",
+        "--model",
+        model_name,
+        "--reasoning-effort",
+        "low",
+        "--base-url",
+        base_url,
+        "--n-threads",
+        "200",
    ]

    try:
@@ -37,7 +47,8 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float:
            text=True,
            capture_output=True,
            timeout=1800,  # 30 minute timeout
-            env={"OPENAI_API_KEY": "dummy"})
+            env={"OPENAI_API_KEY": "dummy"},
+        )

        print("Evaluation process output:\n", result.stdout)

@@ -48,14 +59,16 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float:

        # If we still can't find it, raise an error
        raise ValueError(
-            f"Could not parse score from evaluation output:\n{result.stdout}")
+            f"Could not parse score from evaluation output:\n{result.stdout}"
+        )

    except subprocess.TimeoutExpired as e:
        raise RuntimeError("Evaluation timed out") from e
    except subprocess.CalledProcessError as e:
        raise RuntimeError(
            f"Evaluation failed with exit code {e.returncode}:\n"
-            f"stdout: {e.stdout}\nstderr: {e.stderr}") from e
+            f"stdout: {e.stdout}\nstderr: {e.stderr}"
+        ) from e


 def test_gpqa_correctness(request):
@@ -72,17 +85,20 @@ def test_gpqa_correctness(request):
        server_args = server_args_str.split()

    # Add standard server arguments
-    server_args.extend([
-        "--trust-remote-code",
-    ])
+    server_args.extend(
+        [
+            "--trust-remote-code",
+        ]
+    )

    print(f"Starting GPQA evaluation for model: {model_name}")
    print(f"Expected metric threshold: {expected_metric}")
    print(f"Server args: {' '.join(server_args)}")

    # Launch server and run evaluation
-    with RemoteOpenAIServer(model_name, server_args,
-                            max_wait_seconds=1800) as remote_server:
+    with RemoteOpenAIServer(
+        model_name, server_args, max_wait_seconds=1800
+    ) as remote_server:
        base_url = remote_server.url_for("v1")
        print(f"Server started at: {base_url}")

@@ -96,6 +112,7 @@ def test_gpqa_correctness(request):
        # Verify metric is within tolerance
        assert measured_metric >= expected_metric - TOL, (
            f"GPQA metric too low: {measured_metric:.4f} < "
-            f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}")
+            f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
+        )

        print(f"✅ GPQA test passed for {model_name}")