Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -1,2 +1,2 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
@@ -8,11 +8,9 @@ Pytest configuration for GPT-OSS evaluation tests.
|
||||
def pytest_addoption(parser):
|
||||
"""Add command line options for pytest."""
|
||||
parser.addoption("--model", action="store", help="Model name to evaluate")
|
||||
parser.addoption("--metric",
|
||||
action="store",
|
||||
type=float,
|
||||
help="Expected metric threshold")
|
||||
parser.addoption("--server-args",
|
||||
action="store",
|
||||
default="",
|
||||
help="Additional server arguments")
|
||||
parser.addoption(
|
||||
"--metric", action="store", type=float, help="Expected metric threshold"
|
||||
)
|
||||
parser.addoption(
|
||||
"--server-args", action="store", default="", help="Additional server arguments"
|
||||
)
|
||||
|
||||
@@ -25,9 +25,19 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float:
|
||||
|
||||
# Build the command to run the evaluation
|
||||
cmd = [
|
||||
sys.executable, "-m", "gpt_oss.evals", "--eval", "gpqa", "--model",
|
||||
model_name, "--reasoning-effort", "low", "--base-url", base_url,
|
||||
"--n-threads", "200"
|
||||
sys.executable,
|
||||
"-m",
|
||||
"gpt_oss.evals",
|
||||
"--eval",
|
||||
"gpqa",
|
||||
"--model",
|
||||
model_name,
|
||||
"--reasoning-effort",
|
||||
"low",
|
||||
"--base-url",
|
||||
base_url,
|
||||
"--n-threads",
|
||||
"200",
|
||||
]
|
||||
|
||||
try:
|
||||
@@ -37,7 +47,8 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float:
|
||||
text=True,
|
||||
capture_output=True,
|
||||
timeout=1800, # 30 minute timeout
|
||||
env={"OPENAI_API_KEY": "dummy"})
|
||||
env={"OPENAI_API_KEY": "dummy"},
|
||||
)
|
||||
|
||||
print("Evaluation process output:\n", result.stdout)
|
||||
|
||||
@@ -48,14 +59,16 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float:
|
||||
|
||||
# If we still can't find it, raise an error
|
||||
raise ValueError(
|
||||
f"Could not parse score from evaluation output:\n{result.stdout}")
|
||||
f"Could not parse score from evaluation output:\n{result.stdout}"
|
||||
)
|
||||
|
||||
except subprocess.TimeoutExpired as e:
|
||||
raise RuntimeError("Evaluation timed out") from e
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise RuntimeError(
|
||||
f"Evaluation failed with exit code {e.returncode}:\n"
|
||||
f"stdout: {e.stdout}\nstderr: {e.stderr}") from e
|
||||
f"stdout: {e.stdout}\nstderr: {e.stderr}"
|
||||
) from e
|
||||
|
||||
|
||||
def test_gpqa_correctness(request):
|
||||
@@ -72,17 +85,20 @@ def test_gpqa_correctness(request):
|
||||
server_args = server_args_str.split()
|
||||
|
||||
# Add standard server arguments
|
||||
server_args.extend([
|
||||
"--trust-remote-code",
|
||||
])
|
||||
server_args.extend(
|
||||
[
|
||||
"--trust-remote-code",
|
||||
]
|
||||
)
|
||||
|
||||
print(f"Starting GPQA evaluation for model: {model_name}")
|
||||
print(f"Expected metric threshold: {expected_metric}")
|
||||
print(f"Server args: {' '.join(server_args)}")
|
||||
|
||||
# Launch server and run evaluation
|
||||
with RemoteOpenAIServer(model_name, server_args,
|
||||
max_wait_seconds=1800) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
model_name, server_args, max_wait_seconds=1800
|
||||
) as remote_server:
|
||||
base_url = remote_server.url_for("v1")
|
||||
print(f"Server started at: {base_url}")
|
||||
|
||||
@@ -96,6 +112,7 @@ def test_gpqa_correctness(request):
|
||||
# Verify metric is within tolerance
|
||||
assert measured_metric >= expected_metric - TOL, (
|
||||
f"GPQA metric too low: {measured_metric:.4f} < "
|
||||
f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}")
|
||||
f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
|
||||
)
|
||||
|
||||
print(f"✅ GPQA test passed for {model_name}")
|
||||
|
||||
Reference in New Issue
Block a user