[CI/Build] Revive skipped reward models e2e test (#31665)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-01-05 10:33:46 +08:00
parent da436f868a
commit 367856de14
2 changed files with 64 additions and 4 deletions
--- a/tests/models/fixtures/qwen2_5_math_prm_reward_step.json
+++ b/tests/models/fixtures/qwen2_5_math_prm_reward_step.json
@@ -0,0 +1 @@
+[[[0.0006361007690429688, 0.99951171875], [0.81884765625, 0.1812744140625], [0.025543212890625, 0.974609375], [0.0004382133483886719, 0.99951171875]]]
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from typing import TYPE_CHECKING

 import pytest
 import torch
@@ -9,7 +11,18 @@ from transformers import AutoModel
 from vllm.platforms import current_platform

 from ....conftest import HfRunner
-from ...utils import check_transformers_version
+from ....utils import VLLM_PATH
+from ...registry import HF_EXAMPLE_MODELS
+
+if TYPE_CHECKING:
+    from _typeshed import StrPath
+
+
+FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
+assert FIXTURES_PATH.exists()
+FIXTURE_REWARD_RESULT = {
+    "Qwen/Qwen2.5-Math-PRM-7B": FIXTURES_PATH / "qwen2_5_math_prm_reward_step.json",
+}


@pytest.fixture
@@ -60,6 +73,16 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
    return hf_model


+def dump_reward_outputs(outputs: list[list[float]], filename: "StrPath"):
+    with open(filename, "w", encoding="utf-8") as f:
+        json.dump(outputs, f)
+
+
+def load_reward_outputs(filename: "StrPath") -> list[list[float]]:
+    with open(filename, encoding="utf-8") as f:
+        return json.load(f)
+
+
@pytest.mark.parametrize(
    "model",
    [
@@ -77,9 +100,8 @@ def test_prm_models(
    model: str,
    dtype: str,
 ) -> None:
-    check_transformers_version(
-        "Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
-    )
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_transformers_version(on_fail="skip")

    if current_platform.is_cpu():
        pytest.skip("CPU only supports V1")
@@ -91,9 +113,46 @@ def test_prm_models(
        hf_model = step_reward_patch_hf_model(hf_model)
        hf_outputs = hf_model.reward(math_step_prompts)

+    dump_reward_outputs(
+        hf_outputs,
+        FIXTURE_REWARD_RESULT[model],
+    )
+
    # check logits difference
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = torch.tensor(hf_output).float()
        vllm_output = torch.tensor(vllm_output).float()

        assert torch.allclose(hf_output, vllm_output, 1.5e-2)
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param(
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_prm_models_with_golden_outputs(
+    vllm_runner,
+    math_step_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    if not FIXTURE_REWARD_RESULT.get(model):
+        pytest.skip(f"No available golden outputs for {model}.")
+
+    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.reward(math_step_prompts)
+
+    golden_outputs = load_reward_outputs(FIXTURE_REWARD_RESULT[model])
+
+    # check logits difference
+    for golden_output, vllm_output in zip(golden_outputs, vllm_outputs):
+        golden_output = torch.tensor(golden_output).float()
+        vllm_output = torch.tensor(vllm_output).float()
+
+        assert torch.allclose(golden_output, vllm_output, 1.5e-2)
				`@@ -0,0 +1 @@`
				`[[[0.0006361007690429688, 0.99951171875], [0.81884765625, 0.1812744140625], [0.025543212890625, 0.974609375], [0.0004382133483886719, 0.99951171875]]]`