diff --git a/tests/models/fixtures/qwen2_5_math_prm_reward_step.json b/tests/models/fixtures/qwen2_5_math_prm_reward_step.json
new file mode 100644
index 000000000..dc0f3010c
--- /dev/null
+++ b/tests/models/fixtures/qwen2_5_math_prm_reward_step.json
@@ -0,0 +1 @@
+[[[0.0006361007690429688, 0.99951171875], [0.81884765625, 0.1812744140625], [0.025543212890625, 0.974609375], [0.0004382133483886719, 0.99951171875]]]
\ No newline at end of file
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index c42186c7d..22e0539a9 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from typing import TYPE_CHECKING
 
 import pytest
 import torch
@@ -9,7 +11,18 @@ from transformers import AutoModel
 from vllm.platforms import current_platform
 
 from ....conftest import HfRunner
-from ...utils import check_transformers_version
+from ....utils import VLLM_PATH
+from ...registry import HF_EXAMPLE_MODELS
+
+if TYPE_CHECKING:
+    from _typeshed import StrPath
+
+
+FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
+assert FIXTURES_PATH.exists()
+FIXTURE_REWARD_RESULT = {
+    "Qwen/Qwen2.5-Math-PRM-7B": FIXTURES_PATH / "qwen2_5_math_prm_reward_step.json",
+}
 
 
 @pytest.fixture
@@ -60,6 +73,16 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
     return hf_model
 
 
+def dump_reward_outputs(outputs: list[list[float]], filename: "StrPath"):
+    with open(filename, "w", encoding="utf-8") as f:
+        json.dump(outputs, f)
+
+
+def load_reward_outputs(filename: "StrPath") -> list[list[float]]:
+    with open(filename, encoding="utf-8") as f:
+        return json.load(f)
+
+
 @pytest.mark.parametrize(
     "model",
     [
@@ -77,9 +100,8 @@ def test_prm_models(
     model: str,
     dtype: str,
 ) -> None:
-    check_transformers_version(
-        "Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
-    )
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_transformers_version(on_fail="skip")
 
     if current_platform.is_cpu():
         pytest.skip("CPU only supports V1")
@@ -91,9 +113,46 @@ def test_prm_models(
         hf_model = step_reward_patch_hf_model(hf_model)
         hf_outputs = hf_model.reward(math_step_prompts)
 
+    dump_reward_outputs(
+        hf_outputs,
+        FIXTURE_REWARD_RESULT[model],
+    )
+
     # check logits difference
     for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
         hf_output = torch.tensor(hf_output).float()
         vllm_output = torch.tensor(vllm_output).float()
 
         assert torch.allclose(hf_output, vllm_output, 1.5e-2)
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param(
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_prm_models_with_golden_outputs(
+    vllm_runner,
+    math_step_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    if not FIXTURE_REWARD_RESULT.get(model):
+        pytest.skip(f"No available golden outputs for {model}.")
+
+    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.reward(math_step_prompts)
+
+    golden_outputs = load_reward_outputs(FIXTURE_REWARD_RESULT[model])
+
+    # check logits difference
+    for golden_output, vllm_output in zip(golden_outputs, vllm_outputs):
+        golden_output = torch.tensor(golden_output).float()
+        vllm_output = torch.tensor(vllm_output).float()
+
+        assert torch.allclose(golden_output, vllm_output, 1.5e-2)