diff --git a/tests/models/fixtures/qwen2_5_math_prm_reward_step.json b/tests/models/fixtures/qwen2_5_math_prm_reward_step.json new file mode 100644 index 000000000..dc0f3010c --- /dev/null +++ b/tests/models/fixtures/qwen2_5_math_prm_reward_step.json @@ -0,0 +1 @@ +[[[0.0006361007690429688, 0.99951171875], [0.81884765625, 0.1812744140625], [0.025543212890625, 0.974609375], [0.0004382133483886719, 0.99951171875]]] \ No newline at end of file diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py index c42186c7d..22e0539a9 100644 --- a/tests/models/language/pooling/test_reward.py +++ b/tests/models/language/pooling/test_reward.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +from typing import TYPE_CHECKING import pytest import torch @@ -9,7 +11,18 @@ from transformers import AutoModel from vllm.platforms import current_platform from ....conftest import HfRunner -from ...utils import check_transformers_version +from ....utils import VLLM_PATH +from ...registry import HF_EXAMPLE_MODELS + +if TYPE_CHECKING: + from _typeshed import StrPath + + +FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures" +assert FIXTURES_PATH.exists() +FIXTURE_REWARD_RESULT = { + "Qwen/Qwen2.5-Math-PRM-7B": FIXTURES_PATH / "qwen2_5_math_prm_reward_step.json", +} @pytest.fixture @@ -60,6 +73,16 @@ def step_reward_patch_hf_model(hf_model: HfRunner): return hf_model +def dump_reward_outputs(outputs: list[list[float]], filename: "StrPath"): + with open(filename, "w", encoding="utf-8") as f: + json.dump(outputs, f) + + +def load_reward_outputs(filename: "StrPath") -> list[list[float]]: + with open(filename, encoding="utf-8") as f: + return json.load(f) + + @pytest.mark.parametrize( "model", [ @@ -77,9 +100,8 @@ def test_prm_models( model: str, dtype: str, ) -> None: - check_transformers_version( - "Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2" - ) + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) + model_info.check_transformers_version(on_fail="skip") if current_platform.is_cpu(): pytest.skip("CPU only supports V1") @@ -91,9 +113,46 @@ def test_prm_models( hf_model = step_reward_patch_hf_model(hf_model) hf_outputs = hf_model.reward(math_step_prompts) + dump_reward_outputs( + hf_outputs, + FIXTURE_REWARD_RESULT[model], + ) + # check logits difference for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): hf_output = torch.tensor(hf_output).float() vllm_output = torch.tensor(vllm_output).float() assert torch.allclose(hf_output, vllm_output, 1.5e-2) + + +@pytest.mark.parametrize( + "model", + [ + pytest.param( + "Qwen/Qwen2.5-Math-PRM-7B", + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_prm_models_with_golden_outputs( + vllm_runner, + math_step_prompts, + model: str, + dtype: str, +) -> None: + if not FIXTURE_REWARD_RESULT.get(model): + pytest.skip(f"No available golden outputs for {model}.") + + with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.reward(math_step_prompts) + + golden_outputs = load_reward_outputs(FIXTURE_REWARD_RESULT[model]) + + # check logits difference + for golden_output, vllm_output in zip(golden_outputs, vllm_outputs): + golden_output = torch.tensor(golden_output).float() + vllm_output = torch.tensor(vllm_output).float() + + assert torch.allclose(golden_output, vllm_output, 1.5e-2)