[Model] Systematic support for fp32 head, pooling models part (#23810)

Signed-off-by: wang.yuqi <noooop@126.com>
2025-09-09 22:29:50 +08:00
parent a55cf41a09
commit 19332c0479
14 changed files with 166 additions and 61 deletions
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -9,6 +9,7 @@ import mteb
 import numpy as np
 import pytest
 import requests
+import torch

 from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
                                check_embeddings_close)
@@ -165,16 +166,19 @@ def mteb_test_embed_models(hf_runner,
                           vllm_extra_kwargs=None,
                           hf_model_callback=None,
                           atol=MTEB_EMBED_TOL):
+    # A model family has many models with the same architecture,
+    # and we don't need to test each one.
    if not model_info.enable_test:
-        # A model family has many models with the same architecture,
-        # and we don't need to test each one.
        pytest.skip("Skipping test.")

-    example_prompts = ["The chef prepared a delicious meal."]
+    # Test embed_dims, isnan and whether to use normalize
+    example_prompts = ["The chef prepared a delicious meal." * 1000]

+    # Allow vllm to test using the given dtype, such as float32
    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

+    # Allow vllm to test using hf_overrides
    if model_info.hf_overrides is not None:
        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides

@@ -186,21 +190,32 @@ def mteb_test_embed_models(hf_runner,

        model_config = vllm_model.llm.llm_engine.model_config

+        # Confirm whether vllm is using the correct architecture
        if model_info.architecture:
            assert model_info.architecture in model_config.architectures
+
+        # Confirm whether vllm uses the correct default_pooling_type, which
+        # relates to whether chunked prefill and prefix caching are enabled
        assert (model_config._model_info.default_pooling_type ==
                model_info.default_pooling_type)

        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                              MTEB_EMBED_TASKS)
        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
-        vllm_outputs = vllm_model.embed(example_prompts)

+        # Test embed_dims, isnan and whether to use normalize
+        vllm_outputs = vllm_model.embed(example_prompts,
+                                        truncate_prompt_tokens=-1)
+        assert not torch.any(torch.isnan(torch.tensor(vllm_outputs)))
+
+    # Accelerate mteb test by setting
+    # SentenceTransformers mteb score to a constant
    if model_info.mteb_score is None:
        with hf_runner(model_info.name,
                       is_sentence_transformer=True,
                       dtype="float32") as hf_model:

+            # e.g. setting default parameters for the encode method of hf_runner
            if hf_model_callback is not None:
                hf_model_callback(hf_model)

@@ -299,14 +314,16 @@ def mteb_test_rerank_models(hf_runner,
                            hf_model_callback=None,
                            vllm_mteb_encoder=VllmMtebEncoder,
                            atol=MTEB_RERANK_TOL):
+    # A model family has many models with the same architecture,
+    # and we don't need to test each one.
    if not model_info.enable_test:
-        # A model family has many models with the same architecture,
-        # and we don't need to test each one.
        pytest.skip("Skipping test.")

+    # Allow vllm to test using the given dtype, such as float32
    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

+    # Allow vllm to test using hf_overrides
    if model_info.hf_overrides is not None:
        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides

@@ -319,9 +336,15 @@ def mteb_test_rerank_models(hf_runner,

        model_config = vllm_model.llm.llm_engine.model_config

+        # Confirm whether vllm is using the correct architecture
        if model_info.architecture:
            assert (model_info.architecture in model_config.architectures)
+
+        # Score API is only enabled for num_labels == 1
        assert model_config.hf_config.num_labels == 1
+
+        # Confirm whether vllm uses the correct default_pooling_type, which
+        # relates to whether chunked prefill and prefix caching are enabled
        assert (model_config._model_info.default_pooling_type ==
                model_info.default_pooling_type)

@@ -330,6 +353,8 @@ def mteb_test_rerank_models(hf_runner,
                                          languages=MTEB_RERANK_LANGS)
        vllm_dtype = model_config.dtype

+    # Accelerate mteb test by setting
+    # SentenceTransformers mteb score to a constant
    if model_info.mteb_score is None:
        st_main_score, st_dtype = mteb_test_rerank_models_hf(
            hf_runner, model_info.name, hf_model_callback)