[Model][2/N] Improve all pooling task | Support multi-vector retrieval (#25370)

Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-15 19:14:41 +08:00
parent d4d1a6024f
commit f54f85129e
41 changed files with 786 additions and 399 deletions
--- a/tests/models/language/pooling/test_multi_vector_retrieval.py
+++ b/tests/models/language/pooling/test_multi_vector_retrieval.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModel
+
+from tests.models.utils import check_embeddings_close
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["BAAI/bge-m3"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@torch.inference_mode
+def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str):
+    with vllm_runner(
+        model,
+        runner="pooling",
+        max_model_len=None,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.token_embed(example_prompts)
+
+    with hf_runner(
+        model,
+        auto_cls=AutoModel,
+    ) as hf_model:
+        tokenizer = hf_model.tokenizer
+        hf_outputs = []
+        for prompt in example_prompts:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            inputs = hf_model.wrap_device(inputs)
+            output = hf_model.model(**inputs)
+            embedding = output.last_hidden_state[0].float()
+            # normal
+            hf_outputs.append(embedding.cpu())
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        check_embeddings_close(
+            embeddings_0_lst=hf_output,
+            embeddings_1_lst=vllm_output,
+            name_0="hf",
+            name_1="vllm",
+            tol=1e-2,
+        )
--- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py
+++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
@@ -93,7 +93,7 @@ def test_embed_models_using_normalize(
    ],
 )
@pytest.mark.parametrize("dtype", ["half"])
-def test_reward_models_using_softmax(
+def test_reward_models_using_activation(
    hf_runner,
    vllm_runner,
    example_prompts,
@@ -104,22 +104,64 @@ def test_reward_models_using_softmax(
        model,
        max_model_len=1024,
        dtype=dtype,
-        pooler_config=PoolerConfig(softmax=False),
+        pooler_config=PoolerConfig(activation=False),
    ) as vllm_model:
-        wo_softmax = vllm_model.encode(example_prompts)
+        wo_activation = vllm_model.reward(example_prompts)

    with vllm_runner(
-        model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True)
+        model,
+        max_model_len=1024,
+        dtype=dtype,
+        pooler_config=PoolerConfig(activation=True),
    ) as vllm_model:
-        w_softmax = vllm_model.encode(example_prompts)
+        w_activation = vllm_model.reward(example_prompts)

-    for wo, w in zip(wo_softmax, w_softmax):
+    for wo, w in zip(wo_activation, w_activation):
        wo = torch.tensor(wo)
        w = torch.tensor(w)

        assert not torch.allclose(wo, w, atol=1e-2), (
-            "pooler_config softmax is not working"
+            "pooler_config activation is not working"
        )
        assert torch.allclose(softmax(wo), w, atol=1e-2), (
-            "w_softmax should be close to softmax(wo_softmax)."
+            "w_activation should be close to activation(wo_activation)."
+        )
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "intfloat/multilingual-e5-small",
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_multi_vector_retrieval_models_using_normalize(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(normalize=False),
+    ) as vllm_model:
+        wo_normalize = vllm_model.token_embed(example_prompts)
+
+    with vllm_runner(
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(normalize=True),
+    ) as vllm_model:
+        w_normalize = vllm_model.token_embed(example_prompts)
+
+    for wo, w in zip(wo_normalize, w_normalize):
+        assert not torch.allclose(wo, w, atol=1e-2), (
+            "pooler_config normalize is not working"
+        )
+        assert torch.allclose(F.normalize(wo, p=2, dim=-1), w, atol=1e-2), (
+            "w_normal should be close to normal(wo_normal)."
        )
--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
@@ -19,7 +19,7 @@ def test_bert_models(
    dtype: str,
 ) -> None:
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.token_classify(example_prompts)

    with hf_runner(
        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
@@ -50,7 +50,7 @@ def test_modernbert_models(
    dtype: str,
 ) -> None:
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.token_classify(example_prompts)

    with hf_runner(
        model, dtype=dtype, auto_cls=AutoModelForTokenClassification