[Deprecate] Deprecate pooling multi task support. (#37956)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-03-24 22:07:47 +08:00
parent 352b90c4a4
commit 1b6cb920e6
18 changed files with 566 additions and 66 deletions
--- a/tests/entrypoints/pooling/token_embed/init.py
+++ b/tests/entrypoints/pooling/token_embed/init.py
--- a/tests/entrypoints/pooling/token_embed/test_offline.py
+++ b/tests/entrypoints/pooling/token_embed/test_offline.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+import weakref
+
+import pytest
+
+from vllm import LLM, PoolingRequestOutput
+from vllm.config import PoolerConfig
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+from vllm.tasks import PoolingTask
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+
+prompt = "The chef prepared a delicious meal."
+prompt_token_ids = [0, 581, 21861, 133888, 10, 8, 150, 60744, 109911, 5, 2]
+embedding_size = 384
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model=MODEL_NAME,
+        pooler_config=PoolerConfig(task="token_embed"),
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+        attention_config=attention_config,
+    )
+    assert embedding_size == llm.model_config.embedding_size
+
+    yield weakref.proxy(llm)
+
+    del llm
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_str_prompts(llm: LLM):
+    outputs = llm.encode(prompt, pooling_task="token_embed", use_tqdm=False)
+    assert len(outputs) == 1
+    assert isinstance(outputs[0], PoolingRequestOutput)
+    assert outputs[0].outputs.data.shape == (11, 384)
+
+
+@pytest.mark.skip_global_cleanup
+def test_token_ids_prompts(llm: LLM):
+    outputs = llm.encode([prompt_token_ids], pooling_task="token_embed", use_tqdm=False)
+    assert len(outputs) == 1
+    assert isinstance(outputs[0], PoolingRequestOutput)
+    assert outputs[0].outputs.data.shape == (11, 384)
+
+
+@pytest.mark.parametrize("task", ["embed", "classify", "token_classify"])
+def test_unsupported_tasks(llm: LLM, task: PoolingTask, caplog_vllm):
+    if task == "embed":
+        with caplog_vllm.at_level(level=logging.WARNING, logger="vllm"):
+            llm.encode(prompt, pooling_task=task, use_tqdm=False)
+        assert "deprecated" in caplog_vllm.text
+    else:
+        err_msg = "Classification API is not supported by this model.+"
+
+        with pytest.raises(ValueError, match=err_msg):
+            llm.encode(prompt, pooling_task=task, use_tqdm=False)
--- a/tests/entrypoints/pooling/token_embed/test_online.py
+++ b/tests/entrypoints/pooling/token_embed/test_online.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+DTYPE = "bfloat16"
+input_text = "The best thing about vLLM is that it supports many different models"
+input_tokens = [
+    0,
+    581,
+    2965,
+    13580,
+    1672,
+    81,
+    23708,
+    594,
+    83,
+    450,
+    442,
+    8060,
+    7,
+    5941,
+    12921,
+    115774,
+    2,
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--runner",
+        "pooling",
+        "--dtype",
+        DTYPE,
+        "--enforce-eager",
+        "--max-model-len",
+        "512",
+        "--pooler-config.task",
+        "token_embed",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
+    task = "token_embed"
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == len(input_tokens)
+    assert len(poolings.data[0].data[0]) == 384
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("task", ["embed", "classify", "token_classify", "plugin"])
+async def test_pooling_not_supported(
+    server: RemoteOpenAIServer, model_name: str, task: str
+):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+
+    if task != "embed":
+        assert response.json()["error"]["type"] == "BadRequestError"
+        err_msg = f"Unsupported task: {task!r}"
+        assert response.json()["error"]["message"].startswith(err_msg)