[CI] Split pooling from entrypoints Test (#24632)
Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
0
tests/entrypoints/pooling/llm/__init__.py
Normal file
0
tests/entrypoints/pooling/llm/__init__.py
Normal file
69
tests/entrypoints/pooling/llm/test_classify.py
Normal file
69
tests/entrypoints/pooling/llm/test_classify.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.models.utils import softmax
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
|
||||
|
||||
prompts = ["The chef prepared a delicious meal."]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
|
||||
def get_outputs(activation):
|
||||
outputs = llm.classify(
|
||||
prompts,
|
||||
pooling_params=PoolingParams(activation=activation),
|
||||
use_tqdm=False)
|
||||
return torch.tensor([x.outputs.probs for x in outputs])
|
||||
|
||||
default = get_outputs(activation=None)
|
||||
w_activation = get_outputs(activation=True)
|
||||
wo_activation = get_outputs(activation=False)
|
||||
|
||||
assert torch.allclose(default, w_activation,
|
||||
atol=1e-2), "Default should use activation."
|
||||
assert not torch.allclose(
|
||||
w_activation, wo_activation,
|
||||
atol=1e-2), "wo_activation should not use activation."
|
||||
assert torch.allclose(
|
||||
softmax(wo_activation), w_activation, atol=1e-2
|
||||
), "w_activation should be close to activation(wo_activation)."
|
||||
|
||||
|
||||
def test_encode_api(llm: LLM):
|
||||
err_msg = "pooling_task must be one of.+"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
llm.encode(prompts, use_tqdm=False)
|
||||
|
||||
|
||||
def test_score_api(llm: LLM):
|
||||
err_msg = "Score API is only enabled for num_labels == 1."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
llm.score("ping", "pong", use_tqdm=False)
|
||||
55
tests/entrypoints/pooling/llm/test_embedding.py
Normal file
55
tests/entrypoints/pooling/llm/test_embedding.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
prompts = ["The chef prepared a delicious meal."]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
|
||||
def get_outputs(normalize):
|
||||
outputs = llm.embed(prompts,
|
||||
pooling_params=PoolingParams(normalize=normalize),
|
||||
use_tqdm=False)
|
||||
return torch.tensor([x.outputs.embedding for x in outputs])
|
||||
|
||||
default = get_outputs(normalize=None)
|
||||
w_normal = get_outputs(normalize=True)
|
||||
wo_normal = get_outputs(normalize=False)
|
||||
|
||||
assert torch.allclose(default, w_normal,
|
||||
atol=1e-2), "Default should use normal."
|
||||
assert not torch.allclose(w_normal, wo_normal,
|
||||
atol=1e-2), "wo_normal should not use normal."
|
||||
assert torch.allclose(
|
||||
w_normal, F.normalize(wo_normal, p=2, dim=-1),
|
||||
atol=1e-2), "w_normal should be close to normal(wo_normal)."
|
||||
79
tests/entrypoints/pooling/llm/test_encode.py
Normal file
79
tests/entrypoints/pooling/llm/test_encode.py
Normal file
@@ -0,0 +1,79 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
TOKEN_IDS = [
|
||||
# Using ID={0, 1, 2, 3} results in NaN values,
|
||||
# so we add this offset of 1000
|
||||
[1000],
|
||||
[1000, 1001],
|
||||
[1000, 1002, 1001],
|
||||
[1000, 1003, 1001, 1002],
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_multiple_pooling_params(llm: LLM):
|
||||
pooling_params = [
|
||||
PoolingParams(),
|
||||
PoolingParams(),
|
||||
PoolingParams(),
|
||||
PoolingParams(),
|
||||
]
|
||||
|
||||
# Multiple PoolingParams should be matched with each prompt
|
||||
outputs = llm.encode(PROMPTS, pooling_params=pooling_params)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the size of params does not match the size of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3])
|
||||
|
||||
# Single PoolingParams should be applied to every prompt
|
||||
single_pooling_params = PoolingParams()
|
||||
outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# pooling_params is None, default params should be applied
|
||||
outputs = llm.encode(PROMPTS, pooling_params=None)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_right_side_truncation(llm: LLM):
|
||||
# Embeddings models should truncate the end of the prompt
|
||||
tokenizer = llm.get_tokenizer()
|
||||
assert tokenizer.truncation_side == "right"
|
||||
56
tests/entrypoints/pooling/llm/test_reward.py
Normal file
56
tests/entrypoints/pooling/llm/test_reward.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.models.utils import softmax
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "internlm/internlm2-1_8b-reward"
|
||||
|
||||
prompts = ["The chef prepared a delicious meal."]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
seed=0)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
|
||||
def get_outputs(softmax):
|
||||
outputs = llm.reward(prompts,
|
||||
pooling_params=PoolingParams(softmax=softmax),
|
||||
use_tqdm=False)
|
||||
return torch.cat([x.outputs.data for x in outputs])
|
||||
|
||||
default = get_outputs(softmax=None)
|
||||
w_softmax = get_outputs(softmax=True)
|
||||
wo_softmax = get_outputs(softmax=False)
|
||||
|
||||
assert torch.allclose(default, w_softmax,
|
||||
atol=1e-2), "Default should use softmax."
|
||||
assert not torch.allclose(w_softmax, wo_softmax,
|
||||
atol=1e-2), "wo_softmax should not use softmax."
|
||||
assert torch.allclose(
|
||||
softmax(wo_softmax), w_softmax,
|
||||
atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
|
||||
59
tests/entrypoints/pooling/llm/test_score.py
Normal file
59
tests/entrypoints/pooling/llm/test_score.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.models.utils import softmax
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
|
||||
def get_outputs(activation):
|
||||
text_1 = "What is the capital of France?"
|
||||
text_2 = "The capital of France is Paris."
|
||||
|
||||
outputs = llm.score(
|
||||
text_1,
|
||||
text_2,
|
||||
pooling_params=PoolingParams(activation=activation),
|
||||
use_tqdm=False)
|
||||
return torch.tensor([x.outputs.score for x in outputs])
|
||||
|
||||
default = get_outputs(activation=None)
|
||||
w_activation = get_outputs(activation=True)
|
||||
wo_activation = get_outputs(activation=False)
|
||||
|
||||
assert torch.allclose(default, w_activation,
|
||||
atol=1e-2), "Default should use activation."
|
||||
assert not torch.allclose(
|
||||
w_activation, wo_activation,
|
||||
atol=1e-2), "wo_activation should not use activation."
|
||||
assert torch.allclose(
|
||||
softmax(wo_activation), w_activation, atol=1e-2
|
||||
), "w_activation should be close to activation(wo_activation)."
|
||||
Reference in New Issue
Block a user