Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -6,8 +6,7 @@ from typing import Optional
|
||||
import pytest
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.models.utils import (EmbedModelInfo, check_embeddings_close,
|
||||
matryoshka_fy)
|
||||
from tests.models.utils import EmbedModelInfo, check_embeddings_close, matryoshka_fy
|
||||
|
||||
|
||||
def run_embedding_correctness_test(
|
||||
@@ -29,12 +28,14 @@ def run_embedding_correctness_test(
|
||||
)
|
||||
|
||||
|
||||
def correctness_test_embed_models(hf_runner,
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None):
|
||||
def correctness_test_embed_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
):
|
||||
pytest.skip("Debug only, ci prefers to use mteb test.")
|
||||
|
||||
# The example_prompts has ending "\n", for example:
|
||||
@@ -51,18 +52,16 @@ def correctness_test_embed_models(hf_runner,
|
||||
if model_info.hf_overrides is not None:
|
||||
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=None, **vllm_extra_kwargs
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
dtype=model_info.hf_dtype,
|
||||
is_sentence_transformer=True,
|
||||
model_info.name,
|
||||
dtype=model_info.hf_dtype,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
|
||||
@@ -4,8 +4,7 @@ import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
from tests.models.language.pooling.embed_utils import (
|
||||
run_embedding_correctness_test)
|
||||
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -20,28 +19,27 @@ def test_classify_models(
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
example_prompts = example_prompts * 2
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
enable_prefix_caching=True) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
|
||||
) as vllm_model:
|
||||
cache_config = vllm_model.llm.llm_engine.cache_config
|
||||
assert cache_config.enable_prefix_caching
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForSequenceClassification) as hf_model:
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output)
|
||||
vllm_output = torch.tensor(vllm_output)
|
||||
|
||||
assert torch.allclose(hf_output, vllm_output,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
assert torch.allclose(
|
||||
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -59,18 +57,18 @@ def test_embed_models(
|
||||
example_prompts = [str(s).strip() for s in example_prompts] * 2
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
enable_prefix_caching=True,
|
||||
model,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
enable_prefix_caching=True,
|
||||
) as vllm_model:
|
||||
cache_config = vllm_model.llm.llm_engine.cache_config
|
||||
assert cache_config.enable_prefix_caching
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
is_sentence_transformer=True,
|
||||
model,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
|
||||
|
||||
@@ -81,13 +79,14 @@ def test_embed_models(
|
||||
"intfloat/e5-small",
|
||||
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", # is_causal == False
|
||||
"papluca/xlm-roberta-base-language-detection",
|
||||
])
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_non_causal_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
dtype: str) -> None:
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
enable_prefix_caching=True) as vllm_model:
|
||||
def test_non_causal_models(
|
||||
hf_runner, vllm_runner, example_prompts, model: str, dtype: str
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
|
||||
) as vllm_model:
|
||||
cache_config = vllm_model.llm.llm_engine.cache_config
|
||||
assert not cache_config.enable_prefix_caching
|
||||
|
||||
@@ -10,15 +10,17 @@ from vllm.platforms import current_platform
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param("jason9693/Qwen2.5-1.5B-apeach",
|
||||
marks=[
|
||||
pytest.mark.core_model, pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test
|
||||
]),
|
||||
pytest.param(
|
||||
"jason9693/Qwen2.5-1.5B-apeach",
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype",
|
||||
["half"] if current_platform.is_rocm() else ["float"])
|
||||
@pytest.mark.parametrize("dtype", ["half"] if current_platform.is_rocm() else ["float"])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
@@ -35,9 +37,9 @@ def test_models(
|
||||
with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForSequenceClassification) as hf_model:
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
# check logits difference
|
||||
@@ -48,5 +50,6 @@ def test_models(
|
||||
# the tolerance value of 1e-2 is selected based on the
|
||||
# half datatype tests in
|
||||
# tests/models/language/pooling/test_embedding.py
|
||||
assert torch.allclose(hf_output, vllm_output,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
assert torch.allclose(
|
||||
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
|
||||
)
|
||||
|
||||
@@ -18,20 +18,25 @@ from ...utils import check_embeddings_close
|
||||
# case won't pass because gte-Qwen2-1.5B-instruct will cache custom
|
||||
# model code with bidirectional attention.
|
||||
# [Decoder-only]
|
||||
pytest.param("BAAI/bge-multilingual-gemma2",
|
||||
marks=[pytest.mark.core_model, pytest.mark.slow_test]),
|
||||
pytest.param(
|
||||
"BAAI/bge-multilingual-gemma2",
|
||||
marks=[pytest.mark.core_model, pytest.mark.slow_test],
|
||||
),
|
||||
pytest.param(
|
||||
"intfloat/e5-mistral-7b-instruct",
|
||||
# CPU v1 doesn't support sliding window
|
||||
marks=[pytest.mark.core_model]),
|
||||
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
|
||||
marks=[pytest.mark.cpu_model]),
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
pytest.param(
|
||||
"ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.cpu_model]
|
||||
),
|
||||
# [Encoder-only]
|
||||
pytest.param(
|
||||
"BAAI/bge-base-en-v1.5",
|
||||
marks=[
|
||||
pytest.mark.core_model, pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
),
|
||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
||||
@@ -50,7 +55,6 @@ def test_models(
|
||||
model,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
|
||||
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
|
||||
# ROCm Triton FA does not currently support sliding window attention
|
||||
# switch to use ROCm CK FA backend
|
||||
@@ -58,13 +62,14 @@ def test_models(
|
||||
|
||||
vllm_extra_kwargs = {}
|
||||
if model == "ssmits/Qwen2-7B-Instruct-embed-base":
|
||||
vllm_extra_kwargs["pooler_config"] = \
|
||||
PoolerConfig(pooling_type="MEAN", normalize=False)
|
||||
vllm_extra_kwargs["pooler_config"] = PoolerConfig(
|
||||
pooling_type="MEAN", normalize=False
|
||||
)
|
||||
|
||||
max_model_len: Optional[int] = 512
|
||||
if model in [
|
||||
"sentence-transformers/all-MiniLM-L12-v2",
|
||||
"sentence-transformers/stsb-roberta-base-v2"
|
||||
"sentence-transformers/all-MiniLM-L12-v2",
|
||||
"sentence-transformers/stsb-roberta-base-v2",
|
||||
]:
|
||||
max_model_len = None
|
||||
|
||||
@@ -79,10 +84,9 @@ def test_models(
|
||||
with hf_runner(model, is_sentence_transformer=True) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(model,
|
||||
runner="pooling",
|
||||
max_model_len=max_model_len,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, runner="pooling", max_model_len=max_model_len, **vllm_extra_kwargs
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
check_embeddings_close(
|
||||
|
||||
@@ -70,8 +70,9 @@ async def run_client_embeddings(
|
||||
|
||||
|
||||
def gritlm_instruction(instruction):
|
||||
return ("<|user|>\n" + instruction +
|
||||
"\n<|embed|>\n" if instruction else "<|embed|>\n")
|
||||
return (
|
||||
"<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n"
|
||||
)
|
||||
|
||||
|
||||
def get_test_data():
|
||||
@@ -80,7 +81,8 @@ def get_test_data():
|
||||
README.md in https://github.com/ContextualAI/gritlm
|
||||
"""
|
||||
q_instruction = gritlm_instruction(
|
||||
"Given a scientific paper title, retrieve the paper's abstract", )
|
||||
"Given a scientific paper title, retrieve the paper's abstract",
|
||||
)
|
||||
queries = [
|
||||
"Bitcoin: A Peer-to-Peer Electronic Cash System",
|
||||
"Generative Representational Instruction Tuning",
|
||||
@@ -114,9 +116,9 @@ def test_gritlm_offline_embedding(vllm_runner):
|
||||
queries, q_instruction, documents, d_instruction = get_test_data()
|
||||
|
||||
with vllm_runner(
|
||||
MODEL_NAME,
|
||||
runner="pooling",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
MODEL_NAME,
|
||||
runner="pooling",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
|
||||
@@ -161,9 +163,9 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
|
||||
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
|
||||
|
||||
with vllm_runner(
|
||||
MODEL_NAME,
|
||||
runner="generate",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
MODEL_NAME,
|
||||
runner="generate",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
|
||||
|
||||
@@ -21,16 +21,18 @@ def test_idefics_multimodal(
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
with vllm_runner(model_name="HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
runner="pooling",
|
||||
task="classify",
|
||||
convert="classify",
|
||||
load_format="dummy",
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_log_stats=True,
|
||||
dtype="bfloat16") as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name="HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
runner="pooling",
|
||||
task="classify",
|
||||
convert="classify",
|
||||
load_format="dummy",
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_log_stats=True,
|
||||
dtype="bfloat16",
|
||||
) as vllm_model:
|
||||
llm = vllm_model.get_llm()
|
||||
outputs = llm.classify(prompts)
|
||||
for output in outputs:
|
||||
@@ -38,19 +40,20 @@ def test_idefics_multimodal(
|
||||
|
||||
|
||||
def update_config(config):
|
||||
config.text_config.update({
|
||||
"architectures": ["Gemma3ForSequenceClassification"],
|
||||
"classifier_from_token": ["A", "B", "C", "D", "E"],
|
||||
"method":
|
||||
"no_post_processing",
|
||||
"id2label": {
|
||||
"A": "Chair",
|
||||
"B": "Couch",
|
||||
"C": "Table",
|
||||
"D": "Bed",
|
||||
"E": "Cupboard"
|
||||
},
|
||||
})
|
||||
config.text_config.update(
|
||||
{
|
||||
"architectures": ["Gemma3ForSequenceClassification"],
|
||||
"classifier_from_token": ["A", "B", "C", "D", "E"],
|
||||
"method": "no_post_processing",
|
||||
"id2label": {
|
||||
"A": "Chair",
|
||||
"B": "Couch",
|
||||
"C": "Table",
|
||||
"D": "Bed",
|
||||
"E": "Cupboard",
|
||||
},
|
||||
}
|
||||
)
|
||||
return config
|
||||
|
||||
|
||||
@@ -63,11 +66,10 @@ def test_gemma_multimodal(
|
||||
# switch to use ROCm CK FA backend
|
||||
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"system",
|
||||
"content":
|
||||
"""
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": """
|
||||
You are a helpful assistant. You will be given a product description
|
||||
which may also include an image. Classify the following product into
|
||||
one of the categories:
|
||||
@@ -78,38 +80,39 @@ def test_gemma_multimodal(
|
||||
D = bed
|
||||
E = cupboard
|
||||
|
||||
You'll answer with exactly one letter (A, B, C, D, or E)."""
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url":
|
||||
"https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
|
||||
}
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": "A fine 19th century piece of furniture."
|
||||
}]
|
||||
}]
|
||||
|
||||
with vllm_runner(model_name="google/gemma-3-4b-it",
|
||||
runner="pooling",
|
||||
task="classify",
|
||||
convert="classify",
|
||||
load_format="auto",
|
||||
hf_overrides=update_config,
|
||||
pooler_config=PoolerConfig(pooling_type="LAST"),
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_log_stats=True,
|
||||
dtype="bfloat16") as vllm_model:
|
||||
You'll answer with exactly one letter (A, B, C, D, or E).""",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "A fine 19th century piece of furniture."},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model_name="google/gemma-3-4b-it",
|
||||
runner="pooling",
|
||||
task="classify",
|
||||
convert="classify",
|
||||
load_format="auto",
|
||||
hf_overrides=update_config,
|
||||
pooler_config=PoolerConfig(pooling_type="LAST"),
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_log_stats=True,
|
||||
dtype="bfloat16",
|
||||
) as vllm_model:
|
||||
llm = vllm_model.get_llm()
|
||||
prompts = llm.preprocess_chat(messages)
|
||||
|
||||
result = llm.classify(prompts)
|
||||
assert result[0].outputs.probs[0] > 0.95
|
||||
assert all(c < 0.05 for c in result[0].outputs.probs[1:])
|
||||
assert all(c < 0.05 for c in result[0].outputs.probs[1:])
|
||||
|
||||
@@ -20,14 +20,15 @@ def test_classify_models(
|
||||
with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForSequenceClassification) as hf_model:
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output)
|
||||
vllm_output = torch.tensor(vllm_output)
|
||||
|
||||
assert torch.allclose(hf_output, vllm_output,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
assert torch.allclose(
|
||||
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
|
||||
)
|
||||
|
||||
@@ -7,10 +7,10 @@ from ...utils import EmbedModelInfo
|
||||
|
||||
MODELS = [
|
||||
EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
|
||||
#EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
|
||||
#EmbedModelInfo("nomic-ai/CodeRankEmbed"),
|
||||
# EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
|
||||
# EmbedModelInfo("nomic-ai/CodeRankEmbed"),
|
||||
EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
|
||||
#EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
|
||||
# EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
|
||||
]
|
||||
|
||||
rope_theta = 1000
|
||||
@@ -21,23 +21,24 @@ max_model_len = int(original_max_position_embeddings * factor)
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_default(model_info, vllm_runner):
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=None
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
|
||||
# For nomic-embed-text-v2-moe the length is set to 512
|
||||
# by sentence_bert_config.json.
|
||||
assert model_config.max_model_len == 512
|
||||
else:
|
||||
assert (
|
||||
model_config.max_model_len == original_max_position_embeddings)
|
||||
assert model_config.max_model_len == original_max_position_embeddings
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
# set max_model_len <= 512
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=256) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=256
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == 256
|
||||
|
||||
@@ -46,13 +47,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
# For nomic-embed-text-v2-moe the length is set to 512
|
||||
# by sentence_bert_config.json.
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=1024):
|
||||
with vllm_runner(model_info.name, runner="pooling", max_model_len=1024):
|
||||
pass
|
||||
else:
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=1024) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=1024
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == 1024
|
||||
|
||||
@@ -61,17 +61,18 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
def test_set_max_model_len_illegal(model_info, vllm_runner):
|
||||
# set max_model_len > 2048
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=4096):
|
||||
with vllm_runner(model_info.name, runner="pooling", max_model_len=4096):
|
||||
pass
|
||||
|
||||
# set max_model_len > 2048 by hf_overrides
|
||||
hf_overrides = {"max_model_len": 4096}
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides,
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
@@ -82,16 +83,14 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
|
||||
"rope_scaling": {
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings":
|
||||
original_max_position_embeddings
|
||||
"original_max_position_embeddings": original_max_position_embeddings,
|
||||
},
|
||||
"max_model_len": max_model_len
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=None, hf_overrides=hf_overrides
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
@@ -102,16 +101,17 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
|
||||
"rope_scaling": {
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings":
|
||||
original_max_position_embeddings
|
||||
}
|
||||
"original_max_position_embeddings": original_max_position_embeddings,
|
||||
},
|
||||
}
|
||||
# illegal max_model_len
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=max_model_len + 1,
|
||||
hf_overrides=hf_overrides):
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=max_model_len + 1,
|
||||
hf_overrides=hf_overrides,
|
||||
):
|
||||
pass
|
||||
|
||||
hf_overrides = {
|
||||
@@ -119,15 +119,16 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
|
||||
"rope_scaling": {
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings":
|
||||
original_max_position_embeddings
|
||||
"original_max_position_embeddings": original_max_position_embeddings,
|
||||
},
|
||||
"max_model_len": max_model_len + 1
|
||||
"max_model_len": max_model_len + 1,
|
||||
}
|
||||
# illegal max_model_len by hf_overrides
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides,
|
||||
):
|
||||
pass
|
||||
|
||||
@@ -10,10 +10,7 @@ from vllm.config import PoolerConfig
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"jason9693/Qwen2.5-1.5B-apeach",
|
||||
"papluca/xlm-roberta-base-language-detection"
|
||||
],
|
||||
["jason9693/Qwen2.5-1.5B-apeach", "papluca/xlm-roberta-base-language-detection"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_classify_models_using_activation(
|
||||
@@ -23,30 +20,32 @@ def test_classify_models_using_activation(
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=False)) as vllm_model:
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=False),
|
||||
) as vllm_model:
|
||||
wo_activation_out = vllm_model.classify(example_prompts)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=True)) as vllm_model:
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=True),
|
||||
) as vllm_model:
|
||||
w_activation_out = vllm_model.classify(example_prompts)
|
||||
|
||||
for wo_activation, w_activation in zip(wo_activation_out,
|
||||
w_activation_out):
|
||||
for wo_activation, w_activation in zip(wo_activation_out, w_activation_out):
|
||||
wo_activation = torch.tensor(wo_activation)
|
||||
w_activation = torch.tensor(w_activation)
|
||||
|
||||
assert not torch.allclose(wo_activation, w_activation,
|
||||
atol=1e-2), "pooler_config is not working"
|
||||
assert torch.allclose(softmax(wo_activation), w_activation,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
assert not torch.allclose(wo_activation, w_activation, atol=1e-2), (
|
||||
"pooler_config is not working"
|
||||
)
|
||||
assert torch.allclose(
|
||||
softmax(wo_activation), w_activation, 1e-3 if dtype == "float" else 1e-2
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -63,26 +62,28 @@ def test_embed_models_using_normalize(
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=False)) as vllm_model:
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=False),
|
||||
) as vllm_model:
|
||||
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=True)) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=True),
|
||||
) as vllm_model:
|
||||
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
assert not torch.allclose(
|
||||
wo_normalize, w_normalize,
|
||||
atol=1e-2), "pooler_config normalize is not working"
|
||||
assert not torch.allclose(wo_normalize, w_normalize, atol=1e-2), (
|
||||
"pooler_config normalize is not working"
|
||||
)
|
||||
assert torch.allclose(
|
||||
F.normalize(wo_normalize, p=2, dim=-1), w_normalize,
|
||||
atol=1e-2), "w_normal should be close to normal(wo_normal)."
|
||||
F.normalize(wo_normalize, p=2, dim=-1), w_normalize, atol=1e-2
|
||||
), "w_normal should be close to normal(wo_normal)."
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -99,25 +100,26 @@ def test_reward_models_using_softmax(
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=False)) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=False),
|
||||
) as vllm_model:
|
||||
wo_softmax = vllm_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=True)) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True)
|
||||
) as vllm_model:
|
||||
w_softmax = vllm_model.encode(example_prompts)
|
||||
|
||||
for wo, w in zip(wo_softmax, w_softmax):
|
||||
wo = torch.tensor(wo)
|
||||
w = torch.tensor(w)
|
||||
|
||||
assert not torch.allclose(
|
||||
wo, w, atol=1e-2), "pooler_config softmax is not working"
|
||||
assert torch.allclose(
|
||||
softmax(wo), w,
|
||||
atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
|
||||
assert not torch.allclose(wo, w, atol=1e-2), (
|
||||
"pooler_config softmax is not working"
|
||||
)
|
||||
assert torch.allclose(softmax(wo), w, atol=1e-2), (
|
||||
"w_softmax should be close to softmax(wo_softmax)."
|
||||
)
|
||||
|
||||
@@ -16,10 +16,8 @@ from ...utils import check_transformers_version
|
||||
def math_step_prompts():
|
||||
# ruff: noqa: E501
|
||||
data = {
|
||||
"system":
|
||||
"Please reason step by step, and put your final answer within \\boxed{}. ",
|
||||
"query":
|
||||
"Sue lives in a fun neighborhood. One weekend, the neighbors decided to play a prank on Sue. On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard. On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard. Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
|
||||
"system": "Please reason step by step, and put your final answer within \\boxed{}. ",
|
||||
"query": "Sue lives in a fun neighborhood. One weekend, the neighbors decided to play a prank on Sue. On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard. On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard. Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
|
||||
"response": [
|
||||
"To find out how many more pink plastic flamingos were out than white plastic flamingos at noon on Sunday, we can break down the problem into steps. First, on Friday, the neighbors start with 18 pink plastic flamingos.",
|
||||
"On Saturday, they take back one third of the flamingos. Since there were 18 flamingos, (1/3 \\times 18 = 6) flamingos are taken back. So, they have (18 - 6 = 12) flamingos left in their possession. Then, they paint these 6 flamingos white and put them back out on Sue's front yard. Now, Sue has the original 12 pink flamingos plus the 6 new white ones. Thus, by the end of Saturday, Sue has (12 + 6 = 18) pink flamingos and 6 white flamingos.",
|
||||
@@ -27,16 +25,16 @@ def math_step_prompts():
|
||||
"To find the difference, subtract the number of white flamingos from the number of pink flamingos: (36 - 6 = 30). Therefore, at noon on Sunday, there were 30 more pink plastic flamingos out than white plastic flamingos. The answer is (\\boxed{30}).",
|
||||
],
|
||||
}
|
||||
answer = "<extra_0>".join(data['response']) + "<extra_0>"
|
||||
answer = "<extra_0>".join(data["response"]) + "<extra_0>"
|
||||
prompt = f"<im_start>system\n{data['system']}<im_end>\n<im_start>user\n{data['query']}<im_end>\n<im_start>assistant\n{answer}<im_end><|endoftext|>"
|
||||
return [prompt]
|
||||
|
||||
|
||||
def step_reward_patch_hf_model(hf_model: HfRunner):
|
||||
|
||||
# Patch the hf_runner to use the step reward function
|
||||
def make_step_rewards(logits: torch.Tensor,
|
||||
token_masks: torch.Tensor) -> list[list[float]]:
|
||||
def make_step_rewards(
|
||||
logits: torch.Tensor, token_masks: torch.Tensor
|
||||
) -> list[list[float]]:
|
||||
probabilities = F.softmax(logits, dim=-1)
|
||||
probabilities = probabilities * token_masks.unsqueeze(-1)
|
||||
|
||||
@@ -54,7 +52,7 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
|
||||
outputs = hf_model.model(input_ids=input_ids)
|
||||
|
||||
step_sep_id = hf_model.tokenizer.encode("<extra_0>")[0]
|
||||
token_masks = (input_ids == step_sep_id)
|
||||
token_masks = input_ids == step_sep_id
|
||||
return make_step_rewards(outputs[0], token_masks)
|
||||
|
||||
hf_model.reward = reward # type: ignore[attr-defined]
|
||||
@@ -65,8 +63,10 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param("Qwen/Qwen2.5-Math-PRM-7B",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
pytest.param(
|
||||
"Qwen/Qwen2.5-Math-PRM-7B",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@@ -78,8 +78,9 @@ def test_prm_models(
|
||||
dtype: str,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
|
||||
max_transformers_version="4.53.2")
|
||||
check_transformers_version(
|
||||
"Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
|
||||
)
|
||||
|
||||
if current_platform.is_cpu():
|
||||
pytest.skip("CPU only supports V1")
|
||||
|
||||
@@ -37,10 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict([text_pair]).tolist()
|
||||
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
|
||||
|
||||
assert len(vllm_outputs) == 1
|
||||
@@ -58,10 +57,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict(text_pairs).tolist()
|
||||
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
@@ -80,10 +78,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict(text_pairs).tolist()
|
||||
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
@@ -101,17 +98,15 @@ def emb_model_name(request):
|
||||
def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
|
||||
text_pair = [TEXTS_1[0], TEXTS_2[0]]
|
||||
|
||||
with hf_runner(emb_model_name, dtype=DTYPE,
|
||||
is_sentence_transformer=True) as hf_model:
|
||||
with hf_runner(
|
||||
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
|
||||
) as hf_model:
|
||||
hf_embeddings = hf_model.encode(text_pair)
|
||||
hf_outputs = [
|
||||
F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
|
||||
]
|
||||
hf_outputs = [F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
|
||||
|
||||
assert len(vllm_outputs) == 1
|
||||
@@ -126,20 +121,18 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
|
||||
[TEXTS_1[0], TEXTS_2[1]],
|
||||
]
|
||||
|
||||
with hf_runner(emb_model_name, dtype=DTYPE,
|
||||
is_sentence_transformer=True) as hf_model:
|
||||
hf_embeddings = [
|
||||
hf_model.encode(text_pair) for text_pair in text_pairs
|
||||
]
|
||||
with hf_runner(
|
||||
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
|
||||
) as hf_model:
|
||||
hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
|
||||
hf_outputs = [
|
||||
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
|
||||
for pair in hf_embeddings
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
@@ -155,20 +148,18 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
|
||||
[TEXTS_1[1], TEXTS_2[1]],
|
||||
]
|
||||
|
||||
with hf_runner(emb_model_name, dtype=DTYPE,
|
||||
is_sentence_transformer=True) as hf_model:
|
||||
hf_embeddings = [
|
||||
hf_model.encode(text_pair) for text_pair in text_pairs
|
||||
]
|
||||
with hf_runner(
|
||||
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
|
||||
) as hf_model:
|
||||
hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
|
||||
hf_outputs = [
|
||||
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
|
||||
for pair in hf_embeddings
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
|
||||
@@ -21,9 +21,9 @@ def test_models(
|
||||
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForTokenClassification) as hf_model:
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
|
||||
) as hf_model:
|
||||
tokenizer = hf_model.tokenizer
|
||||
hf_outputs = []
|
||||
for prompt in example_prompts:
|
||||
|
||||
@@ -20,51 +20,57 @@ calculus, each contributing unique perspectives that would shape this new
|
||||
field."""
|
||||
|
||||
|
||||
def test_smaller_truncation_size(vllm_runner,
|
||||
model_name=MODEL_NAME,
|
||||
input_str=input_str):
|
||||
|
||||
def test_smaller_truncation_size(
|
||||
vllm_runner, model_name=MODEL_NAME, input_str=input_str
|
||||
):
|
||||
truncate_prompt_tokens = 10
|
||||
|
||||
with vllm_runner(model_name, runner="pooling",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", max_model_len=max_model_len
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.llm.embed(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens
|
||||
)
|
||||
|
||||
prompt_tokens = vllm_output[0].prompt_token_ids
|
||||
|
||||
assert len(prompt_tokens) == truncate_prompt_tokens
|
||||
|
||||
|
||||
def test_max_truncation_size(vllm_runner,
|
||||
model_name=MODEL_NAME,
|
||||
input_str=input_str):
|
||||
def test_max_truncation_size(vllm_runner, model_name=MODEL_NAME, input_str=input_str):
|
||||
truncate_prompt_tokens = -1
|
||||
|
||||
with vllm_runner(model_name, runner="pooling",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", max_model_len=max_model_len
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.llm.embed(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens
|
||||
)
|
||||
|
||||
prompt_tokens = vllm_output[0].prompt_token_ids
|
||||
|
||||
assert len(prompt_tokens) == max_model_len
|
||||
|
||||
|
||||
def test_bigger_truncation_size(vllm_runner,
|
||||
model_name=MODEL_NAME,
|
||||
input_str=input_str):
|
||||
|
||||
def test_bigger_truncation_size(
|
||||
vllm_runner, model_name=MODEL_NAME, input_str=input_str
|
||||
):
|
||||
truncate_prompt_tokens = max_model_len + 1
|
||||
|
||||
with pytest.raises(ValueError), vllm_runner(
|
||||
model_name, runner="pooling",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
|
||||
with (
|
||||
pytest.raises(ValueError),
|
||||
vllm_runner(
|
||||
model_name, runner="pooling", max_model_len=max_model_len
|
||||
) as vllm_model,
|
||||
):
|
||||
llm_output = vllm_model.llm.embed(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens
|
||||
)
|
||||
|
||||
assert llm_output == f"""truncate_prompt_tokens value
|
||||
assert (
|
||||
llm_output
|
||||
== f"""truncate_prompt_tokens value
|
||||
({truncate_prompt_tokens}) is greater than
|
||||
max_model_len ({max_model_len}). Please, select
|
||||
a smaller truncation size."""
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user