Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -51,8 +51,9 @@ AITER_MODEL_LIST = [
pytest.param(
"google/gemma-1.1-2b-it", # gemma
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
pytest.mark.core_model,
pytest.mark.cpu_model,
pytest.mark.slow_test,
],
),
pytest.param(
@@ -65,8 +66,7 @@ AITER_MODEL_LIST = [
pytest.param(
"openbmb/MiniCPM3-4B",
# fused_moe not supported on CPU
marks=[pytest.mark.core_model,
large_gpu_mark(min_gb=32)],
marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
),
pytest.param(
"facebook/opt-125m", # opt
@@ -82,8 +82,9 @@ AITER_MODEL_LIST = [
pytest.param(
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
pytest.mark.core_model,
pytest.mark.cpu_model,
pytest.mark.slow_test,
],
),
pytest.param(
@@ -100,16 +101,25 @@ AITER_MODEL_LIST = [
marks=[pytest.mark.cpu_model],
),
pytest.param("swiss-ai/Apertus-8B-2509"), # apertus
])
],
)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
)
@pytest.mark.parametrize("use_prompt_embeds", [True, False])
def test_models(hf_runner, vllm_runner, example_prompts, model: str,
max_tokens: int, num_logprobs: int, use_rocm_aiter: bool,
use_prompt_embeds: bool, monkeypatch) -> None:
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
max_tokens: int,
num_logprobs: int,
use_rocm_aiter: bool,
use_prompt_embeds: bool,
monkeypatch,
) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
@@ -125,34 +135,37 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
prompt_embeds: Optional[list[torch.Tensor]] = ([] if use_prompt_embeds
else None)
prompt_embeds: Optional[list[torch.Tensor]] = [] if use_prompt_embeds else None
prompt_token_ids = []
for prompt in example_prompts:
token_ids = hf_model.tokenizer(prompt,
return_tensors="pt").input_ids.to(
hf_model.model.device)
token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
hf_model.model.device
)
prompt_token_ids.append(token_ids)
if prompt_embeds is not None:
prompt_embeds.append(hf_model.model.get_input_embeddings()(
token_ids).squeeze(0))
prompt_embeds.append(
hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
)
with vllm_runner(
model,
tokenizer_name=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
max_num_seqs=2,
enable_prompt_embeds=use_prompt_embeds,
model,
tokenizer_name=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
max_num_seqs=2,
enable_prompt_embeds=use_prompt_embeds,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
if prompt_embeds is not None:
vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
prompt_embeds, max_tokens, num_logprobs)
prompt_embeds, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,

View File

@@ -11,17 +11,17 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
with monkeypatch.context() as m:
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(
model,
load_format="dummy",
model,
load_format="dummy",
) as llm:
if model == "google/gemma-3-4b-it":
normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.language_model.model.
normalizer.cpu().item())
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()
)
config = llm.llm.llm_engine.model_config.hf_config.text_config
else:
normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.model.normalizer.cpu(
).item())
lambda self: self.model_runner.model.model.normalizer.cpu().item()
)
config = llm.llm.llm_engine.model_config.hf_config
assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)

View File

@@ -26,11 +26,13 @@ def test_models(
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,

View File

@@ -24,7 +24,7 @@ SSM_MODELS = [
"tiiuae/falcon-mamba-tiny-dev",
# mamba2-codestral in transformers is broken pending:
# https://github.com/huggingface/transformers/pull/40861
#"yujiepan/mamba2-codestral-v0.1-tiny-random",
# "yujiepan/mamba2-codestral-v0.1-tiny-random",
]
HYBRID_MODELS = [
@@ -65,7 +65,6 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -75,11 +74,13 @@ def test_models(
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@@ -109,13 +110,14 @@ def test_batching(
for_loop_outputs = []
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
for prompt in example_prompts:
single_output, = vllm_model.generate_greedy_logprobs([prompt],
max_tokens,
num_logprobs)
(single_output,) = vllm_model.generate_greedy_logprobs(
[prompt], max_tokens, num_logprobs
)
for_loop_outputs.append(single_output)
batched_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=for_loop_outputs,
@@ -134,8 +136,8 @@ def test_chunked_prefill_with_parallel_sampling(
max_tokens: int,
) -> None:
"""
Tests chunked prefill in conjunction with n > 1.
Tests chunked prefill in conjunction with n > 1.
In this case, prefill is populated with decoding tokens and
we test that it doesn't fail.
@@ -143,16 +145,13 @@ def test_chunked_prefill_with_parallel_sampling(
decoding steps inside a chunked prefill forward pass
(where we have both prefill and decode together)
"""
sampling_params = SamplingParams(n=3,
temperature=1,
seed=0,
max_tokens=max_tokens)
sampling_params = SamplingParams(n=3, temperature=1, seed=0, max_tokens=max_tokens)
with vllm_runner(
model,
enable_chunked_prefill=True,
# forces prefill chunks with decoding
max_num_batched_tokens=MAX_NUM_SEQS * 3,
max_num_seqs=MAX_NUM_SEQS,
model,
enable_chunked_prefill=True,
# forces prefill chunks with decoding
max_num_batched_tokens=MAX_NUM_SEQS * 3,
max_num_seqs=MAX_NUM_SEQS,
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@@ -170,10 +169,8 @@ def test_mamba_cache_cg_padding(
batch size. If it's not, a torch RuntimeError will be raised because
tensor dimensions aren't compatible.
"""
vllm_config = EngineArgs(model=model,
trust_remote_code=True).create_engine_config()
while len(example_prompts) == vllm_config.pad_for_cudagraph(
len(example_prompts)):
vllm_config = EngineArgs(model=model, trust_remote_code=True).create_engine_config()
while len(example_prompts) == vllm_config.pad_for_cudagraph(len(example_prompts)):
example_prompts.append(example_prompts[0])
try:
@@ -183,7 +180,8 @@ def test_mamba_cache_cg_padding(
pytest.fail(
"Couldn't run batch size which is not equal to a Cuda Graph "
"captured batch size. "
"Could be related to mamba cache not padded correctly")
"Could be related to mamba cache not padded correctly"
)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -205,8 +203,10 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
except ValueError:
pytest.fail("Hybrid inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily ")
pytest.fail(
"Hybrid inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily "
)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -215,10 +215,10 @@ def test_state_cleanup(
example_prompts,
model: str,
) -> None:
"""
"""
This test is for verifying that the Hybrid state is cleaned up between
steps.
If it's not cleaned, an error would be expected.
"""
try:
@@ -226,8 +226,10 @@ def test_state_cleanup(
for _ in range(10):
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
except ValueError:
pytest.fail("Hybrid inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids")
pytest.fail(
"Hybrid inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids"
)
@multi_gpu_test(num_gpus=2)
@@ -241,15 +243,19 @@ def test_distributed_correctness(
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(model, tensor_parallel_size=1,
max_num_seqs=MAX_NUM_SEQS) as vllm_model:
with vllm_runner(
model, tensor_parallel_size=1, max_num_seqs=MAX_NUM_SEQS
) as vllm_model:
vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, tensor_parallel_size=2,
max_num_seqs=MAX_NUM_SEQS) as vllm_model:
with vllm_runner(
model, tensor_parallel_size=2, max_num_seqs=MAX_NUM_SEQS
) as vllm_model:
vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=vllm_outputs_tp_1,
@@ -271,7 +277,6 @@ def test_full_cuda_graph(
max_tokens: int,
num_logprobs: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -281,11 +286,13 @@ def test_full_cuda_graph(
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@@ -298,8 +305,9 @@ def test_full_cuda_graph(
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("cache_dtype_param",
["mamba_ssm_cache_dtype", "mamba_cache_dtype"])
@pytest.mark.parametrize(
"cache_dtype_param", ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]
)
def test_fp32_cache_state(
hf_runner,
vllm_runner,
@@ -310,7 +318,6 @@ def test_fp32_cache_state(
num_logprobs: int,
cache_dtype_param: str,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -320,13 +327,15 @@ def test_fp32_cache_state(
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model,
max_num_seqs=MAX_NUM_SEQS,
**{cache_dtype_param: "float32"}) as vllm_model:
with vllm_runner(
model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@@ -339,21 +348,23 @@ def test_fp32_cache_state(
# Helper functions for the APC tests
def _get_vllm_runner_params(model, max_model_len, tensor_parallel_size=1):
return {
'model_name': model,
'enable_prefix_caching': False,
'max_model_len': max_model_len,
'tensor_parallel_size': tensor_parallel_size,
'gpu_memory_utilization': 0.4
"model_name": model,
"enable_prefix_caching": False,
"max_model_len": max_model_len,
"tensor_parallel_size": tensor_parallel_size,
"gpu_memory_utilization": 0.4,
}
def _get_vLLM_output(vllm_runner,
kwargs,
prompts,
max_tokens,
num_logprobs,
num_repetitions=1,
vllm_model=None):
def _get_vLLM_output(
vllm_runner,
kwargs,
prompts,
max_tokens,
num_logprobs,
num_repetitions=1,
vllm_model=None,
):
outs = []
if vllm_model is None:
vllm_model = vllm_runner(**kwargs)
@@ -362,7 +373,8 @@ def _get_vLLM_output(vllm_runner,
vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
else:
vllm_output = vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs)
prompts, max_tokens, num_logprobs
)
outs.append(vllm_output)
return outs, vllm_model
@@ -387,7 +399,6 @@ def test_apc_single_prompt(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -395,29 +406,33 @@ def test_apc_single_prompt(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts.
generated_prompts = [MULTIPLE * example_prompts[0]]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs, n_repetitions)
vllm_runner_kwargs["enable_prefix_caching"] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled
@@ -450,7 +465,6 @@ def test_apc_single_prompt_block_align_alignment(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -458,30 +472,29 @@ def test_apc_single_prompt_block_align_alignment(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts. This custom prompt is used, as it causes the most issues
generated_prompts = ["The president of the United States is " * MULTIPLE]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_runner_kwargs["enable_prefix_caching"] = True
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
# Retrieve the default mamba state block size
mamba_block_size = vllm_model.llm.llm_engine.cache_config. \
mamba_block_size
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
# In case the hybrid model does not have the
# "mamba_block_size" assume a fixed constant
@@ -489,18 +502,18 @@ def test_apc_single_prompt_block_align_alignment(
mamba_block_size = 512
mamba_block_size_multiplier = 10
for offsets in [
-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3
]:
vllm_runner_kwargs[
'max_num_batched_tokens'] = mamba_block_size_multiplier * \
mamba_block_size - offsets
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens, num_logprobs,
n_repetitions)
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
vllm_runner_kwargs["max_num_batched_tokens"] = (
mamba_block_size_multiplier * mamba_block_size - offsets
)
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
# Check alignment of the output logits when using APC
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
@@ -534,7 +547,6 @@ def test_apc_multiple_prompts_all_cached_outputs(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -542,30 +554,34 @@ def test_apc_multiple_prompts_all_cached_outputs(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts.
generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs, n_repetitions)
vllm_runner_kwargs["enable_prefix_caching"] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled
@@ -598,7 +614,6 @@ def test_apc_multiple_prompts_block_align_alignment(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -606,34 +621,31 @@ def test_apc_multiple_prompts_block_align_alignment(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts. This custom prompt is used, as it causes the most issues
prompt_text = "The president of the United States is "
prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31]
generated_prompts = [
prompt_text[offset:] * MULTIPLE for offset in prompt_offsets
]
generated_prompts = [prompt_text[offset:] * MULTIPLE for offset in prompt_offsets]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(model, max_model_len,
tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_runner_kwargs["enable_prefix_caching"] = True
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
# Retrieve the default mamba state block size
mamba_block_size = vllm_model.llm.llm_engine.cache_config. \
mamba_block_size
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
# In case the hybrid model does not have the
# "mamba_block_size" assume a fixed constant
@@ -641,18 +653,18 @@ def test_apc_multiple_prompts_block_align_alignment(
mamba_block_size = 512
mamba_block_size_multiplier = 10
for offsets in [
-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3
]:
vllm_runner_kwargs[
'max_num_batched_tokens'] = mamba_block_size_multiplier * \
mamba_block_size - offsets
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens, num_logprobs,
n_repetitions)
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
vllm_runner_kwargs["max_num_batched_tokens"] = (
mamba_block_size_multiplier * mamba_block_size - offsets
)
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
# Check alignment of the output logits when using APC
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
@@ -686,7 +698,6 @@ def test_apc_multiple_prompts_partial_cached_outputs(
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -694,30 +705,30 @@ def test_apc_multiple_prompts_partial_cached_outputs(
except ValueError:
pass
compare_operator: Callable = check_logprobs_close \
if num_logprobs > 0 else check_outputs_equal # type: ignore
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
MULTIPLE = 300
# Sample prompts.
generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]
max_model_len = max(
len(prompt) + max_tokens for prompt in generated_prompts)
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts, max_tokens,
num_logprobs)
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
# Cache only part of all the prompts
vllm_runner_kwargs['enable_prefix_caching'] = True
vllm_runner_kwargs["enable_prefix_caching"] = True
vllm_outputs_partial_cache, vllm_model = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens,
num_logprobs)
vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens, num_logprobs
)
compare_operator(
outputs_0_lst=vllm_outputs_no_cache[0][:3],
@@ -726,13 +737,15 @@ def test_apc_multiple_prompts_partial_cached_outputs(
name_1="vllm_partial_cache",
)
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
vllm_model=vllm_model)
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
vllm_model=vllm_model,
)
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled

View File

@@ -6,7 +6,9 @@ import json
import pytest
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
MistralToolCall, MistralToolParser)
MistralToolCall,
MistralToolParser,
)
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer import MistralTokenizer
@@ -33,136 +35,114 @@ SYMBOLIC_LANG_PROMPTS = [
]
# for function calling
TOOLS = [{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type":
"string",
"description":
"The city to find the weather for, e.g. 'San Francisco'"
TOOLS = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. 'San Francisco'",
},
"state": {
"type": "string",
"description": "the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"state": {
"type":
"string",
"description":
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"]
}
"required": ["city", "state", "unit"],
},
"required": ["city", "state", "unit"]
}
},
},
}, {
"type": "function",
"function": {
"name": "rewrite",
"description": "Rewrites text",
"parameters": {
"type": "object",
"required": [],
"properties": {
"text": {
"type": "string",
"description": "The input text to rewrite."
}
}
}
}
}]
{
"type": "function",
"function": {
"name": "rewrite",
"description": "Rewrites text",
"parameters": {
"type": "object",
"required": [],
"properties": {
"text": {
"type": "string",
"description": "The input text to rewrite.",
}
},
},
},
},
]
MSGS = [
{"role": "system", "content": "You are an assistant."},
{
"role": "system",
"content": "You are an assistant."
},
{
"role":
"user",
"content":
"Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors." # noqa
},
{
"role":
"assistant",
"content":
"",
"tool_calls": [{
"id": "bbc5b7ede",
"type": "function",
"function": {
"name":
"rewrite",
"arguments":
'{\"text\":\"My English needs improvving, maybe I make errors.\"}' # noqa
}
}]
},
{
"role": "tool",
"content":
"{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}", # noqa
"tool_call_id": "bbc5b7ede",
"name": "rewrite"
"role": "user",
"content": "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors.", # noqa
},
{
"role": "assistant",
"content": "---\n\nMy English needs improving, maybe I make errors"
"content": "",
"tool_calls": [
{
"id": "bbc5b7ede",
"type": "function",
"function": {
"name": "rewrite",
"arguments": '{"text":"My English needs improvving, maybe I make errors."}', # noqa
},
}
],
},
{
"role":
"user",
"content": ("Can you tell me what the temperate"
" will be in Dallas, in fahrenheit?")
}
"role": "tool",
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe I make errors."}', # noqa
"tool_call_id": "bbc5b7ede",
"name": "rewrite",
},
{
"role": "assistant",
"content": "---\n\nMy English needs improving, maybe I make errors",
},
{
"role": "user",
"content": (
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
),
},
]
SAMPLE_JSON_SCHEMA = {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"age": {
"type": "integer"
},
"name": {"type": "string"},
"age": {"type": "integer"},
"skills": {
"type": "array",
"items": {
"type": "string",
"maxLength": 10
},
"minItems": 3
"items": {"type": "string", "maxLength": 10},
"minItems": 3,
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string"
},
"duration": {
"type": "number"
},
"position": {
"type": "string"
}
"company": {"type": "string"},
"duration": {"type": "number"},
"position": {"type": "string"},
},
"required": ["company", "position"]
}
}
"required": ["company", "position"],
},
},
},
"required": ["name", "age", "skills", "work_history"]
"required": ["name", "age", "skills", "work_history"],
}
@@ -170,17 +150,25 @@ SAMPLE_JSON_SCHEMA = {
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, example_prompts, model: str,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
# TODO(sang): Sliding window should be tested separately.
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, dtype=dtype,
tokenizer_mode="mistral") as vllm_model:
with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral") as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@@ -194,27 +182,35 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
max_tokens: int, num_logprobs: int) -> None:
def test_mistral_format(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
) as mistral_format_model:
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="auto",
load_format="safetensors",
config_format="hf",
model,
dtype=dtype,
tokenizer_mode="auto",
load_format="safetensors",
config_format="hf",
) as hf_format_model:
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_format_outputs,
@@ -226,34 +222,35 @@ def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_mistral_symbolic_languages(vllm_runner, model: str,
dtype: str) -> None:
with vllm_runner(model,
dtype=dtype,
max_model_len=8192,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral") as vllm_model:
def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str) -> None:
with vllm_runner(
model,
dtype=dtype,
max_model_len=8192,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral",
) as vllm_model:
for prompt in SYMBOLIC_LANG_PROMPTS:
msg = {"role": "user", "content": prompt}
outputs = vllm_model.llm.chat([msg],
sampling_params=SAMPLING_PARAMS)
outputs = vllm_model.llm.chat([msg], sampling_params=SAMPLING_PARAMS)
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
with vllm_runner(model,
dtype=dtype,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral") as vllm_model:
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral",
) as vllm_model:
msgs = copy.deepcopy(MSGS)
outputs = vllm_model.llm.chat(msgs,
tools=TOOLS,
sampling_params=SAMPLING_PARAMS)
outputs = vllm_model.llm.chat(
msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS
)
tokenizer = vllm_model.llm.get_tokenizer()
tool_parser = MistralToolParser(tokenizer)
@@ -265,10 +262,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
assert parsed_message.tools_called
assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
assert parsed_message.tool_calls[
0].function.name == "get_current_weather"
assert parsed_message.tool_calls[
0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}' # noqa
assert parsed_message.tool_calls[0].function.name == "get_current_weather"
assert (
parsed_message.tool_calls[0].function.arguments
== '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'
) # noqa
assert parsed_message.content is None
@@ -297,17 +295,10 @@ def test_mistral_function_call_nested_json():
"city": "Dallas",
"state": "TX",
"unit": "fahrenheit",
"sub_dict": {
"foo": "bar",
"inner": {
"x": 1,
"y": 2
}
},
"sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
}
model_output = (
f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}")
model_output = f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}"
parsed = parser.extract_tool_calls(model_output, None)

View File

@@ -15,62 +15,56 @@ MODELS = [
def test_phimoe_routing_function():
from vllm.model_executor.models.phimoe import phimoe_routing_function
test_case = {
0: {
"hidden_states":
torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
dtype=torch.float32,
requires_grad=False).view(4, 2),
"gating_output":
torch.tensor([0.1, 0.2, 0.3, 0.4],
dtype=torch.float32,
requires_grad=False),
"topk":
2,
"renormalize":
False,
"hidden_states": torch.tensor(
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
).view(4, 2),
"gating_output": torch.tensor(
[0.1, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
),
"topk": 2,
"renormalize": False,
},
1: {
"hidden_states":
torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
dtype=torch.float32,
requires_grad=False).view(4, 2),
"gating_output":
torch.tensor([0.4, 0.2, 0.3, 0.4],
dtype=torch.float32,
requires_grad=False),
"topk":
2,
"renormalize":
False,
}
"hidden_states": torch.tensor(
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
).view(4, 2),
"gating_output": torch.tensor(
[0.4, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
),
"topk": 2,
"renormalize": False,
},
}
ground_truth = {
0: {
"topk_weights":
torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
"topk_ids":
torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
"topk_weights": torch.tensor(
[1.0, 1.0], dtype=torch.float32, requires_grad=False
),
"topk_ids": torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
},
1: {
"topk_weights":
torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
"topk_ids":
torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
}
"topk_weights": torch.tensor(
[0.5, 1.0], dtype=torch.float32, requires_grad=False
),
"topk_ids": torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
},
}
for test_id in test_case:
topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
assert torch.allclose(topk_weights,
ground_truth[test_id]["topk_weights"])
assert torch.allclose(topk_weights, ground_truth[test_id]["topk_weights"])
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
@pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model.")
@pytest.mark.skipif(
condition=current_platform.is_cpu(),
reason="This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model.",
)
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -87,11 +81,13 @@ def test_models(
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,

View File

@@ -8,8 +8,7 @@ import torch
from datasets import load_dataset
import tests.ci_envs as ci_envs
from tests.models.utils import (GenerateModelInfo,
TokensTextLogprobsPromptLogprobs)
from tests.models.utils import GenerateModelInfo, TokensTextLogprobsPromptLogprobs
from vllm.logprobs import Logprob
# See #24485
@@ -18,13 +17,14 @@ MAX_LENGTH = 1024
@torch.inference_mode
def wikitext_ppl_test(hf_runner,
vllm_runner,
model_info: GenerateModelInfo,
max_length=MAX_LENGTH,
vllm_extra_kwargs=None,
atol=PPL_TOL):
def wikitext_ppl_test(
hf_runner,
vllm_runner,
model_info: GenerateModelInfo,
max_length=MAX_LENGTH,
vllm_extra_kwargs=None,
atol=PPL_TOL,
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
@@ -44,15 +44,16 @@ def wikitext_ppl_test(hf_runner,
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
if "hf_overrides" not in vllm_extra_kwargs:
vllm_extra_kwargs["hf_overrides"] = {}
vllm_extra_kwargs["hf_overrides"][
"head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
with vllm_runner(model_info.name,
gpu_memory_utilization=0.7,
max_model_len=max_length,
max_num_seqs=1,
enforce_eager=True,
**vllm_extra_kwargs) as vllm_model:
with vllm_runner(
model_info.name,
gpu_memory_utilization=0.7,
max_model_len=max_length,
max_num_seqs=1,
enforce_eager=True,
**vllm_extra_kwargs,
) as vllm_model:
# Use max_num_seqs=1 to avoid OOM,
# and avoid batch different requests together.
@@ -60,7 +61,7 @@ def wikitext_ppl_test(hf_runner,
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)
assert model_info.architecture in model_config.architectures
max_length = min(model_config.max_model_len - 1, max_length)
stride = max_length
@@ -74,12 +75,14 @@ def wikitext_ppl_test(hf_runner,
end_loc = min(begin_loc + max_length, n_tokens)
chunks.append(tokens[begin_loc:end_loc])
outputs = vllm_model.generate_greedy_logprobs(prompts=chunks,
max_tokens=1,
num_logprobs=None,
num_prompt_logprobs=0,
use_tqdm=False)
nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
outputs = vllm_model.generate_greedy_logprobs(
prompts=chunks,
max_tokens=1,
num_logprobs=None,
num_prompt_logprobs=0,
use_tqdm=False,
)
nll_sum = torch.tensor(0.0, dtype=torch.float32, device="cpu")
n_tokens = 0
for output in outputs:
output = cast(TokensTextLogprobsPromptLogprobs, output)
@@ -94,7 +97,8 @@ def wikitext_ppl_test(hf_runner,
token_log_probs.append(token_log_prob)
neg_log_likelihood = -torch.tensor(
token_log_probs, dtype=torch.float32, device="cpu").sum()
token_log_probs, dtype=torch.float32, device="cpu"
).sum()
nll_sum += neg_log_likelihood
n_tokens += len(token_log_probs)
vllm_ppl = float(torch.exp(nll_sum / n_tokens))
@@ -104,14 +108,13 @@ def wikitext_ppl_test(hf_runner,
# Accelerate ppl test by setting Transformers ppl score to a constant
if model_info.hf_ppl is None:
with hf_runner(
model_info.name,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
model_info.name,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
nll_sum = torch.tensor(0.0, dtype=torch.float32, device="cpu")
n_tokens = 0
for chunk in chunks:
inputs = hf_model.wrap_device(
{"input_ids": torch.tensor([chunk])})
inputs = hf_model.wrap_device({"input_ids": torch.tensor([chunk])})
input_ids = inputs["input_ids"]
outputs = hf_model.model(input_ids, labels=input_ids)
neg_log_likelihood = outputs.loss

View File

@@ -6,8 +6,7 @@ from typing import Optional
import pytest
from tests.conftest import HfRunner
from tests.models.utils import (EmbedModelInfo, check_embeddings_close,
matryoshka_fy)
from tests.models.utils import EmbedModelInfo, check_embeddings_close, matryoshka_fy
def run_embedding_correctness_test(
@@ -29,12 +28,14 @@ def run_embedding_correctness_test(
)
def correctness_test_embed_models(hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
example_prompts,
vllm_extra_kwargs=None,
hf_model_callback=None):
def correctness_test_embed_models(
hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
example_prompts,
vllm_extra_kwargs=None,
hf_model_callback=None,
):
pytest.skip("Debug only, ci prefers to use mteb test.")
# The example_prompts has ending "\n", for example:
@@ -51,18 +52,16 @@ def correctness_test_embed_models(hf_runner,
if model_info.hf_overrides is not None:
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:
with vllm_runner(
model_info.name, runner="pooling", max_model_len=None, **vllm_extra_kwargs
) as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts)
with hf_runner(
model_info.name,
dtype=model_info.hf_dtype,
is_sentence_transformer=True,
model_info.name,
dtype=model_info.hf_dtype,
is_sentence_transformer=True,
) as hf_model:
if hf_model_callback is not None:
hf_model_callback(hf_model)

View File

@@ -4,8 +4,7 @@ import pytest
import torch
from transformers import AutoModelForSequenceClassification
from tests.models.language.pooling.embed_utils import (
run_embedding_correctness_test)
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
@pytest.mark.parametrize(
@@ -20,28 +19,27 @@ def test_classify_models(
model: str,
dtype: str,
) -> None:
example_prompts = example_prompts * 2
with vllm_runner(model,
max_model_len=512,
dtype=dtype,
enable_prefix_caching=True) as vllm_model:
with vllm_runner(
model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
) as vllm_model:
cache_config = vllm_model.llm.llm_engine.cache_config
assert cache_config.enable_prefix_caching
vllm_outputs = vllm_model.classify(example_prompts)
with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForSequenceClassification) as hf_model:
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
) as hf_model:
hf_outputs = hf_model.classify(example_prompts)
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output)
vllm_output = torch.tensor(vllm_output)
assert torch.allclose(hf_output, vllm_output,
1e-3 if dtype == "float" else 1e-2)
assert torch.allclose(
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
)
@pytest.mark.parametrize(
@@ -59,18 +57,18 @@ def test_embed_models(
example_prompts = [str(s).strip() for s in example_prompts] * 2
with vllm_runner(
model,
runner="pooling",
max_model_len=None,
enable_prefix_caching=True,
model,
runner="pooling",
max_model_len=None,
enable_prefix_caching=True,
) as vllm_model:
cache_config = vllm_model.llm.llm_engine.cache_config
assert cache_config.enable_prefix_caching
vllm_outputs = vllm_model.embed(example_prompts)
with hf_runner(
model,
is_sentence_transformer=True,
model,
is_sentence_transformer=True,
) as hf_model:
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
@@ -81,13 +79,14 @@ def test_embed_models(
"intfloat/e5-small",
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", # is_causal == False
"papluca/xlm-roberta-base-language-detection",
])
],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_non_causal_models(hf_runner, vllm_runner, example_prompts, model: str,
dtype: str) -> None:
with vllm_runner(model,
max_model_len=512,
dtype=dtype,
enable_prefix_caching=True) as vllm_model:
def test_non_causal_models(
hf_runner, vllm_runner, example_prompts, model: str, dtype: str
) -> None:
with vllm_runner(
model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
) as vllm_model:
cache_config = vllm_model.llm.llm_engine.cache_config
assert not cache_config.enable_prefix_caching

View File

@@ -10,15 +10,17 @@ from vllm.platforms import current_platform
@pytest.mark.parametrize(
"model",
[
pytest.param("jason9693/Qwen2.5-1.5B-apeach",
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
]),
pytest.param(
"jason9693/Qwen2.5-1.5B-apeach",
marks=[
pytest.mark.core_model,
pytest.mark.cpu_model,
pytest.mark.slow_test,
],
),
],
)
@pytest.mark.parametrize("dtype",
["half"] if current_platform.is_rocm() else ["float"])
@pytest.mark.parametrize("dtype", ["half"] if current_platform.is_rocm() else ["float"])
def test_models(
hf_runner,
vllm_runner,
@@ -35,9 +37,9 @@ def test_models(
with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.classify(example_prompts)
with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForSequenceClassification) as hf_model:
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
) as hf_model:
hf_outputs = hf_model.classify(example_prompts)
# check logits difference
@@ -48,5 +50,6 @@ def test_models(
# the tolerance value of 1e-2 is selected based on the
# half datatype tests in
# tests/models/language/pooling/test_embedding.py
assert torch.allclose(hf_output, vllm_output,
1e-3 if dtype == "float" else 1e-2)
assert torch.allclose(
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
)

View File

@@ -18,20 +18,25 @@ from ...utils import check_embeddings_close
# case won't pass because gte-Qwen2-1.5B-instruct will cache custom
# model code with bidirectional attention.
# [Decoder-only]
pytest.param("BAAI/bge-multilingual-gemma2",
marks=[pytest.mark.core_model, pytest.mark.slow_test]),
pytest.param(
"BAAI/bge-multilingual-gemma2",
marks=[pytest.mark.core_model, pytest.mark.slow_test],
),
pytest.param(
"intfloat/e5-mistral-7b-instruct",
# CPU v1 doesn't support sliding window
marks=[pytest.mark.core_model]),
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
marks=[pytest.mark.cpu_model]),
marks=[pytest.mark.core_model],
),
pytest.param(
"ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.cpu_model]
),
# [Encoder-only]
pytest.param(
"BAAI/bge-base-en-v1.5",
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
pytest.mark.core_model,
pytest.mark.cpu_model,
pytest.mark.slow_test,
],
),
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
@@ -50,7 +55,6 @@ def test_models(
model,
monkeypatch,
) -> None:
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention
# switch to use ROCm CK FA backend
@@ -58,13 +62,14 @@ def test_models(
vllm_extra_kwargs = {}
if model == "ssmits/Qwen2-7B-Instruct-embed-base":
vllm_extra_kwargs["pooler_config"] = \
PoolerConfig(pooling_type="MEAN", normalize=False)
vllm_extra_kwargs["pooler_config"] = PoolerConfig(
pooling_type="MEAN", normalize=False
)
max_model_len: Optional[int] = 512
if model in [
"sentence-transformers/all-MiniLM-L12-v2",
"sentence-transformers/stsb-roberta-base-v2"
"sentence-transformers/all-MiniLM-L12-v2",
"sentence-transformers/stsb-roberta-base-v2",
]:
max_model_len = None
@@ -79,10 +84,9 @@ def test_models(
with hf_runner(model, is_sentence_transformer=True) as hf_model:
hf_outputs = hf_model.encode(example_prompts)
with vllm_runner(model,
runner="pooling",
max_model_len=max_model_len,
**vllm_extra_kwargs) as vllm_model:
with vllm_runner(
model, runner="pooling", max_model_len=max_model_len, **vllm_extra_kwargs
) as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts)
check_embeddings_close(

View File

@@ -70,8 +70,9 @@ async def run_client_embeddings(
def gritlm_instruction(instruction):
return ("<|user|>\n" + instruction +
"\n<|embed|>\n" if instruction else "<|embed|>\n")
return (
"<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n"
)
def get_test_data():
@@ -80,7 +81,8 @@ def get_test_data():
README.md in https://github.com/ContextualAI/gritlm
"""
q_instruction = gritlm_instruction(
"Given a scientific paper title, retrieve the paper's abstract", )
"Given a scientific paper title, retrieve the paper's abstract",
)
queries = [
"Bitcoin: A Peer-to-Peer Electronic Cash System",
"Generative Representational Instruction Tuning",
@@ -114,9 +116,9 @@ def test_gritlm_offline_embedding(vllm_runner):
queries, q_instruction, documents, d_instruction = get_test_data()
with vllm_runner(
MODEL_NAME,
runner="pooling",
max_model_len=MAX_MODEL_LEN,
MODEL_NAME,
runner="pooling",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.llm
@@ -161,9 +163,9 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
with vllm_runner(
MODEL_NAME,
runner="generate",
max_model_len=MAX_MODEL_LEN,
MODEL_NAME,
runner="generate",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.llm

View File

@@ -21,16 +21,18 @@ def test_idefics_multimodal(
"The future of AI is",
]
with vllm_runner(model_name="HuggingFaceM4/Idefics3-8B-Llama3",
runner="pooling",
task="classify",
convert="classify",
load_format="dummy",
max_model_len=512,
enforce_eager=True,
tensor_parallel_size=1,
disable_log_stats=True,
dtype="bfloat16") as vllm_model:
with vllm_runner(
model_name="HuggingFaceM4/Idefics3-8B-Llama3",
runner="pooling",
task="classify",
convert="classify",
load_format="dummy",
max_model_len=512,
enforce_eager=True,
tensor_parallel_size=1,
disable_log_stats=True,
dtype="bfloat16",
) as vllm_model:
llm = vllm_model.get_llm()
outputs = llm.classify(prompts)
for output in outputs:
@@ -38,19 +40,20 @@ def test_idefics_multimodal(
def update_config(config):
config.text_config.update({
"architectures": ["Gemma3ForSequenceClassification"],
"classifier_from_token": ["A", "B", "C", "D", "E"],
"method":
"no_post_processing",
"id2label": {
"A": "Chair",
"B": "Couch",
"C": "Table",
"D": "Bed",
"E": "Cupboard"
},
})
config.text_config.update(
{
"architectures": ["Gemma3ForSequenceClassification"],
"classifier_from_token": ["A", "B", "C", "D", "E"],
"method": "no_post_processing",
"id2label": {
"A": "Chair",
"B": "Couch",
"C": "Table",
"D": "Bed",
"E": "Cupboard",
},
}
)
return config
@@ -63,11 +66,10 @@ def test_gemma_multimodal(
# switch to use ROCm CK FA backend
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
messages = [{
"role":
"system",
"content":
"""
messages = [
{
"role": "system",
"content": """
You are a helpful assistant. You will be given a product description
which may also include an image. Classify the following product into
one of the categories:
@@ -78,38 +80,39 @@ def test_gemma_multimodal(
D = bed
E = cupboard
You'll answer with exactly one letter (A, B, C, D, or E)."""
}, {
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url":
"https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
}
}, {
"type": "text",
"text": "A fine 19th century piece of furniture."
}]
}]
with vllm_runner(model_name="google/gemma-3-4b-it",
runner="pooling",
task="classify",
convert="classify",
load_format="auto",
hf_overrides=update_config,
pooler_config=PoolerConfig(pooling_type="LAST"),
max_model_len=512,
enforce_eager=True,
tensor_parallel_size=1,
disable_log_stats=True,
dtype="bfloat16") as vllm_model:
You'll answer with exactly one letter (A, B, C, D, or E).""",
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
},
},
{"type": "text", "text": "A fine 19th century piece of furniture."},
],
},
]
with vllm_runner(
model_name="google/gemma-3-4b-it",
runner="pooling",
task="classify",
convert="classify",
load_format="auto",
hf_overrides=update_config,
pooler_config=PoolerConfig(pooling_type="LAST"),
max_model_len=512,
enforce_eager=True,
tensor_parallel_size=1,
disable_log_stats=True,
dtype="bfloat16",
) as vllm_model:
llm = vllm_model.get_llm()
prompts = llm.preprocess_chat(messages)
result = llm.classify(prompts)
assert result[0].outputs.probs[0] > 0.95
assert all(c < 0.05 for c in result[0].outputs.probs[1:])
assert all(c < 0.05 for c in result[0].outputs.probs[1:])

View File

@@ -20,14 +20,15 @@ def test_classify_models(
with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.classify(example_prompts)
with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForSequenceClassification) as hf_model:
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
) as hf_model:
hf_outputs = hf_model.classify(example_prompts)
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output)
vllm_output = torch.tensor(vllm_output)
assert torch.allclose(hf_output, vllm_output,
1e-3 if dtype == "float" else 1e-2)
assert torch.allclose(
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
)

View File

@@ -7,10 +7,10 @@ from ...utils import EmbedModelInfo
MODELS = [
EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
#EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
#EmbedModelInfo("nomic-ai/CodeRankEmbed"),
# EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
# EmbedModelInfo("nomic-ai/CodeRankEmbed"),
EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
#EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
# EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
]
rope_theta = 1000
@@ -21,23 +21,24 @@ max_model_len = int(original_max_position_embeddings * factor)
@pytest.mark.parametrize("model_info", MODELS)
def test_default(model_info, vllm_runner):
with vllm_runner(model_info.name, runner="pooling",
max_model_len=None) as vllm_model:
with vllm_runner(
model_info.name, runner="pooling", max_model_len=None
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
assert model_config.max_model_len == 512
else:
assert (
model_config.max_model_len == original_max_position_embeddings)
assert model_config.max_model_len == original_max_position_embeddings
@pytest.mark.parametrize("model_info", MODELS)
def test_set_max_model_len_legal(model_info, vllm_runner):
# set max_model_len <= 512
with vllm_runner(model_info.name, runner="pooling",
max_model_len=256) as vllm_model:
with vllm_runner(
model_info.name, runner="pooling", max_model_len=256
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 256
@@ -46,13 +47,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=1024):
with vllm_runner(model_info.name, runner="pooling", max_model_len=1024):
pass
else:
with vllm_runner(model_info.name, runner="pooling",
max_model_len=1024) as vllm_model:
with vllm_runner(
model_info.name, runner="pooling", max_model_len=1024
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024
@@ -61,17 +61,18 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
def test_set_max_model_len_illegal(model_info, vllm_runner):
# set max_model_len > 2048
with pytest.raises(ValueError):
with vllm_runner(model_info.name, runner="pooling",
max_model_len=4096):
with vllm_runner(model_info.name, runner="pooling", max_model_len=4096):
pass
# set max_model_len > 2048 by hf_overrides
hf_overrides = {"max_model_len": 4096}
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides):
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides,
):
pass
@@ -82,16 +83,14 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
"rope_scaling": {
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings":
original_max_position_embeddings
"original_max_position_embeddings": original_max_position_embeddings,
},
"max_model_len": max_model_len
"max_model_len": max_model_len,
}
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides):
with vllm_runner(
model_info.name, runner="pooling", max_model_len=None, hf_overrides=hf_overrides
):
pass
@@ -102,16 +101,17 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
"rope_scaling": {
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings":
original_max_position_embeddings
}
"original_max_position_embeddings": original_max_position_embeddings,
},
}
# illegal max_model_len
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=max_model_len + 1,
hf_overrides=hf_overrides):
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=max_model_len + 1,
hf_overrides=hf_overrides,
):
pass
hf_overrides = {
@@ -119,15 +119,16 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
"rope_scaling": {
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings":
original_max_position_embeddings
"original_max_position_embeddings": original_max_position_embeddings,
},
"max_model_len": max_model_len + 1
"max_model_len": max_model_len + 1,
}
# illegal max_model_len by hf_overrides
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides):
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides,
):
pass

View File

@@ -10,10 +10,7 @@ from vllm.config import PoolerConfig
@pytest.mark.parametrize(
"model",
[
"jason9693/Qwen2.5-1.5B-apeach",
"papluca/xlm-roberta-base-language-detection"
],
["jason9693/Qwen2.5-1.5B-apeach", "papluca/xlm-roberta-base-language-detection"],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_classify_models_using_activation(
@@ -23,30 +20,32 @@ def test_classify_models_using_activation(
model: str,
dtype: str,
) -> None:
with vllm_runner(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(activation=False)) as vllm_model:
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(activation=False),
) as vllm_model:
wo_activation_out = vllm_model.classify(example_prompts)
with vllm_runner(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(activation=True)) as vllm_model:
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(activation=True),
) as vllm_model:
w_activation_out = vllm_model.classify(example_prompts)
for wo_activation, w_activation in zip(wo_activation_out,
w_activation_out):
for wo_activation, w_activation in zip(wo_activation_out, w_activation_out):
wo_activation = torch.tensor(wo_activation)
w_activation = torch.tensor(w_activation)
assert not torch.allclose(wo_activation, w_activation,
atol=1e-2), "pooler_config is not working"
assert torch.allclose(softmax(wo_activation), w_activation,
1e-3 if dtype == "float" else 1e-2)
assert not torch.allclose(wo_activation, w_activation, atol=1e-2), (
"pooler_config is not working"
)
assert torch.allclose(
softmax(wo_activation), w_activation, 1e-3 if dtype == "float" else 1e-2
)
@pytest.mark.parametrize(
@@ -63,26 +62,28 @@ def test_embed_models_using_normalize(
model: str,
dtype: str,
) -> None:
with vllm_runner(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=False)) as vllm_model:
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=False),
) as vllm_model:
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
with vllm_runner(model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=True)) as vllm_model:
with vllm_runner(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=True),
) as vllm_model:
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
assert not torch.allclose(
wo_normalize, w_normalize,
atol=1e-2), "pooler_config normalize is not working"
assert not torch.allclose(wo_normalize, w_normalize, atol=1e-2), (
"pooler_config normalize is not working"
)
assert torch.allclose(
F.normalize(wo_normalize, p=2, dim=-1), w_normalize,
atol=1e-2), "w_normal should be close to normal(wo_normal)."
F.normalize(wo_normalize, p=2, dim=-1), w_normalize, atol=1e-2
), "w_normal should be close to normal(wo_normal)."
@pytest.mark.parametrize(
@@ -99,25 +100,26 @@ def test_reward_models_using_softmax(
model: str,
dtype: str,
) -> None:
with vllm_runner(model,
max_model_len=1024,
dtype=dtype,
pooler_config=PoolerConfig(softmax=False)) as vllm_model:
with vllm_runner(
model,
max_model_len=1024,
dtype=dtype,
pooler_config=PoolerConfig(softmax=False),
) as vllm_model:
wo_softmax = vllm_model.encode(example_prompts)
with vllm_runner(model,
max_model_len=1024,
dtype=dtype,
pooler_config=PoolerConfig(softmax=True)) as vllm_model:
with vllm_runner(
model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True)
) as vllm_model:
w_softmax = vllm_model.encode(example_prompts)
for wo, w in zip(wo_softmax, w_softmax):
wo = torch.tensor(wo)
w = torch.tensor(w)
assert not torch.allclose(
wo, w, atol=1e-2), "pooler_config softmax is not working"
assert torch.allclose(
softmax(wo), w,
atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
assert not torch.allclose(wo, w, atol=1e-2), (
"pooler_config softmax is not working"
)
assert torch.allclose(softmax(wo), w, atol=1e-2), (
"w_softmax should be close to softmax(wo_softmax)."
)

View File

@@ -16,10 +16,8 @@ from ...utils import check_transformers_version
def math_step_prompts():
# ruff: noqa: E501
data = {
"system":
"Please reason step by step, and put your final answer within \\boxed{}. ",
"query":
"Sue lives in a fun neighborhood. One weekend, the neighbors decided to play a prank on Sue. On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard. On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard. Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
"system": "Please reason step by step, and put your final answer within \\boxed{}. ",
"query": "Sue lives in a fun neighborhood. One weekend, the neighbors decided to play a prank on Sue. On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard. On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard. Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
"response": [
"To find out how many more pink plastic flamingos were out than white plastic flamingos at noon on Sunday, we can break down the problem into steps. First, on Friday, the neighbors start with 18 pink plastic flamingos.",
"On Saturday, they take back one third of the flamingos. Since there were 18 flamingos, (1/3 \\times 18 = 6) flamingos are taken back. So, they have (18 - 6 = 12) flamingos left in their possession. Then, they paint these 6 flamingos white and put them back out on Sue's front yard. Now, Sue has the original 12 pink flamingos plus the 6 new white ones. Thus, by the end of Saturday, Sue has (12 + 6 = 18) pink flamingos and 6 white flamingos.",
@@ -27,16 +25,16 @@ def math_step_prompts():
"To find the difference, subtract the number of white flamingos from the number of pink flamingos: (36 - 6 = 30). Therefore, at noon on Sunday, there were 30 more pink plastic flamingos out than white plastic flamingos. The answer is (\\boxed{30}).",
],
}
answer = "<extra_0>".join(data['response']) + "<extra_0>"
answer = "<extra_0>".join(data["response"]) + "<extra_0>"
prompt = f"<im_start>system\n{data['system']}<im_end>\n<im_start>user\n{data['query']}<im_end>\n<im_start>assistant\n{answer}<im_end><|endoftext|>"
return [prompt]
def step_reward_patch_hf_model(hf_model: HfRunner):
# Patch the hf_runner to use the step reward function
def make_step_rewards(logits: torch.Tensor,
token_masks: torch.Tensor) -> list[list[float]]:
def make_step_rewards(
logits: torch.Tensor, token_masks: torch.Tensor
) -> list[list[float]]:
probabilities = F.softmax(logits, dim=-1)
probabilities = probabilities * token_masks.unsqueeze(-1)
@@ -54,7 +52,7 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
outputs = hf_model.model(input_ids=input_ids)
step_sep_id = hf_model.tokenizer.encode("<extra_0>")[0]
token_masks = (input_ids == step_sep_id)
token_masks = input_ids == step_sep_id
return make_step_rewards(outputs[0], token_masks)
hf_model.reward = reward # type: ignore[attr-defined]
@@ -65,8 +63,10 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
@pytest.mark.parametrize(
"model",
[
pytest.param("Qwen/Qwen2.5-Math-PRM-7B",
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
pytest.param(
"Qwen/Qwen2.5-Math-PRM-7B",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
],
)
@pytest.mark.parametrize("dtype", ["half"])
@@ -78,8 +78,9 @@ def test_prm_models(
dtype: str,
monkeypatch,
) -> None:
check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
max_transformers_version="4.53.2")
check_transformers_version(
"Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
)
if current_platform.is_cpu():
pytest.skip("CPU only supports V1")

View File

@@ -37,10 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict([text_pair]).tolist()
with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
with vllm_runner(
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
assert len(vllm_outputs) == 1
@@ -58,10 +57,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist()
with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
with vllm_runner(
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
assert len(vllm_outputs) == 2
@@ -80,10 +78,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist()
with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
with vllm_runner(
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
assert len(vllm_outputs) == 2
@@ -101,17 +98,15 @@ def emb_model_name(request):
def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
text_pair = [TEXTS_1[0], TEXTS_2[0]]
with hf_runner(emb_model_name, dtype=DTYPE,
is_sentence_transformer=True) as hf_model:
with hf_runner(
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
) as hf_model:
hf_embeddings = hf_model.encode(text_pair)
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
]
hf_outputs = [F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)]
with vllm_runner(emb_model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
with vllm_runner(
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
assert len(vllm_outputs) == 1
@@ -126,20 +121,18 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
[TEXTS_1[0], TEXTS_2[1]],
]
with hf_runner(emb_model_name, dtype=DTYPE,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
with hf_runner(
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
) as hf_model:
hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
for pair in hf_embeddings
]
with vllm_runner(emb_model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
with vllm_runner(
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
assert len(vllm_outputs) == 2
@@ -155,20 +148,18 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
[TEXTS_1[1], TEXTS_2[1]],
]
with hf_runner(emb_model_name, dtype=DTYPE,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
with hf_runner(
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
) as hf_model:
hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
for pair in hf_embeddings
]
with vllm_runner(emb_model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
with vllm_runner(
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
assert len(vllm_outputs) == 2

View File

@@ -21,9 +21,9 @@ def test_models(
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts)
with hf_runner(model,
dtype=dtype,
auto_cls=AutoModelForTokenClassification) as hf_model:
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
) as hf_model:
tokenizer = hf_model.tokenizer
hf_outputs = []
for prompt in example_prompts:

View File

@@ -20,51 +20,57 @@ calculus, each contributing unique perspectives that would shape this new
field."""
def test_smaller_truncation_size(vllm_runner,
model_name=MODEL_NAME,
input_str=input_str):
def test_smaller_truncation_size(
vllm_runner, model_name=MODEL_NAME, input_str=input_str
):
truncate_prompt_tokens = 10
with vllm_runner(model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
with vllm_runner(
model_name, runner="pooling", max_model_len=max_model_len
) as vllm_model:
vllm_output = vllm_model.llm.embed(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
input_str, truncate_prompt_tokens=truncate_prompt_tokens
)
prompt_tokens = vllm_output[0].prompt_token_ids
assert len(prompt_tokens) == truncate_prompt_tokens
def test_max_truncation_size(vllm_runner,
model_name=MODEL_NAME,
input_str=input_str):
def test_max_truncation_size(vllm_runner, model_name=MODEL_NAME, input_str=input_str):
truncate_prompt_tokens = -1
with vllm_runner(model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
with vllm_runner(
model_name, runner="pooling", max_model_len=max_model_len
) as vllm_model:
vllm_output = vllm_model.llm.embed(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
input_str, truncate_prompt_tokens=truncate_prompt_tokens
)
prompt_tokens = vllm_output[0].prompt_token_ids
assert len(prompt_tokens) == max_model_len
def test_bigger_truncation_size(vllm_runner,
model_name=MODEL_NAME,
input_str=input_str):
def test_bigger_truncation_size(
vllm_runner, model_name=MODEL_NAME, input_str=input_str
):
truncate_prompt_tokens = max_model_len + 1
with pytest.raises(ValueError), vllm_runner(
model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
with (
pytest.raises(ValueError),
vllm_runner(
model_name, runner="pooling", max_model_len=max_model_len
) as vllm_model,
):
llm_output = vllm_model.llm.embed(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
input_str, truncate_prompt_tokens=truncate_prompt_tokens
)
assert llm_output == f"""truncate_prompt_tokens value
assert (
llm_output
== f"""truncate_prompt_tokens value
({truncate_prompt_tokens}) is greater than
max_model_len ({max_model_len}). Please, select
a smaller truncation size."""
)

View File

@@ -12,8 +12,7 @@ import requests
import torch
import tests.ci_envs as ci_envs
from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
check_embeddings_close)
from tests.models.utils import EmbedModelInfo, RerankModelInfo, check_embeddings_close
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
@@ -30,7 +29,6 @@ MTEB_RERANK_TOL = 2e-3
class VllmMtebEncoder(mteb.Encoder):
def __init__(self, vllm_model):
super().__init__()
self.llm = vllm_model
@@ -53,8 +51,7 @@ class VllmMtebEncoder(mteb.Encoder):
def predict(
self,
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
) -> np.ndarray:
@@ -64,17 +61,15 @@ class VllmMtebEncoder(mteb.Encoder):
queries = [s[0] for s in sentences]
corpus = [s[1] for s in sentences]
outputs = self.llm.score(queries,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False)
outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
)
scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores
class OpenAIClientMtebEncoder(mteb.Encoder):
def __init__(self, model_name: str, client):
super().__init__()
self.model_name = model_name
@@ -87,8 +82,9 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
embeddings = self.client.embeddings.create(model=self.model_name,
input=sentences)
embeddings = self.client.embeddings.create(
model=self.model_name, input=sentences
)
outputs = [d.embedding for d in embeddings.data]
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
@@ -96,7 +92,6 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
class ScoreClientMtebEncoder(mteb.Encoder):
def __init__(self, model_name: str, url):
super().__init__()
self.model_name = model_name
@@ -105,8 +100,7 @@ class ScoreClientMtebEncoder(mteb.Encoder):
def predict(
self,
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
) -> np.ndarray:
@@ -122,27 +116,30 @@ class ScoreClientMtebEncoder(mteb.Encoder):
return scores
def get_score(self, query, corpus):
response = requests.post(self.url,
json={
"model": self.model_name,
"text_1": query,
"text_2": corpus,
"truncate_prompt_tokens": -1,
}).json()
return response['data'][0]["score"]
response = requests.post(
self.url,
json={
"model": self.model_name,
"text_1": query,
"text_2": corpus,
"truncate_prompt_tokens": -1,
},
).json()
return response["data"][0]["score"]
class RerankClientMtebEncoder(ScoreClientMtebEncoder):
def get_score(self, query, corpus):
response = requests.post(self.url,
json={
"model": self.model_name,
"query": query,
"documents": [corpus],
"truncate_prompt_tokens": -1,
}).json()
return response['results'][0]["relevance_score"]
response = requests.post(
self.url,
json={
"model": self.model_name,
"query": query,
"documents": [corpus],
"truncate_prompt_tokens": -1,
},
).json()
return response["results"][0]["relevance_score"]
def run_mteb_embed_task(encoder, tasks):
@@ -161,12 +158,14 @@ def run_mteb_embed_task(encoder, tasks):
return main_score
def mteb_test_embed_models(hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_EMBED_TOL):
def mteb_test_embed_models(
hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_EMBED_TOL,
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
@@ -187,15 +186,15 @@ def mteb_test_embed_models(hf_runner,
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
if "hf_overrides" not in vllm_extra_kwargs:
vllm_extra_kwargs["hf_overrides"] = {}
vllm_extra_kwargs["hf_overrides"][
"head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=None,
enforce_eager=True,
**vllm_extra_kwargs) as vllm_model:
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=None,
enforce_eager=True,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
@@ -204,28 +203,29 @@ def mteb_test_embed_models(hf_runner,
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (model_config._model_info.default_pooling_type ==
model_info.default_pooling_type)
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
MTEB_EMBED_TASKS)
vllm_main_score = run_mteb_embed_task(
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
)
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
head_dtype = model_config.head_dtype
# Test embed_dims, isnan and whether to use normalize
vllm_outputs = vllm_model.embed(example_prompts,
truncate_prompt_tokens=-1)
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
assert not torch.any(torch.isnan(torch.tensor(vllm_outputs)))
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(
model_info.name,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
model_info.name,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
# e.g. setting default parameters for the encode method of hf_runner
if hf_model_callback is not None:
hf_model_callback(hf_model)
@@ -247,8 +247,7 @@ def mteb_test_embed_models(hf_runner,
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}",
vllm_main_score)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
@@ -282,26 +281,21 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
top_k=10,
save_predictions=True,
output_folder=f"{results_folder}/stage2",
previous_results=
f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
encode_kwargs={"show_progress_bar": False},
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_rerank_models_hf(hf_runner,
model_name,
hf_dtype="float32",
hf_model_callback=None):
with hf_runner(model_name, is_cross_encoder=True,
dtype=hf_dtype) as hf_model:
def mteb_test_rerank_models_hf(
hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
):
with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
original_predict = hf_model.predict
def _predict(
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
):
@@ -315,20 +309,22 @@ def mteb_test_rerank_models_hf(hf_runner,
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_rerank(hf_model,
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS)
st_main_score = run_mteb_rerank(
hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
)
st_dtype = next(hf_model.model.model.parameters()).dtype
return st_main_score, st_dtype
def mteb_test_rerank_models(hf_runner,
vllm_runner,
model_info: RerankModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
vllm_mteb_encoder=VllmMtebEncoder,
atol=MTEB_RERANK_TOL):
def mteb_test_rerank_models(
hf_runner,
vllm_runner,
model_info: RerankModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
vllm_mteb_encoder=VllmMtebEncoder,
atol=MTEB_RERANK_TOL,
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
@@ -346,33 +342,37 @@ def mteb_test_rerank_models(hf_runner,
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
if "hf_overrides" not in vllm_extra_kwargs:
vllm_extra_kwargs["hf_overrides"] = {}
vllm_extra_kwargs["hf_overrides"][
"head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=None,
max_num_seqs=8,
enforce_eager=True,
**vllm_extra_kwargs) as vllm_model:
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=None,
max_num_seqs=8,
enforce_eager=True,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)
assert model_info.architecture in model_config.architectures
# Score API is only enabled for num_labels == 1
assert model_config.hf_config.num_labels == 1
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (model_config._model_info.default_pooling_type ==
model_info.default_pooling_type)
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS)
vllm_main_score = run_mteb_rerank(
vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS,
)
vllm_dtype = model_config.dtype
head_dtype = model_config.head_dtype
@@ -380,14 +380,14 @@ def mteb_test_rerank_models(hf_runner,
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
st_main_score, st_dtype = mteb_test_rerank_models_hf(
hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback)
hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}",
vllm_main_score)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)

View File

@@ -2,67 +2,76 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.utils import (CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo, RerankModelInfo)
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo("BAAI/bge-base-en",
architecture="BertModel",
mteb_score=0.779336792,
enable_test=True),
CLSPoolingEmbedModelInfo("BAAI/bge-base-zh",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-small-en",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-small-zh",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-large-en",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-large-zh",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-noinstruct",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-base-en-v1.5",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-base-zh-v1.5",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-small-en-v1.5",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-small-zh-v1.5",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-large-en-v1.5",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-v1.5",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo(
"BAAI/bge-base-en",
architecture="BertModel",
mteb_score=0.779336792,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-base-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
),
########## XLMRobertaModel
CLSPoolingEmbedModelInfo("BAAI/bge-m3",
architecture="XLMRobertaModel",
mteb_score=0.787343078,
enable_test=True),
CLSPoolingEmbedModelInfo(
"BAAI/bge-m3",
architecture="XLMRobertaModel",
mteb_score=0.787343078,
enable_test=True,
),
########## Qwen2Model
LASTPoolingEmbedModelInfo("BAAI/bge-code-v1",
architecture="Qwen2Model",
mteb_score=0.75724465,
dtype="float32",
enable_test=True),
LASTPoolingEmbedModelInfo(
"BAAI/bge-code-v1",
architecture="Qwen2Model",
mteb_score=0.75724465,
dtype="float32",
enable_test=True,
),
]
RERANK_MODELS = [
@@ -71,33 +80,35 @@ RERANK_MODELS = [
"BAAI/bge-reranker-base",
architecture="XLMRobertaForSequenceClassification",
mteb_score=0.32398,
enable_test=True),
enable_test=True,
),
CLSPoolingRerankModelInfo(
"BAAI/bge-reranker-large",
architecture="XLMRobertaForSequenceClassification",
enable_test=False),
enable_test=False,
),
CLSPoolingRerankModelInfo(
"BAAI/bge-reranker-v2-m3",
architecture="XLMRobertaForSequenceClassification",
enable_test=False)
enable_test=False,
),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)
def test_embed_models_correctness(
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None:
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)

View File

@@ -8,53 +8,50 @@ import torch
from tests.conftest import HfRunner
from tests.models.language.pooling_mteb_test.mteb_utils import (
VllmMtebEncoder, mteb_test_rerank_models)
VllmMtebEncoder,
mteb_test_rerank_models,
)
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
RERANK_MODELS = [
LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification",
mteb_score=0.33757,
hf_overrides={
"architectures":
["GemmaForSequenceClassification"],
"classifier_from_token": ["Yes"],
"method":
"no_post_processing",
}),
LASTPoolingRerankModelInfo(
"BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification",
mteb_score=0.33757,
hf_overrides={
"architectures": ["GemmaForSequenceClassification"],
"classifier_from_token": ["Yes"],
"method": "no_post_processing",
},
),
]
PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501
class GemmaRerankerHfRunner(HfRunner):
def __init__(self,
model_name: str,
dtype: str = "auto",
*args: Any,
**kwargs: Any) -> None:
def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
padding_side='left')
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
@torch.no_grad()
def predict(self, prompts: list[list[str]], *args,
**kwargs) -> torch.Tensor:
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
def get_inputs(pairs, tokenizer, prompt=None):
if prompt is None:
prompt = PROMPT
sep = "\n"
prompt_inputs = tokenizer(prompt,
return_tensors=None,
add_special_tokens=False)["input_ids"]
sep_inputs = tokenizer(sep,
return_tensors=None,
add_special_tokens=False)["input_ids"]
prompt_inputs = tokenizer(
prompt, return_tensors=None, add_special_tokens=False
)["input_ids"]
sep_inputs = tokenizer(sep, return_tensors=None, add_special_tokens=False)[
"input_ids"
]
inputs = []
for query, passage in pairs:
query_inputs = tokenizer(
@@ -78,8 +75,7 @@ class GemmaRerankerHfRunner(HfRunner):
return_token_type_ids=False,
add_special_tokens=False,
)
item["input_ids"] = item[
"input_ids"] + sep_inputs + prompt_inputs
item["input_ids"] = item["input_ids"] + sep_inputs + prompt_inputs
item["attention_mask"] = [1] * len(item["input_ids"])
inputs.append(item)
return tokenizer.pad(
@@ -95,14 +91,19 @@ class GemmaRerankerHfRunner(HfRunner):
inputs = inputs.to(self.model.device)
_n_tokens = inputs["input_ids"].shape[1]
logits = self.model(**inputs, return_dict=True).logits
_scores = (logits[:, -1,
self.yes_loc].view(-1, ).float().sigmoid())
_scores = (
logits[:, -1, self.yes_loc]
.view(
-1,
)
.float()
.sigmoid()
)
scores.append(_scores[0].item())
return torch.Tensor(scores)
class GemmaMtebEncoder(VllmMtebEncoder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.query_template = "A: {query}\n"
@@ -110,12 +111,10 @@ class GemmaMtebEncoder(VllmMtebEncoder):
def predict(
self,
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
) -> np.ndarray:
_sentences = []
for query, corpus, prompt in sentences:
query = self.query_template.format(query=query)
@@ -127,8 +126,9 @@ class GemmaMtebEncoder(VllmMtebEncoder):
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(GemmaRerankerHfRunner,
vllm_runner,
model_info,
vllm_mteb_encoder=GemmaMtebEncoder)
mteb_test_rerank_models(
GemmaRerankerHfRunner,
vllm_runner,
model_info,
vllm_mteb_encoder=GemmaMtebEncoder,
)

View File

@@ -2,22 +2,30 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.utils import (CLSPoolingRerankModelInfo,
LASTPoolingRerankModelInfo, RerankModelInfo)
from tests.models.utils import (
CLSPoolingRerankModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_rerank_models
RERANK_MODELS = [
CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
mteb_score=0.32898,
architecture="BertForSequenceClassification"),
LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
mteb_score=0.25736,
architecture="Qwen3ForSequenceClassification")
CLSPoolingRerankModelInfo(
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
mteb_score=0.32898,
architecture="BertForSequenceClassification",
),
LASTPoolingRerankModelInfo(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
mteb_score=0.25736,
architecture="Qwen3ForSequenceClassification",
),
]
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None:
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)

View File

@@ -3,74 +3,93 @@
import pytest
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.utils import (CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo, RerankModelInfo)
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo("thenlper/gte-large",
mteb_score=0.76807651,
architecture="BertModel",
enable_test=True),
CLSPoolingEmbedModelInfo("thenlper/gte-base",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("thenlper/gte-small",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("thenlper/gte-large-zh",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("thenlper/gte-base-zh",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("thenlper/gte-small-zh",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo(
"thenlper/gte-large",
mteb_score=0.76807651,
architecture="BertModel",
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-base", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-small", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-base-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False
),
########### NewModel
# These three architectures are almost the same, but not exactly the same.
# For example,
# - whether to use token_type_embeddings
# - whether to use context expansion
# So only test one (the most widely used) model
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel",
mteb_score=0.775074696,
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=True),
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False),
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False),
CLSPoolingEmbedModelInfo(
"Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel",
mteb_score=0.775074696,
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False,
),
CLSPoolingEmbedModelInfo(
"Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False,
),
########### Qwen2ForCausalLM
LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
mteb_score=0.758473459018872,
architecture="Qwen2ForCausalLM",
enable_test=True),
LASTPoolingEmbedModelInfo(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
mteb_score=0.758473459018872,
architecture="Qwen2ForCausalLM",
enable_test=True,
),
########## ModernBertModel
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
mteb_score=0.748193353,
architecture="ModernBertModel",
enable_test=True),
CLSPoolingEmbedModelInfo(
"Alibaba-NLP/gte-modernbert-base",
mteb_score=0.748193353,
architecture="ModernBertModel",
enable_test=True,
),
########## Qwen3ForCausalLM
LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
mteb_score=0.771163695,
architecture="Qwen3ForCausalLM",
dtype="float32",
enable_test=True),
LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-4B",
architecture="Qwen3ForCausalLM",
dtype="float32",
enable_test=False),
LASTPoolingEmbedModelInfo(
"Qwen/Qwen3-Embedding-0.6B",
mteb_score=0.771163695,
architecture="Qwen3ForCausalLM",
dtype="float32",
enable_test=True,
),
LASTPoolingEmbedModelInfo(
"Qwen/Qwen3-Embedding-4B",
architecture="Qwen3ForCausalLM",
dtype="float32",
enable_test=False,
),
]
RERANK_MODELS = [
@@ -79,31 +98,32 @@ RERANK_MODELS = [
"Alibaba-NLP/gte-reranker-modernbert-base",
mteb_score=0.33386,
architecture="ModernBertForSequenceClassification",
enable_test=True),
enable_test=True,
),
CLSPoolingRerankModelInfo(
"Alibaba-NLP/gte-multilingual-reranker-base",
mteb_score=0.33062,
architecture="GteNewForSequenceClassification",
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
enable_test=True),
enable_test=True,
),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)
def test_embed_models_correctness(
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None:
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)

View File

@@ -2,50 +2,55 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo("intfloat/e5-small",
architecture="BertModel",
mteb_score=0.742285423,
enable_test=True),
CLSPoolingEmbedModelInfo("intfloat/e5-base",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("intfloat/e5-large",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-small",
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo(
"intfloat/e5-small",
architecture="BertModel",
mteb_score=0.742285423,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"intfloat/e5-base", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"intfloat/e5-large", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
),
########## XLMRobertaModel
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base",
architecture="XLMRobertaModel",
mteb_score=0.779325955,
enable_test=True),
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large",
architecture="XLMRobertaModel",
enable_test=False),
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel",
enable_test=False),
CLSPoolingEmbedModelInfo(
"intfloat/multilingual-e5-base",
architecture="XLMRobertaModel",
mteb_score=0.779325955,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"intfloat/multilingual-e5-large",
architecture="XLMRobertaModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
"intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel",
enable_test=False,
),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)
def test_embed_models_correctness(
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)

View File

@@ -5,60 +5,68 @@ from functools import partial
import pytest
from tests.models.language.pooling.embed_utils import (
check_embeddings_close, correctness_test_embed_models, matryoshka_fy)
from tests.models.utils import (CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo, EmbedModelInfo,
RerankModelInfo)
check_embeddings_close,
correctness_test_embed_models,
matryoshka_fy,
)
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
RerankModelInfo,
)
from vllm import PoolingParams
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
EMBEDDING_MODELS = [
CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3",
mteb_score=0.824413164,
architecture="XLMRobertaModel",
is_matryoshka=True)
CLSPoolingEmbedModelInfo(
"jinaai/jina-embeddings-v3",
mteb_score=0.824413164,
architecture="XLMRobertaModel",
is_matryoshka=True,
)
]
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
"jinaai/jina-reranker-v2-base-multilingual",
mteb_score=0.33643,
architecture="XLMRobertaForSequenceClassification")
architecture="XLMRobertaForSequenceClassification",
)
]
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
def hf_model_callback(model):
model.encode = partial(model.encode, task="text-matching")
mteb_test_embed_models(hf_runner,
vllm_runner,
model_info,
hf_model_callback=hf_model_callback)
mteb_test_embed_models(
hf_runner, vllm_runner, model_info, hf_model_callback=hf_model_callback
)
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
def test_embed_models_correctness(
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
) -> None:
def hf_model_callback(model):
model.encode = partial(model.encode, task="text-matching")
correctness_test_embed_models(hf_runner,
vllm_runner,
model_info,
example_prompts,
hf_model_callback=hf_model_callback)
correctness_test_embed_models(
hf_runner,
vllm_runner,
model_info,
example_prompts,
hf_model_callback=hf_model_callback,
)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None:
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
@@ -81,32 +89,32 @@ def test_matryoshka(
example_prompts = [str(s).strip() for s in example_prompts]
with hf_runner(
model_info.name,
dtype=dtype,
is_sentence_transformer=True,
model_info.name,
dtype=dtype,
is_sentence_transformer=True,
) as hf_model:
hf_outputs = hf_model.encode(example_prompts, task="text-matching")
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
with vllm_runner(model_info.name,
runner="pooling",
dtype=dtype,
max_model_len=None) as vllm_model:
with vllm_runner(
model_info.name, runner="pooling", dtype=dtype, max_model_len=None
) as vllm_model:
assert vllm_model.llm.llm_engine.model_config.is_matryoshka
matryoshka_dimensions = (
vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
vllm_model.llm.llm_engine.model_config.matryoshka_dimensions
)
assert matryoshka_dimensions is not None
if dimensions not in matryoshka_dimensions:
with pytest.raises(ValueError):
vllm_model.embed(
example_prompts,
pooling_params=PoolingParams(dimensions=dimensions))
example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
)
else:
vllm_outputs = vllm_model.embed(
example_prompts,
pooling_params=PoolingParams(dimensions=dimensions))
example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
)
check_embeddings_close(
embeddings_0_lst=hf_outputs,

View File

@@ -17,46 +17,45 @@ mxbai_rerank_hf_overrides = {
}
RERANK_MODELS = [
LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
mteb_score=0.273,
enable_test=True),
LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
enable_test=False)
LASTPoolingRerankModelInfo(
"mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
mteb_score=0.273,
enable_test=True,
),
LASTPoolingRerankModelInfo(
"mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
enable_test=False,
),
]
class MxbaiRerankerHfRunner(HfRunner):
def __init__(self,
model_name: str,
dtype: str = "auto",
*args: Any,
**kwargs: Any) -> None:
def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
padding_side='left')
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
def predict(self, prompts: list[list[str]], *args,
**kwargs) -> torch.Tensor:
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
def process_inputs(pairs):
inputs = self.tokenizer(pairs,
padding=False,
truncation='longest_first',
return_attention_mask=False)
for i, ele in enumerate(inputs['input_ids']):
inputs['input_ids'][i] = ele
inputs = self.tokenizer.pad(inputs,
padding=True,
return_tensors="pt")
inputs = self.tokenizer(
pairs,
padding=False,
truncation="longest_first",
return_attention_mask=False,
)
for i, ele in enumerate(inputs["input_ids"]):
inputs["input_ids"][i] = ele
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
for key in inputs:
inputs[key] = inputs[key].to(self.model.device)
return inputs

View File

@@ -3,39 +3,42 @@
import pytest
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel",
mteb_score=0.737568559,
enable_test=True),
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("nomic-ai/CodeRankEmbed",
architecture="NomicBertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel",
mteb_score=0.715488912,
enable_test=True)
CLSPoolingEmbedModelInfo(
"nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel",
mteb_score=0.737568559,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel",
mteb_score=0.715488912,
enable_test=True,
),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)
def test_embed_models_correctness(
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)

View File

@@ -18,46 +18,45 @@ qwen3_reranker_hf_overrides = {
}
RERANK_MODELS = [
LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides,
enable_test=True),
LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification",
hf_overrides=qwen3_reranker_hf_overrides,
enable_test=False)
LASTPoolingRerankModelInfo(
"Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides,
enable_test=True,
),
LASTPoolingRerankModelInfo(
"Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification",
hf_overrides=qwen3_reranker_hf_overrides,
enable_test=False,
),
]
class Qwen3RerankerHfRunner(HfRunner):
def __init__(self,
model_name: str,
dtype: str = "auto",
*args: Any,
**kwargs: Any) -> None:
def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
padding_side='left')
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
def predict(self, prompts: list[list[str]], *args,
**kwargs) -> torch.Tensor:
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
def process_inputs(pairs):
inputs = self.tokenizer(pairs,
padding=False,
truncation='longest_first',
return_attention_mask=False)
for i, ele in enumerate(inputs['input_ids']):
inputs['input_ids'][i] = ele
inputs = self.tokenizer.pad(inputs,
padding=True,
return_tensors="pt")
inputs = self.tokenizer(
pairs,
padding=False,
truncation="longest_first",
return_attention_mask=False,
)
for i, ele in enumerate(inputs["input_ids"]):
inputs["input_ids"][i] = ele
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
for key in inputs:
inputs[key] = inputs[key].to(self.model.device)
return inputs
@@ -82,20 +81,18 @@ class Qwen3RerankerHfRunner(HfRunner):
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
@multi_gpu_test(num_gpus=2)
def test_rerank_models_mteb_tp(vllm_runner,
model_info: RerankModelInfo) -> None:
def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None:
assert model_info.architecture == "Qwen3ForSequenceClassification"
vllm_extra_kwargs: dict[str, Any] = {
"tensor_parallel_size": 2,
}
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
vllm_extra_kwargs)
mteb_test_rerank_models(
Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs
)

View File

@@ -3,62 +3,75 @@
import pytest
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False,
architecture="BertModel",
mteb_score=0.714927797,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False,
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
is_matryoshka=False,
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
is_matryoshka=False,
architecture="NomicBertModel",
mteb_score=0.681146831,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
is_matryoshka=False,
architecture="BertModel",
enable_test=False),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True,
architecture="BertModel",
mteb_score=0.649088363,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
is_matryoshka=True,
architecture="XLMRobertaModel",
mteb_score=0.712258299,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True,
architecture="GteModel",
mteb_score=0.706622444,
enable_test=True),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False,
architecture="BertModel",
mteb_score=0.714927797,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-long",
is_matryoshka=False,
architecture="NomicBertModel",
mteb_score=0.681146831,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True,
architecture="BertModel",
mteb_score=0.649088363,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l-v2.0",
is_matryoshka=True,
architecture="XLMRobertaModel",
mteb_score=0.712258299,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True,
architecture="GteModel",
mteb_score=0.706622444,
enable_test=True,
),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)
def test_embed_models_correctness(
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
) -> None:
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)

View File

@@ -2,8 +2,11 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo)
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
)
from .mteb_utils import mteb_test_embed_models
@@ -15,15 +18,15 @@ ST_PROJECTOR_MODELS = [
mteb_score=0.688611955,
enable_test=True,
),
LASTPoolingEmbedModelInfo("google/embeddinggemma-300m",
architecture="Gemma3TextModel",
mteb_score=0.7473819294684156,
enable_test=True)
LASTPoolingEmbedModelInfo(
"google/embeddinggemma-300m",
architecture="Gemma3TextModel",
mteb_score=0.7473819294684156,
enable_test=True,
),
]
@pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)