Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -51,8 +51,9 @@ AITER_MODEL_LIST = [
|
||||
pytest.param(
|
||||
"google/gemma-1.1-2b-it", # gemma
|
||||
marks=[
|
||||
pytest.mark.core_model, pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
),
|
||||
pytest.param(
|
||||
@@ -65,8 +66,7 @@ AITER_MODEL_LIST = [
|
||||
pytest.param(
|
||||
"openbmb/MiniCPM3-4B",
|
||||
# fused_moe not supported on CPU
|
||||
marks=[pytest.mark.core_model,
|
||||
large_gpu_mark(min_gb=32)],
|
||||
marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
pytest.param(
|
||||
"facebook/opt-125m", # opt
|
||||
@@ -82,8 +82,9 @@ AITER_MODEL_LIST = [
|
||||
pytest.param(
|
||||
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
|
||||
marks=[
|
||||
pytest.mark.core_model, pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
),
|
||||
pytest.param(
|
||||
@@ -100,16 +101,25 @@ AITER_MODEL_LIST = [
|
||||
marks=[pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param("swiss-ai/Apertus-8B-2509"), # apertus
|
||||
])
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize(
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
)
|
||||
@pytest.mark.parametrize("use_prompt_embeds", [True, False])
|
||||
def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
max_tokens: int, num_logprobs: int, use_rocm_aiter: bool,
|
||||
use_prompt_embeds: bool, monkeypatch) -> None:
|
||||
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
use_rocm_aiter: bool,
|
||||
use_prompt_embeds: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
@@ -125,34 +135,37 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
prompt_embeds: Optional[list[torch.Tensor]] = ([] if use_prompt_embeds
|
||||
else None)
|
||||
prompt_embeds: Optional[list[torch.Tensor]] = [] if use_prompt_embeds else None
|
||||
|
||||
prompt_token_ids = []
|
||||
for prompt in example_prompts:
|
||||
token_ids = hf_model.tokenizer(prompt,
|
||||
return_tensors="pt").input_ids.to(
|
||||
hf_model.model.device)
|
||||
token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
|
||||
hf_model.model.device
|
||||
)
|
||||
prompt_token_ids.append(token_ids)
|
||||
if prompt_embeds is not None:
|
||||
prompt_embeds.append(hf_model.model.get_input_embeddings()(
|
||||
token_ids).squeeze(0))
|
||||
prompt_embeds.append(
|
||||
hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
tokenizer_name=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
max_num_seqs=2,
|
||||
enable_prompt_embeds=use_prompt_embeds,
|
||||
model,
|
||||
tokenizer_name=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
max_num_seqs=2,
|
||||
enable_prompt_embeds=use_prompt_embeds,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
if prompt_embeds is not None:
|
||||
vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
|
||||
prompt_embeds, max_tokens, num_logprobs)
|
||||
prompt_embeds, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
|
||||
@@ -11,17 +11,17 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
with vllm_runner(
|
||||
model,
|
||||
load_format="dummy",
|
||||
model,
|
||||
load_format="dummy",
|
||||
) as llm:
|
||||
if model == "google/gemma-3-4b-it":
|
||||
normalizers = llm.llm.collective_rpc(
|
||||
lambda self: self.model_runner.model.language_model.model.
|
||||
normalizer.cpu().item())
|
||||
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()
|
||||
)
|
||||
config = llm.llm.llm_engine.model_config.hf_config.text_config
|
||||
else:
|
||||
normalizers = llm.llm.collective_rpc(
|
||||
lambda self: self.model_runner.model.model.normalizer.cpu(
|
||||
).item())
|
||||
lambda self: self.model_runner.model.model.normalizer.cpu().item()
|
||||
)
|
||||
config = llm.llm.llm_engine.model_config.hf_config
|
||||
assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
|
||||
|
||||
@@ -26,11 +26,13 @@ def test_models(
|
||||
) -> None:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
|
||||
@@ -24,7 +24,7 @@ SSM_MODELS = [
|
||||
"tiiuae/falcon-mamba-tiny-dev",
|
||||
# mamba2-codestral in transformers is broken pending:
|
||||
# https://github.com/huggingface/transformers/pull/40861
|
||||
#"yujiepan/mamba2-codestral-v0.1-tiny-random",
|
||||
# "yujiepan/mamba2-codestral-v0.1-tiny-random",
|
||||
]
|
||||
|
||||
HYBRID_MODELS = [
|
||||
@@ -65,7 +65,6 @@ def test_models(
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
@@ -75,11 +74,13 @@ def test_models(
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
@@ -109,13 +110,14 @@ def test_batching(
|
||||
for_loop_outputs = []
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
for prompt in example_prompts:
|
||||
single_output, = vllm_model.generate_greedy_logprobs([prompt],
|
||||
max_tokens,
|
||||
num_logprobs)
|
||||
(single_output,) = vllm_model.generate_greedy_logprobs(
|
||||
[prompt], max_tokens, num_logprobs
|
||||
)
|
||||
for_loop_outputs.append(single_output)
|
||||
|
||||
batched_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=for_loop_outputs,
|
||||
@@ -134,8 +136,8 @@ def test_chunked_prefill_with_parallel_sampling(
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""
|
||||
Tests chunked prefill in conjunction with n > 1.
|
||||
|
||||
Tests chunked prefill in conjunction with n > 1.
|
||||
|
||||
In this case, prefill is populated with decoding tokens and
|
||||
we test that it doesn't fail.
|
||||
|
||||
@@ -143,16 +145,13 @@ def test_chunked_prefill_with_parallel_sampling(
|
||||
decoding steps inside a chunked prefill forward pass
|
||||
(where we have both prefill and decode together)
|
||||
"""
|
||||
sampling_params = SamplingParams(n=3,
|
||||
temperature=1,
|
||||
seed=0,
|
||||
max_tokens=max_tokens)
|
||||
sampling_params = SamplingParams(n=3, temperature=1, seed=0, max_tokens=max_tokens)
|
||||
with vllm_runner(
|
||||
model,
|
||||
enable_chunked_prefill=True,
|
||||
# forces prefill chunks with decoding
|
||||
max_num_batched_tokens=MAX_NUM_SEQS * 3,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
model,
|
||||
enable_chunked_prefill=True,
|
||||
# forces prefill chunks with decoding
|
||||
max_num_batched_tokens=MAX_NUM_SEQS * 3,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
@@ -170,10 +169,8 @@ def test_mamba_cache_cg_padding(
|
||||
batch size. If it's not, a torch RuntimeError will be raised because
|
||||
tensor dimensions aren't compatible.
|
||||
"""
|
||||
vllm_config = EngineArgs(model=model,
|
||||
trust_remote_code=True).create_engine_config()
|
||||
while len(example_prompts) == vllm_config.pad_for_cudagraph(
|
||||
len(example_prompts)):
|
||||
vllm_config = EngineArgs(model=model, trust_remote_code=True).create_engine_config()
|
||||
while len(example_prompts) == vllm_config.pad_for_cudagraph(len(example_prompts)):
|
||||
example_prompts.append(example_prompts[0])
|
||||
|
||||
try:
|
||||
@@ -183,7 +180,8 @@ def test_mamba_cache_cg_padding(
|
||||
pytest.fail(
|
||||
"Couldn't run batch size which is not equal to a Cuda Graph "
|
||||
"captured batch size. "
|
||||
"Could be related to mamba cache not padded correctly")
|
||||
"Could be related to mamba cache not padded correctly"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@@ -205,8 +203,10 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
|
||||
except ValueError:
|
||||
pytest.fail("Hybrid inner state wasn't cleaned up properly between"
|
||||
"steps finished requests registered unnecessarily ")
|
||||
pytest.fail(
|
||||
"Hybrid inner state wasn't cleaned up properly between"
|
||||
"steps finished requests registered unnecessarily "
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@@ -215,10 +215,10 @@ def test_state_cleanup(
|
||||
example_prompts,
|
||||
model: str,
|
||||
) -> None:
|
||||
"""
|
||||
"""
|
||||
This test is for verifying that the Hybrid state is cleaned up between
|
||||
steps.
|
||||
|
||||
|
||||
If it's not cleaned, an error would be expected.
|
||||
"""
|
||||
try:
|
||||
@@ -226,8 +226,10 @@ def test_state_cleanup(
|
||||
for _ in range(10):
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
|
||||
except ValueError:
|
||||
pytest.fail("Hybrid inner state wasn't cleaned up between states, "
|
||||
"could be related to finished_requests_ids")
|
||||
pytest.fail(
|
||||
"Hybrid inner state wasn't cleaned up between states, "
|
||||
"could be related to finished_requests_ids"
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@@ -241,15 +243,19 @@ def test_distributed_correctness(
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(model, tensor_parallel_size=1,
|
||||
max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, tensor_parallel_size=1, max_num_seqs=MAX_NUM_SEQS
|
||||
) as vllm_model:
|
||||
vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, tensor_parallel_size=2,
|
||||
max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, tensor_parallel_size=2, max_num_seqs=MAX_NUM_SEQS
|
||||
) as vllm_model:
|
||||
vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=vllm_outputs_tp_1,
|
||||
@@ -271,7 +277,6 @@ def test_full_cuda_graph(
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
@@ -281,11 +286,13 @@ def test_full_cuda_graph(
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
@@ -298,8 +305,9 @@ def test_full_cuda_graph(
|
||||
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("cache_dtype_param",
|
||||
["mamba_ssm_cache_dtype", "mamba_cache_dtype"])
|
||||
@pytest.mark.parametrize(
|
||||
"cache_dtype_param", ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]
|
||||
)
|
||||
def test_fp32_cache_state(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
@@ -310,7 +318,6 @@ def test_fp32_cache_state(
|
||||
num_logprobs: int,
|
||||
cache_dtype_param: str,
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
@@ -320,13 +327,15 @@ def test_fp32_cache_state(
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
**{cache_dtype_param: "float32"}) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
@@ -339,21 +348,23 @@ def test_fp32_cache_state(
|
||||
# Helper functions for the APC tests
|
||||
def _get_vllm_runner_params(model, max_model_len, tensor_parallel_size=1):
|
||||
return {
|
||||
'model_name': model,
|
||||
'enable_prefix_caching': False,
|
||||
'max_model_len': max_model_len,
|
||||
'tensor_parallel_size': tensor_parallel_size,
|
||||
'gpu_memory_utilization': 0.4
|
||||
"model_name": model,
|
||||
"enable_prefix_caching": False,
|
||||
"max_model_len": max_model_len,
|
||||
"tensor_parallel_size": tensor_parallel_size,
|
||||
"gpu_memory_utilization": 0.4,
|
||||
}
|
||||
|
||||
|
||||
def _get_vLLM_output(vllm_runner,
|
||||
kwargs,
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
num_repetitions=1,
|
||||
vllm_model=None):
|
||||
def _get_vLLM_output(
|
||||
vllm_runner,
|
||||
kwargs,
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
num_repetitions=1,
|
||||
vllm_model=None,
|
||||
):
|
||||
outs = []
|
||||
if vllm_model is None:
|
||||
vllm_model = vllm_runner(**kwargs)
|
||||
@@ -362,7 +373,8 @@ def _get_vLLM_output(vllm_runner,
|
||||
vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
|
||||
else:
|
||||
vllm_output = vllm_model.generate_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs)
|
||||
prompts, max_tokens, num_logprobs
|
||||
)
|
||||
outs.append(vllm_output)
|
||||
|
||||
return outs, vllm_model
|
||||
@@ -387,7 +399,6 @@ def test_apc_single_prompt(
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
@@ -395,29 +406,33 @@ def test_apc_single_prompt(
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = check_logprobs_close \
|
||||
if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
MULTIPLE = 300
|
||||
|
||||
# Sample prompts.
|
||||
generated_prompts = [MULTIPLE * example_prompts[0]]
|
||||
|
||||
max_model_len = max(
|
||||
len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
|
||||
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts, max_tokens,
|
||||
num_logprobs)
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
vllm_runner_kwargs['enable_prefix_caching'] = True
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts, max_tokens,
|
||||
num_logprobs, n_repetitions)
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
)
|
||||
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
@@ -450,7 +465,6 @@ def test_apc_single_prompt_block_align_alignment(
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
@@ -458,30 +472,29 @@ def test_apc_single_prompt_block_align_alignment(
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = check_logprobs_close \
|
||||
if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
MULTIPLE = 300
|
||||
|
||||
# Sample prompts. This custom prompt is used, as it causes the most issues
|
||||
generated_prompts = ["The president of the United States is " * MULTIPLE]
|
||||
|
||||
max_model_len = max(
|
||||
len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
|
||||
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts, max_tokens,
|
||||
num_logprobs)
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
vllm_runner_kwargs['enable_prefix_caching'] = True
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
|
||||
# Retrieve the default mamba state block size
|
||||
mamba_block_size = vllm_model.llm.llm_engine.cache_config. \
|
||||
mamba_block_size
|
||||
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
|
||||
|
||||
# In case the hybrid model does not have the
|
||||
# "mamba_block_size" assume a fixed constant
|
||||
@@ -489,18 +502,18 @@ def test_apc_single_prompt_block_align_alignment(
|
||||
mamba_block_size = 512
|
||||
|
||||
mamba_block_size_multiplier = 10
|
||||
for offsets in [
|
||||
-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3
|
||||
]:
|
||||
|
||||
vllm_runner_kwargs[
|
||||
'max_num_batched_tokens'] = mamba_block_size_multiplier * \
|
||||
mamba_block_size - offsets
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens, num_logprobs,
|
||||
n_repetitions)
|
||||
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
|
||||
vllm_runner_kwargs["max_num_batched_tokens"] = (
|
||||
mamba_block_size_multiplier * mamba_block_size - offsets
|
||||
)
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
)
|
||||
|
||||
# Check alignment of the output logits when using APC
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
@@ -534,7 +547,6 @@ def test_apc_multiple_prompts_all_cached_outputs(
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
@@ -542,30 +554,34 @@ def test_apc_multiple_prompts_all_cached_outputs(
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = check_logprobs_close \
|
||||
if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
MULTIPLE = 300
|
||||
|
||||
# Sample prompts.
|
||||
generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]
|
||||
|
||||
max_model_len = max(
|
||||
len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
|
||||
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts, max_tokens,
|
||||
num_logprobs)
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
vllm_runner_kwargs['enable_prefix_caching'] = True
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts, max_tokens,
|
||||
num_logprobs, n_repetitions)
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
)
|
||||
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
@@ -598,7 +614,6 @@ def test_apc_multiple_prompts_block_align_alignment(
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
@@ -606,34 +621,31 @@ def test_apc_multiple_prompts_block_align_alignment(
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = check_logprobs_close \
|
||||
if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
MULTIPLE = 300
|
||||
|
||||
# Sample prompts. This custom prompt is used, as it causes the most issues
|
||||
prompt_text = "The president of the United States is "
|
||||
prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31]
|
||||
generated_prompts = [
|
||||
prompt_text[offset:] * MULTIPLE for offset in prompt_offsets
|
||||
]
|
||||
generated_prompts = [prompt_text[offset:] * MULTIPLE for offset in prompt_offsets]
|
||||
|
||||
max_model_len = max(
|
||||
len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(model, max_model_len,
|
||||
tensor_parallel_size)
|
||||
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts, max_tokens,
|
||||
num_logprobs)
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
vllm_runner_kwargs['enable_prefix_caching'] = True
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
|
||||
# Retrieve the default mamba state block size
|
||||
mamba_block_size = vllm_model.llm.llm_engine.cache_config. \
|
||||
mamba_block_size
|
||||
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
|
||||
|
||||
# In case the hybrid model does not have the
|
||||
# "mamba_block_size" assume a fixed constant
|
||||
@@ -641,18 +653,18 @@ def test_apc_multiple_prompts_block_align_alignment(
|
||||
mamba_block_size = 512
|
||||
|
||||
mamba_block_size_multiplier = 10
|
||||
for offsets in [
|
||||
-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3
|
||||
]:
|
||||
|
||||
vllm_runner_kwargs[
|
||||
'max_num_batched_tokens'] = mamba_block_size_multiplier * \
|
||||
mamba_block_size - offsets
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens, num_logprobs,
|
||||
n_repetitions)
|
||||
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
|
||||
vllm_runner_kwargs["max_num_batched_tokens"] = (
|
||||
mamba_block_size_multiplier * mamba_block_size - offsets
|
||||
)
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
)
|
||||
|
||||
# Check alignment of the output logits when using APC
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
@@ -686,7 +698,6 @@ def test_apc_multiple_prompts_partial_cached_outputs(
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
@@ -694,30 +705,30 @@ def test_apc_multiple_prompts_partial_cached_outputs(
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = check_logprobs_close \
|
||||
if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
MULTIPLE = 300
|
||||
|
||||
# Sample prompts.
|
||||
generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]
|
||||
|
||||
max_model_len = max(
|
||||
len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size)
|
||||
vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts, max_tokens,
|
||||
num_logprobs)
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
# Cache only part of all the prompts
|
||||
vllm_runner_kwargs['enable_prefix_caching'] = True
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
vllm_outputs_partial_cache, vllm_model = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens,
|
||||
num_logprobs)
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0][:3],
|
||||
@@ -726,13 +737,15 @@ def test_apc_multiple_prompts_partial_cached_outputs(
|
||||
name_1="vllm_partial_cache",
|
||||
)
|
||||
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
vllm_model=vllm_model)
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
vllm_model=vllm_model,
|
||||
)
|
||||
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
|
||||
@@ -6,7 +6,9 @@ import json
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
|
||||
MistralToolCall, MistralToolParser)
|
||||
MistralToolCall,
|
||||
MistralToolParser,
|
||||
)
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||
|
||||
@@ -33,136 +35,114 @@ SYMBOLIC_LANG_PROMPTS = [
|
||||
]
|
||||
|
||||
# for function calling
|
||||
TOOLS = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The city to find the weather for, e.g. 'San Francisco'"
|
||||
TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to find the weather for, e.g. 'San Francisco'",
|
||||
},
|
||||
"state": {
|
||||
"type": "string",
|
||||
"description": "the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"state": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'"
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"]
|
||||
}
|
||||
"required": ["city", "state", "unit"],
|
||||
},
|
||||
"required": ["city", "state", "unit"]
|
||||
}
|
||||
},
|
||||
},
|
||||
}, {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "rewrite",
|
||||
"description": "Rewrites text",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"required": [],
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The input text to rewrite."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}]
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "rewrite",
|
||||
"description": "Rewrites text",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"required": [],
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The input text to rewrite.",
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
MSGS = [
|
||||
{"role": "system", "content": "You are an assistant."},
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an assistant."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors." # noqa
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"assistant",
|
||||
"content":
|
||||
"",
|
||||
"tool_calls": [{
|
||||
"id": "bbc5b7ede",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name":
|
||||
"rewrite",
|
||||
"arguments":
|
||||
'{\"text\":\"My English needs improvving, maybe I make errors.\"}' # noqa
|
||||
}
|
||||
}]
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"content":
|
||||
"{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}", # noqa
|
||||
"tool_call_id": "bbc5b7ede",
|
||||
"name": "rewrite"
|
||||
"role": "user",
|
||||
"content": "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors.", # noqa
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "---\n\nMy English needs improving, maybe I make errors"
|
||||
"content": "",
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": "bbc5b7ede",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "rewrite",
|
||||
"arguments": '{"text":"My English needs improvving, maybe I make errors."}', # noqa
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": ("Can you tell me what the temperate"
|
||||
" will be in Dallas, in fahrenheit?")
|
||||
}
|
||||
"role": "tool",
|
||||
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe I make errors."}', # noqa
|
||||
"tool_call_id": "bbc5b7ede",
|
||||
"name": "rewrite",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "---\n\nMy English needs improving, maybe I make errors",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
SAMPLE_JSON_SCHEMA = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"age": {
|
||||
"type": "integer"
|
||||
},
|
||||
"name": {"type": "string"},
|
||||
"age": {"type": "integer"},
|
||||
"skills": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"maxLength": 10
|
||||
},
|
||||
"minItems": 3
|
||||
"items": {"type": "string", "maxLength": 10},
|
||||
"minItems": 3,
|
||||
},
|
||||
"work_history": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company": {
|
||||
"type": "string"
|
||||
},
|
||||
"duration": {
|
||||
"type": "number"
|
||||
},
|
||||
"position": {
|
||||
"type": "string"
|
||||
}
|
||||
"company": {"type": "string"},
|
||||
"duration": {"type": "number"},
|
||||
"position": {"type": "string"},
|
||||
},
|
||||
"required": ["company", "position"]
|
||||
}
|
||||
}
|
||||
"required": ["company", "position"],
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["name", "age", "skills", "work_history"]
|
||||
"required": ["name", "age", "skills", "work_history"],
|
||||
}
|
||||
|
||||
|
||||
@@ -170,17 +150,25 @@ SAMPLE_JSON_SCHEMA = {
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# TODO(sang): Sliding window should be tested separately.
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, dtype=dtype,
|
||||
tokenizer_mode="mistral") as vllm_model:
|
||||
with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral") as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
@@ -194,27 +182,35 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
|
||||
max_tokens: int, num_logprobs: int) -> None:
|
||||
def test_mistral_format(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
) as mistral_format_model:
|
||||
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="auto",
|
||||
load_format="safetensors",
|
||||
config_format="hf",
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="auto",
|
||||
load_format="safetensors",
|
||||
config_format="hf",
|
||||
) as hf_format_model:
|
||||
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_format_outputs,
|
||||
@@ -226,34 +222,35 @@ def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
|
||||
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_mistral_symbolic_languages(vllm_runner, model: str,
|
||||
dtype: str) -> None:
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral") as vllm_model:
|
||||
def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
) as vllm_model:
|
||||
for prompt in SYMBOLIC_LANG_PROMPTS:
|
||||
msg = {"role": "user", "content": prompt}
|
||||
outputs = vllm_model.llm.chat([msg],
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
outputs = vllm_model.llm.chat([msg], sampling_params=SAMPLING_PARAMS)
|
||||
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral") as vllm_model:
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
) as vllm_model:
|
||||
msgs = copy.deepcopy(MSGS)
|
||||
outputs = vllm_model.llm.chat(msgs,
|
||||
tools=TOOLS,
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
outputs = vllm_model.llm.chat(
|
||||
msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS
|
||||
)
|
||||
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
tool_parser = MistralToolParser(tokenizer)
|
||||
@@ -265,10 +262,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
|
||||
assert parsed_message.tools_called
|
||||
|
||||
assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
|
||||
assert parsed_message.tool_calls[
|
||||
0].function.name == "get_current_weather"
|
||||
assert parsed_message.tool_calls[
|
||||
0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}' # noqa
|
||||
assert parsed_message.tool_calls[0].function.name == "get_current_weather"
|
||||
assert (
|
||||
parsed_message.tool_calls[0].function.arguments
|
||||
== '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'
|
||||
) # noqa
|
||||
assert parsed_message.content is None
|
||||
|
||||
|
||||
@@ -297,17 +295,10 @@ def test_mistral_function_call_nested_json():
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
"sub_dict": {
|
||||
"foo": "bar",
|
||||
"inner": {
|
||||
"x": 1,
|
||||
"y": 2
|
||||
}
|
||||
},
|
||||
"sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
|
||||
}
|
||||
|
||||
model_output = (
|
||||
f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}")
|
||||
model_output = f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}"
|
||||
|
||||
parsed = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
|
||||
@@ -15,62 +15,56 @@ MODELS = [
|
||||
|
||||
def test_phimoe_routing_function():
|
||||
from vllm.model_executor.models.phimoe import phimoe_routing_function
|
||||
|
||||
test_case = {
|
||||
0: {
|
||||
"hidden_states":
|
||||
torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
|
||||
dtype=torch.float32,
|
||||
requires_grad=False).view(4, 2),
|
||||
"gating_output":
|
||||
torch.tensor([0.1, 0.2, 0.3, 0.4],
|
||||
dtype=torch.float32,
|
||||
requires_grad=False),
|
||||
"topk":
|
||||
2,
|
||||
"renormalize":
|
||||
False,
|
||||
"hidden_states": torch.tensor(
|
||||
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
|
||||
).view(4, 2),
|
||||
"gating_output": torch.tensor(
|
||||
[0.1, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
|
||||
),
|
||||
"topk": 2,
|
||||
"renormalize": False,
|
||||
},
|
||||
1: {
|
||||
"hidden_states":
|
||||
torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
|
||||
dtype=torch.float32,
|
||||
requires_grad=False).view(4, 2),
|
||||
"gating_output":
|
||||
torch.tensor([0.4, 0.2, 0.3, 0.4],
|
||||
dtype=torch.float32,
|
||||
requires_grad=False),
|
||||
"topk":
|
||||
2,
|
||||
"renormalize":
|
||||
False,
|
||||
}
|
||||
"hidden_states": torch.tensor(
|
||||
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
|
||||
).view(4, 2),
|
||||
"gating_output": torch.tensor(
|
||||
[0.4, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
|
||||
),
|
||||
"topk": 2,
|
||||
"renormalize": False,
|
||||
},
|
||||
}
|
||||
|
||||
ground_truth = {
|
||||
0: {
|
||||
"topk_weights":
|
||||
torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
|
||||
"topk_ids":
|
||||
torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
|
||||
"topk_weights": torch.tensor(
|
||||
[1.0, 1.0], dtype=torch.float32, requires_grad=False
|
||||
),
|
||||
"topk_ids": torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
|
||||
},
|
||||
1: {
|
||||
"topk_weights":
|
||||
torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
|
||||
"topk_ids":
|
||||
torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
|
||||
}
|
||||
"topk_weights": torch.tensor(
|
||||
[0.5, 1.0], dtype=torch.float32, requires_grad=False
|
||||
),
|
||||
"topk_ids": torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
|
||||
},
|
||||
}
|
||||
|
||||
for test_id in test_case:
|
||||
topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
|
||||
assert torch.allclose(topk_weights,
|
||||
ground_truth[test_id]["topk_weights"])
|
||||
assert torch.allclose(topk_weights, ground_truth[test_id]["topk_weights"])
|
||||
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(condition=current_platform.is_cpu(),
|
||||
reason="This test takes a lot time to run on CPU, "
|
||||
"and vllm CI's disk space is not enough for this model.")
|
||||
@pytest.mark.skipif(
|
||||
condition=current_platform.is_cpu(),
|
||||
reason="This test takes a lot time to run on CPU, "
|
||||
"and vllm CI's disk space is not enough for this model.",
|
||||
)
|
||||
@large_gpu_test(min_gb=80)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@@ -87,11 +81,13 @@ def test_models(
|
||||
) -> None:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
|
||||
@@ -8,8 +8,7 @@ import torch
|
||||
from datasets import load_dataset
|
||||
|
||||
import tests.ci_envs as ci_envs
|
||||
from tests.models.utils import (GenerateModelInfo,
|
||||
TokensTextLogprobsPromptLogprobs)
|
||||
from tests.models.utils import GenerateModelInfo, TokensTextLogprobsPromptLogprobs
|
||||
from vllm.logprobs import Logprob
|
||||
|
||||
# See #24485
|
||||
@@ -18,13 +17,14 @@ MAX_LENGTH = 1024
|
||||
|
||||
|
||||
@torch.inference_mode
|
||||
def wikitext_ppl_test(hf_runner,
|
||||
vllm_runner,
|
||||
model_info: GenerateModelInfo,
|
||||
max_length=MAX_LENGTH,
|
||||
vllm_extra_kwargs=None,
|
||||
atol=PPL_TOL):
|
||||
|
||||
def wikitext_ppl_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: GenerateModelInfo,
|
||||
max_length=MAX_LENGTH,
|
||||
vllm_extra_kwargs=None,
|
||||
atol=PPL_TOL,
|
||||
):
|
||||
# A model family has many models with the same architecture,
|
||||
# and we don't need to test each one.
|
||||
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
|
||||
@@ -44,15 +44,16 @@ def wikitext_ppl_test(hf_runner,
|
||||
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
|
||||
if "hf_overrides" not in vllm_extra_kwargs:
|
||||
vllm_extra_kwargs["hf_overrides"] = {}
|
||||
vllm_extra_kwargs["hf_overrides"][
|
||||
"head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
|
||||
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_model_len=max_length,
|
||||
max_num_seqs=1,
|
||||
enforce_eager=True,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_model_len=max_length,
|
||||
max_num_seqs=1,
|
||||
enforce_eager=True,
|
||||
**vllm_extra_kwargs,
|
||||
) as vllm_model:
|
||||
# Use max_num_seqs=1 to avoid OOM,
|
||||
# and avoid batch different requests together.
|
||||
|
||||
@@ -60,7 +61,7 @@ def wikitext_ppl_test(hf_runner,
|
||||
|
||||
# Confirm whether vllm is using the correct architecture
|
||||
if model_info.architecture:
|
||||
assert (model_info.architecture in model_config.architectures)
|
||||
assert model_info.architecture in model_config.architectures
|
||||
|
||||
max_length = min(model_config.max_model_len - 1, max_length)
|
||||
stride = max_length
|
||||
@@ -74,12 +75,14 @@ def wikitext_ppl_test(hf_runner,
|
||||
end_loc = min(begin_loc + max_length, n_tokens)
|
||||
chunks.append(tokens[begin_loc:end_loc])
|
||||
|
||||
outputs = vllm_model.generate_greedy_logprobs(prompts=chunks,
|
||||
max_tokens=1,
|
||||
num_logprobs=None,
|
||||
num_prompt_logprobs=0,
|
||||
use_tqdm=False)
|
||||
nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
|
||||
outputs = vllm_model.generate_greedy_logprobs(
|
||||
prompts=chunks,
|
||||
max_tokens=1,
|
||||
num_logprobs=None,
|
||||
num_prompt_logprobs=0,
|
||||
use_tqdm=False,
|
||||
)
|
||||
nll_sum = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
n_tokens = 0
|
||||
for output in outputs:
|
||||
output = cast(TokensTextLogprobsPromptLogprobs, output)
|
||||
@@ -94,7 +97,8 @@ def wikitext_ppl_test(hf_runner,
|
||||
token_log_probs.append(token_log_prob)
|
||||
|
||||
neg_log_likelihood = -torch.tensor(
|
||||
token_log_probs, dtype=torch.float32, device="cpu").sum()
|
||||
token_log_probs, dtype=torch.float32, device="cpu"
|
||||
).sum()
|
||||
nll_sum += neg_log_likelihood
|
||||
n_tokens += len(token_log_probs)
|
||||
vllm_ppl = float(torch.exp(nll_sum / n_tokens))
|
||||
@@ -104,14 +108,13 @@ def wikitext_ppl_test(hf_runner,
|
||||
# Accelerate ppl test by setting Transformers ppl score to a constant
|
||||
if model_info.hf_ppl is None:
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
|
||||
model_info.name,
|
||||
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
|
||||
) as hf_model:
|
||||
nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
|
||||
nll_sum = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
n_tokens = 0
|
||||
for chunk in chunks:
|
||||
inputs = hf_model.wrap_device(
|
||||
{"input_ids": torch.tensor([chunk])})
|
||||
inputs = hf_model.wrap_device({"input_ids": torch.tensor([chunk])})
|
||||
input_ids = inputs["input_ids"]
|
||||
outputs = hf_model.model(input_ids, labels=input_ids)
|
||||
neg_log_likelihood = outputs.loss
|
||||
|
||||
@@ -6,8 +6,7 @@ from typing import Optional
|
||||
import pytest
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.models.utils import (EmbedModelInfo, check_embeddings_close,
|
||||
matryoshka_fy)
|
||||
from tests.models.utils import EmbedModelInfo, check_embeddings_close, matryoshka_fy
|
||||
|
||||
|
||||
def run_embedding_correctness_test(
|
||||
@@ -29,12 +28,14 @@ def run_embedding_correctness_test(
|
||||
)
|
||||
|
||||
|
||||
def correctness_test_embed_models(hf_runner,
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None):
|
||||
def correctness_test_embed_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
):
|
||||
pytest.skip("Debug only, ci prefers to use mteb test.")
|
||||
|
||||
# The example_prompts has ending "\n", for example:
|
||||
@@ -51,18 +52,16 @@ def correctness_test_embed_models(hf_runner,
|
||||
if model_info.hf_overrides is not None:
|
||||
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=None, **vllm_extra_kwargs
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
dtype=model_info.hf_dtype,
|
||||
is_sentence_transformer=True,
|
||||
model_info.name,
|
||||
dtype=model_info.hf_dtype,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
|
||||
@@ -4,8 +4,7 @@ import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
from tests.models.language.pooling.embed_utils import (
|
||||
run_embedding_correctness_test)
|
||||
from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -20,28 +19,27 @@ def test_classify_models(
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
example_prompts = example_prompts * 2
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
enable_prefix_caching=True) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
|
||||
) as vllm_model:
|
||||
cache_config = vllm_model.llm.llm_engine.cache_config
|
||||
assert cache_config.enable_prefix_caching
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForSequenceClassification) as hf_model:
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output)
|
||||
vllm_output = torch.tensor(vllm_output)
|
||||
|
||||
assert torch.allclose(hf_output, vllm_output,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
assert torch.allclose(
|
||||
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -59,18 +57,18 @@ def test_embed_models(
|
||||
example_prompts = [str(s).strip() for s in example_prompts] * 2
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
enable_prefix_caching=True,
|
||||
model,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
enable_prefix_caching=True,
|
||||
) as vllm_model:
|
||||
cache_config = vllm_model.llm.llm_engine.cache_config
|
||||
assert cache_config.enable_prefix_caching
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
is_sentence_transformer=True,
|
||||
model,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
|
||||
|
||||
@@ -81,13 +79,14 @@ def test_embed_models(
|
||||
"intfloat/e5-small",
|
||||
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", # is_causal == False
|
||||
"papluca/xlm-roberta-base-language-detection",
|
||||
])
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_non_causal_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
dtype: str) -> None:
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
enable_prefix_caching=True) as vllm_model:
|
||||
def test_non_causal_models(
|
||||
hf_runner, vllm_runner, example_prompts, model: str, dtype: str
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
|
||||
) as vllm_model:
|
||||
cache_config = vllm_model.llm.llm_engine.cache_config
|
||||
assert not cache_config.enable_prefix_caching
|
||||
|
||||
@@ -10,15 +10,17 @@ from vllm.platforms import current_platform
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param("jason9693/Qwen2.5-1.5B-apeach",
|
||||
marks=[
|
||||
pytest.mark.core_model, pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test
|
||||
]),
|
||||
pytest.param(
|
||||
"jason9693/Qwen2.5-1.5B-apeach",
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype",
|
||||
["half"] if current_platform.is_rocm() else ["float"])
|
||||
@pytest.mark.parametrize("dtype", ["half"] if current_platform.is_rocm() else ["float"])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
@@ -35,9 +37,9 @@ def test_models(
|
||||
with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForSequenceClassification) as hf_model:
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
# check logits difference
|
||||
@@ -48,5 +50,6 @@ def test_models(
|
||||
# the tolerance value of 1e-2 is selected based on the
|
||||
# half datatype tests in
|
||||
# tests/models/language/pooling/test_embedding.py
|
||||
assert torch.allclose(hf_output, vllm_output,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
assert torch.allclose(
|
||||
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
|
||||
)
|
||||
|
||||
@@ -18,20 +18,25 @@ from ...utils import check_embeddings_close
|
||||
# case won't pass because gte-Qwen2-1.5B-instruct will cache custom
|
||||
# model code with bidirectional attention.
|
||||
# [Decoder-only]
|
||||
pytest.param("BAAI/bge-multilingual-gemma2",
|
||||
marks=[pytest.mark.core_model, pytest.mark.slow_test]),
|
||||
pytest.param(
|
||||
"BAAI/bge-multilingual-gemma2",
|
||||
marks=[pytest.mark.core_model, pytest.mark.slow_test],
|
||||
),
|
||||
pytest.param(
|
||||
"intfloat/e5-mistral-7b-instruct",
|
||||
# CPU v1 doesn't support sliding window
|
||||
marks=[pytest.mark.core_model]),
|
||||
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
|
||||
marks=[pytest.mark.cpu_model]),
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
pytest.param(
|
||||
"ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.cpu_model]
|
||||
),
|
||||
# [Encoder-only]
|
||||
pytest.param(
|
||||
"BAAI/bge-base-en-v1.5",
|
||||
marks=[
|
||||
pytest.mark.core_model, pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
),
|
||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
||||
@@ -50,7 +55,6 @@ def test_models(
|
||||
model,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
|
||||
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
|
||||
# ROCm Triton FA does not currently support sliding window attention
|
||||
# switch to use ROCm CK FA backend
|
||||
@@ -58,13 +62,14 @@ def test_models(
|
||||
|
||||
vllm_extra_kwargs = {}
|
||||
if model == "ssmits/Qwen2-7B-Instruct-embed-base":
|
||||
vllm_extra_kwargs["pooler_config"] = \
|
||||
PoolerConfig(pooling_type="MEAN", normalize=False)
|
||||
vllm_extra_kwargs["pooler_config"] = PoolerConfig(
|
||||
pooling_type="MEAN", normalize=False
|
||||
)
|
||||
|
||||
max_model_len: Optional[int] = 512
|
||||
if model in [
|
||||
"sentence-transformers/all-MiniLM-L12-v2",
|
||||
"sentence-transformers/stsb-roberta-base-v2"
|
||||
"sentence-transformers/all-MiniLM-L12-v2",
|
||||
"sentence-transformers/stsb-roberta-base-v2",
|
||||
]:
|
||||
max_model_len = None
|
||||
|
||||
@@ -79,10 +84,9 @@ def test_models(
|
||||
with hf_runner(model, is_sentence_transformer=True) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(model,
|
||||
runner="pooling",
|
||||
max_model_len=max_model_len,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, runner="pooling", max_model_len=max_model_len, **vllm_extra_kwargs
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
check_embeddings_close(
|
||||
|
||||
@@ -70,8 +70,9 @@ async def run_client_embeddings(
|
||||
|
||||
|
||||
def gritlm_instruction(instruction):
|
||||
return ("<|user|>\n" + instruction +
|
||||
"\n<|embed|>\n" if instruction else "<|embed|>\n")
|
||||
return (
|
||||
"<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n"
|
||||
)
|
||||
|
||||
|
||||
def get_test_data():
|
||||
@@ -80,7 +81,8 @@ def get_test_data():
|
||||
README.md in https://github.com/ContextualAI/gritlm
|
||||
"""
|
||||
q_instruction = gritlm_instruction(
|
||||
"Given a scientific paper title, retrieve the paper's abstract", )
|
||||
"Given a scientific paper title, retrieve the paper's abstract",
|
||||
)
|
||||
queries = [
|
||||
"Bitcoin: A Peer-to-Peer Electronic Cash System",
|
||||
"Generative Representational Instruction Tuning",
|
||||
@@ -114,9 +116,9 @@ def test_gritlm_offline_embedding(vllm_runner):
|
||||
queries, q_instruction, documents, d_instruction = get_test_data()
|
||||
|
||||
with vllm_runner(
|
||||
MODEL_NAME,
|
||||
runner="pooling",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
MODEL_NAME,
|
||||
runner="pooling",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
|
||||
@@ -161,9 +163,9 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
|
||||
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
|
||||
|
||||
with vllm_runner(
|
||||
MODEL_NAME,
|
||||
runner="generate",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
MODEL_NAME,
|
||||
runner="generate",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
|
||||
|
||||
@@ -21,16 +21,18 @@ def test_idefics_multimodal(
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
with vllm_runner(model_name="HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
runner="pooling",
|
||||
task="classify",
|
||||
convert="classify",
|
||||
load_format="dummy",
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_log_stats=True,
|
||||
dtype="bfloat16") as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name="HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
runner="pooling",
|
||||
task="classify",
|
||||
convert="classify",
|
||||
load_format="dummy",
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_log_stats=True,
|
||||
dtype="bfloat16",
|
||||
) as vllm_model:
|
||||
llm = vllm_model.get_llm()
|
||||
outputs = llm.classify(prompts)
|
||||
for output in outputs:
|
||||
@@ -38,19 +40,20 @@ def test_idefics_multimodal(
|
||||
|
||||
|
||||
def update_config(config):
|
||||
config.text_config.update({
|
||||
"architectures": ["Gemma3ForSequenceClassification"],
|
||||
"classifier_from_token": ["A", "B", "C", "D", "E"],
|
||||
"method":
|
||||
"no_post_processing",
|
||||
"id2label": {
|
||||
"A": "Chair",
|
||||
"B": "Couch",
|
||||
"C": "Table",
|
||||
"D": "Bed",
|
||||
"E": "Cupboard"
|
||||
},
|
||||
})
|
||||
config.text_config.update(
|
||||
{
|
||||
"architectures": ["Gemma3ForSequenceClassification"],
|
||||
"classifier_from_token": ["A", "B", "C", "D", "E"],
|
||||
"method": "no_post_processing",
|
||||
"id2label": {
|
||||
"A": "Chair",
|
||||
"B": "Couch",
|
||||
"C": "Table",
|
||||
"D": "Bed",
|
||||
"E": "Cupboard",
|
||||
},
|
||||
}
|
||||
)
|
||||
return config
|
||||
|
||||
|
||||
@@ -63,11 +66,10 @@ def test_gemma_multimodal(
|
||||
# switch to use ROCm CK FA backend
|
||||
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"system",
|
||||
"content":
|
||||
"""
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": """
|
||||
You are a helpful assistant. You will be given a product description
|
||||
which may also include an image. Classify the following product into
|
||||
one of the categories:
|
||||
@@ -78,38 +80,39 @@ def test_gemma_multimodal(
|
||||
D = bed
|
||||
E = cupboard
|
||||
|
||||
You'll answer with exactly one letter (A, B, C, D, or E)."""
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url":
|
||||
"https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
|
||||
}
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": "A fine 19th century piece of furniture."
|
||||
}]
|
||||
}]
|
||||
|
||||
with vllm_runner(model_name="google/gemma-3-4b-it",
|
||||
runner="pooling",
|
||||
task="classify",
|
||||
convert="classify",
|
||||
load_format="auto",
|
||||
hf_overrides=update_config,
|
||||
pooler_config=PoolerConfig(pooling_type="LAST"),
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_log_stats=True,
|
||||
dtype="bfloat16") as vllm_model:
|
||||
You'll answer with exactly one letter (A, B, C, D, or E).""",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "A fine 19th century piece of furniture."},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model_name="google/gemma-3-4b-it",
|
||||
runner="pooling",
|
||||
task="classify",
|
||||
convert="classify",
|
||||
load_format="auto",
|
||||
hf_overrides=update_config,
|
||||
pooler_config=PoolerConfig(pooling_type="LAST"),
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_log_stats=True,
|
||||
dtype="bfloat16",
|
||||
) as vllm_model:
|
||||
llm = vllm_model.get_llm()
|
||||
prompts = llm.preprocess_chat(messages)
|
||||
|
||||
result = llm.classify(prompts)
|
||||
assert result[0].outputs.probs[0] > 0.95
|
||||
assert all(c < 0.05 for c in result[0].outputs.probs[1:])
|
||||
assert all(c < 0.05 for c in result[0].outputs.probs[1:])
|
||||
|
||||
@@ -20,14 +20,15 @@ def test_classify_models(
|
||||
with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForSequenceClassification) as hf_model:
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output)
|
||||
vllm_output = torch.tensor(vllm_output)
|
||||
|
||||
assert torch.allclose(hf_output, vllm_output,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
assert torch.allclose(
|
||||
hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
|
||||
)
|
||||
|
||||
@@ -7,10 +7,10 @@ from ...utils import EmbedModelInfo
|
||||
|
||||
MODELS = [
|
||||
EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
|
||||
#EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
|
||||
#EmbedModelInfo("nomic-ai/CodeRankEmbed"),
|
||||
# EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
|
||||
# EmbedModelInfo("nomic-ai/CodeRankEmbed"),
|
||||
EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
|
||||
#EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
|
||||
# EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
|
||||
]
|
||||
|
||||
rope_theta = 1000
|
||||
@@ -21,23 +21,24 @@ max_model_len = int(original_max_position_embeddings * factor)
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_default(model_info, vllm_runner):
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=None
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
|
||||
# For nomic-embed-text-v2-moe the length is set to 512
|
||||
# by sentence_bert_config.json.
|
||||
assert model_config.max_model_len == 512
|
||||
else:
|
||||
assert (
|
||||
model_config.max_model_len == original_max_position_embeddings)
|
||||
assert model_config.max_model_len == original_max_position_embeddings
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
# set max_model_len <= 512
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=256) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=256
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == 256
|
||||
|
||||
@@ -46,13 +47,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
# For nomic-embed-text-v2-moe the length is set to 512
|
||||
# by sentence_bert_config.json.
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=1024):
|
||||
with vllm_runner(model_info.name, runner="pooling", max_model_len=1024):
|
||||
pass
|
||||
else:
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=1024) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=1024
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.max_model_len == 1024
|
||||
|
||||
@@ -61,17 +61,18 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
|
||||
def test_set_max_model_len_illegal(model_info, vllm_runner):
|
||||
# set max_model_len > 2048
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name, runner="pooling",
|
||||
max_model_len=4096):
|
||||
with vllm_runner(model_info.name, runner="pooling", max_model_len=4096):
|
||||
pass
|
||||
|
||||
# set max_model_len > 2048 by hf_overrides
|
||||
hf_overrides = {"max_model_len": 4096}
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides,
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
@@ -82,16 +83,14 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
|
||||
"rope_scaling": {
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings":
|
||||
original_max_position_embeddings
|
||||
"original_max_position_embeddings": original_max_position_embeddings,
|
||||
},
|
||||
"max_model_len": max_model_len
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", max_model_len=None, hf_overrides=hf_overrides
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
@@ -102,16 +101,17 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
|
||||
"rope_scaling": {
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings":
|
||||
original_max_position_embeddings
|
||||
}
|
||||
"original_max_position_embeddings": original_max_position_embeddings,
|
||||
},
|
||||
}
|
||||
# illegal max_model_len
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=max_model_len + 1,
|
||||
hf_overrides=hf_overrides):
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=max_model_len + 1,
|
||||
hf_overrides=hf_overrides,
|
||||
):
|
||||
pass
|
||||
|
||||
hf_overrides = {
|
||||
@@ -119,15 +119,16 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
|
||||
"rope_scaling": {
|
||||
"rope_type": "yarn",
|
||||
"factor": factor,
|
||||
"original_max_position_embeddings":
|
||||
original_max_position_embeddings
|
||||
"original_max_position_embeddings": original_max_position_embeddings,
|
||||
},
|
||||
"max_model_len": max_model_len + 1
|
||||
"max_model_len": max_model_len + 1,
|
||||
}
|
||||
# illegal max_model_len by hf_overrides
|
||||
with pytest.raises(ValueError):
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides):
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
hf_overrides=hf_overrides,
|
||||
):
|
||||
pass
|
||||
|
||||
@@ -10,10 +10,7 @@ from vllm.config import PoolerConfig
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"jason9693/Qwen2.5-1.5B-apeach",
|
||||
"papluca/xlm-roberta-base-language-detection"
|
||||
],
|
||||
["jason9693/Qwen2.5-1.5B-apeach", "papluca/xlm-roberta-base-language-detection"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_classify_models_using_activation(
|
||||
@@ -23,30 +20,32 @@ def test_classify_models_using_activation(
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=False)) as vllm_model:
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=False),
|
||||
) as vllm_model:
|
||||
wo_activation_out = vllm_model.classify(example_prompts)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=True)) as vllm_model:
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=True),
|
||||
) as vllm_model:
|
||||
w_activation_out = vllm_model.classify(example_prompts)
|
||||
|
||||
for wo_activation, w_activation in zip(wo_activation_out,
|
||||
w_activation_out):
|
||||
for wo_activation, w_activation in zip(wo_activation_out, w_activation_out):
|
||||
wo_activation = torch.tensor(wo_activation)
|
||||
w_activation = torch.tensor(w_activation)
|
||||
|
||||
assert not torch.allclose(wo_activation, w_activation,
|
||||
atol=1e-2), "pooler_config is not working"
|
||||
assert torch.allclose(softmax(wo_activation), w_activation,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
assert not torch.allclose(wo_activation, w_activation, atol=1e-2), (
|
||||
"pooler_config is not working"
|
||||
)
|
||||
assert torch.allclose(
|
||||
softmax(wo_activation), w_activation, 1e-3 if dtype == "float" else 1e-2
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -63,26 +62,28 @@ def test_embed_models_using_normalize(
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=False)) as vllm_model:
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=False),
|
||||
) as vllm_model:
|
||||
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=True)) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=True),
|
||||
) as vllm_model:
|
||||
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
assert not torch.allclose(
|
||||
wo_normalize, w_normalize,
|
||||
atol=1e-2), "pooler_config normalize is not working"
|
||||
assert not torch.allclose(wo_normalize, w_normalize, atol=1e-2), (
|
||||
"pooler_config normalize is not working"
|
||||
)
|
||||
assert torch.allclose(
|
||||
F.normalize(wo_normalize, p=2, dim=-1), w_normalize,
|
||||
atol=1e-2), "w_normal should be close to normal(wo_normal)."
|
||||
F.normalize(wo_normalize, p=2, dim=-1), w_normalize, atol=1e-2
|
||||
), "w_normal should be close to normal(wo_normal)."
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -99,25 +100,26 @@ def test_reward_models_using_softmax(
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=False)) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=False),
|
||||
) as vllm_model:
|
||||
wo_softmax = vllm_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=True)) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True)
|
||||
) as vllm_model:
|
||||
w_softmax = vllm_model.encode(example_prompts)
|
||||
|
||||
for wo, w in zip(wo_softmax, w_softmax):
|
||||
wo = torch.tensor(wo)
|
||||
w = torch.tensor(w)
|
||||
|
||||
assert not torch.allclose(
|
||||
wo, w, atol=1e-2), "pooler_config softmax is not working"
|
||||
assert torch.allclose(
|
||||
softmax(wo), w,
|
||||
atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
|
||||
assert not torch.allclose(wo, w, atol=1e-2), (
|
||||
"pooler_config softmax is not working"
|
||||
)
|
||||
assert torch.allclose(softmax(wo), w, atol=1e-2), (
|
||||
"w_softmax should be close to softmax(wo_softmax)."
|
||||
)
|
||||
|
||||
@@ -16,10 +16,8 @@ from ...utils import check_transformers_version
|
||||
def math_step_prompts():
|
||||
# ruff: noqa: E501
|
||||
data = {
|
||||
"system":
|
||||
"Please reason step by step, and put your final answer within \\boxed{}. ",
|
||||
"query":
|
||||
"Sue lives in a fun neighborhood. One weekend, the neighbors decided to play a prank on Sue. On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard. On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard. Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
|
||||
"system": "Please reason step by step, and put your final answer within \\boxed{}. ",
|
||||
"query": "Sue lives in a fun neighborhood. One weekend, the neighbors decided to play a prank on Sue. On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard. On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard. Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
|
||||
"response": [
|
||||
"To find out how many more pink plastic flamingos were out than white plastic flamingos at noon on Sunday, we can break down the problem into steps. First, on Friday, the neighbors start with 18 pink plastic flamingos.",
|
||||
"On Saturday, they take back one third of the flamingos. Since there were 18 flamingos, (1/3 \\times 18 = 6) flamingos are taken back. So, they have (18 - 6 = 12) flamingos left in their possession. Then, they paint these 6 flamingos white and put them back out on Sue's front yard. Now, Sue has the original 12 pink flamingos plus the 6 new white ones. Thus, by the end of Saturday, Sue has (12 + 6 = 18) pink flamingos and 6 white flamingos.",
|
||||
@@ -27,16 +25,16 @@ def math_step_prompts():
|
||||
"To find the difference, subtract the number of white flamingos from the number of pink flamingos: (36 - 6 = 30). Therefore, at noon on Sunday, there were 30 more pink plastic flamingos out than white plastic flamingos. The answer is (\\boxed{30}).",
|
||||
],
|
||||
}
|
||||
answer = "<extra_0>".join(data['response']) + "<extra_0>"
|
||||
answer = "<extra_0>".join(data["response"]) + "<extra_0>"
|
||||
prompt = f"<im_start>system\n{data['system']}<im_end>\n<im_start>user\n{data['query']}<im_end>\n<im_start>assistant\n{answer}<im_end><|endoftext|>"
|
||||
return [prompt]
|
||||
|
||||
|
||||
def step_reward_patch_hf_model(hf_model: HfRunner):
|
||||
|
||||
# Patch the hf_runner to use the step reward function
|
||||
def make_step_rewards(logits: torch.Tensor,
|
||||
token_masks: torch.Tensor) -> list[list[float]]:
|
||||
def make_step_rewards(
|
||||
logits: torch.Tensor, token_masks: torch.Tensor
|
||||
) -> list[list[float]]:
|
||||
probabilities = F.softmax(logits, dim=-1)
|
||||
probabilities = probabilities * token_masks.unsqueeze(-1)
|
||||
|
||||
@@ -54,7 +52,7 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
|
||||
outputs = hf_model.model(input_ids=input_ids)
|
||||
|
||||
step_sep_id = hf_model.tokenizer.encode("<extra_0>")[0]
|
||||
token_masks = (input_ids == step_sep_id)
|
||||
token_masks = input_ids == step_sep_id
|
||||
return make_step_rewards(outputs[0], token_masks)
|
||||
|
||||
hf_model.reward = reward # type: ignore[attr-defined]
|
||||
@@ -65,8 +63,10 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param("Qwen/Qwen2.5-Math-PRM-7B",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
pytest.param(
|
||||
"Qwen/Qwen2.5-Math-PRM-7B",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@@ -78,8 +78,9 @@ def test_prm_models(
|
||||
dtype: str,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
|
||||
max_transformers_version="4.53.2")
|
||||
check_transformers_version(
|
||||
"Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
|
||||
)
|
||||
|
||||
if current_platform.is_cpu():
|
||||
pytest.skip("CPU only supports V1")
|
||||
|
||||
@@ -37,10 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict([text_pair]).tolist()
|
||||
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
|
||||
|
||||
assert len(vllm_outputs) == 1
|
||||
@@ -58,10 +57,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict(text_pairs).tolist()
|
||||
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
@@ -80,10 +78,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
|
||||
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict(text_pairs).tolist()
|
||||
|
||||
with vllm_runner(model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
@@ -101,17 +98,15 @@ def emb_model_name(request):
|
||||
def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
|
||||
text_pair = [TEXTS_1[0], TEXTS_2[0]]
|
||||
|
||||
with hf_runner(emb_model_name, dtype=DTYPE,
|
||||
is_sentence_transformer=True) as hf_model:
|
||||
with hf_runner(
|
||||
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
|
||||
) as hf_model:
|
||||
hf_embeddings = hf_model.encode(text_pair)
|
||||
hf_outputs = [
|
||||
F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
|
||||
]
|
||||
hf_outputs = [F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
|
||||
|
||||
assert len(vllm_outputs) == 1
|
||||
@@ -126,20 +121,18 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
|
||||
[TEXTS_1[0], TEXTS_2[1]],
|
||||
]
|
||||
|
||||
with hf_runner(emb_model_name, dtype=DTYPE,
|
||||
is_sentence_transformer=True) as hf_model:
|
||||
hf_embeddings = [
|
||||
hf_model.encode(text_pair) for text_pair in text_pairs
|
||||
]
|
||||
with hf_runner(
|
||||
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
|
||||
) as hf_model:
|
||||
hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
|
||||
hf_outputs = [
|
||||
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
|
||||
for pair in hf_embeddings
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
@@ -155,20 +148,18 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
|
||||
[TEXTS_1[1], TEXTS_2[1]],
|
||||
]
|
||||
|
||||
with hf_runner(emb_model_name, dtype=DTYPE,
|
||||
is_sentence_transformer=True) as hf_model:
|
||||
hf_embeddings = [
|
||||
hf_model.encode(text_pair) for text_pair in text_pairs
|
||||
]
|
||||
with hf_runner(
|
||||
emb_model_name, dtype=DTYPE, is_sentence_transformer=True
|
||||
) as hf_model:
|
||||
hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
|
||||
hf_outputs = [
|
||||
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
|
||||
for pair in hf_embeddings
|
||||
]
|
||||
|
||||
with vllm_runner(emb_model_name,
|
||||
runner="pooling",
|
||||
dtype=DTYPE,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 2
|
||||
|
||||
@@ -21,9 +21,9 @@ def test_models(
|
||||
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForTokenClassification) as hf_model:
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
|
||||
) as hf_model:
|
||||
tokenizer = hf_model.tokenizer
|
||||
hf_outputs = []
|
||||
for prompt in example_prompts:
|
||||
|
||||
@@ -20,51 +20,57 @@ calculus, each contributing unique perspectives that would shape this new
|
||||
field."""
|
||||
|
||||
|
||||
def test_smaller_truncation_size(vllm_runner,
|
||||
model_name=MODEL_NAME,
|
||||
input_str=input_str):
|
||||
|
||||
def test_smaller_truncation_size(
|
||||
vllm_runner, model_name=MODEL_NAME, input_str=input_str
|
||||
):
|
||||
truncate_prompt_tokens = 10
|
||||
|
||||
with vllm_runner(model_name, runner="pooling",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", max_model_len=max_model_len
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.llm.embed(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens
|
||||
)
|
||||
|
||||
prompt_tokens = vllm_output[0].prompt_token_ids
|
||||
|
||||
assert len(prompt_tokens) == truncate_prompt_tokens
|
||||
|
||||
|
||||
def test_max_truncation_size(vllm_runner,
|
||||
model_name=MODEL_NAME,
|
||||
input_str=input_str):
|
||||
def test_max_truncation_size(vllm_runner, model_name=MODEL_NAME, input_str=input_str):
|
||||
truncate_prompt_tokens = -1
|
||||
|
||||
with vllm_runner(model_name, runner="pooling",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_name, runner="pooling", max_model_len=max_model_len
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.llm.embed(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens
|
||||
)
|
||||
|
||||
prompt_tokens = vllm_output[0].prompt_token_ids
|
||||
|
||||
assert len(prompt_tokens) == max_model_len
|
||||
|
||||
|
||||
def test_bigger_truncation_size(vllm_runner,
|
||||
model_name=MODEL_NAME,
|
||||
input_str=input_str):
|
||||
|
||||
def test_bigger_truncation_size(
|
||||
vllm_runner, model_name=MODEL_NAME, input_str=input_str
|
||||
):
|
||||
truncate_prompt_tokens = max_model_len + 1
|
||||
|
||||
with pytest.raises(ValueError), vllm_runner(
|
||||
model_name, runner="pooling",
|
||||
max_model_len=max_model_len) as vllm_model:
|
||||
|
||||
with (
|
||||
pytest.raises(ValueError),
|
||||
vllm_runner(
|
||||
model_name, runner="pooling", max_model_len=max_model_len
|
||||
) as vllm_model,
|
||||
):
|
||||
llm_output = vllm_model.llm.embed(
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens
|
||||
)
|
||||
|
||||
assert llm_output == f"""truncate_prompt_tokens value
|
||||
assert (
|
||||
llm_output
|
||||
== f"""truncate_prompt_tokens value
|
||||
({truncate_prompt_tokens}) is greater than
|
||||
max_model_len ({max_model_len}). Please, select
|
||||
a smaller truncation size."""
|
||||
)
|
||||
|
||||
@@ -12,8 +12,7 @@ import requests
|
||||
import torch
|
||||
|
||||
import tests.ci_envs as ci_envs
|
||||
from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
|
||||
check_embeddings_close)
|
||||
from tests.models.utils import EmbedModelInfo, RerankModelInfo, check_embeddings_close
|
||||
|
||||
# Most embedding models on the STS12 task (See #17175):
|
||||
# - Model implementation and minor changes in tensor dtype
|
||||
@@ -30,7 +29,6 @@ MTEB_RERANK_TOL = 2e-3
|
||||
|
||||
|
||||
class VllmMtebEncoder(mteb.Encoder):
|
||||
|
||||
def __init__(self, vllm_model):
|
||||
super().__init__()
|
||||
self.llm = vllm_model
|
||||
@@ -53,8 +51,7 @@ class VllmMtebEncoder(mteb.Encoder):
|
||||
|
||||
def predict(
|
||||
self,
|
||||
sentences: list[tuple[str, str,
|
||||
Optional[str]]], # query, corpus, prompt
|
||||
sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
@@ -64,17 +61,15 @@ class VllmMtebEncoder(mteb.Encoder):
|
||||
queries = [s[0] for s in sentences]
|
||||
corpus = [s[1] for s in sentences]
|
||||
|
||||
outputs = self.llm.score(queries,
|
||||
corpus,
|
||||
truncate_prompt_tokens=-1,
|
||||
use_tqdm=False)
|
||||
outputs = self.llm.score(
|
||||
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
|
||||
)
|
||||
scores = np.array(outputs)
|
||||
scores = scores[np.argsort(r)]
|
||||
return scores
|
||||
|
||||
|
||||
class OpenAIClientMtebEncoder(mteb.Encoder):
|
||||
|
||||
def __init__(self, model_name: str, client):
|
||||
super().__init__()
|
||||
self.model_name = model_name
|
||||
@@ -87,8 +82,9 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
|
||||
r = self.rng.permutation(len(sentences))
|
||||
sentences = [sentences[i] for i in r]
|
||||
|
||||
embeddings = self.client.embeddings.create(model=self.model_name,
|
||||
input=sentences)
|
||||
embeddings = self.client.embeddings.create(
|
||||
model=self.model_name, input=sentences
|
||||
)
|
||||
outputs = [d.embedding for d in embeddings.data]
|
||||
embeds = np.array(outputs)
|
||||
embeds = embeds[np.argsort(r)]
|
||||
@@ -96,7 +92,6 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
|
||||
|
||||
|
||||
class ScoreClientMtebEncoder(mteb.Encoder):
|
||||
|
||||
def __init__(self, model_name: str, url):
|
||||
super().__init__()
|
||||
self.model_name = model_name
|
||||
@@ -105,8 +100,7 @@ class ScoreClientMtebEncoder(mteb.Encoder):
|
||||
|
||||
def predict(
|
||||
self,
|
||||
sentences: list[tuple[str, str,
|
||||
Optional[str]]], # query, corpus, prompt
|
||||
sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
@@ -122,27 +116,30 @@ class ScoreClientMtebEncoder(mteb.Encoder):
|
||||
return scores
|
||||
|
||||
def get_score(self, query, corpus):
|
||||
response = requests.post(self.url,
|
||||
json={
|
||||
"model": self.model_name,
|
||||
"text_1": query,
|
||||
"text_2": corpus,
|
||||
"truncate_prompt_tokens": -1,
|
||||
}).json()
|
||||
return response['data'][0]["score"]
|
||||
response = requests.post(
|
||||
self.url,
|
||||
json={
|
||||
"model": self.model_name,
|
||||
"text_1": query,
|
||||
"text_2": corpus,
|
||||
"truncate_prompt_tokens": -1,
|
||||
},
|
||||
).json()
|
||||
return response["data"][0]["score"]
|
||||
|
||||
|
||||
class RerankClientMtebEncoder(ScoreClientMtebEncoder):
|
||||
|
||||
def get_score(self, query, corpus):
|
||||
response = requests.post(self.url,
|
||||
json={
|
||||
"model": self.model_name,
|
||||
"query": query,
|
||||
"documents": [corpus],
|
||||
"truncate_prompt_tokens": -1,
|
||||
}).json()
|
||||
return response['results'][0]["relevance_score"]
|
||||
response = requests.post(
|
||||
self.url,
|
||||
json={
|
||||
"model": self.model_name,
|
||||
"query": query,
|
||||
"documents": [corpus],
|
||||
"truncate_prompt_tokens": -1,
|
||||
},
|
||||
).json()
|
||||
return response["results"][0]["relevance_score"]
|
||||
|
||||
|
||||
def run_mteb_embed_task(encoder, tasks):
|
||||
@@ -161,12 +158,14 @@ def run_mteb_embed_task(encoder, tasks):
|
||||
return main_score
|
||||
|
||||
|
||||
def mteb_test_embed_models(hf_runner,
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
atol=MTEB_EMBED_TOL):
|
||||
def mteb_test_embed_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
atol=MTEB_EMBED_TOL,
|
||||
):
|
||||
# A model family has many models with the same architecture,
|
||||
# and we don't need to test each one.
|
||||
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
|
||||
@@ -187,15 +186,15 @@ def mteb_test_embed_models(hf_runner,
|
||||
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
|
||||
if "hf_overrides" not in vllm_extra_kwargs:
|
||||
vllm_extra_kwargs["hf_overrides"] = {}
|
||||
vllm_extra_kwargs["hf_overrides"][
|
||||
"head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
enforce_eager=True,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
enforce_eager=True,
|
||||
**vllm_extra_kwargs,
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
|
||||
# Confirm whether vllm is using the correct architecture
|
||||
@@ -204,28 +203,29 @@ def mteb_test_embed_models(hf_runner,
|
||||
|
||||
# Confirm whether vllm uses the correct default_pooling_type, which
|
||||
# relates to whether chunked prefill and prefix caching are enabled
|
||||
assert (model_config._model_info.default_pooling_type ==
|
||||
model_info.default_pooling_type)
|
||||
assert (
|
||||
model_config._model_info.default_pooling_type
|
||||
== model_info.default_pooling_type
|
||||
)
|
||||
|
||||
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
|
||||
MTEB_EMBED_TASKS)
|
||||
vllm_main_score = run_mteb_embed_task(
|
||||
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
|
||||
)
|
||||
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
|
||||
head_dtype = model_config.head_dtype
|
||||
|
||||
# Test embed_dims, isnan and whether to use normalize
|
||||
vllm_outputs = vllm_model.embed(example_prompts,
|
||||
truncate_prompt_tokens=-1)
|
||||
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
|
||||
assert not torch.any(torch.isnan(torch.tensor(vllm_outputs)))
|
||||
|
||||
# Accelerate mteb test by setting
|
||||
# SentenceTransformers mteb score to a constant
|
||||
if model_info.mteb_score is None:
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
is_sentence_transformer=True,
|
||||
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
|
||||
model_info.name,
|
||||
is_sentence_transformer=True,
|
||||
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
|
||||
) as hf_model:
|
||||
|
||||
# e.g. setting default parameters for the encode method of hf_runner
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
@@ -247,8 +247,7 @@ def mteb_test_embed_models(hf_runner,
|
||||
st_dtype = "Constant"
|
||||
|
||||
print("Model:", model_info.name)
|
||||
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}",
|
||||
vllm_main_score)
|
||||
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
|
||||
print("SentenceTransformers:", st_dtype, st_main_score)
|
||||
print("Difference:", st_main_score - vllm_main_score)
|
||||
|
||||
@@ -282,26 +281,21 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
|
||||
top_k=10,
|
||||
save_predictions=True,
|
||||
output_folder=f"{results_folder}/stage2",
|
||||
previous_results=
|
||||
f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
|
||||
previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
|
||||
encode_kwargs={"show_progress_bar": False},
|
||||
)
|
||||
main_score = results[0].scores["test"][0]["main_score"]
|
||||
return main_score
|
||||
|
||||
|
||||
def mteb_test_rerank_models_hf(hf_runner,
|
||||
model_name,
|
||||
hf_dtype="float32",
|
||||
hf_model_callback=None):
|
||||
with hf_runner(model_name, is_cross_encoder=True,
|
||||
dtype=hf_dtype) as hf_model:
|
||||
|
||||
def mteb_test_rerank_models_hf(
|
||||
hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
|
||||
):
|
||||
with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
|
||||
original_predict = hf_model.predict
|
||||
|
||||
def _predict(
|
||||
sentences: list[tuple[str, str,
|
||||
Optional[str]]], # query, corpus, prompt
|
||||
sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
@@ -315,20 +309,22 @@ def mteb_test_rerank_models_hf(hf_runner,
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
st_main_score = run_mteb_rerank(hf_model,
|
||||
tasks=MTEB_RERANK_TASKS,
|
||||
languages=MTEB_RERANK_LANGS)
|
||||
st_main_score = run_mteb_rerank(
|
||||
hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
|
||||
)
|
||||
st_dtype = next(hf_model.model.model.parameters()).dtype
|
||||
return st_main_score, st_dtype
|
||||
|
||||
|
||||
def mteb_test_rerank_models(hf_runner,
|
||||
vllm_runner,
|
||||
model_info: RerankModelInfo,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
vllm_mteb_encoder=VllmMtebEncoder,
|
||||
atol=MTEB_RERANK_TOL):
|
||||
def mteb_test_rerank_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: RerankModelInfo,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
vllm_mteb_encoder=VllmMtebEncoder,
|
||||
atol=MTEB_RERANK_TOL,
|
||||
):
|
||||
# A model family has many models with the same architecture,
|
||||
# and we don't need to test each one.
|
||||
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
|
||||
@@ -346,33 +342,37 @@ def mteb_test_rerank_models(hf_runner,
|
||||
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
|
||||
if "hf_overrides" not in vllm_extra_kwargs:
|
||||
vllm_extra_kwargs["hf_overrides"] = {}
|
||||
vllm_extra_kwargs["hf_overrides"][
|
||||
"head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
max_num_seqs=8,
|
||||
enforce_eager=True,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
max_num_seqs=8,
|
||||
enforce_eager=True,
|
||||
**vllm_extra_kwargs,
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
|
||||
# Confirm whether vllm is using the correct architecture
|
||||
if model_info.architecture:
|
||||
assert (model_info.architecture in model_config.architectures)
|
||||
assert model_info.architecture in model_config.architectures
|
||||
|
||||
# Score API is only enabled for num_labels == 1
|
||||
assert model_config.hf_config.num_labels == 1
|
||||
|
||||
# Confirm whether vllm uses the correct default_pooling_type, which
|
||||
# relates to whether chunked prefill and prefix caching are enabled
|
||||
assert (model_config._model_info.default_pooling_type ==
|
||||
model_info.default_pooling_type)
|
||||
assert (
|
||||
model_config._model_info.default_pooling_type
|
||||
== model_info.default_pooling_type
|
||||
)
|
||||
|
||||
vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
|
||||
tasks=MTEB_RERANK_TASKS,
|
||||
languages=MTEB_RERANK_LANGS)
|
||||
vllm_main_score = run_mteb_rerank(
|
||||
vllm_mteb_encoder(vllm_model),
|
||||
tasks=MTEB_RERANK_TASKS,
|
||||
languages=MTEB_RERANK_LANGS,
|
||||
)
|
||||
vllm_dtype = model_config.dtype
|
||||
head_dtype = model_config.head_dtype
|
||||
|
||||
@@ -380,14 +380,14 @@ def mteb_test_rerank_models(hf_runner,
|
||||
# SentenceTransformers mteb score to a constant
|
||||
if model_info.mteb_score is None:
|
||||
st_main_score, st_dtype = mteb_test_rerank_models_hf(
|
||||
hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback)
|
||||
hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback
|
||||
)
|
||||
else:
|
||||
st_main_score = model_info.mteb_score
|
||||
st_dtype = "Constant"
|
||||
|
||||
print("Model:", model_info.name)
|
||||
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}",
|
||||
vllm_main_score)
|
||||
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
|
||||
print("SentenceTransformers:", st_dtype, st_main_score)
|
||||
print("Difference:", st_main_score - vllm_main_score)
|
||||
|
||||
|
||||
@@ -2,67 +2,76 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import (
|
||||
correctness_test_embed_models)
|
||||
from tests.models.utils import (CLSPoolingEmbedModelInfo,
|
||||
CLSPoolingRerankModelInfo, EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo, RerankModelInfo)
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import (
|
||||
CLSPoolingEmbedModelInfo,
|
||||
CLSPoolingRerankModelInfo,
|
||||
EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
||||
|
||||
MODELS = [
|
||||
########## BertModel
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-base-en",
|
||||
architecture="BertModel",
|
||||
mteb_score=0.779336792,
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-base-zh",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-small-en",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-small-zh",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-large-en",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-large-zh",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-noinstruct",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-base-en-v1.5",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-base-zh-v1.5",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-small-en-v1.5",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-small-zh-v1.5",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-large-en-v1.5",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-v1.5",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-base-en",
|
||||
architecture="BertModel",
|
||||
mteb_score=0.779336792,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-base-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-small-en", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-small-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-large-en", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-large-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
########## XLMRobertaModel
|
||||
CLSPoolingEmbedModelInfo("BAAI/bge-m3",
|
||||
architecture="XLMRobertaModel",
|
||||
mteb_score=0.787343078,
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-m3",
|
||||
architecture="XLMRobertaModel",
|
||||
mteb_score=0.787343078,
|
||||
enable_test=True,
|
||||
),
|
||||
########## Qwen2Model
|
||||
LASTPoolingEmbedModelInfo("BAAI/bge-code-v1",
|
||||
architecture="Qwen2Model",
|
||||
mteb_score=0.75724465,
|
||||
dtype="float32",
|
||||
enable_test=True),
|
||||
LASTPoolingEmbedModelInfo(
|
||||
"BAAI/bge-code-v1",
|
||||
architecture="Qwen2Model",
|
||||
mteb_score=0.75724465,
|
||||
dtype="float32",
|
||||
enable_test=True,
|
||||
),
|
||||
]
|
||||
|
||||
RERANK_MODELS = [
|
||||
@@ -71,33 +80,35 @@ RERANK_MODELS = [
|
||||
"BAAI/bge-reranker-base",
|
||||
architecture="XLMRobertaForSequenceClassification",
|
||||
mteb_score=0.32398,
|
||||
enable_test=True),
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingRerankModelInfo(
|
||||
"BAAI/bge-reranker-large",
|
||||
architecture="XLMRobertaForSequenceClassification",
|
||||
enable_test=False),
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingRerankModelInfo(
|
||||
"BAAI/bge-reranker-v2-m3",
|
||||
architecture="XLMRobertaForSequenceClassification",
|
||||
enable_test=False)
|
||||
enable_test=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
|
||||
example_prompts)
|
||||
def test_embed_models_correctness(
|
||||
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
|
||||
) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: RerankModelInfo) -> None:
|
||||
def test_rerank_models_mteb(
|
||||
hf_runner, vllm_runner, model_info: RerankModelInfo
|
||||
) -> None:
|
||||
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
@@ -8,53 +8,50 @@ import torch
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
VllmMtebEncoder, mteb_test_rerank_models)
|
||||
VllmMtebEncoder,
|
||||
mteb_test_rerank_models,
|
||||
)
|
||||
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
||||
|
||||
RERANK_MODELS = [
|
||||
LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
|
||||
architecture="GemmaForSequenceClassification",
|
||||
mteb_score=0.33757,
|
||||
hf_overrides={
|
||||
"architectures":
|
||||
["GemmaForSequenceClassification"],
|
||||
"classifier_from_token": ["Yes"],
|
||||
"method":
|
||||
"no_post_processing",
|
||||
}),
|
||||
LASTPoolingRerankModelInfo(
|
||||
"BAAI/bge-reranker-v2-gemma",
|
||||
architecture="GemmaForSequenceClassification",
|
||||
mteb_score=0.33757,
|
||||
hf_overrides={
|
||||
"architectures": ["GemmaForSequenceClassification"],
|
||||
"classifier_from_token": ["Yes"],
|
||||
"method": "no_post_processing",
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501
|
||||
|
||||
|
||||
class GemmaRerankerHfRunner(HfRunner):
|
||||
|
||||
def __init__(self,
|
||||
model_name: str,
|
||||
dtype: str = "auto",
|
||||
*args: Any,
|
||||
**kwargs: Any) -> None:
|
||||
def __init__(
|
||||
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
|
||||
) -> None:
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
padding_side='left')
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
|
||||
self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
|
||||
|
||||
@torch.no_grad()
|
||||
def predict(self, prompts: list[list[str]], *args,
|
||||
**kwargs) -> torch.Tensor:
|
||||
|
||||
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
|
||||
def get_inputs(pairs, tokenizer, prompt=None):
|
||||
if prompt is None:
|
||||
prompt = PROMPT
|
||||
|
||||
sep = "\n"
|
||||
prompt_inputs = tokenizer(prompt,
|
||||
return_tensors=None,
|
||||
add_special_tokens=False)["input_ids"]
|
||||
sep_inputs = tokenizer(sep,
|
||||
return_tensors=None,
|
||||
add_special_tokens=False)["input_ids"]
|
||||
prompt_inputs = tokenizer(
|
||||
prompt, return_tensors=None, add_special_tokens=False
|
||||
)["input_ids"]
|
||||
sep_inputs = tokenizer(sep, return_tensors=None, add_special_tokens=False)[
|
||||
"input_ids"
|
||||
]
|
||||
inputs = []
|
||||
for query, passage in pairs:
|
||||
query_inputs = tokenizer(
|
||||
@@ -78,8 +75,7 @@ class GemmaRerankerHfRunner(HfRunner):
|
||||
return_token_type_ids=False,
|
||||
add_special_tokens=False,
|
||||
)
|
||||
item["input_ids"] = item[
|
||||
"input_ids"] + sep_inputs + prompt_inputs
|
||||
item["input_ids"] = item["input_ids"] + sep_inputs + prompt_inputs
|
||||
item["attention_mask"] = [1] * len(item["input_ids"])
|
||||
inputs.append(item)
|
||||
return tokenizer.pad(
|
||||
@@ -95,14 +91,19 @@ class GemmaRerankerHfRunner(HfRunner):
|
||||
inputs = inputs.to(self.model.device)
|
||||
_n_tokens = inputs["input_ids"].shape[1]
|
||||
logits = self.model(**inputs, return_dict=True).logits
|
||||
_scores = (logits[:, -1,
|
||||
self.yes_loc].view(-1, ).float().sigmoid())
|
||||
_scores = (
|
||||
logits[:, -1, self.yes_loc]
|
||||
.view(
|
||||
-1,
|
||||
)
|
||||
.float()
|
||||
.sigmoid()
|
||||
)
|
||||
scores.append(_scores[0].item())
|
||||
return torch.Tensor(scores)
|
||||
|
||||
|
||||
class GemmaMtebEncoder(VllmMtebEncoder):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.query_template = "A: {query}\n"
|
||||
@@ -110,12 +111,10 @@ class GemmaMtebEncoder(VllmMtebEncoder):
|
||||
|
||||
def predict(
|
||||
self,
|
||||
sentences: list[tuple[str, str,
|
||||
Optional[str]]], # query, corpus, prompt
|
||||
sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
|
||||
_sentences = []
|
||||
for query, corpus, prompt in sentences:
|
||||
query = self.query_template.format(query=query)
|
||||
@@ -127,8 +126,9 @@ class GemmaMtebEncoder(VllmMtebEncoder):
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||
|
||||
mteb_test_rerank_models(GemmaRerankerHfRunner,
|
||||
vllm_runner,
|
||||
model_info,
|
||||
vllm_mteb_encoder=GemmaMtebEncoder)
|
||||
mteb_test_rerank_models(
|
||||
GemmaRerankerHfRunner,
|
||||
vllm_runner,
|
||||
model_info,
|
||||
vllm_mteb_encoder=GemmaMtebEncoder,
|
||||
)
|
||||
|
||||
@@ -2,22 +2,30 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import (CLSPoolingRerankModelInfo,
|
||||
LASTPoolingRerankModelInfo, RerankModelInfo)
|
||||
from tests.models.utils import (
|
||||
CLSPoolingRerankModelInfo,
|
||||
LASTPoolingRerankModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
|
||||
from .mteb_utils import mteb_test_rerank_models
|
||||
|
||||
RERANK_MODELS = [
|
||||
CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
|
||||
mteb_score=0.32898,
|
||||
architecture="BertForSequenceClassification"),
|
||||
LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
|
||||
mteb_score=0.25736,
|
||||
architecture="Qwen3ForSequenceClassification")
|
||||
CLSPoolingRerankModelInfo(
|
||||
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
|
||||
mteb_score=0.32898,
|
||||
architecture="BertForSequenceClassification",
|
||||
),
|
||||
LASTPoolingRerankModelInfo(
|
||||
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
|
||||
mteb_score=0.25736,
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: RerankModelInfo) -> None:
|
||||
def test_rerank_models_mteb(
|
||||
hf_runner, vllm_runner, model_info: RerankModelInfo
|
||||
) -> None:
|
||||
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
@@ -3,74 +3,93 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import (
|
||||
correctness_test_embed_models)
|
||||
from tests.models.utils import (CLSPoolingEmbedModelInfo,
|
||||
CLSPoolingRerankModelInfo, EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo, RerankModelInfo)
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import (
|
||||
CLSPoolingEmbedModelInfo,
|
||||
CLSPoolingRerankModelInfo,
|
||||
EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
||||
|
||||
MODELS = [
|
||||
########## BertModel
|
||||
CLSPoolingEmbedModelInfo("thenlper/gte-large",
|
||||
mteb_score=0.76807651,
|
||||
architecture="BertModel",
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo("thenlper/gte-base",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("thenlper/gte-small",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("thenlper/gte-large-zh",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("thenlper/gte-base-zh",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("thenlper/gte-small-zh",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-large",
|
||||
mteb_score=0.76807651,
|
||||
architecture="BertModel",
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-base", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-small", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-base-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
########### NewModel
|
||||
# These three architectures are almost the same, but not exactly the same.
|
||||
# For example,
|
||||
# - whether to use token_type_embeddings
|
||||
# - whether to use context expansion
|
||||
# So only test one (the most widely used) model
|
||||
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
|
||||
architecture="GteNewModel",
|
||||
mteb_score=0.775074696,
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
|
||||
architecture="GteNewModel",
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
|
||||
architecture="GteNewModel",
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Alibaba-NLP/gte-multilingual-base",
|
||||
architecture="GteNewModel",
|
||||
mteb_score=0.775074696,
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Alibaba-NLP/gte-base-en-v1.5",
|
||||
architecture="GteNewModel",
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Alibaba-NLP/gte-large-en-v1.5",
|
||||
architecture="GteNewModel",
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=False,
|
||||
),
|
||||
########### Qwen2ForCausalLM
|
||||
LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
||||
mteb_score=0.758473459018872,
|
||||
architecture="Qwen2ForCausalLM",
|
||||
enable_test=True),
|
||||
LASTPoolingEmbedModelInfo(
|
||||
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
||||
mteb_score=0.758473459018872,
|
||||
architecture="Qwen2ForCausalLM",
|
||||
enable_test=True,
|
||||
),
|
||||
########## ModernBertModel
|
||||
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
|
||||
mteb_score=0.748193353,
|
||||
architecture="ModernBertModel",
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Alibaba-NLP/gte-modernbert-base",
|
||||
mteb_score=0.748193353,
|
||||
architecture="ModernBertModel",
|
||||
enable_test=True,
|
||||
),
|
||||
########## Qwen3ForCausalLM
|
||||
LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
|
||||
mteb_score=0.771163695,
|
||||
architecture="Qwen3ForCausalLM",
|
||||
dtype="float32",
|
||||
enable_test=True),
|
||||
LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-4B",
|
||||
architecture="Qwen3ForCausalLM",
|
||||
dtype="float32",
|
||||
enable_test=False),
|
||||
LASTPoolingEmbedModelInfo(
|
||||
"Qwen/Qwen3-Embedding-0.6B",
|
||||
mteb_score=0.771163695,
|
||||
architecture="Qwen3ForCausalLM",
|
||||
dtype="float32",
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingEmbedModelInfo(
|
||||
"Qwen/Qwen3-Embedding-4B",
|
||||
architecture="Qwen3ForCausalLM",
|
||||
dtype="float32",
|
||||
enable_test=False,
|
||||
),
|
||||
]
|
||||
|
||||
RERANK_MODELS = [
|
||||
@@ -79,31 +98,32 @@ RERANK_MODELS = [
|
||||
"Alibaba-NLP/gte-reranker-modernbert-base",
|
||||
mteb_score=0.33386,
|
||||
architecture="ModernBertForSequenceClassification",
|
||||
enable_test=True),
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingRerankModelInfo(
|
||||
"Alibaba-NLP/gte-multilingual-reranker-base",
|
||||
mteb_score=0.33062,
|
||||
architecture="GteNewForSequenceClassification",
|
||||
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
|
||||
enable_test=True),
|
||||
enable_test=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
|
||||
example_prompts)
|
||||
def test_embed_models_correctness(
|
||||
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
|
||||
) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: RerankModelInfo) -> None:
|
||||
def test_rerank_models_mteb(
|
||||
hf_runner, vllm_runner, model_info: RerankModelInfo
|
||||
) -> None:
|
||||
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
@@ -2,50 +2,55 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import (
|
||||
correctness_test_embed_models)
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
########## BertModel
|
||||
CLSPoolingEmbedModelInfo("intfloat/e5-small",
|
||||
architecture="BertModel",
|
||||
mteb_score=0.742285423,
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo("intfloat/e5-base",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("intfloat/e5-large",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-small",
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/e5-small",
|
||||
architecture="BertModel",
|
||||
mteb_score=0.742285423,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/e5-base", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/e5-large", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
|
||||
),
|
||||
########## XLMRobertaModel
|
||||
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base",
|
||||
architecture="XLMRobertaModel",
|
||||
mteb_score=0.779325955,
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large",
|
||||
architecture="XLMRobertaModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large-instruct",
|
||||
architecture="XLMRobertaModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/multilingual-e5-base",
|
||||
architecture="XLMRobertaModel",
|
||||
mteb_score=0.779325955,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/multilingual-e5-large",
|
||||
architecture="XLMRobertaModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/multilingual-e5-large-instruct",
|
||||
architecture="XLMRobertaModel",
|
||||
enable_test=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
|
||||
example_prompts)
|
||||
def test_embed_models_correctness(
|
||||
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
|
||||
) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
|
||||
|
||||
@@ -5,60 +5,68 @@ from functools import partial
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import (
|
||||
check_embeddings_close, correctness_test_embed_models, matryoshka_fy)
|
||||
from tests.models.utils import (CLSPoolingEmbedModelInfo,
|
||||
CLSPoolingRerankModelInfo, EmbedModelInfo,
|
||||
RerankModelInfo)
|
||||
check_embeddings_close,
|
||||
correctness_test_embed_models,
|
||||
matryoshka_fy,
|
||||
)
|
||||
from tests.models.utils import (
|
||||
CLSPoolingEmbedModelInfo,
|
||||
CLSPoolingRerankModelInfo,
|
||||
EmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
from vllm import PoolingParams
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
||||
|
||||
EMBEDDING_MODELS = [
|
||||
CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3",
|
||||
mteb_score=0.824413164,
|
||||
architecture="XLMRobertaModel",
|
||||
is_matryoshka=True)
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"jinaai/jina-embeddings-v3",
|
||||
mteb_score=0.824413164,
|
||||
architecture="XLMRobertaModel",
|
||||
is_matryoshka=True,
|
||||
)
|
||||
]
|
||||
|
||||
RERANK_MODELS = [
|
||||
CLSPoolingRerankModelInfo(
|
||||
"jinaai/jina-reranker-v2-base-multilingual",
|
||||
mteb_score=0.33643,
|
||||
architecture="XLMRobertaForSequenceClassification")
|
||||
architecture="XLMRobertaForSequenceClassification",
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
def hf_model_callback(model):
|
||||
model.encode = partial(model.encode, task="text-matching")
|
||||
|
||||
mteb_test_embed_models(hf_runner,
|
||||
vllm_runner,
|
||||
model_info,
|
||||
hf_model_callback=hf_model_callback)
|
||||
mteb_test_embed_models(
|
||||
hf_runner, vllm_runner, model_info, hf_model_callback=hf_model_callback
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
|
||||
def test_embed_models_correctness(
|
||||
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
|
||||
) -> None:
|
||||
def hf_model_callback(model):
|
||||
model.encode = partial(model.encode, task="text-matching")
|
||||
|
||||
correctness_test_embed_models(hf_runner,
|
||||
vllm_runner,
|
||||
model_info,
|
||||
example_prompts,
|
||||
hf_model_callback=hf_model_callback)
|
||||
correctness_test_embed_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info,
|
||||
example_prompts,
|
||||
hf_model_callback=hf_model_callback,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: RerankModelInfo) -> None:
|
||||
def test_rerank_models_mteb(
|
||||
hf_runner, vllm_runner, model_info: RerankModelInfo
|
||||
) -> None:
|
||||
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@@ -81,32 +89,32 @@ def test_matryoshka(
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
with hf_runner(
|
||||
model_info.name,
|
||||
dtype=dtype,
|
||||
is_sentence_transformer=True,
|
||||
model_info.name,
|
||||
dtype=dtype,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts, task="text-matching")
|
||||
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
with vllm_runner(
|
||||
model_info.name, runner="pooling", dtype=dtype, max_model_len=None
|
||||
) as vllm_model:
|
||||
assert vllm_model.llm.llm_engine.model_config.is_matryoshka
|
||||
|
||||
matryoshka_dimensions = (
|
||||
vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
|
||||
vllm_model.llm.llm_engine.model_config.matryoshka_dimensions
|
||||
)
|
||||
assert matryoshka_dimensions is not None
|
||||
|
||||
if dimensions not in matryoshka_dimensions:
|
||||
with pytest.raises(ValueError):
|
||||
vllm_model.embed(
|
||||
example_prompts,
|
||||
pooling_params=PoolingParams(dimensions=dimensions))
|
||||
example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
|
||||
)
|
||||
else:
|
||||
vllm_outputs = vllm_model.embed(
|
||||
example_prompts,
|
||||
pooling_params=PoolingParams(dimensions=dimensions))
|
||||
example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
|
||||
)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
|
||||
@@ -17,46 +17,45 @@ mxbai_rerank_hf_overrides = {
|
||||
}
|
||||
|
||||
RERANK_MODELS = [
|
||||
LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
|
||||
architecture="Qwen2ForSequenceClassification",
|
||||
hf_overrides=mxbai_rerank_hf_overrides,
|
||||
mteb_score=0.273,
|
||||
enable_test=True),
|
||||
LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
|
||||
architecture="Qwen2ForSequenceClassification",
|
||||
hf_overrides=mxbai_rerank_hf_overrides,
|
||||
enable_test=False)
|
||||
LASTPoolingRerankModelInfo(
|
||||
"mixedbread-ai/mxbai-rerank-base-v2",
|
||||
architecture="Qwen2ForSequenceClassification",
|
||||
hf_overrides=mxbai_rerank_hf_overrides,
|
||||
mteb_score=0.273,
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingRerankModelInfo(
|
||||
"mixedbread-ai/mxbai-rerank-large-v2",
|
||||
architecture="Qwen2ForSequenceClassification",
|
||||
hf_overrides=mxbai_rerank_hf_overrides,
|
||||
enable_test=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class MxbaiRerankerHfRunner(HfRunner):
|
||||
|
||||
def __init__(self,
|
||||
model_name: str,
|
||||
dtype: str = "auto",
|
||||
*args: Any,
|
||||
**kwargs: Any) -> None:
|
||||
def __init__(
|
||||
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
|
||||
) -> None:
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
padding_side='left')
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
|
||||
self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
|
||||
self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
|
||||
|
||||
def predict(self, prompts: list[list[str]], *args,
|
||||
**kwargs) -> torch.Tensor:
|
||||
|
||||
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
|
||||
def process_inputs(pairs):
|
||||
inputs = self.tokenizer(pairs,
|
||||
padding=False,
|
||||
truncation='longest_first',
|
||||
return_attention_mask=False)
|
||||
for i, ele in enumerate(inputs['input_ids']):
|
||||
inputs['input_ids'][i] = ele
|
||||
inputs = self.tokenizer.pad(inputs,
|
||||
padding=True,
|
||||
return_tensors="pt")
|
||||
inputs = self.tokenizer(
|
||||
pairs,
|
||||
padding=False,
|
||||
truncation="longest_first",
|
||||
return_attention_mask=False,
|
||||
)
|
||||
for i, ele in enumerate(inputs["input_ids"]):
|
||||
inputs["input_ids"][i] = ele
|
||||
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
|
||||
for key in inputs:
|
||||
inputs[key] = inputs[key].to(self.model.device)
|
||||
return inputs
|
||||
|
||||
@@ -3,39 +3,42 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import (
|
||||
correctness_test_embed_models)
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1",
|
||||
architecture="NomicBertModel",
|
||||
mteb_score=0.737568559,
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
|
||||
architecture="NomicBertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("nomic-ai/CodeRankEmbed",
|
||||
architecture="NomicBertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
|
||||
architecture="NomicBertModel",
|
||||
mteb_score=0.715488912,
|
||||
enable_test=True)
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"nomic-ai/nomic-embed-text-v1",
|
||||
architecture="NomicBertModel",
|
||||
mteb_score=0.737568559,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"nomic-ai/nomic-embed-text-v1.5",
|
||||
architecture="NomicBertModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"nomic-ai/nomic-embed-text-v2-moe",
|
||||
architecture="NomicBertModel",
|
||||
mteb_score=0.715488912,
|
||||
enable_test=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
|
||||
example_prompts)
|
||||
def test_embed_models_correctness(
|
||||
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
|
||||
) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
|
||||
|
||||
@@ -18,46 +18,45 @@ qwen3_reranker_hf_overrides = {
|
||||
}
|
||||
|
||||
RERANK_MODELS = [
|
||||
LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
mteb_score=0.25736,
|
||||
hf_overrides=qwen3_reranker_hf_overrides,
|
||||
enable_test=True),
|
||||
LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
hf_overrides=qwen3_reranker_hf_overrides,
|
||||
enable_test=False)
|
||||
LASTPoolingRerankModelInfo(
|
||||
"Qwen/Qwen3-Reranker-0.6B",
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
mteb_score=0.25736,
|
||||
hf_overrides=qwen3_reranker_hf_overrides,
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingRerankModelInfo(
|
||||
"Qwen/Qwen3-Reranker-4B",
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
hf_overrides=qwen3_reranker_hf_overrides,
|
||||
enable_test=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class Qwen3RerankerHfRunner(HfRunner):
|
||||
|
||||
def __init__(self,
|
||||
model_name: str,
|
||||
dtype: str = "auto",
|
||||
*args: Any,
|
||||
**kwargs: Any) -> None:
|
||||
def __init__(
|
||||
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
|
||||
) -> None:
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
padding_side='left')
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
|
||||
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
|
||||
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
|
||||
|
||||
def predict(self, prompts: list[list[str]], *args,
|
||||
**kwargs) -> torch.Tensor:
|
||||
|
||||
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
|
||||
def process_inputs(pairs):
|
||||
inputs = self.tokenizer(pairs,
|
||||
padding=False,
|
||||
truncation='longest_first',
|
||||
return_attention_mask=False)
|
||||
for i, ele in enumerate(inputs['input_ids']):
|
||||
inputs['input_ids'][i] = ele
|
||||
inputs = self.tokenizer.pad(inputs,
|
||||
padding=True,
|
||||
return_tensors="pt")
|
||||
inputs = self.tokenizer(
|
||||
pairs,
|
||||
padding=False,
|
||||
truncation="longest_first",
|
||||
return_attention_mask=False,
|
||||
)
|
||||
for i, ele in enumerate(inputs["input_ids"]):
|
||||
inputs["input_ids"][i] = ele
|
||||
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
|
||||
for key in inputs:
|
||||
inputs[key] = inputs[key].to(self.model.device)
|
||||
return inputs
|
||||
@@ -82,20 +81,18 @@ class Qwen3RerankerHfRunner(HfRunner):
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||
|
||||
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_rerank_models_mteb_tp(vllm_runner,
|
||||
model_info: RerankModelInfo) -> None:
|
||||
|
||||
def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||
assert model_info.architecture == "Qwen3ForSequenceClassification"
|
||||
|
||||
vllm_extra_kwargs: dict[str, Any] = {
|
||||
"tensor_parallel_size": 2,
|
||||
}
|
||||
|
||||
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
|
||||
vllm_extra_kwargs)
|
||||
mteb_test_rerank_models(
|
||||
Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs
|
||||
)
|
||||
|
||||
@@ -3,62 +3,75 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import (
|
||||
correctness_test_embed_models)
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
mteb_score=0.714927797,
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
|
||||
is_matryoshka=False,
|
||||
architecture="NomicBertModel",
|
||||
mteb_score=0.681146831,
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
enable_test=False),
|
||||
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
|
||||
is_matryoshka=True,
|
||||
architecture="BertModel",
|
||||
mteb_score=0.649088363,
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
|
||||
is_matryoshka=True,
|
||||
architecture="XLMRobertaModel",
|
||||
mteb_score=0.712258299,
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
|
||||
is_matryoshka=True,
|
||||
architecture="GteModel",
|
||||
mteb_score=0.706622444,
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-xs",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
mteb_score=0.714927797,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-s",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-m",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-m-long",
|
||||
is_matryoshka=False,
|
||||
architecture="NomicBertModel",
|
||||
mteb_score=0.681146831,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-l",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-m-v1.5",
|
||||
is_matryoshka=True,
|
||||
architecture="BertModel",
|
||||
mteb_score=0.649088363,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-l-v2.0",
|
||||
is_matryoshka=True,
|
||||
architecture="XLMRobertaModel",
|
||||
mteb_score=0.712258299,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-m-v2.0",
|
||||
is_matryoshka=True,
|
||||
architecture="GteModel",
|
||||
mteb_score=0.706622444,
|
||||
enable_test=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
example_prompts) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
|
||||
example_prompts)
|
||||
def test_embed_models_correctness(
|
||||
hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
|
||||
) -> None:
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
|
||||
|
||||
@@ -2,8 +2,11 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo)
|
||||
from tests.models.utils import (
|
||||
CLSPoolingEmbedModelInfo,
|
||||
EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo,
|
||||
)
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
@@ -15,15 +18,15 @@ ST_PROJECTOR_MODELS = [
|
||||
mteb_score=0.688611955,
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingEmbedModelInfo("google/embeddinggemma-300m",
|
||||
architecture="Gemma3TextModel",
|
||||
mteb_score=0.7473819294684156,
|
||||
enable_test=True)
|
||||
LASTPoolingEmbedModelInfo(
|
||||
"google/embeddinggemma-300m",
|
||||
architecture="Gemma3TextModel",
|
||||
mteb_score=0.7473819294684156,
|
||||
enable_test=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
Reference in New Issue
Block a user