[CI] Reorganization pooling_mteb_test (#31265)

Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
wang.yuqi
2025-12-24 23:36:20 +08:00
committed by GitHub
parent 7cd288a4b3
commit 1ff67df182
18 changed files with 480 additions and 347 deletions

View File

@@ -0,0 +1,228 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import mteb
import numpy as np
import torch
from mteb.models import ModelMeta
from mteb.types import Array
from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs
from tests.models.utils import (
EmbedModelInfo,
check_embeddings_close,
get_vllm_extra_kwargs,
)
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4
_empty_model_meta = ModelMeta(
loader=None,
name="vllm/model",
revision="1",
release_date=None,
languages=None,
framework=[],
similarity_fn_name=None,
n_parameters=None,
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
public_training_data=None,
use_instructions=None,
training_datasets=None,
modalities=["text"], # 'image' can be added to evaluate multimodal models
)
class MtebEmbedMixin(mteb.EncoderProtocol):
mteb_model_meta = _empty_model_meta
def similarity(
self,
embeddings1: np.ndarray,
embeddings2: np.ndarray,
) -> np.ndarray:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
return sim
def similarity_pairwise(
self,
embeddings1: Array,
embeddings2: Array,
) -> Array:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
norm1.flatten() * norm2.flatten()
)
return sim
class VllmMtebEncoder(MtebEmbedMixin):
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
class OpenAIClientMtebEncoder(MtebEmbedMixin):
def __init__(self, model_name: str, client):
self.model_name = model_name
self.client = client
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
embeddings = self.client.embeddings.create(
model=self.model_name, input=sentences
)
outputs = [d.embedding for d in embeddings.data]
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks)
results = mteb.evaluate(
encoder,
tasks,
cache=None,
show_progress_bar=False,
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_embed_models(
hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_EMBED_TOL,
):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
# Test embed_dims, isnan and whether to use normalize
example_prompts = ["The chef prepared a delicious meal." * 1000]
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=model_info.max_model_len,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Confirm whether the important configs in model_config are correct.
if model_info.pooling_type is not None:
assert model_config.pooler_config.pooling_type == model_info.pooling_type
if model_info.attn_type is not None:
assert model_config.attn_type == model_info.attn_type
if model_info.is_prefix_caching_supported is not None:
assert (
model_config.is_prefix_caching_supported
== model_info.is_prefix_caching_supported
)
if model_info.is_chunked_prefill_supported is not None:
assert (
model_config.is_chunked_prefill_supported
== model_info.is_chunked_prefill_supported
)
vllm_main_score = run_mteb_embed_task(
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
)
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
head_dtype = model_config.head_dtype
# Test embedding_size, isnan and whether to use normalize
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
outputs_tensor = torch.tensor(vllm_outputs)
assert not torch.any(torch.isnan(outputs_tensor))
embedding_size = model_config.embedding_size
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(
model_info.name,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
# e.g. setting default parameters for the encode method of hf_runner
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype
# Check embeddings close to hf outputs
hf_outputs = hf_model.encode(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol

View File

@@ -7,37 +7,24 @@ from pathlib import Path
import mteb
import numpy as np
import requests
import torch
from mteb.models import ModelMeta
from mteb.types import Array
from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs
from tests.models.utils import (
EmbedModelInfo,
RerankModelInfo,
check_embeddings_close,
get_vllm_extra_kwargs,
)
template_home = (
Path(__file__).parent.parent.parent.parent.parent
/ "examples/pooling/score/template"
)
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["eng"]
MTEB_RERANK_TOL = 2e-3
template_home = (
Path(__file__).parent.parent.parent.parent.parent
/ "examples/pooling/score/template"
)
_empty_model_meta = ModelMeta(
loader=None,
name="vllm/model",
@@ -60,84 +47,11 @@ _empty_model_meta = ModelMeta(
)
class VllmMtebEncoder(mteb.EncoderProtocol):
class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
def similarity(
self,
embeddings1: np.ndarray,
embeddings2: np.ndarray,
) -> np.ndarray:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
return sim
def similarity_pairwise(
self,
embeddings1: Array,
embeddings2: Array,
) -> Array:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
norm1.flatten() * norm2.flatten()
)
return sim
class OpenAIClientMtebEncoder(VllmMtebEncoder):
def __init__(self, model_name: str, client):
self.model_name = model_name
self.client = client
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
embeddings = self.client.embeddings.create(
model=self.model_name, input=sentences
)
outputs = [d.embedding for d in embeddings.data]
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
class VllmMtebCrossEncoder(MtebCrossEncoderMixin):
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
@@ -164,7 +78,7 @@ class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
return scores
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
class ScoreClientMtebEncoder(MtebCrossEncoderMixin):
mteb_model_meta = _empty_model_meta
def __init__(self, model_name: str, url):
@@ -216,102 +130,6 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder):
return response["results"][0]["relevance_score"]
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks)
results = mteb.evaluate(
encoder,
tasks,
cache=None,
show_progress_bar=False,
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_embed_models(
hf_runner,
vllm_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_EMBED_TOL,
):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
# Test embed_dims, isnan and whether to use normalize
example_prompts = ["The chef prepared a delicious meal." * 1000]
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=model_info.max_model_len,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
vllm_main_score = run_mteb_embed_task(
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
)
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
head_dtype = model_config.head_dtype
# Test embedding_size, isnan and whether to use normalize
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
outputs_tensor = torch.tensor(vllm_outputs)
assert not torch.any(torch.isnan(outputs_tensor))
embedding_size = model_config.embedding_size
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(
model_info.name,
is_sentence_transformer=True,
dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
) as hf_model:
# e.g. setting default parameters for the encode method of hf_runner
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype
# Check embeddings close to hf outputs
hf_outputs = hf_model.encode(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
with tempfile.TemporaryDirectory() as prediction_folder:
bm25s = mteb.get_model("bm25s")
@@ -391,18 +209,28 @@ def mteb_test_rerank_models(
# Score API is only enabled for num_labels == 1
assert model_config.hf_config.num_labels == 1
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
# Maybe load chat_template.
chat_template: str | None = None
if model_info.chat_template_name is not None:
chat_template = (template_home / model_info.chat_template_name).read_text()
vllm_model.chat_template = chat_template
# Confirm whether the important configs in model_config are correct.
if model_info.pooling_type is not None:
assert model_config.pooler_config.pooling_type == model_info.pooling_type
if model_info.attn_type is not None:
assert model_config.attn_type == model_info.attn_type
if model_info.is_prefix_caching_supported is not None:
assert (
model_config.is_prefix_caching_supported
== model_info.is_prefix_caching_supported
)
if model_info.is_chunked_prefill_supported is not None:
assert (
model_config.is_chunked_prefill_supported
== model_info.is_chunked_prefill_supported
)
vllm_main_score = run_mteb_rerank(
vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS,

View File

@@ -4,90 +4,94 @@ import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-base-en",
architecture="BertModel",
mteb_score=0.779336792,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-base-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False),
EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False),
EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False),
EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False),
EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False),
EmbedModelInfo(
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
),
########## XLMRobertaModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-m3",
architecture="XLMRobertaModel",
mteb_score=0.787343078,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
########## Qwen2Model
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-code-v1",
architecture="Qwen2Model",
mteb_score=0.75724465,
dtype="float32",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
),
]
RERANK_MODELS = [
########## XLMRobertaForSequenceClassification
CLSPoolingRerankModelInfo(
RerankModelInfo(
"BAAI/bge-reranker-base",
architecture="XLMRobertaForSequenceClassification",
mteb_score=0.32398,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingRerankModelInfo(
RerankModelInfo(
"BAAI/bge-reranker-large",
architecture="XLMRobertaForSequenceClassification",
enable_test=False,
),
CLSPoolingRerankModelInfo(
RerankModelInfo(
"BAAI/bge-reranker-v2-m3",
architecture="XLMRobertaForSequenceClassification",
enable_test=False,

View File

@@ -9,14 +9,12 @@ import torch
from torch.utils.data import DataLoader
from tests.conftest import HfRunner
from tests.models.language.pooling_mteb_test.mteb_utils import (
VllmMtebCrossEncoder,
mteb_test_rerank_models,
)
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
from tests.models.utils import RerankModelInfo
from .mteb_score_utils import VllmMtebCrossEncoder, mteb_test_rerank_models
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
RerankModelInfo(
"BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification",
mteb_score=0.33757,
@@ -25,6 +23,10 @@ RERANK_MODELS = [
"classifier_from_token": ["Yes"],
"method": "no_post_processing",
},
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
),
]

View File

@@ -3,23 +3,29 @@
import pytest
from tests.models.utils import (
CLSPoolingRerankModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_rerank_models
from .mteb_score_utils import mteb_test_rerank_models
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
RerankModelInfo(
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
mteb_score=0.32898,
architecture="BertForSequenceClassification",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
),
LASTPoolingRerankModelInfo(
RerankModelInfo(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
mteb_score=0.25736,
architecture="Qwen3ForSequenceClassification",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
),
]

View File

@@ -5,36 +5,32 @@ import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"thenlper/gte-large",
mteb_score=0.76807651,
architecture="BertModel",
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-base", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-small", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False),
EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False),
EmbedModelInfo(
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-base-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False),
EmbedModelInfo(
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False
),
########### NewModel
@@ -43,48 +39,64 @@ MODELS = [
# - whether to use token_type_embeddings
# - whether to use context expansion
# So only test one (the most widely used) model
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel",
mteb_score=0.775074696,
hf_overrides={"architectures": ["GteNewModel"]},
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False,
),
########### Qwen2ForCausalLM
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
mteb_score=0.758473459018872,
architecture="Qwen2ForCausalLM",
pooling_type="LAST",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
########## ModernBertModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-modernbert-base",
mteb_score=0.748193353,
architecture="ModernBertModel",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
########## Qwen3ForCausalLM
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"Qwen/Qwen3-Embedding-0.6B",
mteb_score=0.771163695,
architecture="Qwen3ForCausalLM",
dtype="float32",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
),
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"Qwen/Qwen3-Embedding-4B",
architecture="Qwen3ForCausalLM",
dtype="float32",
@@ -93,18 +105,26 @@ MODELS = [
]
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
RerankModelInfo(
# classifier_pooling: mean
"Alibaba-NLP/gte-reranker-modernbert-base",
mteb_score=0.33386,
architecture="ModernBertForSequenceClassification",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingRerankModelInfo(
RerankModelInfo(
"Alibaba-NLP/gte-multilingual-reranker-base",
mteb_score=0.33062,
architecture="GteNewForSequenceClassification",
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
]

View File

@@ -3,40 +3,44 @@
import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
from .mteb_embed_utils import mteb_test_embed_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"intfloat/e5-small",
architecture="BertModel",
mteb_score=0.742285423,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"intfloat/e5-base", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"intfloat/e5-large", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False),
EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False),
EmbedModelInfo(
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
),
########## XLMRobertaModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"intfloat/multilingual-e5-base",
architecture="XLMRobertaModel",
mteb_score=0.779325955,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"intfloat/multilingual-e5-large",
architecture="XLMRobertaModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel",
enable_test=False,

View File

@@ -10,30 +10,37 @@ from tests.models.language.pooling.embed_utils import (
matryoshka_fy,
)
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
RerankModelInfo,
)
from vllm import PoolingParams
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
EMBEDDING_MODELS = [
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"jinaai/jina-embeddings-v3",
mteb_score=0.824413164,
architecture="XLMRobertaModel",
is_matryoshka=True,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
dtype="float32",
)
]
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
RerankModelInfo(
"jinaai/jina-reranker-v2-base-multilingual",
mteb_score=0.33643,
architecture="XLMRobertaForSequenceClassification",
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
)
]

View File

@@ -6,9 +6,9 @@ import pytest
import torch
from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
from tests.models.utils import RerankModelInfo
from .mteb_utils import mteb_test_rerank_models
from .mteb_score_utils import mteb_test_rerank_models
mxbai_rerank_hf_overrides = {
"architectures": ["Qwen2ForSequenceClassification"],
@@ -17,14 +17,18 @@ mxbai_rerank_hf_overrides = {
}
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
RerankModelInfo(
"mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
mteb_score=0.273,
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
),
LASTPoolingRerankModelInfo(
RerankModelInfo(
"mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,

View File

@@ -3,29 +3,39 @@
import pytest
from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
mteb_test_embed_models,
)
from tests.models.language.pooling_mteb_test.mteb_score_utils import (
mteb_test_rerank_models,
)
from tests.models.utils import (
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
EMBEDDING_MODELS = [
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"nvidia/llama-nemotron-embed-1b-v2",
architecture="LlamaBidirectionalModel",
mteb_score=0.689164662128673,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
)
]
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
RerankModelInfo(
"nvidia/llama-nemotron-rerank-1b-v2",
architecture="LlamaBidirectionalForSequenceClassification",
chat_template_name="nemotron-rerank.jinja",
mteb_score=0.33994,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
),
]

View File

@@ -4,30 +4,38 @@
import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
from .mteb_embed_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel",
mteb_score=0.737568559,
enable_test=True,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel",
mteb_score=0.715488912,
enable_test=True,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
),
]

View File

@@ -6,10 +6,10 @@ import pytest
import torch
from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
from tests.models.utils import RerankModelInfo
from tests.utils import multi_gpu_test
from .mteb_utils import mteb_test_rerank_models
from .mteb_score_utils import mteb_test_rerank_models
qwen3_reranker_hf_overrides = {
"architectures": ["Qwen3ForSequenceClassification"],
@@ -18,14 +18,18 @@ qwen3_reranker_hf_overrides = {
}
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
RerankModelInfo(
"Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides,
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
),
LASTPoolingRerankModelInfo(
RerankModelInfo(
"Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification",
hf_overrides=qwen3_reranker_hf_overrides,

View File

@@ -4,62 +4,82 @@
import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
from .mteb_embed_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False,
architecture="BertModel",
mteb_score=0.714927797,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-long",
is_matryoshka=False,
architecture="NomicBertModel",
mteb_score=0.681146831,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True,
architecture="BertModel",
mteb_score=0.649088363,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l-v2.0",
is_matryoshka=True,
architecture="XLMRobertaModel",
mteb_score=0.712258299,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True,
architecture="GteModel",
mteb_score=0.706622444,
pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
]

View File

@@ -3,25 +3,31 @@
import pytest
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
)
from .mteb_utils import mteb_test_embed_models
from .mteb_embed_utils import mteb_test_embed_models
# ST models with projector (Dense) layers
ST_PROJECTOR_MODELS = [
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"TencentBAC/Conan-embedding-v1",
architecture="BertModel",
mteb_score=0.688611955,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"google/embeddinggemma-300m",
architecture="Gemma3TextModel",
mteb_score=0.7473819294684156,
pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
dtype="float32",
),