[CI/Build] Reorganize models tests (#17459)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-01 14:03:08 +08:00
parent aa4502e7f3
commit afb4429b4f
65 changed files with 316 additions and 323 deletions
--- a/tests/models/language/pooling/init.py
+++ b/tests/models/language/pooling/init.py
--- a/tests/models/language/pooling/test_cls_models.py
+++ b/tests/models/language/pooling/test_cls_models.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the classification outputs of HF and vLLM models.
+
+Run `pytest tests/models/test_cls_models.py`.
+"""
+import pytest
+import torch
+from transformers import AutoModelForSequenceClassification
+
+from vllm.platforms import current_platform
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param("jason9693/Qwen2.5-1.5B-apeach",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+    ],
+)
+@pytest.mark.parametrize("dtype",
+                         ["half"] if current_platform.is_rocm() else ["float"])
+def test_classification_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    monkeypatch,
+) -> None:
+    if current_platform.is_rocm():
+        # ROCm Triton FA does not currently support sliding window attention
+        # switch to use ROCm CK FA backend
+        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.classify(example_prompts)
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    # check logits difference
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+
+        # the tolerance value of 1e-2 is selected based on the
+        # half datatype tests in
+        # tests/models/embedding/language/test_embedding.py
+        assert torch.allclose(hf_output, vllm_output,
+                              1e-3 if dtype == "float" else 1e-2)
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the embedding outputs of HF and vLLM models.
+
+Run `pytest tests/models/embedding/language/test_embedding.py`.
+"""
+import pytest
+
+from vllm.config import PoolerConfig
+from vllm.platforms import current_platform
+
+from ...utils import check_embeddings_close
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        # [Encoder-only]
+        pytest.param("BAAI/bge-base-en-v1.5",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
+        pytest.param("intfloat/multilingual-e5-small"),
+        pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
+        # [Decoder-only]
+        pytest.param("BAAI/bge-multilingual-gemma2",
+                     marks=[pytest.mark.core_model]),
+        pytest.param("intfloat/e5-mistral-7b-instruct",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
+        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
+        # [Cross-Encoder]
+        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model,
+    dtype: str,
+    monkeypatch,
+) -> None:
+
+    if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
+        # ROCm Triton FA does not currently support sliding window attention
+        # switch to use ROCm CK FA backend
+        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
+    vllm_extra_kwargs = {}
+    if model == "ssmits/Qwen2-7B-Instruct-embed-base":
+        vllm_extra_kwargs["override_pooler_config"] = \
+            PoolerConfig(pooling_type="MEAN")
+
+    if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
+        vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
+
+    # The example_prompts has ending "\n", for example:
+    # "Write a short story about a robot that dreams for the first time.\n"
+    # sentence_transformers will strip the input texts, see:
+    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
+    # This makes the input_ids different between hf_model and vllm_model.
+    # So we need to strip the input texts to avoid test failing.
+    example_prompts = [str(s).strip() for s in example_prompts]
+
+    with hf_runner(model, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts)
+
+    with vllm_runner(model,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None,
+                     **vllm_extra_kwargs) as vllm_model:
+        vllm_outputs = vllm_model.encode(example_prompts)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -0,0 +1,213 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import importlib.util
+import math
+from array import array
+
+import openai
+import pytest
+from scipy.spatial.distance import cosine
+
+from vllm import LLM, SamplingParams
+from vllm.config import ModelConfig
+from vllm.utils import STR_BACKEND_ENV_VAR
+
+from ....utils import RemoteOpenAIServer
+
+# GritLM embedding implementation is only supported by XFormers backend.
+pytestmark = pytest.mark.skipif(not importlib.util.find_spec("xformers"),
+                                reason="GritLM requires XFormers")
+
+MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
+MAX_MODEL_LEN = 4000
+
+
+def _arr(arr):
+    """
+    Convert a list of integers to an array of integers.
+    """
+    return array("i", arr)
+
+
+def test_find_array():
+    from vllm.model_executor.models.gritlm import GritLMPooler
+
+    model_config = ModelConfig(
+        MODEL_NAME,
+        task="embed",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="bfloat16",
+        seed=0,
+    )
+    pooler = GritLMPooler(model_config=model_config)
+
+    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+
+    with pytest.raises(ValueError):
+        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+
+
+def run_llm_encode(
+    llm: LLM,
+    queries: list[str],
+    instruction: str,
+) -> list[list[float]]:
+    outputs = llm.embed([instruction + q for q in queries])
+    return [output.outputs.embedding for output in outputs]
+
+
+async def run_client_embeddings(
+    client: openai.AsyncOpenAI,
+    queries: list[str],
+    instruction: str,
+) -> list[list[float]]:
+    outputs = await client.embeddings.create(
+        model=MODEL_NAME,
+        input=[instruction + q for q in queries],
+    )
+    return [data.embedding for data in outputs.data]
+
+
+def gritlm_instruction(instruction):
+    return ("<|user|>\n" + instruction +
+            "\n<|embed|>\n" if instruction else "<|embed|>\n")
+
+
+def get_test_data():
+    """
+    Grabbed this test data and the expected values from
+    README.md in https://github.com/ContextualAI/gritlm
+    """
+    q_instruction = gritlm_instruction(
+        "Given a scientific paper title, retrieve the paper's abstract", )
+    queries = [
+        "Bitcoin: A Peer-to-Peer Electronic Cash System",
+        "Generative Representational Instruction Tuning",
+    ]
+
+    d_instruction = gritlm_instruction("")
+    documents = [
+        # ruff: noqa: E501
+        "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
+        "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
+    ]
+
+    return queries, q_instruction, documents, d_instruction
+
+
+def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
+    cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
+    assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
+
+    cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
+    assert math.isclose(cosine_sim_q0_d1, 0.101, abs_tol=0.001)
+
+    cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
+    assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
+
+    cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
+    assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001)
+
+
+def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch,
+                                  vllm_runner):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
+
+        queries, q_instruction, documents, d_instruction = get_test_data()
+
+        with vllm_runner(
+                MODEL_NAME,
+                task="embed",
+                max_model_len=MAX_MODEL_LEN,
+        ) as vllm_model:
+            llm = vllm_model.model
+
+            d_rep = run_llm_encode(
+                llm,
+                documents,
+                d_instruction,
+            )
+            q_rep = run_llm_encode(
+                llm,
+                queries,
+                q_instruction,
+            )
+
+        validate_embed_output(q_rep, d_rep)
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_embedding():
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    # GritLM embedding implementation is only supported by XFormers backend.
+    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+    env_dict = {STR_BACKEND_ENV_VAR: "XFORMERS"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
+        client_embedding = server.get_async_client()
+
+        d_rep = await run_client_embeddings(
+            client_embedding,
+            documents,
+            d_instruction,
+        )
+        q_rep = await run_client_embeddings(
+            client_embedding,
+            queries,
+            q_instruction,
+        )
+
+    validate_embed_output(q_rep, d_rep)
+
+
+def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
+
+        input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+
+        with vllm_runner(
+                MODEL_NAME,
+                task="generate",
+                max_model_len=MAX_MODEL_LEN,
+        ) as vllm_model:
+            llm = vllm_model.model
+
+            sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
+            outputs = llm.generate(input, sampling_params=sampling_params)
+
+        assert outputs[0].outputs[0].text == "The capital of France is Paris."
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_generate():
+    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+
+    # GritLM embedding implementation is only supported by XFormers backend.
+    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
+    env_dict = {"VLLM_USE_V1": "0", STR_BACKEND_ENV_VAR: "XFORMERS"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
+        client_generate = server.get_async_client()
+
+        outputs = await client_generate.completions.create(
+            model=MODEL_NAME,
+            prompt=input,
+            max_tokens=256,
+            temperature=0.0,
+        )
+
+    assert outputs.choices[0].text == "The capital of France is Paris."
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+"""Compare the scoring outputs of HF and vLLM models.
+
+Run `pytest tests/models/embedding/language/test_jina.py`.
+"""
+import math
+
+import pytest
+
+from vllm import PoolingParams
+
+from ...utils import check_embeddings_close, matryoshka_fy
+
+SCORING_MODELS = [
+    "jinaai/jina-reranker-v2-base-multilingual",  # Roberta
+]
+
+TEXTS_1 = ["Organic skincare products for sensitive skin"]
+
+TEXTS_2 = [
+    "Organic skincare for sensitive skin with aloe vera and chamomile.",
+    "New makeup trends focus on bold colors and innovative techniques",
+    "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
+    "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",
+    "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",
+    "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",
+    "针对敏感肌专门设计的天然有机护肤产品",
+    "新的化妆趋势注重鲜艳的颜色和创新的技巧",
+    "敏感肌のために特別に設計された天然有機スキンケア製品",
+    "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています",
+]
+
+EMBEDDING_MODELS = [
+    "jinaai/jina-embeddings-v3",
+]
+
+EMBEDDING_PROMPTS = [
+    "Follow the white rabbit.",  # English
+    "Sigue al conejo blanco.",  # Spanish
+    "Suis le lapin blanc.",  # French
+    "跟着白兔走。",  # Chinese
+    "اتبع الأرنب الأبيض.",  # Arabic
+    "Folge dem weißen Kaninchen.",  # German
+]
+
+
+@pytest.fixture(scope="module", params=SCORING_MODELS)
+def model_name(request):
+    yield request.param
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict([text_pair]).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pairs = [[TEXTS_1[0], text] for text in TEXTS_2]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 10
+    assert len(hf_outputs) == 10
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
+def emb_model_name(request):
+    yield request.param
+
+
+def test_is_matryoshka(vllm_runner, emb_model_name):
+    with vllm_runner(emb_model_name, task="embed",
+                     max_model_len=None) as vllm_model:
+        assert vllm_model.model.llm_engine.model_config.is_matryoshka
+
+
+@pytest.mark.parametrize("model", EMBEDDING_MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_embeddings(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    monkeypatch,
+) -> None:
+
+    example_prompts = EMBEDDING_PROMPTS
+
+    with hf_runner(
+            model,
+            dtype=dtype,
+            is_sentence_transformer=True,
+    ) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts, task="text-matching")
+
+    with vllm_runner(model, task="embed", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.encode(example_prompts)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
+
+
+@pytest.mark.parametrize("model", EMBEDDING_MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dimensions", [16, 32])
+def test_matryoshka(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    dimensions: int,
+    monkeypatch,
+) -> None:
+
+    example_prompts = EMBEDDING_PROMPTS
+
+    with hf_runner(
+            model,
+            dtype=dtype,
+            is_sentence_transformer=True,
+    ) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts, task="text-matching")
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+
+    with vllm_runner(model, task="embed", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        matryoshka_dimensions = (
+            vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
+        assert matryoshka_dimensions is not None
+
+        if dimensions not in matryoshka_dimensions:
+            with pytest.raises(ValueError):
+                vllm_model.encode(
+                    example_prompts,
+                    pooling_params=PoolingParams(dimensions=dimensions))
+        else:
+            vllm_outputs = vllm_model.encode(
+                example_prompts,
+                pooling_params=PoolingParams(dimensions=dimensions))
+
+            check_embeddings_close(
+                embeddings_0_lst=hf_outputs,
+                embeddings_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+                tol=1e-2,
+            )
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the scoring outputs of HF and vLLM models.
+
+Run `pytest tests/models/embedding/language/test_scoring.py`.
+"""
+import math
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+MODELS = [
+    "cross-encoder/ms-marco-MiniLM-L-6-v2",  # Bert
+    "BAAI/bge-reranker-v2-m3",  # Roberta
+]
+
+EMBEDDING_MODELS = [
+    "sentence-transformers/all-MiniLM-L12-v2",
+]
+
+TEXTS_1 = [
+    "What is the capital of France?",
+    "What is the capital of Germany?",
+]
+
+TEXTS_2 = [
+    "The capital of France is Paris.",
+    "The capital of Germany is Berlin.",
+]
+
+
+@pytest.fixture(scope="module", params=MODELS)
+def model_name(request):
+    yield request.param
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict([text_pair]).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
+def emb_model_name(request):
+    yield request.param
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = hf_model.encode(text_pair)
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the embedding outputs of HF and vLLM models.
+
+Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`.
+"""
+import pytest
+
+from ...utils import EmbedModelInfo, check_embeddings_close
+
+EMBEDDING_PROMPTS = [
+    'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!',
+    'Mexico City of Course!'
+]
+
+MODELS = [
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
+                   is_matryoshka=False,
+                   architecture="NomicBertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
+                   is_matryoshka=True,
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
+                   is_matryoshka=True,
+                   architecture="XLMRobertaModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
+                   is_matryoshka=True,
+                   architecture="GteModel",
+                   enable_test=True),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model_info: EmbedModelInfo,
+    dtype: str,
+    monkeypatch,
+) -> None:
+    if not model_info.enable_test:
+        # A model family has many models with the same architecture,
+        # and we don't need to test each one.
+        pytest.skip("Skipping test.")
+
+    example_prompts = example_prompts + EMBEDDING_PROMPTS
+
+    vllm_extra_kwargs = {
+        "hf_overrides": {
+            "is_matryoshka": model_info.is_matryoshka
+        }
+    }
+
+    with hf_runner(model_info.name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts)
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None,
+                     **vllm_extra_kwargs) as vllm_model:
+
+        assert (vllm_model.model.llm_engine.model_config.is_matryoshka ==
+                model_info.is_matryoshka)
+
+        if model_info.architecture:
+            assert (model_info.architecture
+                    in vllm_model.model.llm_engine.model_config.architectures)
+
+        vllm_outputs = vllm_model.encode(example_prompts)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
--- a/tests/models/language/pooling/test_truncation_control.py
+++ b/tests/models/language/pooling/test_truncation_control.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
+max_model_len = 128
+
+input_str = """Immerse yourself in the enchanting chronicle of calculus, a 
+    mathematical domain that has radically transformed our comprehension of 
+    change and motion. Despite its roots in ancient civilizations, the 
+    formal birth of calculus predominantly occurred in the 17th century, 
+    primarily under the influential guidance of Sir Isaac Newton and Gottfried 
+    Wilhelm Leibniz. The earliest traces of calculus concepts are found in 
+    ancient Greek mathematics,most notably in the works of Eudoxus and 
+    Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a 
+    technique for computing areas and volumes through the use of finite sums. 
+    This methodology laid crucial foundational work for integral calculus. 
+    In the 17th century, both Newton and Leibniz independently pioneered 
+    calculus, each contributing unique perspectives that would shape this new 
+    field."""
+
+
+def test_smaller_truncation_size(vllm_runner,
+                                 model_name=MODEL_NAME,
+                                 input_str=input_str):
+
+    truncate_prompt_tokens = 10
+
+    with vllm_runner(model_name, task="embed",
+                     max_model_len=max_model_len) as vllm_model:
+        vllm_output = vllm_model.model.encode(
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+
+    prompt_tokens = vllm_output[0].prompt_token_ids
+
+    assert len(prompt_tokens) == truncate_prompt_tokens
+
+
+def test_max_truncation_size(vllm_runner,
+                             model_name=MODEL_NAME,
+                             input_str=input_str):
+    truncate_prompt_tokens = -1
+
+    with vllm_runner(model_name, task="embed",
+                     max_model_len=max_model_len) as vllm_model:
+        vllm_output = vllm_model.model.encode(
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+
+    prompt_tokens = vllm_output[0].prompt_token_ids
+
+    assert len(prompt_tokens) == max_model_len
+
+
+def test_bigger_truncation_size(vllm_runner,
+                                model_name=MODEL_NAME,
+                                input_str=input_str):
+
+    truncate_prompt_tokens = max_model_len + 1
+
+    with pytest.raises(ValueError), vllm_runner(
+            model_name, task="embed",
+            max_model_len=max_model_len) as vllm_model:
+
+        llm_output = vllm_model.model.encode(
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+
+        assert llm_output == f"""truncate_prompt_tokens value 
+                ({truncate_prompt_tokens}) is greater than 
+                max_model_len ({max_model_len}). Please, select 
+                a smaller truncation size."""