[CI] improve embed testing (#18747)

2025-05-28 15:16:35 +08:00
parent 0c492b7824
commit de65fc8e1e
13 changed files with 248 additions and 178 deletions
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -80,18 +80,19 @@ def run_mteb_embed_task_st(model_name, tasks):
 def mteb_test_embed_models(hf_runner,
                           vllm_runner,
                           model_info: EmbedModelInfo,
-                           vllm_extra_kwargs=None):
+                           vllm_extra_kwargs=None,
+                           hf_model_callback=None):
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
        pytest.skip("Skipping test.")

    vllm_extra_kwargs = vllm_extra_kwargs or {}
+    vllm_extra_kwargs["dtype"] = model_info.dtype

    with vllm_runner(model_info.name,
                     task="embed",
                     max_model_len=None,
-                     dtype=model_info.dtype,
                     **vllm_extra_kwargs) as vllm_model:

        if model_info.architecture:
@@ -108,10 +109,14 @@ def mteb_test_embed_models(hf_runner,
    with set_default_torch_dtype(model_dtype) and hf_runner(
            model_info.name, is_sentence_transformer=True,
            dtype=model_dtype) as hf_model:
+
+        if hf_model_callback is not None:
+            hf_model_callback(hf_model)
+
        st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)

    print("VLLM:", vllm_dtype, vllm_main_score)
    print("SentenceTransformer:", model_dtype, st_main_score)
    print("Difference:", st_main_score - vllm_main_score)

-    assert st_main_score == pytest.approx(vllm_main_score, rel=MTEB_EMBED_TOL)
+    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)