[Frontend] Separate pooling APIs in offline inference (#11129)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-13 18:40:07 +08:00
parent f93bf2b189
commit eeec9e3390
21 changed files with 669 additions and 304 deletions
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -42,7 +42,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
    assert len(vllm_outputs) == 1
    assert len(hf_outputs) == 1

-    assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)


@pytest.mark.parametrize("dtype", ["half"])
@@ -63,8 +63,8 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
    assert len(vllm_outputs) == 2
    assert len(hf_outputs) == 2

-    assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)


@pytest.mark.parametrize("dtype", ["half"])
@@ -85,5 +85,5 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
    assert len(vllm_outputs) == 2
    assert len(hf_outputs) == 2

-    assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)