[Frontend] Separate pooling APIs in offline inference (#11129)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2024-12-13 18:40:07 +08:00
committed by GitHub
parent f93bf2b189
commit eeec9e3390
21 changed files with 669 additions and 304 deletions

View File

@@ -42,7 +42,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
assert len(vllm_outputs) == 1
assert len(hf_outputs) == 1
assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
@pytest.mark.parametrize("dtype", ["half"])
@@ -63,8 +63,8 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2
assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01)
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
@pytest.mark.parametrize("dtype", ["half"])
@@ -85,5 +85,5 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2
assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01)
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)