Support embedding models in V1 (#16188)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Max de Bayser <maxdebayser@gmail.com> Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-19 01:36:33 -03:00
parent 4959915089
commit 799397ee4f
56 changed files with 889 additions and 281 deletions
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -6,6 +6,14 @@ from transformers import AutoModelForSequenceClassification

 from vllm.platforms import current_platform

+# TODO: enable when float32 is supported by V1
+# @pytest.fixture(autouse=True)
+# def v1(run_with_both_engines):
+#     # Simple autouse wrapper to run both engines for each test
+#     # This can be promoted up to conftest.py to run for every
+#     # test in a package
+#     pass
+

@pytest.mark.parametrize(
    "model",
@@ -29,7 +37,7 @@ def test_models(
        # switch to use ROCm CK FA backend
        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")

-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.classify(example_prompts)

    with hf_runner(model,