[Frontend] Re-enable running MaxSim on GPU (#38620)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-04-03 00:03:13 +08:00
parent d9408ffba3
commit a9b4f07ba2
12 changed files with 207 additions and 54 deletions
--- a/tests/entrypoints/pooling/scoring/test_late_interaction_online.py
+++ b/tests/entrypoints/pooling/scoring/test_late_interaction_online.py
@@ -26,13 +26,18 @@ TEXTS_2 = [
 ]


-@pytest.fixture(scope="module")
-def server():
+@pytest.fixture(scope="module", params=[True, False])
+def server(request):
    args = [
        "--max-model-len",
        str(MAX_MODEL_LEN),
    ]

+    # Test run pooling score MaxSim on worker side (GPU)
+    # aka flash-late-interaction
+    if not request.param:
+        args += ["--no-enable-flash-late-interaction"]
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server