[Frontend] Re-enable running MaxSim on GPU (#38620)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
wang.yuqi
2026-04-03 00:03:13 +08:00
committed by GitHub
parent d9408ffba3
commit a9b4f07ba2
12 changed files with 207 additions and 54 deletions

View File

@@ -26,13 +26,18 @@ TEXTS_2 = [
]
@pytest.fixture(scope="module")
def server():
@pytest.fixture(scope="module", params=[True, False])
def server(request):
args = [
"--max-model-len",
str(MAX_MODEL_LEN),
]
# Test run pooling score MaxSim on worker side (GPU)
# aka flash-late-interaction
if not request.param:
args += ["--no-enable-flash-late-interaction"]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server