[Frontend] Re-enable running MaxSim on GPU (#38620)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
@@ -26,13 +26,18 @@ TEXTS_2 = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
@pytest.fixture(scope="module", params=[True, False])
|
||||
def server(request):
|
||||
args = [
|
||||
"--max-model-len",
|
||||
str(MAX_MODEL_LEN),
|
||||
]
|
||||
|
||||
# Test run pooling score MaxSim on worker side (GPU)
|
||||
# aka flash-late-interaction
|
||||
if not request.param:
|
||||
args += ["--no-enable-flash-late-interaction"]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
Reference in New Issue
Block a user