[Frontend] API support for beam search for MQLLMEngine (#9117)

This commit is contained in:
Brendan Wong
2024-10-07 22:51:43 -07:00
committed by GitHub
parent e1faa2a598
commit 8c746226c9
8 changed files with 215 additions and 106 deletions

View File

@@ -9,6 +9,7 @@ from fastapi import Request
from vllm.config import ModelConfig
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.multiprocessing.client import MQLLMEngineClient
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block
@@ -150,15 +151,16 @@ class OpenAIServingCompletion(OpenAIServing):
log_tracing_disabled_warning()
if isinstance(sampling_params, BeamSearchParams):
if not isinstance(self.engine_client, AsyncLLMEngine):
raise ValueError(
"Beam search in the API server is only supported"
" with AsyncLLMEngine. please add "
"`--disable-frontend-multiprocessing` to "
"use beam search.")
assert isinstance(self.engine_client,
(AsyncLLMEngine,
MQLLMEngineClient)), \
"Beam search is only supported with" \
"AsyncLLMEngine and MQLLMEngineClient."
generator = self.engine_client.beam_search(
prompt_inputs["prompt_token_ids"], request_id_item,
sampling_params)
prompt_inputs["prompt_token_ids"],
request_id_item,
sampling_params,
)
else:
generator = self.engine_client.generate(
{