[Frontend] API support for beam search for MQLLMEngine (#9117)
This commit is contained in:
@@ -7,6 +7,7 @@ from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable,
|
||||
from weakref import ReferenceType
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
|
||||
from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig)
|
||||
from vllm.core.scheduler import SchedulerOutputs
|
||||
@@ -14,7 +15,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_timeout import asyncio_timeout
|
||||
from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
|
||||
from vllm.engine.metrics_types import StatLoggerBase
|
||||
from vllm.entrypoints.llm import BeamSearchSequence
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase
|
||||
from vllm.executor.gpu_executor import GPUExecutorAsync
|
||||
from vllm.executor.ray_utils import initialize_ray_cluster
|
||||
@@ -33,7 +33,7 @@ from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import (collect_from_async_generator, deprecate_kwargs,
|
||||
get_beam_search_score, random_uuid, weak_bind)
|
||||
random_uuid, weak_bind)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
|
||||
@@ -1052,16 +1052,14 @@ class AsyncLLMEngine:
|
||||
temperature = params.temperature
|
||||
length_penalty = params.length_penalty
|
||||
|
||||
def sort_beams_key(x: BeamSearchSequence) -> float:
|
||||
return get_beam_search_score(x.tokens, x.cum_logprob,
|
||||
tokenizer.eos_token_id,
|
||||
length_penalty)
|
||||
|
||||
tokenizer = await self.get_tokenizer()
|
||||
tokenizedPrompt = prompt if isinstance(
|
||||
prompt, list) else tokenizer.encode(prompt)
|
||||
tokenizedLength = len(tokenizedPrompt)
|
||||
|
||||
sort_beams_key = create_sort_beams_key_function(
|
||||
tokenizer.eos_token_id, length_penalty)
|
||||
|
||||
beam_search_params = SamplingParams(logprobs=2 * beam_width,
|
||||
max_tokens=1,
|
||||
temperature=temperature)
|
||||
|
||||
Reference in New Issue
Block a user