[Frontend] Add tokenize/detokenize endpoints (#5054)
This commit is contained in:
@@ -10,9 +10,10 @@ from vllm.config import ModelConfig
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
CompletionRequest,
|
||||
DetokenizeRequest,
|
||||
EmbeddingRequest, ErrorResponse,
|
||||
ModelCard, ModelList,
|
||||
ModelPermission)
|
||||
ModelPermission, TokenizeRequest)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import Logprob
|
||||
@@ -99,8 +100,9 @@ class OpenAIServing:
|
||||
return json_str
|
||||
|
||||
async def _check_model(
|
||||
self, request: Union[CompletionRequest, ChatCompletionRequest,
|
||||
EmbeddingRequest]
|
||||
self, request: Union[ChatCompletionRequest, CompletionRequest,
|
||||
DetokenizeRequest, EmbeddingRequest,
|
||||
TokenizeRequest]
|
||||
) -> Optional[ErrorResponse]:
|
||||
if request.model in self.served_model_names:
|
||||
return None
|
||||
@@ -126,7 +128,8 @@ class OpenAIServing:
|
||||
def _validate_prompt_and_tokenize(
|
||||
self,
|
||||
request: Union[ChatCompletionRequest, CompletionRequest,
|
||||
EmbeddingRequest],
|
||||
DetokenizeRequest, EmbeddingRequest,
|
||||
TokenizeRequest],
|
||||
prompt: Optional[str] = None,
|
||||
prompt_ids: Optional[List[int]] = None,
|
||||
truncate_prompt_tokens: Optional[Annotated[int,
|
||||
@@ -174,6 +177,11 @@ class OpenAIServing:
|
||||
f"generation. Please reduce the length of the input.", )
|
||||
return input_ids, input_text
|
||||
|
||||
# Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
|
||||
# and does not require model context length validation
|
||||
if isinstance(request, (TokenizeRequest, DetokenizeRequest)):
|
||||
return input_ids, input_text
|
||||
|
||||
if request.max_tokens is None:
|
||||
if token_num >= self.max_model_len:
|
||||
raise ValueError(
|
||||
|
||||
Reference in New Issue
Block a user