[Frontend] Add tokenize/detokenize endpoints (#5054)

This commit is contained in:
sasha0552
2024-06-26 16:54:22 +00:00
committed by GitHub
parent 5bfd1bbc98
commit c54269d967
5 changed files with 143 additions and 6 deletions

View File

@@ -10,9 +10,10 @@ from vllm.config import ModelConfig
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
CompletionRequest,
DetokenizeRequest,
EmbeddingRequest, ErrorResponse,
ModelCard, ModelList,
ModelPermission)
ModelPermission, TokenizeRequest)
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import Logprob
@@ -99,8 +100,9 @@ class OpenAIServing:
return json_str
async def _check_model(
self, request: Union[CompletionRequest, ChatCompletionRequest,
EmbeddingRequest]
self, request: Union[ChatCompletionRequest, CompletionRequest,
DetokenizeRequest, EmbeddingRequest,
TokenizeRequest]
) -> Optional[ErrorResponse]:
if request.model in self.served_model_names:
return None
@@ -126,7 +128,8 @@ class OpenAIServing:
def _validate_prompt_and_tokenize(
self,
request: Union[ChatCompletionRequest, CompletionRequest,
EmbeddingRequest],
DetokenizeRequest, EmbeddingRequest,
TokenizeRequest],
prompt: Optional[str] = None,
prompt_ids: Optional[List[int]] = None,
truncate_prompt_tokens: Optional[Annotated[int,
@@ -174,6 +177,11 @@ class OpenAIServing:
f"generation. Please reduce the length of the input.", )
return input_ids, input_text
# Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
# and does not require model context length validation
if isinstance(request, (TokenizeRequest, DetokenizeRequest)):
return input_ids, input_text
if request.max_tokens is None:
if token_num >= self.max_model_len:
raise ValueError(