[BugFix][Frontend] Use LoRA tokenizer in OpenAI APIs (#6227)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
Nick Hill
2024-07-18 00:13:30 -07:00
committed by GitHub
parent 8a74c68bd1
commit e2fbaee725
16 changed files with 267 additions and 186 deletions

View File

@@ -5,6 +5,7 @@ from http import HTTPStatus
from typing import Any, Dict, List, Optional, Tuple, Union
from pydantic import Field
from transformers import PreTrainedTokenizer
from typing_extensions import Annotated
from vllm.config import ModelConfig
@@ -19,7 +20,6 @@ from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import Logprob
from vllm.transformers_utils.tokenizer import get_tokenizer
logger = init_logger(__name__)
@@ -52,14 +52,6 @@ class OpenAIServing:
self.model_config = model_config
self.max_model_len = model_config.max_model_len
# A separate tokenizer to map token IDs to strings.
self.tokenizer = get_tokenizer(
model_config.tokenizer,
tokenizer_mode=model_config.tokenizer_mode,
tokenizer_revision=model_config.tokenizer_revision,
trust_remote_code=model_config.trust_remote_code,
truncation_side="left")
self.served_model_names = served_model_names
self.lora_requests = []
@@ -154,7 +146,8 @@ class OpenAIServing:
def _maybe_get_adapter(
self, request: Union[CompletionRequest, ChatCompletionRequest,
EmbeddingRequest]
EmbeddingRequest, TokenizeRequest,
DetokenizeRequest]
) -> Tuple[Optional[str], Optional[Union[LoRARequest,
PromptAdapterRequest]]]:
if request.model in self.served_model_names:
@@ -168,11 +161,12 @@ class OpenAIServing:
# if _check_model has been called earlier, this will be unreachable
raise ValueError(f"The model `{request.model}` does not exist.")
def _validate_prompt_and_tokenize(
async def _validate_prompt_and_tokenize(
self,
request: Union[ChatCompletionRequest, CompletionRequest,
DetokenizeRequest, EmbeddingRequest,
TokenizeRequest],
tokenizer: "PreTrainedTokenizer",
prompt: Optional[str] = None,
prompt_ids: Optional[List[int]] = None,
truncate_prompt_tokens: Optional[Annotated[int,
@@ -181,7 +175,7 @@ class OpenAIServing:
) -> Tuple[List[int], str]:
if not (prompt or prompt_ids):
raise ValueError("Either prompt or prompt_ids should be provided.")
if (prompt and prompt_ids):
if prompt and prompt_ids:
raise ValueError(
"Only one of prompt or prompt_ids should be provided.")
@@ -200,14 +194,14 @@ class OpenAIServing:
"truncation": True,
"max_length": truncate_prompt_tokens,
})
input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
input_ids = tokenizer(prompt, **tokenizer_kwargs).input_ids
elif truncate_prompt_tokens is not None:
input_ids = prompt_ids[-truncate_prompt_tokens:]
else:
input_ids = prompt_ids
input_text = prompt if prompt is not None else self.tokenizer.decode(
prompt_ids)
input_text = prompt if prompt is not None else tokenizer.decode(
input_ids)
token_num = len(input_ids)
# Note: EmbeddingRequest doesn't have max_tokens
@@ -245,7 +239,9 @@ class OpenAIServing:
else:
return input_ids, input_text
def _get_decoded_token(self, logprob: Logprob, token_id: int) -> str:
@staticmethod
def _get_decoded_token(logprob: Logprob, token_id: int,
tokenizer: PreTrainedTokenizer) -> str:
if logprob.decoded_token is not None:
return logprob.decoded_token
return self.tokenizer.decode(token_id)
return tokenizer.decode(token_id)