[BugFix][Frontend] Use LoRA tokenizer in OpenAI APIs (#6227)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
@@ -5,6 +5,7 @@ from http import HTTPStatus
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from pydantic import Field
|
||||
from transformers import PreTrainedTokenizer
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
@@ -19,7 +20,6 @@ from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sequence import Logprob
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -52,14 +52,6 @@ class OpenAIServing:
|
||||
self.model_config = model_config
|
||||
self.max_model_len = model_config.max_model_len
|
||||
|
||||
# A separate tokenizer to map token IDs to strings.
|
||||
self.tokenizer = get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
tokenizer_revision=model_config.tokenizer_revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
truncation_side="left")
|
||||
|
||||
self.served_model_names = served_model_names
|
||||
|
||||
self.lora_requests = []
|
||||
@@ -154,7 +146,8 @@ class OpenAIServing:
|
||||
|
||||
def _maybe_get_adapter(
|
||||
self, request: Union[CompletionRequest, ChatCompletionRequest,
|
||||
EmbeddingRequest]
|
||||
EmbeddingRequest, TokenizeRequest,
|
||||
DetokenizeRequest]
|
||||
) -> Tuple[Optional[str], Optional[Union[LoRARequest,
|
||||
PromptAdapterRequest]]]:
|
||||
if request.model in self.served_model_names:
|
||||
@@ -168,11 +161,12 @@ class OpenAIServing:
|
||||
# if _check_model has been called earlier, this will be unreachable
|
||||
raise ValueError(f"The model `{request.model}` does not exist.")
|
||||
|
||||
def _validate_prompt_and_tokenize(
|
||||
async def _validate_prompt_and_tokenize(
|
||||
self,
|
||||
request: Union[ChatCompletionRequest, CompletionRequest,
|
||||
DetokenizeRequest, EmbeddingRequest,
|
||||
TokenizeRequest],
|
||||
tokenizer: "PreTrainedTokenizer",
|
||||
prompt: Optional[str] = None,
|
||||
prompt_ids: Optional[List[int]] = None,
|
||||
truncate_prompt_tokens: Optional[Annotated[int,
|
||||
@@ -181,7 +175,7 @@ class OpenAIServing:
|
||||
) -> Tuple[List[int], str]:
|
||||
if not (prompt or prompt_ids):
|
||||
raise ValueError("Either prompt or prompt_ids should be provided.")
|
||||
if (prompt and prompt_ids):
|
||||
if prompt and prompt_ids:
|
||||
raise ValueError(
|
||||
"Only one of prompt or prompt_ids should be provided.")
|
||||
|
||||
@@ -200,14 +194,14 @@ class OpenAIServing:
|
||||
"truncation": True,
|
||||
"max_length": truncate_prompt_tokens,
|
||||
})
|
||||
input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
|
||||
input_ids = tokenizer(prompt, **tokenizer_kwargs).input_ids
|
||||
elif truncate_prompt_tokens is not None:
|
||||
input_ids = prompt_ids[-truncate_prompt_tokens:]
|
||||
else:
|
||||
input_ids = prompt_ids
|
||||
|
||||
input_text = prompt if prompt is not None else self.tokenizer.decode(
|
||||
prompt_ids)
|
||||
input_text = prompt if prompt is not None else tokenizer.decode(
|
||||
input_ids)
|
||||
token_num = len(input_ids)
|
||||
|
||||
# Note: EmbeddingRequest doesn't have max_tokens
|
||||
@@ -245,7 +239,9 @@ class OpenAIServing:
|
||||
else:
|
||||
return input_ids, input_text
|
||||
|
||||
def _get_decoded_token(self, logprob: Logprob, token_id: int) -> str:
|
||||
@staticmethod
|
||||
def _get_decoded_token(logprob: Logprob, token_id: int,
|
||||
tokenizer: PreTrainedTokenizer) -> str:
|
||||
if logprob.decoded_token is not None:
|
||||
return logprob.decoded_token
|
||||
return self.tokenizer.decode(token_id)
|
||||
return tokenizer.decode(token_id)
|
||||
|
||||
Reference in New Issue
Block a user