[Perf] Optimize detokenizer python logic (#32975)
Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -35,6 +35,9 @@ class IncrementalDetokenizer:
|
|||||||
def output_token_ids(self) -> list[int]:
|
def output_token_ids(self) -> list[int]:
|
||||||
return self.token_ids
|
return self.token_ids
|
||||||
|
|
||||||
|
def num_output_tokens(self) -> int:
|
||||||
|
return len(self.token_ids)
|
||||||
|
|
||||||
def update(self, new_token_ids: list[int], stop_terminated: bool) -> str | None:
|
def update(self, new_token_ids: list[int], stop_terminated: bool) -> str | None:
|
||||||
self.token_ids.extend(new_token_ids)
|
self.token_ids.extend(new_token_ids)
|
||||||
return None
|
return None
|
||||||
@@ -112,14 +115,12 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
|
|||||||
skipped_stop_token_id = None
|
skipped_stop_token_id = None
|
||||||
|
|
||||||
# 1) Detokenize the new token ids incrementally.
|
# 1) Detokenize the new token ids incrementally.
|
||||||
# TODO(woosuk): This method becomes very inefficient when the number of
|
|
||||||
# new_token_ids is more than 1. We need to optimize this.
|
|
||||||
stop_check_offset = len(self.output_text)
|
stop_check_offset = len(self.output_text)
|
||||||
for new_token_id in new_token_ids:
|
for new_token_id in new_token_ids:
|
||||||
self.token_ids.append(new_token_id)
|
self.token_ids.append(new_token_id)
|
||||||
self.output_text += self.decode_next(new_token_id)
|
self.output_text += self.decode_next(new_token_id)
|
||||||
# Support min_tokens, see https://github.com/vllm-project/vllm/pull/22014
|
# Support min_tokens, see https://github.com/vllm-project/vllm/pull/22014
|
||||||
if self.min_tokens and len(self.output_token_ids) <= self.min_tokens:
|
if self.min_tokens and self.num_output_tokens() <= self.min_tokens:
|
||||||
stop_check_offset = len(self.output_text)
|
stop_check_offset = len(self.output_text)
|
||||||
|
|
||||||
if skipped_stop_token_id is not None:
|
if skipped_stop_token_id is not None:
|
||||||
@@ -128,7 +129,7 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
|
|||||||
|
|
||||||
# 2) Evaluate stop strings.
|
# 2) Evaluate stop strings.
|
||||||
stop_string = None
|
stop_string = None
|
||||||
if self.stop and len(self.output_token_ids) > self.min_tokens:
|
if self.stop and self.num_output_tokens() > self.min_tokens:
|
||||||
stop = check_stop_strings(
|
stop = check_stop_strings(
|
||||||
output_text=self.output_text,
|
output_text=self.output_text,
|
||||||
new_char_count=len(self.output_text) - stop_check_offset,
|
new_char_count=len(self.output_text) - stop_check_offset,
|
||||||
@@ -295,6 +296,9 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
|
|||||||
else (self.token_ids[self.prompt_len :])
|
else (self.token_ids[self.prompt_len :])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def num_output_tokens(self) -> int:
|
||||||
|
return len(self.token_ids) - self.prompt_len
|
||||||
|
|
||||||
def decode_next(self, next_token_id: int) -> str:
|
def decode_next(self, next_token_id: int) -> str:
|
||||||
new_tokens, decoded_text, prefix_offset, read_offset = detokenize_incrementally(
|
new_tokens, decoded_text, prefix_offset, read_offset = detokenize_incrementally(
|
||||||
tokenizer=self.tokenizer,
|
tokenizer=self.tokenizer,
|
||||||
|
|||||||
@@ -292,7 +292,7 @@ class RequestState:
|
|||||||
if not (
|
if not (
|
||||||
finished
|
finished
|
||||||
or self.sent_tokens_offset == 0
|
or self.sent_tokens_offset == 0
|
||||||
or len(self.detokenizer.output_token_ids) - self.sent_tokens_offset
|
or self.detokenizer.num_output_tokens() - self.sent_tokens_offset
|
||||||
>= self.stream_interval
|
>= self.stream_interval
|
||||||
):
|
):
|
||||||
return None
|
return None
|
||||||
@@ -303,7 +303,7 @@ class RequestState:
|
|||||||
new_token_ids = self.detokenizer.output_token_ids[
|
new_token_ids = self.detokenizer.output_token_ids[
|
||||||
self.sent_tokens_offset :
|
self.sent_tokens_offset :
|
||||||
]
|
]
|
||||||
self.sent_tokens_offset = len(self.detokenizer.output_token_ids)
|
self.sent_tokens_offset = self.detokenizer.num_output_tokens()
|
||||||
|
|
||||||
external_req_id = self.external_req_id
|
external_req_id = self.external_req_id
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user