[Bugfix] vLLM produces invalid UTF-8 tokens and “�” (#28874)

Signed-off-by: John Calderon <jcalderon@nvidia.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
This commit is contained in:
John Calderon
2026-01-05 19:23:00 -05:00
committed by GitHub
parent 3c98c2d21b
commit 2f4e6548ef
3 changed files with 539 additions and 33 deletions

View File

@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from collections.abc import Iterable
from dataclasses import dataclass
from vllm.logger import init_logger
@@ -88,11 +89,16 @@ class LogprobsProcessor:
logprobs = logprobs_np.tolist()
token_ids = token_ids_np.tolist()
# Detokenize (non-incrementally).
decoded_tokens = (
NONES
if self.tokenizer is None
else (convert_ids_list_to_tokens(self.tokenizer, token_ids))
)
decoded_tokens: list[str] | Iterable[None]
if self.tokenizer is None:
decoded_tokens = NONES
else:
decoded_tokens_list = convert_ids_list_to_tokens(
self.tokenizer, token_ids
)
decoded_tokens = self._verify_tokens(
decoded_tokens_list=decoded_tokens_list, tokens=token_ids
)
# Sampler puts the sampled logprob in first.
sampled_token_logprob = logprobs[0]
@@ -126,37 +132,45 @@ class LogprobsProcessor:
token_ids, logprobs, ranks = prompt_logprobs_tensors
# Detokenize non-incrementally.
# Output is flat: [num_tok, num_lps] -> [num_tok * num_lps]
decoded_tokens = (
None
if self.tokenizer is None
else (
convert_ids_list_to_tokens(self.tokenizer, token_ids.flatten().tolist())
)
)
# Recover shapes.
num_prompt_tokens, num_logprobs = logprobs.shape
# Detokenize non-incrementally.
# Output is flat: [num_tok, num_lps] -> [num_tok * num_lps]
all_decoded_tokens: list[str] | None = (
None
if self.tokenizer is None
else convert_ids_list_to_tokens(
self.tokenizer, token_ids.flatten().tolist()
)
)
# Pythonize the torch tensors.
prompt_token_ranks = ranks.tolist()
prompt_logprobs = logprobs.tolist()
token_ids = token_ids.tolist()
token_ids_list = token_ids.tolist()
# Make Logprob for each position.
for pos in range(num_prompt_tokens):
# Handle flattening.
# Handle flattening and UTF-8 correction per position
offset = pos * num_logprobs
offset_end = offset + num_logprobs
decoded_tokens_for_pos = (
NONES if decoded_tokens is None else decoded_tokens[offset:offset_end]
)
decoded_tokens_for_pos: list[str] | Iterable[None]
if all_decoded_tokens is None:
decoded_tokens_for_pos = NONES
else:
# Extract decoded tokens for this position
decoded_tokens_slice = all_decoded_tokens[offset:offset_end]
# Apply UTF-8 correction within this position's token boundaries
decoded_tokens_for_pos = self._verify_tokens(
decoded_tokens_list=decoded_tokens_slice, tokens=token_ids_list[pos]
)
# Update with the Logprob container for this pos.
append_logprobs_for_next_position(
self.prompt_logprobs,
token_ids[pos],
token_ids_list[pos],
prompt_logprobs[pos],
decoded_tokens_for_pos,
prompt_token_ranks[pos],
@@ -182,6 +196,48 @@ class LogprobsProcessor:
self.prompt_logprobs = []
return plp
def _correct_decoded_token(self, idx: int, tokens: list[int]) -> str:
assert self.tokenizer is not None, "self.tokenizer should not be None"
# try with prev token id in same list
if idx > 0:
possible_decoded_token = self.tokenizer.decode(tokens[idx - 1 : idx + 1])
if not possible_decoded_token.endswith("<EFBFBD>"):
return possible_decoded_token
# try with previous logprob token id
if self.logprobs:
latest_token_id = next(iter(self.logprobs[-1]))
decode_ids = [latest_token_id]
if idx > 0:
decode_ids.extend(tokens[idx - 1 : idx + 1])
else:
decode_ids.extend(tokens[idx : idx + 1])
possible_decoded_token = self.tokenizer.decode(decode_ids)
if not possible_decoded_token.endswith("<EFBFBD>"):
return possible_decoded_token
# by default return empty string
return ""
def _verify_tokens(
self, decoded_tokens_list: list[str], tokens: list[int]
) -> list[str]:
corrected_decoded_token_map = dict()
for idx, text in enumerate(decoded_tokens_list):
if text.endswith("<EFBFBD>"):
# utf-8 char at the end means it's a potential unfinished byte sequence
# from byte fallback tokenization.
corrected_decoded_token_map[idx] = self._correct_decoded_token(
idx, tokens
)
for idx, text in corrected_decoded_token_map.items():
decoded_tokens_list[idx] = text
return decoded_tokens_list
def update_from_output(self, output: EngineCoreOutput) -> None:
if output.new_logprobs is not None:
self._update_sample_logprobs(output.new_logprobs)