[Bugfix] vLLM produces invalid UTF-8 tokens and “�” (#28874)

Signed-off-by: John Calderon <jcalderon@nvidia.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
This commit is contained in:
John Calderon
2026-01-05 19:23:00 -05:00
committed by GitHub
parent 3c98c2d21b
commit 2f4e6548ef
3 changed files with 539 additions and 33 deletions

View File

@@ -274,12 +274,28 @@ def _validate_logprobs(
# the logprob token id at this sequence position
decoded_token = pos_logprob_dict[lp_tok].decoded_token
ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, lp_tok)
assert decoded_token == ref_decoded_token, (
f"Sampled logprob token id {lp_tok} decodes to"
f" {ref_decoded_token} but Logprob decoded"
f" token is {decoded_token} instead"
f" (at position {idx})"
)
# With UTF-8 correction logic, tokens ending with "<22>"
# (incomplete byte sequences) are corrected to either
# empty string or proper UTF-8 characters
if ref_decoded_token.endswith("<EFBFBD>"):
# Token needs UTF-8 correction
assert not decoded_token.endswith("<EFBFBD>"), (
f"Sampled logprob token id {lp_tok} decodes to"
f" '{ref_decoded_token}' (ends with replacement char)"
f" but corrected decoded token '{decoded_token}'"
f" still ends with replacement char"
f" (at position {idx}). UTF-8 correction should"
f" have removed it."
)
else:
# No correction needed, should match exactly
assert decoded_token == ref_decoded_token, (
f"Sampled logprob token id {lp_tok} decodes to"
f" {ref_decoded_token} but Logprob decoded"
f" token is {decoded_token} instead"
f" (at position {idx})"
)
ref_cumulative_logprob += pos_logprob_dict[sampled_token].logprob
# Assert that cumulative logprobs are correct
@@ -420,12 +436,28 @@ def _validate_logprobs(
# the logprob token id at this sequence position
decoded_token = pos_logprob_dict[plp_tok].decoded_token
ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, plp_tok)
assert decoded_token == ref_decoded_token, (
f"Prompt logprob token id {plp_tok} decodes to"
f" {ref_decoded_token} but Logprob decoded"
f" token is {decoded_token} instead"
f" (at position {idx})"
)
# With UTF-8 correction logic, tokens ending with "<22>"
# (incomplete byte sequences) are corrected to either
# empty string or proper UTF-8 characters
if ref_decoded_token.endswith("<EFBFBD>"):
# Token needs UTF-8 correction
assert not decoded_token.endswith("<EFBFBD>"), (
f"Prompt logprob token id {plp_tok} decodes to"
f" '{ref_decoded_token}' (ends with replacement char)"
f" but corrected decoded token '{decoded_token}'"
f" still ends with replacement char"
f" (at position {idx}). UTF-8 correction should"
f" have removed it."
)
else:
# No correction needed, should match exactly
assert decoded_token == ref_decoded_token, (
f"Prompt logprob token id {plp_tok} decodes to"
f" {ref_decoded_token} but Logprob decoded"
f" token is {decoded_token} instead"
f" (at position {idx})"
)
else:
# Prompt logprobs disabled for this request
assert prompt_logprobs is None