[Bugfix] vLLM produces invalid UTF-8 tokens and “�” (#28874)

Signed-off-by: John Calderon <jcalderon@nvidia.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
This commit is contained in:
John Calderon
2026-01-05 19:23:00 -05:00
committed by GitHub
parent 3c98c2d21b
commit 2f4e6548ef
3 changed files with 539 additions and 33 deletions

View File

@@ -514,6 +514,424 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
del llm
class TestCorrectDecodedToken:
"""Unit tests for _correct_decoded_token method in LogprobsProcessor.
This method handles UTF-8 decoding issues where incomplete byte sequences
result in the Unicode replacement character "<EFBFBD>" (U+FFFD). This commonly
happens with byte-fallback tokenization when multi-byte UTF-8 characters
are split across tokens.
"""
@pytest.fixture
def mock_tokenizer(self):
"""Create a mock tokenizer for testing."""
from unittest.mock import Mock
tokenizer = Mock()
return tokenizer
@pytest.fixture
def processor_with_empty_logprobs(self, mock_tokenizer):
"""Create a LogprobsProcessor with empty logprobs."""
from vllm.v1.engine.logprobs import LogprobsProcessor
processor = LogprobsProcessor(
tokenizer=mock_tokenizer,
logprobs=[],
prompt_logprobs=None,
cumulative_logprob=0.0,
num_logprobs=1,
num_prompt_logprobs=None,
)
return processor
@pytest.fixture
def processor_with_previous_logprobs(self, mock_tokenizer):
"""Create a LogprobsProcessor with previous logprobs."""
from vllm.v1.engine.logprobs import LogprobsProcessor
processor = LogprobsProcessor(
tokenizer=mock_tokenizer,
logprobs=[{123: None}], # Previous token ID is 123
prompt_logprobs=None,
cumulative_logprob=0.0,
num_logprobs=1,
num_prompt_logprobs=None,
)
return processor
def test_correction_with_previous_token_in_list(
self, processor_with_empty_logprobs
):
"""Test correction using previous token in the same list.
Scenario: Token at idx=1 ends with "<EFBFBD>", but when decoded with
the previous token (idx=0), it forms a valid UTF-8 sequence.
Example: token[0]="<EFBFBD>", token[1]="<EFBFBD>" -> together form "polarized"
"""
processor = processor_with_empty_logprobs
tokens = [100, 101, 102] # token IDs
# Mock tokenizer behavior:
# - decode([102]) returns "<22>" (ends with replacement char)
# - decode([101, 102]) returns "valid" (no replacement char)
processor.tokenizer.decode.side_effect = lambda ids: (
"valid" if ids == [101, 102] else "<EFBFBD>"
)
result = processor._correct_decoded_token(2, tokens)
assert result == "valid"
processor.tokenizer.decode.assert_called_with([101, 102])
def test_correction_with_previous_logprob_token(
self, processor_with_previous_logprobs
):
"""Test correction using previous logprob token.
Scenario: Cannot correct with previous token in list (idx=0),
but can correct with previous logprob token.
"""
processor = processor_with_previous_logprobs
tokens = [100] # single token
# Mock tokenizer behavior:
# - decode([100]) returns "<22>" (ends with replacement char)
# - decode([123, 100]) returns " "polarized" (no replacement char)
# Token 123 is from previous logprobs
def mock_decode(ids):
if ids == [123, 100]:
return ' "polarized"'
return "<EFBFBD>"
processor.tokenizer.decode.side_effect = mock_decode
result = processor._correct_decoded_token(0, tokens)
assert result == ' "polarized"'
def test_correction_at_idx_zero_no_previous_logprobs(
self, processor_with_empty_logprobs
):
"""Test correction at idx=0 with no previous logprobs.
Scenario: First token in list, no previous logprobs available.
Should return empty string as fallback.
"""
processor = processor_with_empty_logprobs
tokens = [100]
# Mock tokenizer always returns "<22>"
processor.tokenizer.decode.return_value = "<EFBFBD>"
result = processor._correct_decoded_token(0, tokens)
assert result == ""
def test_correction_at_idx_zero_with_previous_logprobs(
self, processor_with_previous_logprobs
):
"""Test correction at idx=0 with previous logprobs available.
Scenario: First token in list, but previous logprobs exist.
Should try correction with previous logprob token.
"""
processor = processor_with_previous_logprobs
tokens = [200]
# Mock tokenizer behavior
def mock_decode(ids):
if ids == [123, 200]:
return "corrected"
return "<EFBFBD>"
processor.tokenizer.decode.side_effect = mock_decode
result = processor._correct_decoded_token(0, tokens)
assert result == "corrected"
def test_no_correction_needed_returns_fallback(
self, processor_with_previous_logprobs
):
"""Test fallback to empty string when no correction works.
Scenario: All correction attempts still end with "<EFBFBD>".
Should return empty string as final fallback.
"""
processor = processor_with_previous_logprobs
tokens = [100, 101, 102]
# Mock tokenizer always returns text ending with "<22>"
processor.tokenizer.decode.return_value = "still<EFBFBD>"
result = processor._correct_decoded_token(2, tokens)
assert result == ""
def test_middle_token_correction(self, processor_with_previous_logprobs):
"""Test correction for a token in the middle of the list.
Scenario: Token at idx=5 in a longer list needs correction.
"""
processor = processor_with_previous_logprobs
tokens = [10, 20, 30, 40, 50, 60, 70, 80]
# Mock tokenizer behavior for middle token
def mock_decode(ids):
if ids == [50, 60]:
return "olar"
return "<EFBFBD>"
processor.tokenizer.decode.side_effect = mock_decode
result = processor._correct_decoded_token(5, tokens)
assert result == "olar"
def test_multiple_consecutive_replacement_chars(
self, processor_with_previous_logprobs
):
"""Test handling of multiple consecutive replacement characters.
Scenario: Sequence like ["<EFBFBD>", "<EFBFBD>", "p"] where first two should
become empty strings.
"""
processor = processor_with_previous_logprobs
# Test first replacement char
tokens = [100, 101, 102]
processor.tokenizer.decode.return_value = "still<EFBFBD>"
result1 = processor._correct_decoded_token(0, tokens)
assert result1 == ""
# Test second replacement char
result2 = processor._correct_decoded_token(1, tokens)
assert result2 == ""
def test_correction_with_multibyte_utf8(self, processor_with_previous_logprobs):
"""Test correction involving multi-byte UTF-8 characters.
Scenario: Byte-fallback tokenization splits multi-byte UTF-8
characters (e.g., curly quotes, Chinese characters, emojis).
Example from user: "<EFBFBD>", "<EFBFBD>" -> "", "\""
"""
processor = processor_with_previous_logprobs
tokens = [200, 201]
# Mock tokenizer behavior for multi-byte UTF-8 correction
def mock_decode(ids):
# When decoding first token (idx=0) with previous logprob token
if ids == [123, 200]:
return ' "' # Space + left curly quote
# When decoding second token (idx=1) with previous token in list
elif ids == [200, 201]:
return '"' # Right curly quote
# When decoding second token (idx=1) with previous logprob + prev token
elif ids == [123, 200, 201]:
return ' ""' # Full sequence
return "<EFBFBD>"
processor.tokenizer.decode.side_effect = mock_decode
# First token correction (idx=0)
# Will call decode([123, 200]) since idx=0 uses previous logprob token
result1 = processor._correct_decoded_token(0, tokens)
assert result1 == ' "'
# Second token correction (idx=1)
# Will call decode([200, 201]) since idx>0 uses previous token in list
result2 = processor._correct_decoded_token(1, tokens)
assert result2 == '"'
def test_real_world_opt125m_scenario(self, mock_tokenizer):
"""Test the real-world scenario from user's example.
User's example with facebook/opt-125m:
Before: [" the", " term", " <20>", "<EFBFBD>", "p", "olar", "ized", "<EFBFBD>", "<EFBFBD>", ...]
After: [" the", " term", "", " "", "p", "olar", "ized", "", "\"", ...]
"""
from vllm.v1.engine.logprobs import LogprobsProcessor
# Simulate the sequence of tokens
processor = LogprobsProcessor(
tokenizer=mock_tokenizer,
logprobs=[],
prompt_logprobs=None,
cumulative_logprob=0.0,
num_logprobs=1,
num_prompt_logprobs=None,
)
# Token IDs representing the problematic sequence
tokens = [1, 2, 3, 4, 5, 6, 7, 8, 9] # placeholder IDs
# Mock decode behavior simulating the real scenario
def mock_decode(ids):
# Simulate cases where individual tokens decode to "<22>"
# but combinations decode correctly
if len(ids) == 1:
if ids[0] == 3 or ids[0] == 4 or ids[0] == 8 or ids[0] == 9:
return "<EFBFBD>"
elif len(ids) == 2:
if ids == [2, 3]:
return " term<72>" # Still ends with <20>, need more context
elif ids == [3, 4]:
return ' "' # Corrected to space + left curly quote
elif ids == [7, 8]:
return "ized<EFBFBD>" # Still ends with <20>
elif ids == [8, 9]:
return '"' # Corrected to right curly quote
elif len(ids) == 3:
if ids == [1, 2, 3]:
return " the term<72>" # Still ends with issue
elif ids == [2, 3, 4]:
return ' term "' # With all context
return "normal_text"
mock_tokenizer.decode.side_effect = mock_decode
# Test token at index 2 (should fail to correct, return "")
# Token 3 individually is "<22>"
# decode([2, 3]) = " term<72>" (still ends with <20>)
# No previous logprobs, so fallback to ""
result = processor._correct_decoded_token(2, tokens)
assert result == ""
# Test token at index 3 (should correct to " "")
# Token 4 individually is "<22>"
# decode([3, 4]) = " "" (corrected!)
processor.logprobs = [{2: None}] # Add previous logprob
result = processor._correct_decoded_token(3, tokens)
assert result == ' "'
def test_verify_tokens_integration():
"""Integration test for _verify_tokens with real model.
This test validates that _verify_tokens correctly identifies and
corrects tokens ending with the replacement character "<EFBFBD>".
Uses facebook/opt-125m which is known to produce these issues.
"""
runner = VllmRunner(
"facebook/opt-125m",
max_logprobs=0,
enable_prefix_caching=False,
gpu_memory_utilization=0.15,
max_model_len=256,
)
# Use a prompt that triggers multi-byte UTF-8 issues
# Based on user's example: "In this example,"
test_prompts = ["In this example,"]
sampling_params = SamplingParams(
max_tokens=16,
temperature=0,
logprobs=0,
)
results = runner.llm.generate(test_prompts, sampling_params=sampling_params)
# Verify that decoded tokens don't contain replacement characters
for result in results:
assert result.outputs[0].logprobs is not None
for logprob_dict in result.outputs[0].logprobs:
for token_id, logprob_info in logprob_dict.items():
decoded_token = logprob_info.decoded_token
# Decoded tokens should not end with replacement character
# They should either be corrected or empty string
assert not decoded_token.endswith("<EFBFBD>"), (
f"Token {token_id} decoded to '{decoded_token}' which "
f"ends with replacement character"
)
# Decoded tokens should not contain lone replacement characters
assert decoded_token != "<EFBFBD>", (
f"Token {token_id} is a lone replacement character"
)
def test_utf8_edge_cases_with_real_model():
"""Test various UTF-8 edge cases with a real model.
Tests prompts that are likely to trigger byte-fallback tokenization
and multi-byte UTF-8 splitting.
"""
runner = VllmRunner(
"facebook/opt-125m",
max_logprobs=1,
enable_prefix_caching=False,
gpu_memory_utilization=0.15,
max_model_len=256,
)
# Prompts with various multi-byte UTF-8 characters
test_prompts = [
'Smart quotes: "Hello"', # Curly quotes
"Em dash — test", # Em dash
"Ellipsis… continues", # Ellipsis
"Chinese: 你好", # Chinese characters
"Emoji: 😀 🎉", # Emojis
'Mixed: "quoted" — with symbols', # Mixed
]
sampling_params = SamplingParams(
max_tokens=10,
temperature=0,
logprobs=1,
)
results = runner.llm.generate(test_prompts, sampling_params=sampling_params)
for i, result in enumerate(results):
prompt = test_prompts[i]
assert result.outputs[0].logprobs is not None
# Check that no decoded tokens end with replacement character
for logprob_dict in result.outputs[0].logprobs:
for token_id, logprob_info in logprob_dict.items():
decoded_token = logprob_info.decoded_token
assert not decoded_token.endswith("<EFBFBD>"), (
f"Prompt: '{prompt}'\n"
f"Token {token_id} decoded to '{decoded_token}' which "
f"ends with replacement character"
)
def test_correct_decoded_token_preserves_valid_tokens():
"""Test that valid tokens (not ending with <20>) are not modified.
The _correct_decoded_token method should only be called for tokens
ending with "<EFBFBD>", but this test verifies the broader _verify_tokens
logic doesn't affect valid tokens.
"""
runner = VllmRunner(
"facebook/opt-125m",
max_logprobs=2,
enable_prefix_caching=False,
gpu_memory_utilization=0.15,
max_model_len=256,
)
# Simple prompt with standard ASCII characters
test_prompts = ["Hello world, this is a test."]
sampling_params = SamplingParams(
max_tokens=10,
temperature=0,
logprobs=2,
)
results = runner.llm.generate(test_prompts, sampling_params=sampling_params)
for result in results:
assert result.outputs[0].logprobs is not None
# All decoded tokens should be valid strings
for logprob_dict in result.outputs[0].logprobs:
for token_id, logprob_info in logprob_dict.items():
decoded_token = logprob_info.decoded_token
# Valid tokens should be non-empty strings (or empty if corrected)
assert isinstance(decoded_token, str)
# Should not contain replacement character
assert "<EFBFBD>" not in decoded_token
@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
@pytest.mark.parametrize(
"model_setup",