[Bugfix] vLLM produces invalid UTF-8 tokens and “�” (#28874)

Signed-off-by: John Calderon <jcalderon@nvidia.com> Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
2026-01-05 19:23:00 -05:00
parent 3c98c2d21b
commit 2f4e6548ef
3 changed files with 539 additions and 33 deletions
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -514,6 +514,424 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
    del llm


+class TestCorrectDecodedToken:
+    """Unit tests for _correct_decoded_token method in LogprobsProcessor.
+
+    This method handles UTF-8 decoding issues where incomplete byte sequences
+    result in the Unicode replacement character "<EFBFBD>" (U+FFFD). This commonly
+    happens with byte-fallback tokenization when multi-byte UTF-8 characters
+    are split across tokens.
+    """
+
+    @pytest.fixture
+    def mock_tokenizer(self):
+        """Create a mock tokenizer for testing."""
+        from unittest.mock import Mock
+
+        tokenizer = Mock()
+        return tokenizer
+
+    @pytest.fixture
+    def processor_with_empty_logprobs(self, mock_tokenizer):
+        """Create a LogprobsProcessor with empty logprobs."""
+        from vllm.v1.engine.logprobs import LogprobsProcessor
+
+        processor = LogprobsProcessor(
+            tokenizer=mock_tokenizer,
+            logprobs=[],
+            prompt_logprobs=None,
+            cumulative_logprob=0.0,
+            num_logprobs=1,
+            num_prompt_logprobs=None,
+        )
+        return processor
+
+    @pytest.fixture
+    def processor_with_previous_logprobs(self, mock_tokenizer):
+        """Create a LogprobsProcessor with previous logprobs."""
+        from vllm.v1.engine.logprobs import LogprobsProcessor
+
+        processor = LogprobsProcessor(
+            tokenizer=mock_tokenizer,
+            logprobs=[{123: None}],  # Previous token ID is 123
+            prompt_logprobs=None,
+            cumulative_logprob=0.0,
+            num_logprobs=1,
+            num_prompt_logprobs=None,
+        )
+        return processor
+
+    def test_correction_with_previous_token_in_list(
+        self, processor_with_empty_logprobs
+    ):
+        """Test correction using previous token in the same list.
+
+        Scenario: Token at idx=1 ends with "<EFBFBD>", but when decoded with
+        the previous token (idx=0), it forms a valid UTF-8 sequence.
+        Example: token[0]="<EFBFBD>", token[1]="<EFBFBD>" -> together form "polarized"
+        """
+        processor = processor_with_empty_logprobs
+        tokens = [100, 101, 102]  # token IDs
+
+        # Mock tokenizer behavior:
+        # - decode([102]) returns "<22>" (ends with replacement char)
+        # - decode([101, 102]) returns "valid" (no replacement char)
+        processor.tokenizer.decode.side_effect = lambda ids: (
+            "valid" if ids == [101, 102] else "<EFBFBD>"
+        )
+
+        result = processor._correct_decoded_token(2, tokens)
+        assert result == "valid"
+        processor.tokenizer.decode.assert_called_with([101, 102])
+
+    def test_correction_with_previous_logprob_token(
+        self, processor_with_previous_logprobs
+    ):
+        """Test correction using previous logprob token.
+
+        Scenario: Cannot correct with previous token in list (idx=0),
+        but can correct with previous logprob token.
+        """
+        processor = processor_with_previous_logprobs
+        tokens = [100]  # single token
+
+        # Mock tokenizer behavior:
+        # - decode([100]) returns "<22>" (ends with replacement char)
+        # - decode([123, 100]) returns " "polarized" (no replacement char)
+        # Token 123 is from previous logprobs
+        def mock_decode(ids):
+            if ids == [123, 100]:
+                return ' "polarized"'
+            return "<EFBFBD>"
+
+        processor.tokenizer.decode.side_effect = mock_decode
+
+        result = processor._correct_decoded_token(0, tokens)
+        assert result == ' "polarized"'
+
+    def test_correction_at_idx_zero_no_previous_logprobs(
+        self, processor_with_empty_logprobs
+    ):
+        """Test correction at idx=0 with no previous logprobs.
+
+        Scenario: First token in list, no previous logprobs available.
+        Should return empty string as fallback.
+        """
+        processor = processor_with_empty_logprobs
+        tokens = [100]
+
+        # Mock tokenizer always returns "<22>"
+        processor.tokenizer.decode.return_value = "<EFBFBD>"
+
+        result = processor._correct_decoded_token(0, tokens)
+        assert result == ""
+
+    def test_correction_at_idx_zero_with_previous_logprobs(
+        self, processor_with_previous_logprobs
+    ):
+        """Test correction at idx=0 with previous logprobs available.
+
+        Scenario: First token in list, but previous logprobs exist.
+        Should try correction with previous logprob token.
+        """
+        processor = processor_with_previous_logprobs
+        tokens = [200]
+
+        # Mock tokenizer behavior
+        def mock_decode(ids):
+            if ids == [123, 200]:
+                return "corrected"
+            return "<EFBFBD>"
+
+        processor.tokenizer.decode.side_effect = mock_decode
+
+        result = processor._correct_decoded_token(0, tokens)
+        assert result == "corrected"
+
+    def test_no_correction_needed_returns_fallback(
+        self, processor_with_previous_logprobs
+    ):
+        """Test fallback to empty string when no correction works.
+
+        Scenario: All correction attempts still end with "<EFBFBD>".
+        Should return empty string as final fallback.
+        """
+        processor = processor_with_previous_logprobs
+        tokens = [100, 101, 102]
+
+        # Mock tokenizer always returns text ending with "<22>"
+        processor.tokenizer.decode.return_value = "still<EFBFBD>"
+
+        result = processor._correct_decoded_token(2, tokens)
+        assert result == ""
+
+    def test_middle_token_correction(self, processor_with_previous_logprobs):
+        """Test correction for a token in the middle of the list.
+
+        Scenario: Token at idx=5 in a longer list needs correction.
+        """
+        processor = processor_with_previous_logprobs
+        tokens = [10, 20, 30, 40, 50, 60, 70, 80]
+
+        # Mock tokenizer behavior for middle token
+        def mock_decode(ids):
+            if ids == [50, 60]:
+                return "olar"
+            return "<EFBFBD>"
+
+        processor.tokenizer.decode.side_effect = mock_decode
+
+        result = processor._correct_decoded_token(5, tokens)
+        assert result == "olar"
+
+    def test_multiple_consecutive_replacement_chars(
+        self, processor_with_previous_logprobs
+    ):
+        """Test handling of multiple consecutive replacement characters.
+
+        Scenario: Sequence like ["<EFBFBD>", "<EFBFBD>", "p"] where first two should
+        become empty strings.
+        """
+        processor = processor_with_previous_logprobs
+
+        # Test first replacement char
+        tokens = [100, 101, 102]
+        processor.tokenizer.decode.return_value = "still<EFBFBD>"
+        result1 = processor._correct_decoded_token(0, tokens)
+        assert result1 == ""
+
+        # Test second replacement char
+        result2 = processor._correct_decoded_token(1, tokens)
+        assert result2 == ""
+
+    def test_correction_with_multibyte_utf8(self, processor_with_previous_logprobs):
+        """Test correction involving multi-byte UTF-8 characters.
+
+        Scenario: Byte-fallback tokenization splits multi-byte UTF-8
+        characters (e.g., curly quotes, Chinese characters, emojis).
+        Example from user: "<EFBFBD>", "<EFBFBD>" -> "", "\""
+        """
+        processor = processor_with_previous_logprobs
+        tokens = [200, 201]
+
+        # Mock tokenizer behavior for multi-byte UTF-8 correction
+        def mock_decode(ids):
+            # When decoding first token (idx=0) with previous logprob token
+            if ids == [123, 200]:
+                return ' "'  # Space + left curly quote
+            # When decoding second token (idx=1) with previous token in list
+            elif ids == [200, 201]:
+                return '"'  # Right curly quote
+            # When decoding second token (idx=1) with previous logprob + prev token
+            elif ids == [123, 200, 201]:
+                return ' ""'  # Full sequence
+            return "<EFBFBD>"
+
+        processor.tokenizer.decode.side_effect = mock_decode
+
+        # First token correction (idx=0)
+        # Will call decode([123, 200]) since idx=0 uses previous logprob token
+        result1 = processor._correct_decoded_token(0, tokens)
+        assert result1 == ' "'
+
+        # Second token correction (idx=1)
+        # Will call decode([200, 201]) since idx>0 uses previous token in list
+        result2 = processor._correct_decoded_token(1, tokens)
+        assert result2 == '"'
+
+    def test_real_world_opt125m_scenario(self, mock_tokenizer):
+        """Test the real-world scenario from user's example.
+
+        User's example with facebook/opt-125m:
+        Before: [" the", " term", " <20>", "<EFBFBD>", "p", "olar", "ized", "<EFBFBD>", "<EFBFBD>", ...]
+        After: [" the", " term", "", " "", "p", "olar", "ized", "", "\"", ...]
+        """
+        from vllm.v1.engine.logprobs import LogprobsProcessor
+
+        # Simulate the sequence of tokens
+        processor = LogprobsProcessor(
+            tokenizer=mock_tokenizer,
+            logprobs=[],
+            prompt_logprobs=None,
+            cumulative_logprob=0.0,
+            num_logprobs=1,
+            num_prompt_logprobs=None,
+        )
+
+        # Token IDs representing the problematic sequence
+        tokens = [1, 2, 3, 4, 5, 6, 7, 8, 9]  # placeholder IDs
+
+        # Mock decode behavior simulating the real scenario
+        def mock_decode(ids):
+            # Simulate cases where individual tokens decode to "<22>"
+            # but combinations decode correctly
+            if len(ids) == 1:
+                if ids[0] == 3 or ids[0] == 4 or ids[0] == 8 or ids[0] == 9:
+                    return "<EFBFBD>"
+            elif len(ids) == 2:
+                if ids == [2, 3]:
+                    return " term<72>"  # Still ends with <20>, need more context
+                elif ids == [3, 4]:
+                    return ' "'  # Corrected to space + left curly quote
+                elif ids == [7, 8]:
+                    return "ized<EFBFBD>"  # Still ends with <20>
+                elif ids == [8, 9]:
+                    return '"'  # Corrected to right curly quote
+            elif len(ids) == 3:
+                if ids == [1, 2, 3]:
+                    return " the term<72>"  # Still ends with issue
+                elif ids == [2, 3, 4]:
+                    return ' term "'  # With all context
+            return "normal_text"
+
+        mock_tokenizer.decode.side_effect = mock_decode
+
+        # Test token at index 2 (should fail to correct, return "")
+        # Token 3 individually is "<22>"
+        # decode([2, 3]) = " term<72>" (still ends with <20>)
+        # No previous logprobs, so fallback to ""
+        result = processor._correct_decoded_token(2, tokens)
+        assert result == ""
+
+        # Test token at index 3 (should correct to " "")
+        # Token 4 individually is "<22>"
+        # decode([3, 4]) = " "" (corrected!)
+        processor.logprobs = [{2: None}]  # Add previous logprob
+        result = processor._correct_decoded_token(3, tokens)
+        assert result == ' "'
+
+
+def test_verify_tokens_integration():
+    """Integration test for _verify_tokens with real model.
+
+    This test validates that _verify_tokens correctly identifies and
+    corrects tokens ending with the replacement character "<EFBFBD>".
+    Uses facebook/opt-125m which is known to produce these issues.
+    """
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=0,
+        enable_prefix_caching=False,
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
+
+    # Use a prompt that triggers multi-byte UTF-8 issues
+    # Based on user's example: "In this example,"
+    test_prompts = ["In this example,"]
+
+    sampling_params = SamplingParams(
+        max_tokens=16,
+        temperature=0,
+        logprobs=0,
+    )
+
+    results = runner.llm.generate(test_prompts, sampling_params=sampling_params)
+
+    # Verify that decoded tokens don't contain replacement characters
+    for result in results:
+        assert result.outputs[0].logprobs is not None
+        for logprob_dict in result.outputs[0].logprobs:
+            for token_id, logprob_info in logprob_dict.items():
+                decoded_token = logprob_info.decoded_token
+                # Decoded tokens should not end with replacement character
+                # They should either be corrected or empty string
+                assert not decoded_token.endswith("<EFBFBD>"), (
+                    f"Token {token_id} decoded to '{decoded_token}' which "
+                    f"ends with replacement character"
+                )
+                # Decoded tokens should not contain lone replacement characters
+                assert decoded_token != "<EFBFBD>", (
+                    f"Token {token_id} is a lone replacement character"
+                )
+
+
+def test_utf8_edge_cases_with_real_model():
+    """Test various UTF-8 edge cases with a real model.
+
+    Tests prompts that are likely to trigger byte-fallback tokenization
+    and multi-byte UTF-8 splitting.
+    """
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=1,
+        enable_prefix_caching=False,
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
+
+    # Prompts with various multi-byte UTF-8 characters
+    test_prompts = [
+        'Smart quotes: "Hello"',  # Curly quotes
+        "Em dash — test",  # Em dash
+        "Ellipsis… continues",  # Ellipsis
+        "Chinese: 你好",  # Chinese characters
+        "Emoji: 😀 🎉",  # Emojis
+        'Mixed: "quoted" — with symbols',  # Mixed
+    ]
+
+    sampling_params = SamplingParams(
+        max_tokens=10,
+        temperature=0,
+        logprobs=1,
+    )
+
+    results = runner.llm.generate(test_prompts, sampling_params=sampling_params)
+
+    for i, result in enumerate(results):
+        prompt = test_prompts[i]
+        assert result.outputs[0].logprobs is not None
+
+        # Check that no decoded tokens end with replacement character
+        for logprob_dict in result.outputs[0].logprobs:
+            for token_id, logprob_info in logprob_dict.items():
+                decoded_token = logprob_info.decoded_token
+                assert not decoded_token.endswith("<EFBFBD>"), (
+                    f"Prompt: '{prompt}'\n"
+                    f"Token {token_id} decoded to '{decoded_token}' which "
+                    f"ends with replacement character"
+                )
+
+
+def test_correct_decoded_token_preserves_valid_tokens():
+    """Test that valid tokens (not ending with <20>) are not modified.
+
+    The _correct_decoded_token method should only be called for tokens
+    ending with "<EFBFBD>", but this test verifies the broader _verify_tokens
+    logic doesn't affect valid tokens.
+    """
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=2,
+        enable_prefix_caching=False,
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
+
+    # Simple prompt with standard ASCII characters
+    test_prompts = ["Hello world, this is a test."]
+
+    sampling_params = SamplingParams(
+        max_tokens=10,
+        temperature=0,
+        logprobs=2,
+    )
+
+    results = runner.llm.generate(test_prompts, sampling_params=sampling_params)
+
+    for result in results:
+        assert result.outputs[0].logprobs is not None
+
+        # All decoded tokens should be valid strings
+        for logprob_dict in result.outputs[0].logprobs:
+            for token_id, logprob_info in logprob_dict.items():
+                decoded_token = logprob_info.decoded_token
+                # Valid tokens should be non-empty strings (or empty if corrected)
+                assert isinstance(decoded_token, str)
+                # Should not contain replacement character
+                assert "<EFBFBD>" not in decoded_token
+
+
@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
@pytest.mark.parametrize(
    "model_setup",