diff --git a/vllm/v1/structured_output/backend_outlines.py b/vllm/v1/structured_output/backend_outlines.py index 34916079f..53c08dbc3 100644 --- a/vllm/v1/structured_output/backend_outlines.py +++ b/vllm/v1/structured_output/backend_outlines.py @@ -122,7 +122,12 @@ class OutlinesGrammar(StructuredOutputGrammar): Returns False if the FSM failed to advance. """ if self.guide.accepts_tokens(tokens): - # Advance cannot fail because we checked Guide.accepts_tokens() + # Advance can fail when the next state reached after advancing with + # the current tokens is a dead state. This is because Guide.accepts_tokens() + # only checks whether the current tokens can be accepted, + # whereas guide.advance() additionally checks the next state + # after all tokens are accepted. + # We need to be aware that the FSM must be prepared without dead states. for t in tokens: self.guide.advance(t) self.num_processed_tokens += 1 diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 3c98538f8..1419cdce1 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -226,7 +226,9 @@ def _reduced_vocabulary( # by this point. token_bytes = bytes(token_str) # type: ignore[arg-type] - elif "\ufffd" in token_str and not re_replacement_seq.match(token_str): + elif (token_str == "\ufffd" and token != "\ufffd") or ( + "\ufffd" in token_str and not re_replacement_seq.match(token_str) + ): # Handle tokens with invalid UTF-8 sequences. if re_llama_byte_token.match(token): # Llama-like tokenizers use <0xXX> for incomplete sequences.