[Bugfix] Fix byte fallback handling when using outlines (#31391)

Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com>
Co-authored-by: Kenichi Maehashi <maehashi@preferred.jp>
This commit is contained in:
Shinichi Hemmi
2026-01-21 04:48:08 +09:00
committed by GitHub
parent 7c5dedc247
commit 86c69dc54c
2 changed files with 9 additions and 2 deletions

View File

@@ -122,7 +122,12 @@ class OutlinesGrammar(StructuredOutputGrammar):
Returns False if the FSM failed to advance.
"""
if self.guide.accepts_tokens(tokens):
# Advance cannot fail because we checked Guide.accepts_tokens()
# Advance can fail when the next state reached after advancing with
# the current tokens is a dead state. This is because Guide.accepts_tokens()
# only checks whether the current tokens can be accepted,
# whereas guide.advance() additionally checks the next state
# after all tokens are accepted.
# We need to be aware that the FSM must be prepared without dead states.
for t in tokens:
self.guide.advance(t)
self.num_processed_tokens += 1

View File

@@ -226,7 +226,9 @@ def _reduced_vocabulary(
# by this point.
token_bytes = bytes(token_str) # type: ignore[arg-type]
elif "\ufffd" in token_str and not re_replacement_seq.match(token_str):
elif (token_str == "\ufffd" and token != "\ufffd") or (
"\ufffd" in token_str and not re_replacement_seq.match(token_str)
):
# Handle tokens with invalid UTF-8 sequences.
if re_llama_byte_token.match(token):
# Llama-like tokenizers use <0xXX> for incomplete sequences.