[Bugfix] Fix byte fallback handling when using outlines (#31391)
Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com> Co-authored-by: Kenichi Maehashi <maehashi@preferred.jp>
This commit is contained in:
@@ -122,7 +122,12 @@ class OutlinesGrammar(StructuredOutputGrammar):
|
||||
Returns False if the FSM failed to advance.
|
||||
"""
|
||||
if self.guide.accepts_tokens(tokens):
|
||||
# Advance cannot fail because we checked Guide.accepts_tokens()
|
||||
# Advance can fail when the next state reached after advancing with
|
||||
# the current tokens is a dead state. This is because Guide.accepts_tokens()
|
||||
# only checks whether the current tokens can be accepted,
|
||||
# whereas guide.advance() additionally checks the next state
|
||||
# after all tokens are accepted.
|
||||
# We need to be aware that the FSM must be prepared without dead states.
|
||||
for t in tokens:
|
||||
self.guide.advance(t)
|
||||
self.num_processed_tokens += 1
|
||||
|
||||
@@ -226,7 +226,9 @@ def _reduced_vocabulary(
|
||||
# by this point.
|
||||
token_bytes = bytes(token_str) # type: ignore[arg-type]
|
||||
|
||||
elif "\ufffd" in token_str and not re_replacement_seq.match(token_str):
|
||||
elif (token_str == "\ufffd" and token != "\ufffd") or (
|
||||
"\ufffd" in token_str and not re_replacement_seq.match(token_str)
|
||||
):
|
||||
# Handle tokens with invalid UTF-8 sequences.
|
||||
if re_llama_byte_token.match(token):
|
||||
# Llama-like tokenizers use <0xXX> for incomplete sequences.
|
||||
|
||||
Reference in New Issue
Block a user