[Bugfix] Fix Qwen-VL tokenizer implementation (#36140)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-03-06 00:07:19 +08:00
committed by GitHub
parent 176c799f4c
commit 7196348157
9 changed files with 118 additions and 66 deletions

View File

@@ -29,7 +29,8 @@ def test_tokenizer_like_protocol():
_assert_tokenizer_like(tokenizer)
tokenizer = get_tokenizer(
"mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
"mistralai/Mistral-7B-Instruct-v0.3",
tokenizer_mode="mistral",
)
assert isinstance(tokenizer, MistralTokenizer)
_assert_tokenizer_like(tokenizer)
@@ -40,11 +41,20 @@ def test_tokenizer_like_protocol():
tokenizer = get_tokenizer("deepseek-ai/DeepSeek-V3", tokenizer_mode="deepseek_v32")
assert isinstance(tokenizer, HfTokenizer)
# Verify it's a fast tokenizer (required for FastIncrementalDetokenizer)
assert isinstance(tokenizer, PreTrainedTokenizerFast)
assert "DSV32" in tokenizer.__class__.__name__
_assert_tokenizer_like(tokenizer)
tokenizer = get_tokenizer(
"Qwen/Qwen-VL",
tokenizer_mode="qwen_vl",
trust_remote_code=True,
)
assert isinstance(tokenizer, HfTokenizer)
assert "WithoutImagePad" in tokenizer.__class__.__name__
@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
def test_tokenizer_revision(tokenizer_name: str):