[Bugfix] Fix Granite Vision / Don't use Siglip Pooling Head Nested Models by Default (#32299)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
This commit is contained in:
Alex Brooks
2026-01-20 20:11:52 -07:00
committed by GitHub
parent 7013e9ac8f
commit 27b81e010d
5 changed files with 83 additions and 16 deletions

View File

@@ -124,8 +124,10 @@ def _llava_vllm_to_hf_output(
if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
]
assert output_str[0] == " "
hf_output_str = output_str[1:]
# output_str[0] is not " " in some cases, e.g., Granite Vision,
# but for most llava based models, this is the case
hf_output_str = output_str[1:] if output_str[0] == " " else output_str
if hf_output_ids[-1] == eos_token_id:
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)