[VLM] Avoid unnecessary tokenization (#12310)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-01-22 19:08:31 +08:00
committed by GitHub
parent 68ad4e3a8d
commit cd7b6f0857
9 changed files with 71 additions and 40 deletions

View File

@@ -315,13 +315,14 @@ class PixtralHFMultiModalProcessor(
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index
tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab()
processor = self.info.get_hf_processor()
image_token = processor.image_token
image_break_token = processor.image_break_token
image_end_token = processor.image_end_token
image_break_id = vocab[processor.image_break_token]
image_token_id = hf_config.image_token_index
image_end_id = vocab[processor.image_end_token]
vision_config = hf_config.vision_config
assert isinstance(vision_config, PixtralVisionConfig)
@@ -336,10 +337,10 @@ class PixtralHFMultiModalProcessor(
image_height=image_size.height,
)
tokens = ([image_token] * ncols + [image_break_token]) * nrows
tokens[-1] = image_end_token
tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
tokens[-1] = image_end_id
return "".join(tokens)
return tokens
return [
PromptReplacement(