[VLM] Avoid unnecessary tokenization (#12310)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-22 19:08:31 +08:00
parent 68ad4e3a8d
commit cd7b6f0857
9 changed files with 71 additions and 40 deletions
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -315,13 +315,14 @@ class PixtralHFMultiModalProcessor(
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargs,
    ) -> list[PromptReplacement]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
        hf_config = self.info.get_hf_config()
-        image_token_id = hf_config.image_token_index
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()

-        processor = self.info.get_hf_processor()
-        image_token = processor.image_token
-        image_break_token = processor.image_break_token
-        image_end_token = processor.image_end_token
+        image_break_id = vocab[processor.image_break_token]
+        image_token_id = hf_config.image_token_index
+        image_end_id = vocab[processor.image_end_token]

        vision_config = hf_config.vision_config
        assert isinstance(vision_config, PixtralVisionConfig)
@@ -336,10 +337,10 @@ class PixtralHFMultiModalProcessor(
                image_height=image_size.height,
            )

-            tokens = ([image_token] * ncols + [image_break_token]) * nrows
-            tokens[-1] = image_end_token
+            tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
+            tokens[-1] = image_end_id

-            return "".join(tokens)
+            return tokens

        return [
            PromptReplacement(