[1/2] Move InternVL-based processors (#37260)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-03-17 21:50:56 +08:00
committed by GitHub
parent 2660b9289c
commit f340324335
20 changed files with 3252 additions and 3099 deletions

View File

@@ -27,48 +27,16 @@ from vllm.multimodal.processing import (
PromptUpdate,
PromptUpdateDetails,
)
from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor
from .intern_vit import InternVisionModel
from .internvl import (
BaseInternVLDummyInputsBuilder,
BaseInternVLMultiModalProcessor,
BaseInternVLProcessingInfo,
BaseInternVLProcessor,
InternVLChatModel,
)
IMG_PAD = "<|vision_pad|>"
class NVLMProcessor(BaseInternVLProcessor):
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_PAD]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
if num_patches is None:
raise NotImplementedError("Embedding inputs are not supported")
tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
if self.use_thumbnail:
tile_pos_identifiers += ["<tile_global_thumbnail>"]
context_size = feature_size // num_patches
features = "".join(
identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
)
# We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile"
repl = "<Image>" + features + "</Image>"
return PromptUpdateDetails.select_text(repl, IMG_PAD)
class NVLMProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> NVLMProcessor: