[2/3] Refactor InternVL-based processors (#37324)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-03-18 22:22:19 +08:00
committed by GitHub
parent 525f2eeb0b
commit 99267c23ca
18 changed files with 815 additions and 1199 deletions

View File

@@ -27,7 +27,8 @@ from vllm.multimodal.processing import (
PromptUpdate,
PromptUpdateDetails,
)
from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor
from vllm.transformers_utils.processors.internvl import InternVLImageProcessor
from vllm.transformers_utils.processors.nvlm_d import NVLMProcessor
from .intern_vit import InternVisionModel
from .internvl import (
@@ -39,12 +40,33 @@ from .internvl import (
class NVLMProcessingInfo(BaseInternVLProcessingInfo):
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
return InternVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
return self.ctx.init_processor(
NVLMProcessor,
config=self.get_hf_config(),
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return NVLMProcessor(
tokenizer=self.get_tokenizer(),
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)
@@ -117,9 +139,11 @@ class NVLMMultiModalProcessor(BaseInternVLMultiModalProcessor[NVLMProcessingInfo
if num_patches is not None:
assert isinstance(num_patches, int)
repl = hf_processor.get_image_repl(feature_size, num_patches)
repl = hf_processor.get_image_repl(num_patches, num_features=feature_size)
return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
return PromptUpdateDetails.select_text(
repl.full + "\n", hf_processor.ctx_image_token
)
# See note in dummy data regarding why we have the extra newline
return [