[2/3] Refactor InternVL-based processors (#37324)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -27,7 +27,8 @@ from vllm.multimodal.processing import (
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
)
|
||||
from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor
|
||||
from vllm.transformers_utils.processors.internvl import InternVLImageProcessor
|
||||
from vllm.transformers_utils.processors.nvlm_d import NVLMProcessor
|
||||
|
||||
from .intern_vit import InternVisionModel
|
||||
from .internvl import (
|
||||
@@ -39,12 +40,33 @@ from .internvl import (
|
||||
|
||||
|
||||
class NVLMProcessingInfo(BaseInternVLProcessingInfo):
|
||||
def get_image_processor(self, **kwargs):
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
|
||||
kwargs.setdefault("image_size", vision_config.image_size)
|
||||
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
|
||||
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
|
||||
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
|
||||
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
|
||||
|
||||
return InternVLImageProcessor(**kwargs)
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
|
||||
return self.ctx.init_processor(
|
||||
NVLMProcessor,
|
||||
config=self.get_hf_config(),
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
image_processor = self.get_image_processor(**kwargs)
|
||||
image_size = image_processor.image_size
|
||||
patch_size = vision_config.patch_size
|
||||
downsample_ratio = config.downsample_ratio
|
||||
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
|
||||
|
||||
return NVLMProcessor(
|
||||
tokenizer=self.get_tokenizer(),
|
||||
**kwargs,
|
||||
image_processor=image_processor,
|
||||
image_seq_length=image_seq_length,
|
||||
)
|
||||
|
||||
|
||||
@@ -117,9 +139,11 @@ class NVLMMultiModalProcessor(BaseInternVLMultiModalProcessor[NVLMProcessingInfo
|
||||
if num_patches is not None:
|
||||
assert isinstance(num_patches, int)
|
||||
|
||||
repl = hf_processor.get_image_repl(feature_size, num_patches)
|
||||
repl = hf_processor.get_image_repl(num_patches, num_features=feature_size)
|
||||
|
||||
return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
|
||||
return PromptUpdateDetails.select_text(
|
||||
repl.full + "\n", hf_processor.ctx_image_token
|
||||
)
|
||||
|
||||
# See note in dummy data regarding why we have the extra newline
|
||||
return [
|
||||
|
||||
Reference in New Issue
Block a user