[VLM] Separate out profiling-related logic (#11746)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-01-06 16:02:21 +08:00
committed by GitHub
parent 2a622d704a
commit 996357e480
17 changed files with 1036 additions and 739 deletions

View File

@@ -1,6 +1,6 @@
from functools import cached_property
from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
TypedDict, Union)
from typing import (Final, Iterable, List, Literal, Mapping, Optional,
Protocol, Set, Tuple, TypedDict, Union)
import numpy as np
import torch
@@ -17,12 +17,14 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.profiling import BaseProfilingInfo
from vllm.sequence import IntermediateTensors
from .clip import CLIPVisionModel
from .interfaces import SupportsMultiModal, SupportsPP
from .llava import (LlavaMultiModalProcessor, LlavaMultiModalProjector,
init_vision_tower_for_llava)
from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingMixin,
BaseLlavaProfilingInfo, LlavaLikeConfig,
LlavaMultiModalProjector, init_vision_tower_for_llava)
from .siglip import SiglipVisionModel
from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn,
init_vllm_registered_model, maybe_prefix)
@@ -60,36 +62,18 @@ LlavaNextImageInputs = Union[LlavaNextImagePixelInputs,
LlavaNextImageEmbeddingInputs]
class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
image_grid_pinpoints: Final[list[list[int]]]
def _get_hf_config(self) -> LlavaNextConfig:
class LlavaNextProcessingMixin(BaseLlavaProcessingMixin):
def _get_hf_config(self) -> LlavaNextLikeConfig:
return self.ctx.get_hf_config(LlavaNextConfig)
def _get_hf_processor(self) -> LlavaNextProcessor:
def _get_hf_processor(self):
return self.ctx.get_hf_processor(LlavaNextProcessor)
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return dict(
pixel_values=MultiModalFieldConfig.batched("image"),
image_sizes=MultiModalFieldConfig.batched("image"),
image_embeds=MultiModalFieldConfig.batched("image"),
)
def _get_image_token(self) -> str:
return self._get_hf_processor().image_token
def _get_max_image_tokens(self) -> int:
largest_feature_size, _ = self._get_pinpoint_with_most_features()
return largest_feature_size
def _get_dummy_image_size(self) -> ImageSize:
_, pinpoint = self._get_pinpoint_with_most_features()
return pinpoint
# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
def _get_num_image_tokens(
self,
@@ -98,7 +82,7 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
image_height: int,
) -> int:
hf_config = self._get_hf_config()
vision_encoder_info = self._vision_encoder_info
vision_encoder_info = self._get_vision_encoder_info()
base_feature_size = self._apply_feature_select_strategy(
hf_config.vision_feature_select_strategy,
@@ -140,7 +124,7 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
current_height = npatches * num_patch_height
current_width = npatches * num_patch_width
# NOTE: HF resizes based on float32
# NOTE: Use float32 to remain consistent with HF output
original_aspect_ratio = np.array(original_width / original_height,
dtype=np.float32)
current_aspect_ratio = np.array(current_width / current_height,
@@ -164,11 +148,10 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
return (unpadded_features, newline_features)
def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]:
"""
Get the grid pinpoint with the most features and
the corresponding feature size.
"""
class LlavaNextProfilingInfo(LlavaNextProcessingMixin, BaseLlavaProfilingInfo):
def _get_image_size_with_most_features(self) -> ImageSize:
hf_config = self._get_hf_config()
largest_feature_size, largest_feature_pinpoint = 0, None
@@ -183,7 +166,25 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
if largest_feature_size == 0 or largest_feature_pinpoint is None:
raise ValueError("Cannot have a largest feature size of 0!")
return largest_feature_size, largest_feature_pinpoint
return largest_feature_pinpoint
class LlavaNextMultiModalProcessor(LlavaNextProcessingMixin,
BaseLlavaMultiModalProcessor):
def _get_profiling_info(self) -> BaseProfilingInfo:
return LlavaNextProfilingInfo(self.ctx)
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return dict(
pixel_values=MultiModalFieldConfig.batched("image"),
image_sizes=MultiModalFieldConfig.batched("image"),
image_embeds=MultiModalFieldConfig.batched("image"),
)
@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor)