[V1] Override mm_counts for dummy data creation (#15703)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-03-30 18:20:42 +08:00
committed by GitHub
parent 7fd8c0f85c
commit 803d5c35f3
9 changed files with 114 additions and 93 deletions

View File

@@ -71,7 +71,8 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
max_video_tokens = self.get_num_video_tokens(
image_width=target_width,
image_height=target_height,
num_frames=self.get_num_frames_with_most_features(seq_len),
num_frames=self.get_num_frames_with_most_features(
seq_len, mm_counts),
)
return {"video": max_video_tokens}
@@ -130,9 +131,12 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
return num_frames
def get_num_frames_with_most_features(self, seq_len: int) -> int:
mm_config = self.ctx.get_mm_config()
max_videos = mm_config.get_limit_per_prompt("video")
def get_num_frames_with_most_features(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> int:
max_videos = mm_counts.get("video", 0)
max_total_frames = self._get_max_video_frames(seq_len)
@@ -155,7 +159,7 @@ class LlavaNextVideoDummyInputsBuilder(
target_width, target_height = \
self.info.get_image_size_with_most_features()
target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len)
self.info.get_num_frames_with_most_features(seq_len, mm_counts)
mm_data = {
"video":

View File

@@ -108,7 +108,7 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
) -> Mapping[str, int]:
return {
"image": self.get_max_image_tokens(),
"video": self.get_max_video_tokens(seq_len),
"video": self.get_max_video_tokens(seq_len, mm_counts),
}
# Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
@@ -202,10 +202,13 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
return num_frames
def get_num_frames_with_most_features(self, seq_len: int) -> int:
mm_config = self.ctx.get_mm_config()
max_images = mm_config.get_limit_per_prompt("image")
max_videos = mm_config.get_limit_per_prompt("video")
def get_num_frames_with_most_features(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> int:
max_images = mm_counts.get("image", 0)
max_videos = mm_counts.get("video", 0)
max_image_tokens = self.get_max_image_tokens() * max_images
max_total_frames = self._get_max_video_frames(seq_len -
@@ -215,13 +218,18 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
return max(max_frames_per_video, 1)
def get_max_video_tokens(self, seq_len: int) -> int:
def get_max_video_tokens(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> int:
target_width, target_height = self.get_image_size_with_most_features()
return self.get_num_video_tokens(
image_width=target_width,
image_height=target_height,
num_frames=self.get_num_frames_with_most_features(seq_len),
num_frames=self.get_num_frames_with_most_features(
seq_len, mm_counts),
)
@@ -243,7 +251,8 @@ class LlavaOnevisionDummyInputsBuilder(
target_width, target_height = \
self.info.get_image_size_with_most_features()
target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len)
self.info.get_num_frames_with_most_features(seq_len,
mm_counts)
mm_data = {
"image":

View File

@@ -43,7 +43,8 @@ from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
from vllm.multimodal.profiling import ProcessorInputs
from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
MiniCPMVDummyInputsBuilder,
MiniCPMVMultiModalDataParser,
MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo,
_minicpmv_field_config)
@@ -203,8 +204,8 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
return 30
def get_max_audio_tokens(self) -> int:
return self.get_max_audio_tokens_per_chunk(
) * self.get_max_audio_chunks_with_most_features()
num_chunks = self.get_max_audio_chunks_with_most_features()
return self.get_max_audio_tokens_per_chunk() * num_chunks
def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
sampling_rate = self.get_default_audio_sampling_rate()
@@ -212,21 +213,24 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
def get_num_frames_with_most_features(self, seq_len: int) -> int:
mm_config = self.ctx.get_mm_config()
max_images = mm_config.get_limit_per_prompt("image")
max_videos = mm_config.get_limit_per_prompt("video")
max_audios = mm_config.get_limit_per_prompt("audio")
def get_num_frames_with_most_features(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> int:
max_images = mm_counts.get("image", 0)
max_videos = mm_counts.get("video", 0)
max_audios = mm_counts.get("audio", 0)
max_image_tokens = self.get_max_image_tokens() * max_images
max_audio_tokens = self.get_max_audio_tokens() * max_audios
max_total_frames = self.get_max_video_frames(seq_len -
max_image_tokens -
max_audio_tokens)
max_frames_per_video = min(max_total_frames // max(max_videos, 1),
_MAX_FRAMES_PER_VIDEO)
num_frames = max(max_total_frames // max(max_videos, 1), 1)
return num_frames
return max(max_frames_per_video, 1)
class MiniCPMODummyInputsBuilder(

View File

@@ -69,6 +69,9 @@ from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
merge_multimodal_embeddings)
from .vision import scatter_patch_features, select_patch_features
# For profile run
_MAX_FRAMES_PER_VIDEO = 16
class MiniCPMVImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
@@ -369,7 +372,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
) -> Mapping[str, int]:
mm_max_tokens = {"image": self.get_max_image_tokens()}
if self.get_model_version() == (2, 6):
mm_max_tokens["video"] = self.get_max_video_tokens(seq_len)
mm_max_tokens["video"] = self.get_max_video_tokens(
seq_len, mm_counts)
return mm_max_tokens
@@ -432,9 +436,14 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
use_image_id=False,
)
def get_max_video_tokens(self, seq_len: int) -> int:
return self.get_max_video_frame_tokens(
) * self.get_num_frames_with_most_features(seq_len)
def get_max_video_tokens(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> int:
num_frames = self.get_num_frames_with_most_features(seq_len, mm_counts)
num_video_tokens_total = self.get_max_video_frame_tokens() * num_frames
return num_video_tokens_total
def get_video_max_slice_num(self) -> int:
return 1
@@ -449,18 +458,21 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
num_frames = max_tokens // num_frame_tokens
return num_frames
def get_num_frames_with_most_features(self, seq_len: int) -> int:
mm_config = self.ctx.get_mm_config()
max_images = mm_config.get_limit_per_prompt("image")
max_videos = mm_config.get_limit_per_prompt("video")
def get_num_frames_with_most_features(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> int:
max_images = mm_counts.get("image", 0)
max_videos = mm_counts.get("video", 0)
max_image_tokens = self.get_max_image_tokens() * max_images
max_total_frames = self.get_max_video_frames(seq_len -
max_image_tokens)
max_frames_per_video = min(max_total_frames // max(max_videos, 1),
_MAX_FRAMES_PER_VIDEO)
num_frames = max(max_total_frames // max(max_videos, 1), 1)
return num_frames
return max(max_frames_per_video, 1)
_I = TypeVar("_I",
@@ -483,7 +495,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
video_width, video_height = \
self.info.get_video_frame_size_with_most_features()
num_video_frames = \
self.info.get_num_frames_with_most_features(seq_len)
self.info.get_num_frames_with_most_features(seq_len, mm_counts)
mm_data = {
"image":

View File

@@ -806,7 +806,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
):
) -> Qwen2VLImageProcessor:
return cached_image_processor_from_config(
self.ctx.model_config,
**self._get_image_processor_kwargs(min_pixels=min_pixels,
@@ -825,7 +825,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
) -> Mapping[str, int]:
return {
"image": self.get_max_image_tokens(),
"video": self.get_max_video_tokens(seq_len),
"video": self.get_max_video_tokens(seq_len, mm_counts),
}
def _get_vision_info(
@@ -941,10 +941,13 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
return num_frames
def get_num_frames_with_most_features(self, seq_len: int) -> int:
mm_config = self.ctx.get_mm_config()
max_images = mm_config.get_limit_per_prompt("image")
max_videos = mm_config.get_limit_per_prompt("video")
def get_num_frames_with_most_features(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> int:
max_images = mm_counts.get("image", 0)
max_videos = mm_counts.get("video", 0)
max_image_tokens = self.get_max_image_tokens() * max_images
max_total_frames = self._get_max_video_frames(seq_len -
@@ -954,13 +957,18 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
return max(max_frames_per_video, 1)
def get_max_video_tokens(self, seq_len: int) -> int:
def get_max_video_tokens(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> int:
target_width, target_height = self.get_image_size_with_most_features()
return self.get_num_video_tokens(
image_width=target_width,
image_height=target_height,
num_frames=self.get_num_frames_with_most_features(seq_len),
num_frames=self.get_num_frames_with_most_features(
seq_len, mm_counts),
image_processor=None,
)
@@ -982,7 +990,7 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
target_width, target_height = \
self.info.get_image_size_with_most_features()
target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len)
self.info.get_num_frames_with_most_features(seq_len, mm_counts)
mm_data = {
"image":