diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index d880fc434..97aace5a2 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -66,8 +66,6 @@ from .vision import get_vit_attn_backend logger = init_logger(__name__) -_MAX_FRAMES_PER_VIDEO = 16 - # === Vision Transformer === # @@ -839,6 +837,15 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + max_image_tokens = self.get_max_image_tokens() + max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts) + return {"image": max_image_tokens, "video": max_video_tokens} + def _get_vision_info( self, *, @@ -964,8 +971,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): max_image_tokens = self.get_max_image_tokens() * max_images max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens) - max_frames_per_video = min(max_total_frames // max(max_videos, 1), - _MAX_FRAMES_PER_VIDEO) + max_frames_per_video = max_total_frames // max(max_videos, 1) return max(max_frames_per_video, 2) diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 780974c3b..6034505fa 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -287,8 +287,13 @@ class Ernie4_5_VLMoeMoE(nn.Module): if self.has_shared_experts: shared_output = self.shared_experts(hidden_states) - if visual_token_mask is not None and visual_token_mask.any(): - # assert visual_token_mask.shape[0] != hidden_states.shape[0] + if visual_token_mask is not None and visual_token_mask.all(): + # only vision modal input + router_logits, _ = self.vision_experts_gate(hidden_states) + final_hidden_states = self.vision_experts( + hidden_states=hidden_states, router_logits=router_logits) + elif visual_token_mask is not None and visual_token_mask.any(): + # text and vision modals input visual_token_mask = visual_token_mask.repeat( 1, self.hidden_size).bool() text_token_mask = ~visual_token_mask @@ -310,7 +315,7 @@ class Ernie4_5_VLMoeMoE(nn.Module): hidden_states=vision_hidden_states, router_logits=vision_router_logits).flatten() else: - # text modal input processing directly + # only text modal input text_router_logits, _ = self.text_experts_gate(hidden_states) final_hidden_states = self.text_experts(