[Bugfix] Fix EVS implementation for Qwen3 VL (#33607)
Signed-off-by: 2ez4bz <133824995+2ez4bz@users.noreply.github.com>
This commit is contained in:
@@ -195,6 +195,8 @@ class Qwen2_5_VLVideoPixelInputs(TensorSchema):
|
||||
- second_per_grid_ts: The video time interval (in seconds) for each
|
||||
grid along the temporal dimension in the 3D position IDs. Returned
|
||||
when `videos` is not `None`.
|
||||
- timestamps: List of timestamp values (in seconds) for each frame
|
||||
after merging. Length equals the temporal dimension after merging.
|
||||
"""
|
||||
|
||||
type: Literal["pixel_values_videos"]
|
||||
@@ -214,6 +216,8 @@ class Qwen2_5_VLVideoPixelInputs(TensorSchema):
|
||||
TensorShape("nv"),
|
||||
]
|
||||
|
||||
timestamps: list[list[float]] | None = None
|
||||
|
||||
|
||||
class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
|
||||
"""
|
||||
@@ -232,6 +236,8 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
|
||||
- second_per_grid_ts: The video time interval (in seconds) for each
|
||||
grid along the temporal dimension in the 3D position IDs. Returned
|
||||
when `videos` is not `None`.
|
||||
- timestamps: List of timestamp values (in seconds) for each frame
|
||||
after merging. Length equals the temporal dimension after merging.
|
||||
"""
|
||||
|
||||
type: Literal["video_embeds"]
|
||||
@@ -250,6 +256,7 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
|
||||
torch.Tensor | None,
|
||||
TensorShape("nv"),
|
||||
] = None
|
||||
timestamps: list[list[float]] | None = None
|
||||
|
||||
|
||||
Qwen2_5_VLVideoInputs: TypeAlias = (
|
||||
|
||||
@@ -755,6 +755,7 @@ def _create_qwen2vl_field_factory(
|
||||
"video", video_embed_grid_sizes
|
||||
),
|
||||
video_grid_thw=MultiModalFieldConfig.batched("video", keep_on_cpu=True),
|
||||
timestamps=MultiModalFieldConfig.batched("video", keep_on_cpu=True),
|
||||
)
|
||||
|
||||
return _qwen2vl_field_config
|
||||
|
||||
@@ -628,6 +628,9 @@ class Qwen3_5MoeForCausalLM(Qwen3_5ForCausalLMBase, QwenNextMixtureOfExperts):
|
||||
dummy_inputs=Qwen3VLDummyInputsBuilder,
|
||||
)
|
||||
class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
|
||||
# Qwen3.5 does not support multimodal pruning (EVS).
|
||||
supports_multimodal_pruning = False
|
||||
|
||||
packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
|
||||
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
||||
"in_proj_ba": ["in_proj_b", "in_proj_a"],
|
||||
@@ -643,10 +646,8 @@ class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid)
|
||||
self.config = config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
|
||||
self.video_pruning_rate = multimodal_config.video_pruning_rate
|
||||
self.is_multimodal_pruning_enabled = (
|
||||
multimodal_config.is_multimodal_pruning_enabled()
|
||||
)
|
||||
# Qwen3.5 does not support multimodal pruning (EVS).
|
||||
self.is_multimodal_pruning_enabled = False
|
||||
|
||||
with self._mark_tower_model(vllm_config, {"image", "video"}):
|
||||
self.visual = Qwen3_VisionTransformer(
|
||||
@@ -693,6 +694,12 @@ class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid)
|
||||
|
||||
return inputs_embeds
|
||||
|
||||
def recompute_mrope_positions(self, *args, **kwargs):
|
||||
raise NotImplementedError(
|
||||
"Qwen3.5 does not support multimodal pruning (EVS). "
|
||||
"recompute_mrope_positions should never be called."
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
@@ -851,10 +858,8 @@ class Qwen3_5MoeForConditionalGeneration(
|
||||
self.config = config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
|
||||
self.video_pruning_rate = multimodal_config.video_pruning_rate
|
||||
self.is_multimodal_pruning_enabled = (
|
||||
multimodal_config.is_multimodal_pruning_enabled()
|
||||
)
|
||||
# Qwen3.5 does not support multimodal pruning (EVS).
|
||||
self.is_multimodal_pruning_enabled = False
|
||||
|
||||
with self._mark_tower_model(vllm_config, {"image", "video"}):
|
||||
self.visual = Qwen3_VisionTransformer(
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -45,6 +45,7 @@ from vllm.model_executor.model_loader.weight_utils import (
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tokenizers.registry import cached_tokenizer_from_config
|
||||
|
||||
from .interfaces import MixtureOfExperts
|
||||
from .qwen3_moe import (
|
||||
@@ -415,6 +416,7 @@ class Qwen3VLMoeForConditionalGeneration(
|
||||
multimodal_config = vllm_config.model_config.multimodal_config
|
||||
|
||||
self.config = config
|
||||
self._tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
|
||||
self.multimodal_config = multimodal_config
|
||||
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
|
||||
self.video_pruning_rate = multimodal_config.video_pruning_rate
|
||||
|
||||
Reference in New Issue
Block a user