diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index 5b8ead4c7..d6716c8e5 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -127,7 +127,10 @@ class AIMv2Attention(nn.Module): self.num_heads_per_partition = divide(self.num_heads, self.tp_size) self.attn = MMEncoderAttention( - self.num_heads_per_partition, self.head_dim, self.scale + self.num_heads_per_partition, + self.head_dim, + self.scale, + prefix=prefix, ) def forward(self, x: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index ac9ae49f0..ad8f7c1af 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -123,7 +123,10 @@ class BlipAttention(nn.Module): self.num_heads_per_partition = divide(self.num_heads, self.tp_size) self.attn = MMEncoderAttention( - self.num_heads_per_partition, self.head_dim, self.scale + self.num_heads_per_partition, + self.head_dim, + self.scale, + prefix=prefix, ) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 0321851d1..b2886e85a 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -296,6 +296,7 @@ class Glm4vVisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, + prefix=prefix, ) self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index ae64913c5..8bad386fa 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -136,7 +136,10 @@ class EVA2CLIPAttention(nn.Module): ) self.attn = MMEncoderAttention( - self.num_heads_per_rank, self.head_dim, self.scale + self.num_heads_per_rank, + self.head_dim, + self.scale, + prefix=prefix, ) self.output_dropout = torch.nn.Dropout(config.dropout_prob) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index d5fc6e315..d6f93a9d4 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -163,7 +163,10 @@ class Idefics2VisionAttention(nn.Module): ) # Use unified MMEncoderAttention with Flash Attention support self.attn = MMEncoderAttention( - self.num_heads_per_partition, self.head_dim, self.scale + self.num_heads_per_partition, + self.head_dim, + self.scale, + prefix=prefix, ) def forward( diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 41ca5c297..8cacfe06e 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -212,7 +212,10 @@ class InternParallelAttention(nn.Module): ) self.attn = MMEncoderAttention( - self.num_heads_per_partition, self.head_dim, self.scale + self.num_heads_per_partition, + self.head_dim, + self.scale, + prefix=prefix, ) def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor): diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py index 195bb9681..421e0ffd4 100644 --- a/vllm/model_executor/models/interns1_vit.py +++ b/vllm/model_executor/models/interns1_vit.py @@ -170,6 +170,7 @@ class InternSdpaAttention(nn.Module): config: PretrainedConfig, *, num_dummy_heads: int = 0, + prefix: str = "", ) -> None: super().__init__() @@ -215,7 +216,12 @@ class InternSdpaAttention(nn.Module): self.projection_layer = nn.Linear(self.dummy_dim, self.embed_dim) # Use unified MMEncoderAttention with automatic backend selection - self.attn = MMEncoderAttention(self.num_heads, self.head_dim, self.scale) + self.attn = MMEncoderAttention( + self.num_heads, + self.head_dim, + self.scale, + prefix=prefix, + ) def forward(self, x: torch.Tensor) -> torch.Tensor: """x shape: (B, N, C)""" @@ -313,7 +319,11 @@ class InternS1VisionLayer(nn.Module): num_dummy_heads: int, prefix: str = "", ): - return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads) + return InternSdpaAttention( + config, + num_dummy_heads=num_dummy_heads, + prefix=prefix, + ) def forward( self, diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 54b58299b..52fdeddf4 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -254,7 +254,10 @@ class Llama4VisionAttention(nn.Module): self.scaling = self.head_dim**-0.5 self.attn = MMEncoderAttention( - self.num_local_heads, self.head_dim, self.scaling + self.num_local_heads, + self.head_dim, + self.scaling, + prefix=prefix, ) if use_data_parallel: diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 7ea06fd85..b1330d92d 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -231,7 +231,11 @@ class MultiHeadDotProductAttention(nn.Module): self.scale = self.head_dim**-0.5 self.attn = MMEncoderAttention( - self.num_heads, self.head_dim, self.scale, num_kv_heads=self.num_kv_heads + self.num_heads, + self.head_dim, + self.scale, + num_kv_heads=self.num_kv_heads, + prefix=prefix, ) def forward( diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index cc718d6d5..f9664f32e 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -611,6 +611,7 @@ class ImagePoolingAttention(nn.Module): self.head_dim, self.scale, num_kv_heads=self.num_kv_heads, + prefix=prefix, ) def forward_sdpa( diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 0310c5415..c06beb97f 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -345,6 +345,7 @@ class Qwen2_5_VisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, + prefix=prefix, ) self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index c7c26c206..6169e72df 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -319,6 +319,7 @@ class Qwen2VisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, + prefix=prefix, ) self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 4d797528f..96294158c 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -194,6 +194,7 @@ class Qwen3OmniMoeAudioAttention(nn.Module): num_heads=self.num_local_heads, head_size=self.head_dim, scale=self.scaling, + prefix=prefix, ) def forward( diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 1cbb54c84..fe2bb1ac6 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -759,7 +759,12 @@ class Step3VisionAttention(nn.Module): ) # Use unified MMEncoderAttention with automatic backend selection - self.attn = MMEncoderAttention(self.num_heads, self.head_dim, self.scale) + self.attn = MMEncoderAttention( + self.num_heads, + self.head_dim, + self.scale, + prefix=prefix, + ) def forward( self, diff --git a/vllm/model_executor/models/step_vl.py b/vllm/model_executor/models/step_vl.py index de7db5daa..31b266a7e 100644 --- a/vllm/model_executor/models/step_vl.py +++ b/vllm/model_executor/models/step_vl.py @@ -220,7 +220,12 @@ class PerceptionEncoderVisionAttention(nn.Module): prefix=f"{prefix}.out_proj", disable_tp=use_data_parallel, ) - self.attn = MMEncoderAttention(self.num_heads, self.head_dim, self.scale) + self.attn = MMEncoderAttention( + self.num_heads, + self.head_dim, + self.scale, + prefix=prefix, + ) self.rope = PerceptionEncoderRope2D( dim=self.head_dim, max_grid_height=max_grid_height,