diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index d6716c8e5..63cb9c96e 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -130,7 +130,7 @@ class AIMv2Attention(nn.Module): self.num_heads_per_partition, self.head_dim, self.scale, - prefix=prefix, + prefix=f"{prefix}.attn", ) def forward(self, x: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index ad8f7c1af..73b0b8af9 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -126,7 +126,7 @@ class BlipAttention(nn.Module): self.num_heads_per_partition, self.head_dim, self.scale, - prefix=prefix, + prefix=f"{prefix}.attn", ) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index b2886e85a..5333042cb 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -296,7 +296,7 @@ class Glm4vVisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, - prefix=prefix, + prefix=f"{prefix}.attn", ) self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 8bad386fa..56504029d 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -139,7 +139,7 @@ class EVA2CLIPAttention(nn.Module): self.num_heads_per_rank, self.head_dim, self.scale, - prefix=prefix, + prefix=f"{prefix}.attn", ) self.output_dropout = torch.nn.Dropout(config.dropout_prob) diff --git a/vllm/model_executor/models/glm_ocr.py b/vllm/model_executor/models/glm_ocr.py index 90c6baacb..d03743140 100644 --- a/vllm/model_executor/models/glm_ocr.py +++ b/vllm/model_executor/models/glm_ocr.py @@ -137,6 +137,7 @@ class GlmOcrVisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, + prefix=f"{prefix}.attn", ) self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index d6f93a9d4..b90afbe5a 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -166,7 +166,7 @@ class Idefics2VisionAttention(nn.Module): self.num_heads_per_partition, self.head_dim, self.scale, - prefix=prefix, + prefix=f"{prefix}.attn", ) def forward( diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 8cacfe06e..816147364 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -215,7 +215,7 @@ class InternParallelAttention(nn.Module): self.num_heads_per_partition, self.head_dim, self.scale, - prefix=prefix, + prefix=f"{prefix}.attn", ) def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor): diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py index 421e0ffd4..533f0681c 100644 --- a/vllm/model_executor/models/interns1_vit.py +++ b/vllm/model_executor/models/interns1_vit.py @@ -220,7 +220,7 @@ class InternSdpaAttention(nn.Module): self.num_heads, self.head_dim, self.scale, - prefix=prefix, + prefix=f"{prefix}.attn", ) def forward(self, x: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 52fdeddf4..58f63597a 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -257,7 +257,7 @@ class Llama4VisionAttention(nn.Module): self.num_local_heads, self.head_dim, self.scaling, - prefix=prefix, + prefix=f"{prefix}.attn", ) if use_data_parallel: diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index b1330d92d..1ee177656 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -235,7 +235,7 @@ class MultiHeadDotProductAttention(nn.Module): self.head_dim, self.scale, num_kv_heads=self.num_kv_heads, - prefix=prefix, + prefix=f"{prefix}.attn", ) def forward( diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index f9664f32e..9d996a93b 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -611,7 +611,7 @@ class ImagePoolingAttention(nn.Module): self.head_dim, self.scale, num_kv_heads=self.num_kv_heads, - prefix=prefix, + prefix=f"{prefix}.attn", ) def forward_sdpa( diff --git a/vllm/model_executor/models/openpangu_vl.py b/vllm/model_executor/models/openpangu_vl.py index 239ef81d3..d7df2cbb4 100644 --- a/vllm/model_executor/models/openpangu_vl.py +++ b/vllm/model_executor/models/openpangu_vl.py @@ -125,6 +125,7 @@ class OpenPanguVisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, + prefix=f"{prefix}.attn", ) self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index c06beb97f..c2c52fa66 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -345,7 +345,7 @@ class Qwen2_5_VisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, - prefix=prefix, + prefix=f"{prefix}.attn", ) self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 6169e72df..d911fb1dd 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -319,7 +319,7 @@ class Qwen2VisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, - prefix=prefix, + prefix=f"{prefix}.attn", ) self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 96294158c..9500ce2e2 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -194,7 +194,7 @@ class Qwen3OmniMoeAudioAttention(nn.Module): num_heads=self.num_local_heads, head_size=self.head_dim, scale=self.scaling, - prefix=prefix, + prefix=f"{prefix}.attn", ) def forward( diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index fe2bb1ac6..f3993348b 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -763,7 +763,7 @@ class Step3VisionAttention(nn.Module): self.num_heads, self.head_dim, self.scale, - prefix=prefix, + prefix=f"{prefix}.attn", ) def forward( diff --git a/vllm/model_executor/models/step_vl.py b/vllm/model_executor/models/step_vl.py index 31b266a7e..4669771f4 100644 --- a/vllm/model_executor/models/step_vl.py +++ b/vllm/model_executor/models/step_vl.py @@ -224,7 +224,7 @@ class PerceptionEncoderVisionAttention(nn.Module): self.num_heads, self.head_dim, self.scale, - prefix=prefix, + prefix=f"{prefix}.attn", ) self.rope = PerceptionEncoderRope2D( dim=self.head_dim,