[Models]: Make Multimodal config implicit in ViT implementation (#31972)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -4,7 +4,6 @@
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import MultiModalConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.model_executor.models.vision import get_vit_attn_backend
|
||||
@@ -32,7 +31,6 @@ class MMEncoderAttention(CustomOp):
|
||||
scale: float | None = None,
|
||||
num_kv_heads: int | None = None,
|
||||
prefix: str = "",
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Args:
|
||||
@@ -42,7 +40,6 @@ class MMEncoderAttention(CustomOp):
|
||||
num_kv_heads: number of kv heads.
|
||||
prefix: This has no effect, it is only here to make it easier to
|
||||
swap between Attention and MultiHeadAttention
|
||||
multimodal_config: configs for multi-modal.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
@@ -62,16 +59,10 @@ class MMEncoderAttention(CustomOp):
|
||||
# weight and activation dtype.
|
||||
dtype = torch.get_default_dtype()
|
||||
|
||||
# Try to get vision attention backend from multimodal_config.
|
||||
attn_backend_override = None
|
||||
if multimodal_config is not None:
|
||||
attn_backend_override = multimodal_config.mm_encoder_attn_backend
|
||||
|
||||
# Get device-specific vision attention backend.
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=head_size,
|
||||
dtype=dtype,
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
|
||||
self.is_flash_attn_backend = self.attn_backend in {
|
||||
|
||||
Reference in New Issue
Block a user