[Attention] Refactor CUDA attention backend selection logic (#24794)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
@@ -45,7 +45,7 @@ if TYPE_CHECKING:
|
||||
|
||||
import vllm.model_executor.layers.quantization as me_quant
|
||||
import vllm.model_executor.models as me_models
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.config.load import LoadConfig
|
||||
from vllm.config.parallel import ParallelConfig
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
@@ -53,7 +53,7 @@ if TYPE_CHECKING:
|
||||
else:
|
||||
PretrainedConfig = Any
|
||||
|
||||
_Backend = Any
|
||||
AttentionBackendEnum = Any
|
||||
me_quant = LazyLoader(
|
||||
"model_executor", globals(), "vllm.model_executor.layers.quantization"
|
||||
)
|
||||
@@ -302,7 +302,7 @@ class ModelConfig:
|
||||
mm_processor_cache_type: InitVar[MMCacheType | None] = None
|
||||
mm_shm_cache_max_object_size_mb: InitVar[int | None] = None
|
||||
mm_encoder_tp_mode: InitVar[MMEncoderTPMode | None] = None
|
||||
mm_encoder_attn_backend: InitVar[_Backend | str | None] = None
|
||||
mm_encoder_attn_backend: InitVar[AttentionBackendEnum | str | None] = None
|
||||
interleave_mm_strings: InitVar[bool | None] = None
|
||||
skip_mm_profiling: InitVar[bool | None] = None
|
||||
video_pruning_rate: InitVar[float | None] = None
|
||||
@@ -420,7 +420,7 @@ class ModelConfig:
|
||||
mm_processor_cache_type: MMCacheType | None,
|
||||
mm_shm_cache_max_object_size_mb: int | None,
|
||||
mm_encoder_tp_mode: MMEncoderTPMode | None,
|
||||
mm_encoder_attn_backend: _Backend | str | None,
|
||||
mm_encoder_attn_backend: AttentionBackendEnum | str | None,
|
||||
interleave_mm_strings: bool | None,
|
||||
skip_mm_profiling: bool | None,
|
||||
video_pruning_rate: float | None,
|
||||
|
||||
Reference in New Issue
Block a user