diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index c239cb5d0..b4d8c7b86 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -640,8 +640,9 @@ steps: # grade: Blocking source_file_dependencies: - csrc/attention/ - - vllm/attention - vllm/v1/attention + # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) + - vllm/model_executor/layers/attention - tests/kernels/attention commands: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d1a536a07..6edcb2e7d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -568,8 +568,9 @@ steps: mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/attention/ - - vllm/attention - vllm/v1/attention + # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) + - vllm/model_executor/layers/attention - tests/kernels/attention commands: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index 8641a18b4..3f43b8d42 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -15,8 +15,9 @@ steps: timeout_in_minutes: 35 source_file_dependencies: - csrc/attention/ - - vllm/attention - vllm/v1/attention + # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) + - vllm/model_executor/layers/attention - tests/kernels/attention commands: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index c963be4cb..772c62973 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,8 +2,8 @@ # for more info about CODEOWNERS file # This lists cover the "core" components of vLLM that require careful review -/vllm/attention @LucasWilkinson /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn +/vllm/model_executor/layers/attention @LucasWilkinson /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/mamba @tdoublep diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index e2f560815..624f13bf7 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -29,7 +29,7 @@ The initialization code should look like this: ```python from torch import nn from vllm.config import VllmConfig - from vllm.attention.layer import Attention + from vllm.model_executor.layers.attention import Attention class MyAttention(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str): diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md index 3f4934b15..487522389 100644 --- a/docs/design/custom_op.md +++ b/docs/design/custom_op.md @@ -271,7 +271,7 @@ Taking `MMEncoderAttention` as an example: ??? code ```python - from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention + from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.custom_op import CustomOp diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 961d6873f..50492a569 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -21,7 +21,6 @@ from tests.compile.fusion_test_utils import ( from tests.utils import flat_product from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.attention.layer import Attention from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.matcher_utils import QUANT_OPS @@ -40,6 +39,7 @@ from vllm.config import ( set_current_vllm_config, ) from vllm.forward_context import get_forward_context, set_forward_context +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8StaticTensorSym, diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index 45a114679..19511b787 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -5,7 +5,6 @@ import pytest import torch from tests.compile.backend import TestBackend -from vllm.attention.layer import Attention from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass @@ -21,6 +20,7 @@ from vllm.config import ( VllmConfig, set_current_vllm_config, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.platforms import current_platform diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 94d494613..e3b612123 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -9,8 +9,7 @@ import torch from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.utils import opcheck from vllm import _custom_ops as ops -from vllm.attention.layer import Attention -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.platforms import current_platform from vllm.utils.mem_utils import get_max_shared_memory_bytes from vllm.utils.torch_utils import set_random_seed diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index ecaea8867..25fb5c926 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -12,7 +12,7 @@ from unittest.mock import patch import pytest import torch -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index badbd3e9a..458c7a2e5 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -5,7 +5,6 @@ import numpy as np import pytest import torch -from vllm.attention.layer import Attention from vllm.config import ( AttentionConfig, CacheConfig, @@ -19,6 +18,7 @@ from vllm.distributed.parallel_state import ( init_distributed_environment, initialize_model_parallel, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py index d223ad6e0..76f9a8f90 100644 --- a/tests/v1/worker/test_utils.py +++ b/tests/v1/worker/test_utils.py @@ -7,7 +7,7 @@ from vllm.v1.worker.utils import bind_kv_cache def test_bind_kv_cache(default_vllm_config): - from vllm.attention.layer import Attention + from vllm.model_executor.layers.attention import Attention ctx = { "layers.0.self_attn": Attention(32, 128, 0.1, prefix="layers.0.self_attn"), @@ -35,7 +35,7 @@ def test_bind_kv_cache(default_vllm_config): def test_bind_kv_cache_non_attention(default_vllm_config): - from vllm.attention.layer import Attention + from vllm.model_executor.layers.attention import Attention # example from Jamba PP=2 ctx = { @@ -58,7 +58,7 @@ def test_bind_kv_cache_non_attention(default_vllm_config): def test_bind_kv_cache_draft_model(default_vllm_config): - from vllm.attention.layer import Attention + from vllm.model_executor.layers.attention import Attention layer_names = [ "model.layers.0.attn", diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 90e65d059..3e4f92cdc 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -58,7 +58,6 @@ FILES = [ SEPARATE_GROUPS = [ "tests", # v0 related - "vllm/attention", "vllm/compilation", "vllm/lora", "vllm/model_executor", diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/vllm/attention/utils/__init__.py b/vllm/attention/utils/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/vllm/attention/utils/kv_sharing_utils.py b/vllm/attention/utils/kv_sharing_utils.py deleted file mode 100644 index 93af5bf7e..000000000 --- a/vllm/attention/utils/kv_sharing_utils.py +++ /dev/null @@ -1,33 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -def validate_kv_sharing_target( - current_layer_name, target_layer_name, static_forward_context -): - error_msg = ( - f"Specified KV sharing target layer for {current_layer_name} " - f"is not valid: target layer {target_layer_name} " - ) - - if current_layer_name == target_layer_name: - raise ValueError(error_msg + "cannot be the same as the current layer.") - - if target_layer_name not in static_forward_context: - from vllm.model_executor.models.utils import extract_layer_index - - # If target layer name is not in the static fwd context, it means either - # a) the target layer does not come BEFORE the current layer, or - # b) the target layer is not an Attention layer that exists in the model - current_layer_idx = extract_layer_index(current_layer_name) - target_layer_idx = extract_layer_index(target_layer_name) - if current_layer_idx <= target_layer_idx: - raise ValueError(error_msg + "must come before the current layer.") - else: - raise ValueError(error_msg + "is not a valid Attention layer in the model.") - - # Currently KV sharing is only supported between layers of the same type - target_layer_attn_type = static_forward_context[target_layer_name].attn_type - expected = static_forward_context[current_layer_name].attn_type - if target_layer_attn_type != expected: - raise ValueError( - error_msg + f"must be the same type as the current layer ({expected})." - ) diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index 618892ad3..0dc4b1489 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -11,9 +11,9 @@ from torch import fx from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import PatternMatcherPass -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kNvfp4Dynamic, diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py index bc95b7238..3ddd2b87f 100644 --- a/vllm/compilation/qk_norm_rope_fusion.py +++ b/vllm/compilation/qk_norm_rope_fusion.py @@ -10,9 +10,9 @@ from torch import fx from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import PatternMatcherPass -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from .fusion import empty_bf16, empty_fp32, empty_i64 diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 708a7f12d..73922a6fb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -8,7 +8,6 @@ from typing import Any import torch -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.utils import yield_req_data @@ -25,6 +24,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( ) from vllm.forward_context import ForwardContext from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_utils import BlockHash diff --git a/vllm/model_executor/layers/attention/__init__.py b/vllm/model_executor/layers/attention/__init__.py index e69de29bb..1be9f7742 100644 --- a/vllm/model_executor/layers/attention/__init__.py +++ b/vllm/model_executor/layers/attention/__init__.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention.chunked_local_attention import ( + ChunkedLocalAttention, +) +from vllm.model_executor.layers.attention.cross_attention import CrossAttention +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) +from vllm.model_executor.layers.attention.mla_attention import MLAAttention +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention.static_sink_attention import ( + StaticSinkAttention, +) + +__all__ = [ + "Attention", + "ChunkedLocalAttention", + "CrossAttention", + "EncoderOnlyAttention", + "MLAAttention", + "MMEncoderAttention", + "StaticSinkAttention", +] diff --git a/vllm/attention/layer.py b/vllm/model_executor/layers/attention/attention.py similarity index 70% rename from vllm/attention/layer.py rename to vllm/model_executor/layers/attention/attention.py index 9a6945f7a..25917294a 100644 --- a/vllm/attention/layer.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -1,23 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer.""" -from typing import cast +from typing import TYPE_CHECKING import torch import torch.nn as nn import vllm.envs as envs -from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target -from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer from vllm.config import CacheConfig, get_current_vllm_config from vllm.config.vllm import VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger +from vllm.model_executor.layers.attention.kv_transfer_utils import ( + maybe_transfer_kv_layer, +) from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, UnquantizedLinearMethod, ) from vllm.model_executor.layers.quantization import QuantizationConfig @@ -33,20 +32,54 @@ from vllm.utils.torch_utils import ( from vllm.v1.attention.backend import ( AttentionBackend, AttentionType, - MLAAttentionImpl, ) from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import ( FullAttentionSpec, KVCacheSpec, - MLAAttentionSpec, SlidingWindowSpec, ) +if TYPE_CHECKING: + from vllm.model_executor.layers.attention import MLAAttention + logger = init_logger(__name__) +def validate_kv_sharing_target( + current_layer_name, target_layer_name, static_forward_context +): + error_msg = ( + f"Specified KV sharing target layer for {current_layer_name} " + f"is not valid: target layer {target_layer_name} " + ) + + if current_layer_name == target_layer_name: + raise ValueError(error_msg + "cannot be the same as the current layer.") + + if target_layer_name not in static_forward_context: + from vllm.model_executor.models.utils import extract_layer_index + + # If target layer name is not in the static fwd context, it means either + # a) the target layer does not come BEFORE the current layer, or + # b) the target layer is not an Attention layer that exists in the model + current_layer_idx = extract_layer_index(current_layer_name) + target_layer_idx = extract_layer_index(target_layer_name) + if current_layer_idx <= target_layer_idx: + raise ValueError(error_msg + "must come before the current layer.") + else: + raise ValueError(error_msg + "is not a valid Attention layer in the model.") + + # Currently KV sharing is only supported between layers of the same type + target_layer_attn_type = static_forward_context[target_layer_name].attn_type + expected = static_forward_context[current_layer_name].attn_type + if target_layer_attn_type != expected: + raise ValueError( + error_msg + f"must be the same type as the current layer ({expected})." + ) + + def should_load_quant_weights(quant_method: QuantizeMethodBase | None) -> bool: """Returns whether the quantization method should load quantized weights.""" return quant_method is not None and not isinstance( @@ -493,236 +526,6 @@ class Attention(nn.Module, AttentionLayerBase): ) -class MLAAttention(nn.Module, AttentionLayerBase): - """Multi-Head Latent Attention layer. - - This class takes query, and compressed key/value tensors as input. - The class does the following: - - 1. Store the input key and value tensors in the KV cache. - 2. Perform (multi-head/multi-query/grouped-query) attention. - 3. Return the output tensor. - """ - - def __init__( - self, - num_heads: int, - scale: float, - qk_nope_head_dim: int, - qk_rope_head_dim: int, - v_head_dim: int, - q_lora_rank: int | None, - kv_lora_rank: int, - kv_b_proj: ColumnParallelLinear, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - use_sparse: bool = False, - indexer: object | None = None, - **extra_impl_args, - ): - super().__init__() - self.num_heads = num_heads - self.scale = scale - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_rope_head_dim = qk_rope_head_dim - self.v_head_dim = v_head_dim - self.q_lora_rank = q_lora_rank - self.kv_lora_rank = kv_lora_rank - self.head_size = kv_lora_rank + qk_rope_head_dim - self.layer_name = prefix - - if cache_config is not None: - kv_cache_dtype = cache_config.cache_dtype - block_size = cache_config.block_size - calculate_kv_scales = cache_config.calculate_kv_scales - else: - kv_cache_dtype = "auto" - block_size = 16 - calculate_kv_scales = False - self.quant_config = quant_config - - # Initialize KV cache quantization attributes - self.kv_cache_dtype = kv_cache_dtype - self.calculate_kv_scales = calculate_kv_scales - _init_kv_cache_quant(self, quant_config, prefix) - - dtype = torch.get_default_dtype() - self.attn_backend = get_attn_backend( - self.head_size, - dtype, - kv_cache_dtype, - block_size, - use_mla=True, - use_sparse=use_sparse, - ) - - if ( - cache_config is not None - and cache_config.enable_prefix_caching - and vllm_is_batch_invariant() - and ( - self.attn_backend.get_name() == "TRITON_MLA" - or self.attn_backend.get_name() == "FLASHINFER" - ) - ): - logger.warning_once( - "Disabling prefix caching for TRITON_MLA / FLASHINFER " - "with batch invariance, as it is not yet supported.", - scope="local", - ) - cache_config.enable_prefix_caching = False - - impl_cls = cast(type[MLAAttentionImpl], self.attn_backend.get_impl_cls()) - self.impl = impl_cls( - num_heads=self.num_heads, - head_size=self.head_size, - scale=self.scale, - num_kv_heads=1, - alibi_slopes=None, - sliding_window=None, - kv_cache_dtype=self.kv_cache_dtype, - logits_soft_cap=None, - attn_type=AttentionType.DECODER, - kv_sharing_target_layer_name=None, - # MLA Args - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - qk_head_dim=self.qk_nope_head_dim + self.qk_rope_head_dim, - v_head_dim=self.v_head_dim, - kv_b_proj=kv_b_proj, - indexer=indexer, - **extra_impl_args, - ) - - self.use_direct_call = not current_platform.opaque_attention_op() - - compilation_config = get_current_vllm_config().compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError(f"Duplicate layer name: {prefix}") - compilation_config.static_forward_context[prefix] = self - - self.kv_cache = [ - torch.tensor([]) - for _ in range( - get_current_vllm_config().parallel_config.pipeline_parallel_size - ) - ] - - self.use_sparse = use_sparse - - # Initialize q/k/v range constants. - self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) - self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) - self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) - - def forward( - self, - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - output_shape: torch.Size | None = None, - ) -> torch.Tensor: - if self.calculate_kv_scales: - torch.ops.vllm.maybe_calc_kv_scales(q, kv_c_normed, k_pe, self.layer_name) - - if self.use_direct_call: - forward_context: ForwardContext = get_forward_context() - attn_metadata = forward_context.attn_metadata - if isinstance(attn_metadata, dict): - attn_metadata = attn_metadata[self.layer_name] - self_kv_cache = self.kv_cache[forward_context.virtual_engine] - - if self.attn_backend.accept_output_buffer: - output = torch.empty(output_shape, dtype=q.dtype, device=q.device) - self.impl.forward( - self, - q, - kv_c_normed, - k_pe, - self_kv_cache, - attn_metadata, - output=output, - ) - return output - else: - return self.impl.forward( - self, q, kv_c_normed, k_pe, self_kv_cache, attn_metadata - ) - else: - if self.attn_backend.accept_output_buffer: - output = torch.empty(output_shape, dtype=q.dtype, device=q.device) - torch.ops.vllm.unified_mla_attention_with_output( - q, - kv_c_normed, - k_pe, - output, - self.layer_name, - ) - return output - else: - return torch.ops.vllm.unified_mla_attention( - q, - kv_c_normed, - k_pe, - self.layer_name, - ) - - def process_weights_after_loading(self, act_dtype: torch.dtype): - if hasattr(self.impl, "process_weights_after_loading"): - self.impl.process_weights_after_loading(act_dtype) - - # If we should not load quant weights, we initialize the scales to 1.0 - # as the default value. See [Note: Register q/k/v/prob scales in state dict] - # for more details. - quant_method = ( - self.quant_config.get_quant_method(self, prefix=self.layer_name) - if self.quant_config - else None - ) - if not should_load_quant_weights(quant_method): - set_default_quant_scales(self, register_buffer=False) - - def calc_kv_scales( - self, q: torch.Tensor, kv_c_normed: torch.Tensor, k_pe: torch.Tensor - ) -> None: - """Optional scale calculation for MLA inputs. - - Mirrors Attention.calc_kv_scales. Not all MLA backends require this - """ - # Use safe defaults if ranges are not present - q_range = getattr(self, "q_range", torch.tensor(1.0)) - k_range = getattr(self, "k_range", torch.tensor(1.0)) - v_range = getattr(self, "v_range", torch.tensor(1.0)) - - self._q_scale.copy_(torch.abs(q).max() / q_range) - # kv_c_normed is the compressed KV representation; use it for k/v - kv_abs_max = torch.abs(kv_c_normed).max() - self._k_scale.copy_(kv_abs_max / k_range) - self._v_scale.copy_(kv_abs_max / v_range) - self._q_scale_float = self._q_scale.item() - self._k_scale_float = self._k_scale.item() - self._v_scale_float = self._v_scale.item() - self.calculate_kv_scales = False - - def get_attn_backend(self) -> type[AttentionBackend]: - return self.attn_backend - - def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: - kv_cache_dtype = kv_cache_dtype_str_to_dtype( - self.kv_cache_dtype, vllm_config.model_config - ) - return MLAAttentionSpec( - block_size=vllm_config.cache_config.block_size, - num_kv_heads=1, - head_size=self.head_size, - dtype=kv_cache_dtype, - cache_dtype_str=vllm_config.cache_config.cache_dtype, - ) - - def maybe_calc_kv_scales( query: torch.Tensor, key: torch.Tensor, @@ -759,7 +562,7 @@ direct_register_custom_op( def get_attention_context( layer_name: str, -) -> tuple[dict | object | None, Attention | MLAAttention, torch.Tensor]: +) -> tuple[dict | object | None, "Attention | MLAAttention", torch.Tensor]: """Extract attention context for a given layer. This helper function extracts the attention metadata, attention layer @@ -782,7 +585,7 @@ def get_attention_context( attn_metadata = forward_context.attn_metadata if isinstance(attn_metadata, dict): attn_metadata = attn_metadata[layer_name] - attn_layer: Attention | MLAAttention = forward_context.no_compile_layers[layer_name] + attn_layer = forward_context.no_compile_layers[layer_name] kv_cache = attn_layer.kv_cache[forward_context.virtual_engine] return attn_metadata, attn_layer, kv_cache @@ -914,79 +717,3 @@ direct_register_custom_op( mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, ) - - -@maybe_transfer_kv_layer -def unified_mla_attention( - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - layer_name: str, -) -> torch.Tensor: - attn_metadata, self, kv_cache = get_attention_context(layer_name) - output = self.impl.forward(self, q, kv_c_normed, k_pe, kv_cache, attn_metadata) - - return output - - -def unified_mla_attention_fake( - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - layer_name: str, -) -> torch.Tensor: - return torch.empty_like(q).contiguous() - - -direct_register_custom_op( - op_name="unified_mla_attention", - op_func=unified_mla_attention, - mutates_args=[], - fake_impl=unified_mla_attention_fake, - dispatch_key=current_platform.dispatch_key, -) - - -@maybe_transfer_kv_layer -def unified_mla_attention_with_output( - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - output: torch.Tensor, - layer_name: str, - output_scale: torch.Tensor | None = None, - output_block_scale: torch.Tensor | None = None, -) -> None: - attn_metadata, self, kv_cache = get_attention_context(layer_name) - self.impl.forward( - self, - q, - kv_c_normed, - k_pe, - kv_cache, - attn_metadata, - output=output, - output_scale=output_scale, - output_block_scale=output_block_scale, - ) - - -def unified_mla_attention_with_output_fake( - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - output: torch.Tensor, - layer_name: str, - output_scale: torch.Tensor | None = None, - output_block_scale: torch.Tensor | None = None, -) -> None: - return - - -direct_register_custom_op( - op_name="unified_mla_attention_with_output", - op_func=unified_mla_attention_with_output, - mutates_args=["output", "output_block_scale"], - fake_impl=unified_mla_attention_with_output_fake, - dispatch_key=current_platform.dispatch_key, -) diff --git a/vllm/model_executor/layers/attention/chunked_local_attention.py b/vllm/model_executor/layers/attention/chunked_local_attention.py index 0fae51443..e33733c0c 100644 --- a/vllm/model_executor/layers/attention/chunked_local_attention.py +++ b/vllm/model_executor/layers/attention/chunked_local_attention.py @@ -4,9 +4,9 @@ import functools import torch -from vllm.attention.layer import Attention from vllm.config import CacheConfig from vllm.config.vllm import VllmConfig +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.v1.attention.backend import ( AttentionBackend, diff --git a/vllm/model_executor/layers/attention/cross_attention.py b/vllm/model_executor/layers/attention/cross_attention.py index f47fa1148..6a829db26 100644 --- a/vllm/model_executor/layers/attention/cross_attention.py +++ b/vllm/model_executor/layers/attention/cross_attention.py @@ -6,9 +6,9 @@ from copy import copy import numpy as np import torch -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.utils.math_utils import cdiv from vllm.v1.attention.backend import ( AttentionBackend, diff --git a/vllm/model_executor/layers/attention/encoder_only_attention.py b/vllm/model_executor/layers/attention/encoder_only_attention.py index 89a92ca1b..941911028 100644 --- a/vllm/model_executor/layers/attention/encoder_only_attention.py +++ b/vllm/model_executor/layers/attention/encoder_only_attention.py @@ -5,9 +5,9 @@ from copy import copy import torch -from vllm.attention.layer import Attention from vllm.config import CacheConfig from vllm.config.vllm import VllmConfig +from vllm.model_executor.layers.attention import Attention from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, diff --git a/vllm/attention/utils/kv_transfer_utils.py b/vllm/model_executor/layers/attention/kv_transfer_utils.py similarity index 95% rename from vllm/attention/utils/kv_transfer_utils.py rename to vllm/model_executor/layers/attention/kv_transfer_utils.py index 210be55fe..9ee6b4d0f 100644 --- a/vllm/attention/utils/kv_transfer_utils.py +++ b/vllm/model_executor/layers/attention/kv_transfer_utils.py @@ -19,7 +19,7 @@ def maybe_transfer_kv_layer(func: Callable) -> Callable: On exit: saves the KV layer to the connector. """ # Import at runtime to avoid circular dependency - from vllm.attention.layer import get_attention_context + from vllm.model_executor.layers.attention.attention import get_attention_context # Inspect the signature ONCE when the decorator is applied. sig = inspect.signature(func) diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py old mode 100755 new mode 100644 index 9371a977f..112c3a5a9 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -191,24 +191,38 @@ import functools from abc import abstractmethod from dataclasses import dataclass, field from enum import Enum -from typing import ClassVar, Generic, TypeVar +from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast + +if TYPE_CHECKING: + from flashinfer import BatchPrefillWithRaggedKVCacheWrapper import torch +import torch.nn as nn from tqdm import tqdm +import vllm.envs as envs from vllm import _custom_ops as ops -from vllm import envs from vllm._aiter_ops import rocm_aiter_ops -from vllm.config import ModelConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank +from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp -from vllm.model_executor.layers.batch_invariant import ( - vllm_is_batch_invariant, +from vllm.model_executor.layers.attention.attention import ( + _init_kv_cache_quant, + get_attention_context, + set_default_quant_scales, + should_load_quant_weights, ) +from vllm.model_executor.layers.attention.kv_transfer_utils import ( + maybe_transfer_kv_layer, +) +from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant from vllm.model_executor.layers.linear import ( ColumnParallelLinear, ) +from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, @@ -217,11 +231,16 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform from vllm.utils.flashinfer import has_nvidia_artifactory from vllm.utils.math_utils import cdiv, round_down +from vllm.utils.torch_utils import ( + direct_register_custom_op, + kv_cache_dtype_str_to_dtype, +) from vllm.v1.attention.backend import ( AttentionBackend, AttentionLayer, AttentionMetadata, AttentionMetadataBuilder, + AttentionType, CommonAttentionMetadata, MLAAttentionImpl, ) @@ -234,7 +253,320 @@ from vllm.v1.attention.backends.utils import ( ) from vllm.v1.attention.ops.common import cp_lse_ag_out_rs from vllm.v1.attention.ops.merge_attn_states import merge_attn_states -from vllm.v1.kv_cache_interface import AttentionSpec +from vllm.v1.attention.selector import get_attn_backend +from vllm.v1.kv_cache_interface import ( + AttentionSpec, + KVCacheSpec, + MLAAttentionSpec, +) + +logger = init_logger(__name__) + + +class MLAAttention(nn.Module, AttentionLayerBase): + """Multi-Head Latent Attention layer. + + This class takes query, and compressed key/value tensors as input. + The class does the following: + + 1. Store the input key and value tensors in the KV cache. + 2. Perform (multi-head/multi-query/grouped-query) attention. + 3. Return the output tensor. + """ + + def __init__( + self, + num_heads: int, + scale: float, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: int | None, + kv_lora_rank: int, + kv_b_proj: ColumnParallelLinear, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + use_sparse: bool = False, + indexer: object | None = None, + **extra_impl_args, + ): + super().__init__() + self.num_heads = num_heads + self.scale = scale + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.head_size = kv_lora_rank + qk_rope_head_dim + self.layer_name = prefix + + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + calculate_kv_scales = cache_config.calculate_kv_scales + else: + kv_cache_dtype = "auto" + block_size = 16 + calculate_kv_scales = False + self.quant_config = quant_config + + # Initialize KV cache quantization attributes + self.kv_cache_dtype = kv_cache_dtype + self.calculate_kv_scales = calculate_kv_scales + _init_kv_cache_quant(self, quant_config, prefix) + + dtype = torch.get_default_dtype() + self.attn_backend = get_attn_backend( + self.head_size, + dtype, + kv_cache_dtype, + block_size, + use_mla=True, + use_sparse=use_sparse, + ) + + if ( + cache_config is not None + and cache_config.enable_prefix_caching + and vllm_is_batch_invariant() + and ( + self.attn_backend.get_name() == "TRITON_MLA" + or self.attn_backend.get_name() == "FLASHINFER" + ) + ): + logger.warning_once( + "Disabling prefix caching for TRITON_MLA / FLASHINFER " + "with batch invariance, as it is not yet supported.", + scope="local", + ) + cache_config.enable_prefix_caching = False + + impl_cls = cast(type[MLAAttentionImpl], self.attn_backend.get_impl_cls()) + self.impl = impl_cls( + num_heads=self.num_heads, + head_size=self.head_size, + scale=self.scale, + num_kv_heads=1, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype=self.kv_cache_dtype, + logits_soft_cap=None, + attn_type=AttentionType.DECODER, + kv_sharing_target_layer_name=None, + # MLA Args + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + qk_head_dim=self.qk_nope_head_dim + self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + kv_b_proj=kv_b_proj, + indexer=indexer, + **extra_impl_args, + ) + + self.use_direct_call = not current_platform.opaque_attention_op() + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + + self.kv_cache = [ + torch.tensor([]) + for _ in range( + get_current_vllm_config().parallel_config.pipeline_parallel_size + ) + ] + + self.use_sparse = use_sparse + + # Initialize q/k/v range constants. + self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) + + def forward( + self, + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + output_shape: torch.Size | None = None, + ) -> torch.Tensor: + if self.calculate_kv_scales: + torch.ops.vllm.maybe_calc_kv_scales(q, kv_c_normed, k_pe, self.layer_name) + + if self.use_direct_call: + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + if isinstance(attn_metadata, dict): + attn_metadata = attn_metadata[self.layer_name] + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + + if self.attn_backend.accept_output_buffer: + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) + self.impl.forward( + self, + q, + kv_c_normed, + k_pe, + self_kv_cache, + attn_metadata, + output=output, + ) + return output + else: + return self.impl.forward( + self, q, kv_c_normed, k_pe, self_kv_cache, attn_metadata + ) + else: + if self.attn_backend.accept_output_buffer: + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) + torch.ops.vllm.unified_mla_attention_with_output( + q, + kv_c_normed, + k_pe, + output, + self.layer_name, + ) + return output + else: + return torch.ops.vllm.unified_mla_attention( + q, + kv_c_normed, + k_pe, + self.layer_name, + ) + + def process_weights_after_loading(self, act_dtype: torch.dtype): + if hasattr(self.impl, "process_weights_after_loading"): + self.impl.process_weights_after_loading(act_dtype) + + # If we should not load quant weights, we initialize the scales to 1.0 + # as the default value. See [Note: Register q/k/v/prob scales in state dict] + # for more details. + quant_method = ( + self.quant_config.get_quant_method(self, prefix=self.layer_name) + if self.quant_config + else None + ) + if not should_load_quant_weights(quant_method): + set_default_quant_scales(self, register_buffer=False) + + def calc_kv_scales( + self, q: torch.Tensor, kv_c_normed: torch.Tensor, k_pe: torch.Tensor + ) -> None: + """Optional scale calculation for MLA inputs. + + Mirrors Attention.calc_kv_scales. Not all MLA backends require this + """ + # Use safe defaults if ranges are not present + q_range = getattr(self, "q_range", torch.tensor(1.0)) + k_range = getattr(self, "k_range", torch.tensor(1.0)) + v_range = getattr(self, "v_range", torch.tensor(1.0)) + + self._q_scale.copy_(torch.abs(q).max() / q_range) + # kv_c_normed is the compressed KV representation; use it for k/v + kv_abs_max = torch.abs(kv_c_normed).max() + self._k_scale.copy_(kv_abs_max / k_range) + self._v_scale.copy_(kv_abs_max / v_range) + self._q_scale_float = self._q_scale.item() + self._k_scale_float = self._k_scale.item() + self._v_scale_float = self._v_scale.item() + self.calculate_kv_scales = False + + def get_attn_backend(self) -> type[AttentionBackend]: + return self.attn_backend + + def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: + kv_cache_dtype = kv_cache_dtype_str_to_dtype( + self.kv_cache_dtype, vllm_config.model_config + ) + return MLAAttentionSpec( + block_size=vllm_config.cache_config.block_size, + num_kv_heads=1, + head_size=self.head_size, + dtype=kv_cache_dtype, + cache_dtype_str=vllm_config.cache_config.cache_dtype, + ) + + +@maybe_transfer_kv_layer +def unified_mla_attention( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + attn_metadata, self, kv_cache = get_attention_context(layer_name) + output = self.impl.forward(self, q, kv_c_normed, k_pe, kv_cache, attn_metadata) + + return output + + +def unified_mla_attention_fake( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + return torch.empty_like(q).contiguous() + + +direct_register_custom_op( + op_name="unified_mla_attention", + op_func=unified_mla_attention, + mutates_args=[], + fake_impl=unified_mla_attention_fake, + dispatch_key=current_platform.dispatch_key, +) + + +@maybe_transfer_kv_layer +def unified_mla_attention_with_output( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + output: torch.Tensor, + layer_name: str, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, +) -> None: + attn_metadata, self, kv_cache = get_attention_context(layer_name) + self.impl.forward( + self, + q, + kv_c_normed, + k_pe, + kv_cache, + attn_metadata, + output=output, + output_scale=output_scale, + output_block_scale=output_block_scale, + ) + + +def unified_mla_attention_with_output_fake( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + output: torch.Tensor, + layer_name: str, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, +) -> None: + return + + +direct_register_custom_op( + op_name="unified_mla_attention_with_output", + op_func=unified_mla_attention_with_output, + mutates_args=["output", "output_block_scale"], + fake_impl=unified_mla_attention_with_output_fake, + dispatch_key=current_platform.dispatch_key, +) class QueryLenSupport(Enum): @@ -266,15 +598,12 @@ except ImportError: from flash_attn import flash_attn_varlen_func # type: ignore[no-redef] is_vllm_fa = False -try: - from flashinfer import BatchPrefillWithRaggedKVCacheWrapper - from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache # noqa: F401 - flashinfer_available = True -except ImportError: - BatchPrefillWithRaggedKVCacheWrapper = object +@functools.cache +def flashinfer_available() -> bool: + import importlib.util - flashinfer_available = False + return importlib.util.find_spec("flashinfer") is not None def dynamic_per_batched_tensor_quant( @@ -398,8 +727,8 @@ class MLACommonPrefillMetadata: @dataclass class FlashInferPrefillMetadata(MLACommonPrefillMetadata): - prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None - prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = field( + prefill_main: "BatchPrefillWithRaggedKVCacheWrapper | None" = None + prefill_chunks: "list[BatchPrefillWithRaggedKVCacheWrapper]" = field( default_factory=list ) @@ -495,7 +824,7 @@ def use_flashinfer_prefill() -> bool: vllm_config = get_current_vllm_config() if not ( not vllm_config.attention_config.disable_flashinfer_prefill - and flashinfer_available + and flashinfer_available() and not vllm_config.attention_config.use_cudnn_prefill and current_platform.is_device_capability_family(100) ): @@ -509,7 +838,7 @@ def use_cudnn_prefill() -> bool: vllm_config = get_current_vllm_config() return ( - flashinfer_available + flashinfer_available() and vllm_config.attention_config.use_cudnn_prefill and current_platform.is_device_capability_family(100) and has_nvidia_artifactory() @@ -731,6 +1060,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): has_context = True if self._fi_prefill_main is None: + from flashinfer import BatchPrefillWithRaggedKVCacheWrapper + self._fi_prefill_main = BatchPrefillWithRaggedKVCacheWrapper( self._workspace_buffer, "NHD", backend="cutlass" ) @@ -739,6 +1070,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): num_chunks = chunked_context.cu_seq_lens.shape[0] # Allocate more prefill chunk wrappers if needed if len(self._fi_prefill_chunks) < num_chunks: + from flashinfer import BatchPrefillWithRaggedKVCacheWrapper + for _ in range(len(self._fi_prefill_chunks), num_chunks): self._fi_prefill_chunks.append( BatchPrefillWithRaggedKVCacheWrapper( @@ -1513,6 +1846,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): ): assert isinstance(prefill, CudnnPrefillMetadata) assert prefill.query_seq_lens is not None + from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache + output, lse = cudnn_batch_prefill_with_kv_cache( q=q, k_cache=k, @@ -1572,6 +1907,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): assert prefill.chunked_context is not None assert prefill.chunked_context.seq_lens[chunk_idx] is not None assert prefill.query_seq_lens is not None + from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache + return cudnn_batch_prefill_with_kv_cache( q=q, k_cache=k, diff --git a/vllm/model_executor/layers/attention/static_sink_attention.py b/vllm/model_executor/layers/attention/static_sink_attention.py index a869226ea..49d83823b 100644 --- a/vllm/model_executor/layers/attention/static_sink_attention.py +++ b/vllm/model_executor/layers/attention/static_sink_attention.py @@ -4,11 +4,11 @@ import functools import torch -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.attention import Attention from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import direct_register_custom_op from vllm.v1.attention.backend import ( diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py index 2549f1221..9f10ca57c 100644 --- a/vllm/model_executor/layers/mla.py +++ b/vllm/model_executor/layers/mla.py @@ -4,9 +4,9 @@ from dataclasses import dataclass import torch -from vllm.attention.layer import MLAAttention from vllm.config import CacheConfig from vllm.model_executor.custom_op import PluggableLayer +from vllm.model_executor.layers.attention import MLAAttention from vllm.model_executor.layers.quantization import QuantizationConfig diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 8b1d564e2..5745cb547 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -19,12 +19,12 @@ from compressed_tensors.quantization import ( from compressed_tensors.transform import TransformConfig import vllm.envs as envs -from vllm.attention.layer import Attention from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 60600e1e3..b2c2a0cff 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -11,9 +11,9 @@ import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.layer import Attention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index e10144ed1..3b59f76d3 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -11,8 +11,8 @@ from torch.nn.parameter import Parameter import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index eca7f61dc..4ade9bc05 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -7,9 +7,9 @@ import torch from torch.nn.parameter import Parameter from vllm import envs -from vllm.attention.layer import Attention from vllm.config import get_current_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEConfig, diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py index 5ccc73166..e97fac80f 100644 --- a/vllm/model_executor/layers/quantization/petit.py +++ b/vllm/model_executor/layers/quantization/petit.py @@ -8,8 +8,8 @@ import regex as re import torch from torch.nn.parameter import Parameter -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index b97eddaff..1f433e07e 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -7,8 +7,8 @@ import torch from torch.nn.parameter import Parameter from vllm import _custom_ops as ops -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 39bcd56bc..8fd7b875f 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Any, Optional, cast import torch -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 1d67cb835..f3c972926 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -11,9 +11,9 @@ import torch from torch import nn from typing_extensions import assert_never -from vllm.attention.layer import Attention, MLAAttention from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention, MLAAttention from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 00605fdc6..a47fe4b7b 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -9,7 +9,6 @@ from itertools import islice import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -18,6 +17,7 @@ from vllm.distributed import ( get_tensor_model_parallel_world_size, ) from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index b802bb0ee..5b8ead4c7 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -11,7 +11,7 @@ import torch.nn as nn from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.utils import divide from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 3ae501610..921d0cd3b 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -32,12 +32,12 @@ import torch from torch import nn from transformers import ApertusConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import XIELU -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index cf93d2eb6..031b6534f 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -8,7 +8,6 @@ from itertools import islice import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -19,6 +18,7 @@ from vllm.distributed import ( ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 1e0f27ec7..bc1cd2ed8 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -29,7 +29,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -38,6 +37,7 @@ from vllm.distributed import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index a8ee14aa0..fc10f790e 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -32,7 +32,6 @@ import torch.nn.functional as F from torch import nn from transformers.configuration_utils import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -41,6 +40,7 @@ from vllm.distributed import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 77f49eb65..d220b22dd 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -9,12 +9,12 @@ import torch from torch import nn from transformers import BambaConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 532175e72..0cdf4f70e 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -11,7 +11,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, PoolerConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( EncoderOnlyAttention, ) from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index f200f791c..22bcdeb45 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -15,7 +15,7 @@ from vllm.distributed import ( tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import get_act_and_mul_fn, get_act_fn -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( EncoderOnlyAttention, ) from vllm.model_executor.layers.fused_moe import activation_without_mul, fused_topk diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 9279cccd5..ac9ae49f0 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -11,7 +11,7 @@ from transformers import Blip2VisionConfig, BlipVisionConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index c6056329f..233028a90 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -27,7 +27,6 @@ import torch from torch import nn from transformers import BloomConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -36,6 +35,7 @@ from vllm.distributed import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index c6c48a821..c4b885cc9 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -16,12 +16,12 @@ from transformers import ( ChameleonVQVAEConfig, ) -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index ea4f87d97..f48e5dc1d 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -12,11 +12,11 @@ import torch from torch import nn from torch.nn import LayerNorm -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 481f5ae6d..a6a303348 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -14,12 +14,11 @@ from transformers import ( CLIPVisionConfig, ) -from vllm.attention.layer import Attention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 609512078..e73dfb1f0 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -30,11 +30,11 @@ import torch from torch import nn from transformers import Cohere2Config, CohereConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 8cef4b428..ca6e6a49a 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -8,13 +8,13 @@ import torch import torch.nn as nn from transformers import DbrxConfig -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( QKVParallelLinear, diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py index 46ce87b34..651ced896 100644 --- a/vllm/model_executor/models/deepencoder.py +++ b/vllm/model_executor/models/deepencoder.py @@ -18,7 +18,7 @@ import torch.nn as nn import torch.nn.functional as F from transformers import CLIPVisionConfig -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 5649f8bd1..f8907ed86 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -33,7 +33,6 @@ from torch import nn from transformers import DeepseekV2Config, DeepseekV3Config from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -45,6 +44,7 @@ from vllm.distributed import ( ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index b69d87f1e..4e3931454 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -32,7 +32,6 @@ import torch from torch import nn from transformers import Dots1Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( @@ -41,6 +40,7 @@ from vllm.distributed import ( tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index fa5a5cc7f..d2f39553d 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -16,7 +16,7 @@ from vllm.distributed.parallel_state import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.conv import Conv2dLayer diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index be153dbcf..452c7624d 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -32,7 +32,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -42,6 +41,7 @@ from vllm.distributed import ( ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 85c447c9b..db724d027 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -42,7 +42,7 @@ from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 63da84d69..9d3cbbecf 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -31,12 +31,11 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention - # from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 6cafbfb57..b633fd285 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -32,11 +32,11 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 5b2ef9082..485b145b9 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -28,11 +28,11 @@ import torch from torch import nn from transformers import Exaone4Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 77cb68c4c..dc636274a 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -30,7 +30,6 @@ from torch import nn from torch.nn import LayerNorm from transformers import FalconConfig as HF_FalconConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -40,6 +39,7 @@ from vllm.distributed import ( tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 582f1d244..3d4d253c3 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -9,12 +9,12 @@ import torch from torch import nn from transformers import FalconH1Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 4d016f286..b3ae5f5ac 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -26,12 +26,12 @@ import torch from torch import nn from transformers import GemmaConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 6d946522f..303f04b64 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -23,12 +23,12 @@ import torch from torch import nn from transformers import Gemma2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 502fe6b82..b2352a3c9 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -22,13 +22,13 @@ import torch from torch import nn from transformers import Gemma3TextConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.layernorm import GemmaRMSNorm diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index bdbb3c91e..770424ba0 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -21,7 +21,6 @@ import torch from torch import nn from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size @@ -32,6 +31,7 @@ from vllm.model_executor.layers.activation import ( GeluAndMul, GeluAndMulSparse, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 3e55df4ff..89447927d 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -29,10 +29,10 @@ import torch from torch import nn from transformers import Glm4Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index a120b106f..a081641be 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -52,7 +52,7 @@ from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.conv import Conv2dLayer, Conv3dLayer diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 4c60cd460..d0e6cb6ad 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -32,7 +32,6 @@ import torch from torch import nn from transformers.models.glm4_moe import Glm4MoeConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -42,6 +41,7 @@ from vllm.distributed import ( ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 59ba0fccc..1ff346d0e 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -23,7 +23,7 @@ from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py index a14aa47ed..bc70e0ad2 100644 --- a/vllm/model_executor/models/glmasr.py +++ b/vllm/model_executor/models/glmasr.py @@ -16,7 +16,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 4026d69cd..41a4ca174 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -27,7 +27,6 @@ import torch from torch import nn from transformers import GPT2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed.parallel_state import ( @@ -35,6 +34,7 @@ from vllm.distributed.parallel_state import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 8af41d004..c6629c937 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -28,11 +28,11 @@ import torch from torch import nn from transformers import GPTBigCodeConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 366f0fd90..c29103c6d 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -26,11 +26,11 @@ import torch from torch import nn from transformers import GPTJConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 764a801db..8d44d12fc 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -26,11 +26,11 @@ import torch from torch import nn from transformers import GPTNeoXConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index acaf099ed..b273880ce 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -7,7 +7,6 @@ import torch.distributed as dist from torch import nn from transformers import GptOssConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -19,6 +18,7 @@ from vllm.distributed import ( get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 53c3230b2..4b486ede4 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -31,11 +31,11 @@ import torch from torch import nn from transformers import GraniteConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 33446e744..171b2e0ec 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -31,7 +31,6 @@ from typing import Any import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -39,6 +38,7 @@ from vllm.distributed import ( get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index b6a3c4872..500ef1a1d 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -9,11 +9,11 @@ import torch from torch import nn from transformers import GraniteMoeHybridConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 49bdc0241..e2943b797 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -32,12 +32,12 @@ import torch import torch.nn.functional as F from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index df507a234..d9362e1dd 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -33,7 +33,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -43,6 +42,7 @@ from vllm.distributed import ( tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 9214f47c7..5381b08b0 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -39,7 +39,7 @@ from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index c78ad6479..441aabd7e 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -29,7 +29,7 @@ from transformers.models.idefics2.configuration_idefics2 import ( from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 3e705defc..fa5efc808 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -23,7 +23,7 @@ from vllm.distributed import ( tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 60db4c4c6..c00b9a0ee 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -10,7 +10,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -21,6 +20,7 @@ from vllm.distributed import ( tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py index 2b2866d67..195bb9681 100644 --- a/vllm/model_executor/models/interns1_vit.py +++ b/vllm/model_executor/models/interns1_vit.py @@ -15,7 +15,7 @@ from transformers import PretrainedConfig from transformers.utils import torch_int from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear diff --git a/vllm/model_executor/models/iquest_loopcoder.py b/vllm/model_executor/models/iquest_loopcoder.py index 9dd6a08d1..24c004ff4 100644 --- a/vllm/model_executor/models/iquest_loopcoder.py +++ b/vllm/model_executor/models/iquest_loopcoder.py @@ -24,10 +24,10 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 704ade320..c0e4a1932 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -20,7 +20,7 @@ from vllm.config import VllmConfig from vllm.config.model import ModelConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 2173b7e4a..5685acd75 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -28,7 +28,6 @@ from itertools import islice import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -36,6 +35,7 @@ from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py index 265a57db5..ea06ee1b1 100644 --- a/vllm/model_executor/models/jais2.py +++ b/vllm/model_executor/models/jais2.py @@ -31,7 +31,6 @@ import torch from torch import nn from transformers import Jais2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -39,6 +38,7 @@ from vllm.distributed import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import ReLUSquaredActivation +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 27f14374c..980bcffb5 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -9,11 +9,11 @@ import torch from torch import nn from transformers import JambaConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index b32f2762e..d37b43102 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -20,7 +20,7 @@ from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.conv import Conv2dLayer diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index ba5e80ac7..fa611ad50 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -7,11 +7,11 @@ import torch import torch.nn as nn from transformers import Lfm2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index 6d786276a..293471bba 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -6,7 +6,6 @@ from itertools import islice import torch import torch.nn as nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -15,6 +14,7 @@ from vllm.distributed import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/lfm2_siglip2.py b/vllm/model_executor/models/lfm2_siglip2.py index 960f119a5..d58e2ad85 100644 --- a/vllm/model_executor/models/lfm2_siglip2.py +++ b/vllm/model_executor/models/lfm2_siglip2.py @@ -13,7 +13,7 @@ from transformers import Siglip2VisionConfig from vllm.compilation.decorators import support_torch_compile from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 29cbea187..16d3cf88a 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -31,12 +31,12 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 477775514..0cdb4989e 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -24,7 +24,6 @@ import torch from torch import nn from transformers import Llama4TextConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -33,7 +32,8 @@ from vllm.distributed import ( tensor_model_parallel_all_gather, ) from vllm.logger import init_logger -from vllm.model_executor.layers.attention.chunked_local_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, ChunkedLocalAttention, ) from vllm.model_executor.layers.fused_moe import SharedFusedMoE diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py index 9d8c26e46..f7640746a 100644 --- a/vllm/model_executor/models/mimo_v2_flash.py +++ b/vllm/model_executor/models/mimo_v2_flash.py @@ -6,7 +6,6 @@ from itertools import islice import torch from torch import nn -from vllm.attention.layer import Attention from vllm.config import ( CacheConfig, VllmConfig, @@ -22,6 +21,7 @@ from vllm.distributed import ( ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 023d08691..4217d119a 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -33,7 +33,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -43,6 +42,7 @@ from vllm.distributed import ( tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import FatreluAndMul, SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index c7a54cea2..e61e9d061 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -29,9 +29,9 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index be5f0b921..7583be200 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -30,7 +30,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( @@ -38,6 +37,7 @@ from vllm.distributed import ( get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index b91321aed..a7785bcfc 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -14,7 +14,6 @@ import torch from torch import nn from transformers import MiniMaxConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import ( @@ -24,6 +23,7 @@ from vllm.distributed.parallel_state import ( ) from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 8d1197800..376fd7a17 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -32,7 +32,6 @@ import torch from torch import nn from transformers import MixtralConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -40,6 +39,7 @@ from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_world_size, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index be34e436a..9de43f1e1 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -36,7 +36,7 @@ from vllm.config import VllmConfig, set_current_vllm_config from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import set_forward_context -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index a8119b046..a29b1a9fb 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -10,7 +10,7 @@ from transformers.activations import ACT2FN from vllm.compilation.decorators import support_torch_compile from vllm.config import ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( EncoderOnlyAttention, ) from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 7ba1382e3..7ea06fd85 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -17,7 +17,6 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin, TensorT from transformers.image_utils import ImageInput from transformers.tokenization_utils_base import TextInput -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions @@ -29,7 +28,7 @@ from vllm.distributed import ( tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import MulAndSilu, QuickGELU, SiluAndMul -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index b0a75c18b..6ded8e08c 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -23,7 +23,6 @@ from transformers.image_utils import ImageInput from transformers.tokenization_utils_base import TextInput from transformers.video_utils import VideoInput, VideoMetadata -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions @@ -36,7 +35,7 @@ from vllm.distributed import ( ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import MulAndSilu, SiluAndMul, get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index 823a8c0e7..8c6998656 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -53,7 +53,7 @@ from transformers.activations import ACT2FN from transformers.modeling_utils import PreTrainedModel from vllm.distributed import divide, get_tensor_model_parallel_world_size -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 5d039f7b4..85933626c 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -10,7 +10,6 @@ import torch import torch.nn as nn from transformers import MptConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -19,6 +18,7 @@ from vllm.distributed import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index c416cbb15..7689e9c60 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -30,11 +30,11 @@ from itertools import islice import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 999949fa1..e33bbe9fa 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -25,7 +25,6 @@ from itertools import islice import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.parallel import ParallelConfig @@ -33,6 +32,7 @@ from vllm.distributed import get_ep_group, get_tensor_model_parallel_world_size from vllm.distributed.communication_op import tensor_model_parallel_all_gather from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import ReLUSquaredActivation +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE from vllm.model_executor.layers.fused_moe.utils import activation_without_mul from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 3ed316dbe..4491a6a3e 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -31,11 +31,11 @@ import torch from torch import nn from transformers import OlmoConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 5019cd787..1de5a12fd 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -32,7 +32,6 @@ import torch from torch import nn from transformers import Olmo2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size @@ -40,6 +39,7 @@ from vllm.distributed.communication_op import tensor_model_parallel_all_gather from vllm.distributed.parallel_state import get_tensor_model_parallel_rank from vllm.distributed.utils import split_tensor_along_last_dim from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index d9695f8f2..f0afe0e99 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -21,7 +21,6 @@ from itertools import islice import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import ( @@ -32,6 +31,7 @@ from vllm.distributed import ( ) from vllm.distributed.utils import split_tensor_along_last_dim from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 982c4e1fc..5eba82e2c 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -29,7 +29,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.distributed import ( @@ -41,7 +40,8 @@ from vllm.distributed import ( tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.static_sink_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, StaticSinkAttention, ) from vllm.model_executor.layers.fused_moe import SharedFusedMoE @@ -84,6 +84,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import set_default_rope_theta +from vllm.v1.attention.backend import AttentionType from vllm.v1.attention.backends.flash_attn_diffkv import FlashAttentionDiffKVBackend diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 4c64b5771..81653b951 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -27,11 +27,11 @@ import torch from torch import nn from transformers import OPTConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 7d5a36a97..3cacb9d61 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -15,11 +15,11 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index a9476645a..56505ec7b 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -33,11 +33,11 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index d8e66dc10..1728e8ef6 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -34,7 +34,7 @@ from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.conv import Conv2dLayer diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index c7829476e..a03a78557 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -30,11 +30,11 @@ import torch from torch import nn from transformers import PersimmonConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index bf1e13614..75c42c0d3 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -45,11 +45,11 @@ import torch from torch import nn from transformers import PhiConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 19c7cecda..0b55b7ec8 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -31,10 +31,10 @@ import torch from torch import nn from transformers.configuration_utils import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( QKVParallelLinear, diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 82833dddc..2bc89cc23 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -9,7 +9,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig, get_current_vllm_config from vllm.distributed import divide, get_tensor_model_parallel_world_size @@ -17,6 +16,7 @@ from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py index df1300ac1..4ba51898d 100644 --- a/vllm/model_executor/models/plamo3.py +++ b/vllm/model_executor/models/plamo3.py @@ -10,12 +10,12 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 906395260..b4526beac 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -16,11 +16,11 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 99bddeec2..ccddc6e81 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -33,12 +33,12 @@ import torch from torch import nn from transformers import Qwen2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index c0fb1f13b..0310c5415 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -49,7 +49,7 @@ from vllm.distributed import utils as dist_utils from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_and_mul_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 04330a868..4b0c75616 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -34,12 +34,12 @@ import torch.nn.functional as F from torch import nn from transformers import Qwen2MoeConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 847501c01..61ff54abd 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -49,7 +49,7 @@ from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 5757739c1..06df05144 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -30,11 +30,11 @@ import torch from torch import nn from transformers import Qwen3Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 690d5368a..2f95f4141 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -32,7 +32,6 @@ import torch import torch.nn.functional as F from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -43,6 +42,7 @@ from vllm.distributed import ( ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 3e89d1972..503b40702 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -10,7 +10,6 @@ from einops import rearrange from torch import nn from transformers.activations import ACT2FN -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CacheConfig, @@ -29,6 +28,7 @@ from vllm.distributed import ( ) from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fla.ops import ( chunk_gated_delta_rule, fused_recurrent_gated_delta_rule, diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index e38cf9277..d90174911 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -30,12 +30,12 @@ import torch from torch import nn from transformers import PretrainedConfig as SeedOssConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 9d4e76f1c..4e63521bc 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -19,10 +19,10 @@ from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( EncoderOnlyAttention, + MMEncoderAttention, ) -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index 0b81d1b00..ccda1d9c9 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -13,7 +13,7 @@ from transformers.configuration_utils import PretrainedConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 589727c6f..bff866d0d 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -30,11 +30,11 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 211b57ddb..034c9c18f 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -29,10 +29,10 @@ import torch from torch import nn from transformers import StableLmConfig -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index dd1e8e98f..5f08a59e2 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -28,11 +28,11 @@ import torch from torch import nn from transformers import Starcoder2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/step1.py b/vllm/model_executor/models/step1.py index 8e655c691..4173b9ebf 100644 --- a/vllm/model_executor/models/step1.py +++ b/vllm/model_executor/models/step1.py @@ -10,7 +10,6 @@ from collections.abc import Iterable import torch from torch import nn -from vllm.attention.layer import Attention, AttentionType from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( get_pp_group, @@ -18,6 +17,7 @@ from vllm.distributed import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -41,6 +41,7 @@ from vllm.model_executor.models.utils import ( maybe_prefix, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType STEP_PACKED_MODULES_MAPPING = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 4855dffec..18b689166 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -9,7 +9,6 @@ from typing import Any import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( @@ -19,6 +18,7 @@ from vllm.distributed import ( ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 8f41e8c5b..8b795ecea 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -19,7 +19,7 @@ from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index 6532a4e0f..0c4d4c2a4 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -27,12 +27,12 @@ from torch import nn from transformers import AutoModel from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS -from vllm.attention.layer import Attention from vllm.config.utils import getattr_iter from vllm.distributed import get_pp_group, get_tp_group from vllm.distributed.utils import get_pp_indices from vllm.logger import init_logger -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index ec3e5818e..b254a5308 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -17,7 +17,6 @@ from transformers import ( ) from transformers.models.whisper.modeling_whisper import sinusoids -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions @@ -25,8 +24,11 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.cross_attention import CrossAttention -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import ( + Attention, + CrossAttention, + MMEncoderAttention, +) from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/whisper_causal.py b/vllm/model_executor/models/whisper_causal.py index 8438b460f..4f2d4c07c 100644 --- a/vllm/model_executor/models/whisper_causal.py +++ b/vllm/model_executor/models/whisper_causal.py @@ -10,9 +10,9 @@ import torch import torch.nn.functional as F from torch import nn -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index dafad457a..b4d844ba6 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -16,11 +16,11 @@ import torch from torch import nn from transformers import Zamba2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index e9596a42e..ef9c2676d 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -9,7 +9,7 @@ from typing import ClassVar import numpy as np import torch -from vllm.attention.layer import Attention +from vllm.model_executor.layers.attention import Attention from vllm.v1.attention.backend import ( AttentionBackend, AttentionImpl, diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 3febbe57a..e122882c7 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -8,9 +8,9 @@ from typing import ClassVar import torch from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import get_cu_count diff --git a/vllm/v1/spec_decode/draft_model.py b/vllm/v1/spec_decode/draft_model.py index 5a54074dd..7d631aa89 100644 --- a/vllm/v1/spec_decode/draft_model.py +++ b/vllm/v1/spec_decode/draft_model.py @@ -4,10 +4,10 @@ from typing import Any import torch -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config.speculative import SpeculativeConfig from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.model_loader import get_model from vllm.triton_utils import tl, triton from vllm.v1.attention.backends.utils import ( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 64f6263cc..adf64f749 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -21,7 +21,6 @@ import torch.nn as nn from tqdm import tqdm import vllm.envs as envs -from vllm.attention.layer import Attention, MLAAttention from vllm.compilation.counter import compilation_counter from vllm.compilation.cuda_graph import CUDAGraphStat, CUDAGraphWrapper from vllm.compilation.monitor import set_cudagraph_capturing_enabled @@ -50,6 +49,7 @@ from vllm.forward_context import ( ) from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping, LoRAMappingType +from vllm.model_executor.layers.attention import Attention, MLAAttention from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( RoutedExpertsCapturer, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index ccfbc3c6b..8af17e270 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -7,9 +7,9 @@ from dataclasses import dataclass, field import torch from typing_extensions import deprecated -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.registry import MultiModalRegistry