diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index e828de0ad..a68d1f016 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -29,7 +29,7 @@ The initialization code should look like this: ```python from torch import nn from vllm.config import VllmConfig - from vllm.attention import Attention + from vllm.attention.layer import Attention class MyAttention(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str): diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index ea61c9495..dbe12dc5d 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -9,8 +9,9 @@ from tests.compile.backend import LazyInitPass, TestBackend from tests.utils import flat_product from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.attention import Attention, AttentionMetadata +from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.attention.layer import Attention from vllm.attention.selector import global_force_attn_backend_context_manager from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass from vllm.compilation.fx_utils import find_op_nodes diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index 511e50f5f..5ebb95b6d 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -5,7 +5,8 @@ import pytest import torch from tests.compile.backend import TestBackend -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 9307ef781..b8148ce06 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -14,7 +14,7 @@ import torch from torch._prims_common import TensorLikeType from tests.kernels.quant_utils import native_w8a8_block_matmul -from vllm.attention import AttentionType +from vllm.attention.backends.abstract import AttentionType from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input from vllm.utils import ( diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index d0f1b703f..89669ee8b 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -5,8 +5,8 @@ import numpy as np import pytest import torch -from vllm.attention import Attention from vllm.attention.backends.abstract import MultipleOf +from vllm.attention.layer import Attention from vllm.config import ( CacheConfig, ModelConfig, diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py index f987b09e6..bcf5611e3 100644 --- a/tests/v1/worker/test_utils.py +++ b/tests/v1/worker/test_utils.py @@ -7,7 +7,7 @@ from vllm.v1.worker.utils import bind_kv_cache def test_bind_kv_cache(): - from vllm.attention import Attention + from vllm.attention.layer import Attention ctx = { "layers.0.self_attn": Attention(32, 128, 0.1), @@ -35,7 +35,7 @@ def test_bind_kv_cache(): def test_bind_kv_cache_non_attention(): - from vllm.attention import Attention + from vllm.attention.layer import Attention # example from Jamba PP=2 ctx = { diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py index 8b4dc4013..e69de29bb 100644 --- a/vllm/attention/__init__.py +++ b/vllm/attention/__init__.py @@ -1,19 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionMetadata, - AttentionType, -) -from vllm.attention.layer import Attention -from vllm.attention.selector import get_attn_backend, get_mamba_attn_backend - -__all__ = [ - "Attention", - "AttentionBackend", - "AttentionMetadata", - "AttentionType", - "get_attn_backend", - "get_mamba_attn_backend", -] diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index bd7e81b15..a321167b8 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -178,7 +178,7 @@ class AttentionBackend(ABC): By default, only supports decoder attention. Backends should override this to support other attention types. """ - from vllm.attention import AttentionType + from vllm.attention.backends.abstract import AttentionType return attn_type == AttentionType.DECODER diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index f1d57ac50..62ac38751 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -10,8 +10,11 @@ import torch.nn as nn import torch.nn.functional as F import vllm.envs as envs -from vllm.attention import AttentionType -from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl +from vllm.attention.backends.abstract import ( + AttentionBackend, + AttentionType, + MLAAttentionImpl, +) from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.selector import get_attn_backend from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index 4f44faece..6dcbbd85d 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -10,7 +10,7 @@ from torch import fx from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import PatternMatcherPass -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py index e3c399e07..794cd8e3f 100644 --- a/vllm/compilation/qk_norm_rope_fusion.py +++ b/vllm/compilation/qk_norm_rope_fusion.py @@ -9,7 +9,7 @@ from torch import fx from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import PatternMatcherPass -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 493938d4a..ff51840b8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -20,7 +20,7 @@ import torch import zmq from vllm import envs -from vllm.attention import AttentionBackend +from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.selector import get_attn_backend from vllm.config import VllmConfig diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 8cd09014c..0ad9d4ae1 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -8,7 +8,8 @@ from typing import Any, ClassVar import torch -from vllm.attention import Attention, AttentionBackend, AttentionMetadata +from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata +from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.v1 import ( diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py index d85b3e61c..278713408 100644 --- a/vllm/model_executor/layers/mamba/linear_attn.py +++ b/vllm/model_executor/layers/mamba/linear_attn.py @@ -8,7 +8,7 @@ import torch.nn.functional as F from einops import rearrange from torch import nn -from vllm.attention import AttentionMetadata +from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed.communication_op import tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import ( diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 2021b68b8..eeb244415 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -11,8 +11,7 @@ import torch from torch import nn from typing_extensions import assert_never -from vllm.attention import Attention -from vllm.attention.layer import MLAAttention +from vllm.attention.layer import Attention, MLAAttention from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 4eb5665a7..85827d54c 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -9,7 +9,8 @@ from itertools import islice import torch from torch import nn -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index b75e91319..f38b09bf5 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -32,7 +32,8 @@ import torch from torch import nn from transformers import ApertusConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index b75a25476..266d29a8d 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -8,7 +8,7 @@ from itertools import islice import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 024788918..beb22995a 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -29,7 +29,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index cc10e936a..f7a5d4e78 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -32,7 +32,7 @@ import torch.nn.functional as F from torch import nn from transformers.configuration_utils import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 00fba9342..507fbf1fd 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -27,7 +27,7 @@ import torch from torch import nn from transformers import BloomConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index b5a6d00dc..3aa01bb19 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -16,7 +16,7 @@ from transformers import ( ChameleonVQVAEConfig, ) -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index dbfcd62d0..3d485fdd0 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -12,7 +12,7 @@ import torch from torch import nn from torch.nn import LayerNorm -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 5d611deb9..c2993b47d 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -14,8 +14,7 @@ from transformers import ( CLIPVisionConfig, ) -from vllm.attention import Attention -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layer import Attention, MultiHeadAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 5ed920927..f837502c4 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -30,7 +30,7 @@ import torch from torch import nn from transformers import Cohere2Config, CohereConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 2c7290190..946baffc8 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -8,7 +8,7 @@ import torch import torch.nn as nn from transformers import DbrxConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( get_pp_group, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index ad932559b..73cac2556 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -33,8 +33,8 @@ from torch import nn from transformers import DeepseekV2Config, DeepseekV3Config from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention import Attention from vllm.attention.backends.abstract import AttentionBackend +from vllm.attention.layer import Attention from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index e65c27510..1c2abbe7b 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -32,7 +32,7 @@ import torch from torch import nn from transformers import Dots1Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index a7df3509e..278ba45e9 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -32,7 +32,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 50e033d77..72f9957fc 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -31,7 +31,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention # from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index d13275488..99002baa8 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -32,7 +32,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 70f3cce2b..9d2c67d6c 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -28,7 +28,7 @@ import torch from torch import nn from transformers import Exaone4Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index dc2d51f34..32d9e7b92 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -30,7 +30,7 @@ from torch import nn from torch.nn import LayerNorm from transformers import FalconConfig as HF_FalconConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 00c7f59a0..dd5a74c8e 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -26,7 +26,7 @@ import torch from torch import nn from transformers import GemmaConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 9b6cfe693..cb36e0482 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -23,7 +23,7 @@ import torch from torch import nn from transformers import Gemma2Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 4ad6fc89d..73176eba9 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -23,7 +23,8 @@ import torch.nn.functional as F from torch import nn from transformers import Gemma3TextConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 8f1447ba3..f4427c9fd 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -21,7 +21,7 @@ import torch from torch import nn from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index f8ef3b038..002cdb721 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -29,7 +29,8 @@ import torch from torch import nn from transformers import Glm4Config -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 5aa51af54..c99f824e1 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -31,7 +31,7 @@ import torch from torch import nn from transformers.models.glm4_moe import Glm4MoeConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index a5e8131c7..da5d48a94 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -27,7 +27,7 @@ import torch from torch import nn from transformers import GPT2Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed.parallel_state import ( diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index cdf038ba2..a405fd184 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -28,7 +28,7 @@ import torch from torch import nn from transformers import GPTBigCodeConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index bd1bfea3c..f0a34c47d 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -26,7 +26,7 @@ import torch from torch import nn from transformers import GPTJConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 815c2fba4..b9959682c 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -26,7 +26,7 @@ import torch from torch import nn from transformers import GPTNeoXConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 1bc0ad387..9de3e2619 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -7,7 +7,8 @@ import torch.distributed as dist from torch import nn from transformers import GptOssConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index cd7ce2fc8..eac9ef947 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -31,7 +31,7 @@ import torch from torch import nn from transformers import GraniteConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 8f4139d63..02c6c5862 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -31,7 +31,7 @@ from typing import Any import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index cfca56492..6f62a1d11 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -31,7 +31,7 @@ import torch import torch.nn.functional as F from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 53fb444ed..ccdfa3fe1 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -33,7 +33,8 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index dc8f821bd..c79934e12 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -10,7 +10,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 5549a1fc1..601228881 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -28,7 +28,7 @@ from itertools import islice import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index 74bdde27e..69615f8b6 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn from transformers import Lfm2Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index c088a0821..aaeb2cc38 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -6,7 +6,7 @@ from itertools import islice import torch import torch.nn as nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f6af2bb3b..6dfbde7a1 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -31,7 +31,8 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index e1bdfc340..423be45e8 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -24,7 +24,7 @@ import torch from torch import nn from transformers import Llama4TextConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 049238330..67911ba8c 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -33,7 +33,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 2d775219f..0a2bcbd7f 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -29,7 +29,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 4955c68c0..dd98e36ec 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -30,7 +30,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 50f7396e2..390de78cc 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -14,7 +14,8 @@ import torch from torch import nn from transformers import MiniMaxConfig -from vllm.attention import Attention, AttentionMetadata +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import ( diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 0a9c3f136..e21656dbd 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -32,7 +32,7 @@ import torch from torch import nn from transformers import MixtralConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index dc06938d5..7b53299cc 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -17,8 +17,7 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin, TensorT from transformers.image_utils import ImageInput from transformers.tokenization_utils_base import TextInput -from vllm.attention import Attention -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.layer import Attention, MultiHeadAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 106ad971a..1e285646b 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -10,7 +10,7 @@ import torch import torch.nn as nn from transformers import MptConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index c3337bd1e..93ad2064a 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -30,7 +30,7 @@ from itertools import islice import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 2eebe3805..34ea2945b 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -31,7 +31,7 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.attention import AttentionType +from vllm.attention.backends.abstract import AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index bd8a8e317..3bbb4dd24 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -31,7 +31,7 @@ import torch from torch import nn from transformers import OlmoConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index f0f6b2f6b..88e9c2d85 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -32,7 +32,7 @@ import torch from torch import nn from transformers import Olmo2Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index c39e338d7..1376583a9 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -21,7 +21,7 @@ from itertools import islice import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 4124a181a..bddd9fa50 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -29,7 +29,8 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 5df700d1a..bba5291ea 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -27,7 +27,7 @@ import torch from torch import nn from transformers import OPTConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index b30be93ca..544a44ed5 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -15,7 +15,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index 63d2fff6e..dcae92ed2 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -33,7 +33,8 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 98963d52e..795cd25f1 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -30,7 +30,7 @@ import torch from torch import nn from transformers import PersimmonConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index da476f621..70016d9ed 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -45,7 +45,7 @@ import torch from torch import nn from transformers import PhiConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 8ffac95d9..a5a669139 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -31,7 +31,7 @@ import torch from torch import nn from transformers.configuration_utils import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index c973e7917..12285cf9c 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -16,7 +16,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 5831ce0b3..34c31d8de 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -33,7 +33,8 @@ import torch from torch import nn from transformers import Qwen2Config -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 6b97d0b2c..5a4287400 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -34,7 +34,7 @@ import torch.nn.functional as F from torch import nn from transformers import Qwen2MoeConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 93a629d81..7d2b3e5f9 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -30,7 +30,8 @@ import torch from torch import nn from transformers import Qwen3Config -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 8ee3dd99e..6f520706a 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -31,7 +31,7 @@ from typing import Any import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index bfed64728..661a18215 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -10,7 +10,8 @@ from einops import rearrange from torch import nn from transformers.activations import ACT2FN -from vllm.attention import Attention, AttentionMetadata +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CacheConfig, diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index 4744d8e44..267c60157 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -30,7 +30,8 @@ import torch from torch import nn from transformers import PretrainedConfig as SeedOssConfig -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 7e9fc5103..c576154b1 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -30,7 +30,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index a738fcbb4..6cb98b7b7 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -29,7 +29,7 @@ import torch from torch import nn from transformers import StableLmConfig -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 1118fca3c..46422f303 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -28,7 +28,7 @@ import torch from torch import nn from transformers import Starcoder2Config -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 3c377a2c5..077cce84a 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -9,7 +9,7 @@ from typing import Any import torch from torch import nn -from vllm.attention import Attention +from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index f4ba4758b..b33ce3542 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -27,7 +27,8 @@ from torch import nn from transformers import AutoModel from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS -from vllm.attention import Attention, AttentionType +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.config.utils import getattr_iter from vllm.distributed import get_pp_group, get_tp_group diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 50587c627..c72b5e1c0 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -16,8 +16,8 @@ from transformers import ( ) from transformers.models.whisper.modeling_whisper import sinusoids -from vllm.attention import Attention, AttentionType -from vllm.attention.layer import MultiHeadAttention +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention, MultiHeadAttention from vllm.attention.layers.cross_attention import CrossAttention from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 75b6bc77e..e8e14387b 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -335,7 +335,7 @@ class CudaPlatformBase(Platform): use_sparse: bool, attn_type: str | None = None, ) -> str: - from vllm.attention import AttentionType + from vllm.attention.backends.abstract import AttentionType if attn_type is None: attn_type = AttentionType.DECODER diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 590bf91b0..d0b1f8c1b 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -51,7 +51,7 @@ class CPUAttentionBackend(AttentionBackend): @classmethod def supports_attn_type(cls, attn_type: str) -> bool: """CPU attention supports decoder and encoder-only attention.""" - from vllm.attention import AttentionType + from vllm.attention.backends.abstract import AttentionType return attn_type in ( AttentionType.DECODER, diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index a9a4af5ac..0fc57cfb1 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -84,7 +84,7 @@ class FlashAttentionBackend(AttentionBackend): @classmethod def supports_attn_type(cls, attn_type: str) -> bool: """FlashAttention supports all attention types.""" - from vllm.attention import AttentionType + from vllm.attention.backends.abstract import AttentionType return attn_type in ( AttentionType.DECODER, diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 7768827d2..3869f1f41 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -87,7 +87,7 @@ class FlexAttentionBackend(AttentionBackend): @classmethod def supports_attn_type(cls, attn_type: str) -> bool: """FlexAttention supports both decoder and encoder-only attention.""" - from vllm.attention import AttentionType + from vllm.attention.backends.abstract import AttentionType return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY) diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py index 86747299e..2f2e85c0f 100644 --- a/vllm/v1/kv_offload/cpu.py +++ b/vllm/v1/kv_offload/cpu.py @@ -4,7 +4,7 @@ from collections.abc import Iterator import torch -from vllm.attention import AttentionBackend +from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.platforms import current_platform from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py index c1813a4ff..3afce5589 100644 --- a/vllm/v1/kv_offload/spec.py +++ b/vllm/v1/kv_offload/spec.py @@ -11,7 +11,7 @@ from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager from vllm.v1.kv_offload.worker.worker import OffloadingHandler if TYPE_CHECKING: - from vllm.attention import AttentionBackend + from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig logger = init_logger(__name__) diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py index bb163f004..461458c1f 100644 --- a/vllm/v1/kv_offload/worker/cpu_gpu.py +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -5,7 +5,7 @@ import numpy as np import torch from vllm import _custom_ops as ops -from vllm.attention import AttentionBackend +from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d3c61794f..581921a9b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -19,12 +19,13 @@ import torch.nn as nn from tqdm import tqdm import vllm.envs as envs -from vllm.attention import Attention, AttentionType from vllm.attention.backends.abstract import ( AttentionBackend, AttentionMetadata, + AttentionType, MultipleOf, ) +from vllm.attention.layer import Attention from vllm.compilation.counter import compilation_counter from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.compilation.monitor import set_cudagraph_capturing_enabled diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index ff047d8d0..b799f1be7 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -13,7 +13,7 @@ from typing import ( import torch -from vllm.attention import AttentionBackend +from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.config.cache import CacheDType from vllm.distributed.kv_transfer import ( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 72d4474b8..9c1fbfd24 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -17,9 +17,8 @@ import torch_xla.distributed.spmd as xs import torch_xla.runtime as xr import vllm.envs as envs -from vllm.attention import Attention from vllm.attention.backends.abstract import AttentionType -from vllm.attention.layer import MLAAttention +from vllm.attention.layer import Attention, MLAAttention from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper from vllm.config import (