diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 7d322aeaf..ff5ce20d1 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1105,8 +1105,8 @@ steps: - vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/v1/attention/selector.py - vllm/platforms/cuda.py - - vllm/attention/selector.py commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fceae9685..4e004e347 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -954,8 +954,8 @@ steps: - vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/v1/attention/selector.py - vllm/platforms/cuda.py - - vllm/attention/selector.py commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index 7ca099516..cf4b646f3 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -90,8 +90,8 @@ steps: - vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/v1/attention/selector.py - vllm/platforms/cuda.py - - vllm/attention/selector.py commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 8122c525f..c963be4cb 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -3,7 +3,6 @@ # This lists cover the "core" components of vLLM that require careful review /vllm/attention @LucasWilkinson -/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety @@ -27,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # vLLM V1 /vllm/v1/attention @LucasWilkinson +/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill /vllm/v1/attention/backends/mla @pavanimajety /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety /vllm/v1/attention/backends/triton_attn.py @tdoublep @@ -117,15 +117,15 @@ mkdocs.yaml @hmellor /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten # Kernels -/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep -/vllm/attention/ops/triton_unified_attention.py @tdoublep +/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep +/vllm/v1/attention/ops/triton_unified_attention.py @tdoublep # ROCm related: specify owner with write access to notify AMD folks for careful code review /vllm/**/*rocm* @tjtanaa /docker/Dockerfile.rocm* @gshtras @tjtanaa /vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa /vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa -/vllm/attention/ops/rocm*.py @gshtras @tjtanaa +/vllm/v1/attention/ops/rocm*.py @gshtras @tjtanaa /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa /csrc/rocm @gshtras @tjtanaa /requirements/*rocm* @tjtanaa diff --git a/.github/mergify.yml b/.github/mergify.yml index 61a03135b..a496dd302 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -222,10 +222,10 @@ pull_request_rules: - files~=^csrc/rocm/ - files~=^docker/Dockerfile.rocm - files~=^requirements/rocm.*\.txt - - files~=^vllm/attention/backends/rocm.*\.py - - files~=^vllm/attention/ops/rocm.*\.py - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py + - files~=^vllm/v1/attention/backends/rocm.*\.py - files~=^vllm/v1/attention/backends/mla/rocm.*\.py + - files~=^vllm/v1/attention/ops/rocm.*\.py - files~=^tests/kernels/.*_rocm.*\.py - files=vllm/platforms/rocm.py - title~=(?i)AMD diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py index bca66f301..ef6be1f3c 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -7,9 +7,6 @@ import torch from tabulate import tabulate from vllm import _custom_ops as ops -from vllm.attention.ops.triton_reshape_and_cache_flash import ( - triton_reshape_and_cache_flash, -) from vllm.logger import init_logger from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import ( @@ -17,6 +14,9 @@ from vllm.utils.torch_utils import ( create_kv_caches_with_random_flash, set_random_seed, ) +from vllm.v1.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash, +) logger = init_logger(__name__) diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index d37501b86..28f6f960a 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -142,7 +142,7 @@ We use "mamba-like" to refer to layers that posses a state that is updated in-pl For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`. It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers. Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this. -It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/attention/backends/registry.py) when adding a new mamba backend. +It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/v1/attention/backends/registry.py) when adding a new mamba backend. Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it. Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this. The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended. diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md index 6beb0ce0c..fd298a149 100644 --- a/docs/design/custom_op.md +++ b/docs/design/custom_op.md @@ -60,7 +60,7 @@ For example: **1. Attention:** ```python ---8<-- "vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn" +--8<-- "vllm/model_executor/layers/attention/mm_encoder_attention.py:mm_encoder_attn" --8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention" ``` diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 6a4b5fd6b..9cebaed51 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -124,7 +124,7 @@ Every plugin has three parts: Please look at the worker base class [WorkerBase][vllm.v1.worker.worker_base.WorkerBase] for more functions that can be implemented. -5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.attention.backends.abstract.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations. +5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.v1.attention.backend.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations. 6. Implement custom ops for high performance. Most ops can be ran by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports: @@ -153,5 +153,5 @@ The interface for the model/module may change during vLLM's development. If you !!! warning "Deprecations" - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0. - - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. + - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. - `seed_everything` platform interface is deprecated. It will be removed in v0.15.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead. diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index 17f727b33..90793fb61 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -4,10 +4,10 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import AttentionConfig from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.v1.attention.backends.registry import AttentionBackendEnum def parse_args(): diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index b2dadffd2..abe827043 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -4,10 +4,10 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import AttentionConfig from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.v1.attention.backends.registry import AttentionBackendEnum def parse_args(): diff --git a/tests/compile/fullgraph/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py index c1f6f95d5..c7c737371 100644 --- a/tests/compile/fullgraph/test_full_cudagraph.py +++ b/tests/compile/fullgraph/test_full_cudagraph.py @@ -9,10 +9,10 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear from tests.v1.attention.utils import full_cg_backend_configs as backend_configs from vllm import LLM, SamplingParams -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import CompilationConfig from vllm.platforms import current_platform from vllm.utils.torch_utils import is_torch_equal_or_newer +from vllm.v1.attention.backends.registry import AttentionBackendEnum @contextlib.contextmanager diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index 7fe90c881..209a879bf 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -10,10 +10,10 @@ import torch from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.platforms import current_platform from vllm.utils.torch_utils import is_torch_equal_or_newer +from vllm.v1.attention.backends.registry import AttentionBackendEnum from ...utils import create_new_process_for_each_test diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 9e52de5c2..a1fd098ae 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -9,8 +9,6 @@ from tests.compile.backend import LazyInitPass, TestBackend from tests.utils import flat_product from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.attention.backends.abstract import AttentionMetadata -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import Attention from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass from vllm.compilation.fx_utils import find_op_nodes @@ -37,6 +35,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer +from vllm.v1.attention.backend import AttentionMetadata +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.kv_cache_interface import AttentionSpec FP8_DTYPE = current_platform.fp8_dtype() diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index e0968ac79..45a114679 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -5,7 +5,6 @@ import pytest import torch from tests.compile.backend import TestBackend -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP from vllm.compilation.noop_elimination import NoOpEliminationPass @@ -25,6 +24,7 @@ from vllm.config import ( from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.platforms import current_platform +from vllm.v1.attention.backend import AttentionType RSQRT_OP = torch.ops.aten.rsqrt.default INDEX_SELECT_OP = torch.ops.aten.index.Tensor diff --git a/tests/config/test_multimodal_config.py b/tests/config/test_multimodal_config.py index 3d02893e5..51bf93878 100644 --- a/tests/config/test_multimodal_config.py +++ b/tests/config/test_multimodal_config.py @@ -3,8 +3,8 @@ import pytest -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config.multimodal import MultiModalConfig +from vllm.v1.attention.backends.registry import AttentionBackendEnum def test_mm_encoder_attn_backend_str_conversion(): diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 5bb5fcea2..2acb38bc9 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -299,7 +299,7 @@ def test_compilation_config(): def test_attention_config(): - from vllm.attention.backends.registry import AttentionBackendEnum + from vllm.v1.attention.backends.registry import AttentionBackendEnum parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py index 68ffb1ee3..cf24630c5 100644 --- a/tests/kernels/attention/test_aiter_flash_attn.py +++ b/tests/kernels/attention/test_aiter_flash_attn.py @@ -6,9 +6,9 @@ import pytest import torch import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401 -from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed +from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [128, 256] diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 24b058ed2..94d494613 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -10,7 +10,7 @@ from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.utils import opcheck from vllm import _custom_ops as ops from vllm.attention.layer import Attention -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.platforms import current_platform from vllm.utils.mem_utils import get_max_shared_memory_bytes from vllm.utils.torch_utils import set_random_seed @@ -30,7 +30,7 @@ NUM_PREFILL_SEQS = [3] # Arbitrary values for testing NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing # This should be sync with get_supported_head_sizes() in -# vllm.attention.ops.paged_attn.PagedAttention +# vllm.v1.attention.ops.paged_attn.PagedAttention HEAD_SIZES = [32, 80, 128, 256] BLOCK_SIZES = [16, 32] diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index d62acc202..a63297c35 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -6,13 +6,13 @@ from unittest.mock import patch import pytest import torch -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.rocm import RocmPlatform +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.selector import _cached_get_attn_backend, get_attn_backend @pytest.fixture(autouse=True) @@ -182,7 +182,7 @@ def test_backend_selection( expected = name assert backend.get_name() == expected elif name == "FLASH_ATTN_MLA": - from vllm.attention.utils.fa_utils import ( + from vllm.v1.attention.backends.fa_utils import ( flash_attn_supports_mla, ) diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index 19892ce26..367a986ab 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -270,7 +270,7 @@ def test_reshape_and_cache_flash( v_scale, ) elif implementation == "triton": - from vllm.attention.ops.triton_reshape_and_cache_flash import ( + from vllm.v1.attention.ops.triton_reshape_and_cache_flash import ( triton_reshape_and_cache_flash, ) diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py index 2151933a6..6b3d3485d 100644 --- a/tests/kernels/attention/test_flashmla.py +++ b/tests/kernels/attention/test_flashmla.py @@ -7,12 +7,12 @@ import random import pytest import torch -from vllm.attention.ops.flashmla import ( +from vllm.triton_utils import triton +from vllm.v1.attention.ops.flashmla import ( flash_mla_with_kvcache, get_mla_metadata, is_flashmla_dense_supported, ) -from vllm.triton_utils import triton def cal_diff( diff --git a/tests/kernels/attention/test_flashmla_sparse.py b/tests/kernels/attention/test_flashmla_sparse.py index 7ee6f4b07..c1147ae9e 100644 --- a/tests/kernels/attention/test_flashmla_sparse.py +++ b/tests/kernels/attention/test_flashmla_sparse.py @@ -5,7 +5,7 @@ import torch def test_sparse_flashmla_metadata_smoke(): - import vllm.attention.ops.flashmla as fm + import vllm.v1.attention.ops.flashmla as fm ok, reason = fm.is_flashmla_sparse_supported() if not ok: @@ -34,7 +34,7 @@ def test_sparse_flashmla_metadata_smoke(): def test_sparse_flashmla_decode_smoke(): - import vllm.attention.ops.flashmla as fm + import vllm.v1.attention.ops.flashmla as fm ok, reason = fm.is_flashmla_sparse_supported() if not ok: @@ -97,7 +97,7 @@ def test_sparse_flashmla_decode_smoke(): def test_sparse_flashmla_prefill_smoke(): - import vllm.attention.ops.flashmla as fm + import vllm.v1.attention.ops.flashmla as fm ok, reason = fm.is_flashmla_sparse_supported() if not ok: diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py index c7662223e..a9f525cdc 100644 --- a/tests/kernels/attention/test_merge_attn_states.py +++ b/tests/kernels/attention/test_merge_attn_states.py @@ -5,10 +5,10 @@ import pytest import torch from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda -from vllm.attention.ops.triton_merge_attn_states import ( +from vllm.platforms import current_platform +from vllm.v1.attention.ops.triton_merge_attn_states import ( merge_attn_states as merge_attn_states_triton, ) -from vllm.platforms import current_platform # Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005 diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index 56912c145..ecaea8867 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -12,14 +12,14 @@ from unittest.mock import patch import pytest import torch -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention -from vllm.attention.selector import _cached_get_attn_backend +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.rocm import RocmPlatform from vllm.utils.torch_utils import set_random_seed +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.selector import _cached_get_attn_backend @pytest.fixture(autouse=True) diff --git a/tests/kernels/attention/test_pack_unpack_triton.py b/tests/kernels/attention/test_pack_unpack_triton.py index d2aa14738..158ae550e 100644 --- a/tests/kernels/attention/test_pack_unpack_triton.py +++ b/tests/kernels/attention/test_pack_unpack_triton.py @@ -4,7 +4,7 @@ import torch from torch.testing import assert_close -from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton +from vllm.v1.attention.ops.common import pack_seq_triton, unpack_seq_triton def test_pack_seq_basic_fp8(): diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index 45779636e..2dc4a3cd2 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -10,10 +10,12 @@ import pytest import torch import torch.nn.functional as F -from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode -from vllm.attention.ops.prefix_prefill import context_attention_fwd from vllm.platforms import current_platform from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed +from vllm.v1.attention.ops.chunked_prefill_paged_decode import ( + chunked_prefill_paged_decode, +) +from vllm.v1.attention.ops.prefix_prefill import context_attention_fwd NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 64] diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py index f97d475eb..2a684ed70 100644 --- a/tests/kernels/attention/test_rocm_attention_selector.py +++ b/tests/kernels/attention/test_rocm_attention_selector.py @@ -4,10 +4,10 @@ import pytest import torch -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config from vllm.platforms.rocm import RocmPlatform +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.selector import _cached_get_attn_backend, get_attn_backend @pytest.fixture(autouse=True) @@ -19,7 +19,7 @@ def clear_cache(): @pytest.mark.skip(reason="Skipped for now. Should be revisited.") def test_selector(monkeypatch: pytest.MonkeyPatch): # Set the current platform to ROCm using monkeypatch - monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform()) + monkeypatch.setattr("vllm.v1.attention.selector.current_platform", RocmPlatform()) # Test standard ROCm attention attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_ATTN) diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py index 04085fe5f..f6b066a7b 100644 --- a/tests/kernels/attention/test_triton_decode_attention.py +++ b/tests/kernels/attention/test_triton_decode_attention.py @@ -4,8 +4,8 @@ import pytest import torch -from vllm.attention.ops.triton_decode_attention import decode_attention_fwd from vllm.utils.math_utils import cdiv +from vllm.v1.attention.ops.triton_decode_attention import decode_attention_fwd @pytest.mark.parametrize("B", [3, 5]) diff --git a/tests/kernels/attention/test_triton_prefill_attention.py b/tests/kernels/attention/test_triton_prefill_attention.py index 67c52cbfd..f4505d91f 100644 --- a/tests/kernels/attention/test_triton_prefill_attention.py +++ b/tests/kernels/attention/test_triton_prefill_attention.py @@ -5,7 +5,7 @@ import pytest import torch import torch.nn.functional as F -from vllm.attention.ops.triton_prefill_attention import context_attention_fwd +from vllm.v1.attention.ops.triton_prefill_attention import context_attention_fwd def ref_masked_attention( diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index 55e359348..a28982250 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -5,10 +5,10 @@ import pytest import torch -from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.platforms import current_platform from vllm.utils.math_utils import next_power_of_2 from vllm.utils.torch_utils import set_random_seed +from vllm.v1.attention.ops.triton_unified_attention import unified_attention NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [128, 256] diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 72c79370d..ccdacf40c 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -13,11 +13,11 @@ import torch from torch._prims_common import TensorLikeType from tests.kernels.quant_utils import native_w8a8_block_matmul -from vllm.attention.backends.abstract import AttentionType from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input from vllm.utils.torch_utils import make_tensor_with_pad +from vllm.v1.attention.backend import AttentionType # For now, disable "test_aot_dispatch_dynamic" since there are some # bugs related to this test in PyTorch 2.4. diff --git a/tests/models/multimodal/generation/test_vit_backend_functionality.py b/tests/models/multimodal/generation/test_vit_backend_functionality.py index 8cea6135b..8f141746e 100644 --- a/tests/models/multimodal/generation/test_vit_backend_functionality.py +++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py @@ -14,10 +14,10 @@ import pytest from transformers import AutoProcessor from vllm import LLM, EngineArgs, SamplingParams -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.multimodal.utils import encode_image_url from vllm.multimodal.video import sample_frames_from_video from vllm.platforms import current_platform +from vllm.v1.attention.backends.registry import AttentionBackendEnum from ....utils import create_new_process_for_each_test from ...utils import dummy_hf_overrides diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py index f3b85ba0e..9be5fd330 100644 --- a/tests/models/quantization/test_fp8.py +++ b/tests/models/quantization/test_fp8.py @@ -9,7 +9,7 @@ Note: these tests will only pass on L4 GPU. import pytest from tests.quantization.utils import is_quant_method_supported -from vllm.attention.utils.fa_utils import flash_attn_supports_fp8 +from vllm.v1.attention.backends.fa_utils import flash_attn_supports_fp8 from vllm.platforms import current_platform from ..utils import check_logprobs_close diff --git a/tests/test_attention_backend_registry.py b/tests/test_attention_backend_registry.py index 7b90b949a..034749874 100644 --- a/tests/test_attention_backend_registry.py +++ b/tests/test_attention_backend_registry.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.attention.backends.abstract import ( +from vllm.v1.attention.backend import ( AttentionBackend, AttentionImpl, ) -from vllm.attention.backends.registry import ( +from vllm.v1.attention.backends.registry import ( AttentionBackendEnum, MambaAttentionBackendEnum, register_backend, diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index 80714ac5a..2068c30c0 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -15,8 +15,6 @@ from tests.v1.attention.utils import ( create_vllm_config, try_get_attention_backend, ) -from vllm.attention.backends.abstract import AttentionType -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ModelConfig from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv @@ -25,6 +23,8 @@ from vllm.utils.torch_utils import ( is_torch_equal_or_newer, set_random_seed, ) +from vllm.v1.attention.backend import AttentionType +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, set_kv_cache_layout, diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 514bd0526..de80c556b 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -18,15 +18,15 @@ from tests.v1.attention.utils import ( try_get_attention_backend, ) from vllm import _custom_ops as ops -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.ops.flashmla import is_flashmla_dense_supported -from vllm.attention.utils.fa_utils import flash_attn_supports_mla from vllm.config.vllm import set_current_vllm_config from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla from vllm.v1.attention.backends.mla.common import QueryLenSupport +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported from vllm.v1.kv_cache_interface import FullAttentionSpec BACKENDS_TO_TEST = [ diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py index 77faeb93d..a31c053ae 100644 --- a/tests/v1/attention/test_rocm_attention_backends_selection.py +++ b/tests/v1/attention/test_rocm_attention_backends_selection.py @@ -7,9 +7,9 @@ from unittest.mock import MagicMock, patch import pytest import torch -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.selector import AttentionSelectorConfig from vllm.platforms import current_platform +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.selector import AttentionSelectorConfig # ROCm-specific attention backend selection tests pytestmark = pytest.mark.skipif( diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py index 2b63253b3..1c24cd82f 100644 --- a/tests/v1/attention/test_sparse_mla_backends.py +++ b/tests/v1/attention/test_sparse_mla_backends.py @@ -21,7 +21,6 @@ from tests.v1.attention.utils import ( create_vllm_config, ) from vllm import _custom_ops as ops -from vllm.attention.ops import flashmla from vllm.config import set_current_vllm_config from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.platforms import current_platform @@ -31,6 +30,7 @@ from vllm.v1.attention.backends.mla.flashmla_sparse import ( triton_convert_req_index_to_global_index, ) from vllm.v1.attention.backends.utils import split_prefill_chunks +from vllm.v1.attention.ops import flashmla SPARSE_BACKEND_BATCH_SPECS = { name: BATCH_SPECS[name] diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 031436a03..71e74f4d5 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -7,8 +7,6 @@ from dataclasses import dataclass import pytest import torch -from vllm.attention.backends.abstract import AttentionImpl -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( CacheConfig, CompilationConfig, @@ -20,6 +18,8 @@ from vllm.config import ( VllmConfig, ) from vllm.config.model import ModelDType +from vllm.v1.attention.backend import AttentionImpl +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py index a8013ed22..485eb26c7 100644 --- a/tests/v1/determinism/utils.py +++ b/tests/v1/determinism/utils.py @@ -6,9 +6,9 @@ import random import pytest import torch -from vllm.attention.utils.fa_utils import flash_attn_supports_mla from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer +from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla skip_unsupported = pytest.mark.skipif( not (current_platform.is_cuda() and current_platform.has_device_capability(80)), diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py index 0d29ca5fc..da6a5aadb 100644 --- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py +++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py @@ -14,12 +14,12 @@ from unittest.mock import patch import pytest -from vllm.attention.backends.abstract import AttentionMetadata from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, KVConnectorRole, ) +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from .utils import create_scheduler, create_vllm_config diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index a5e326e82..3158ff0bd 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -13,7 +13,6 @@ from tests.v1.attention.utils import ( create_standard_kv_cache_spec, try_get_attention_backend, ) -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( AttentionConfig, CacheConfig, @@ -27,6 +26,7 @@ from vllm.config import ( from vllm.config.load import LoadConfig from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.platforms import current_platform +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py index 3b8813ceb..b33dc58ff 100644 --- a/tests/v1/spec_decode/test_mtp.py +++ b/tests/v1/spec_decode/test_mtp.py @@ -12,7 +12,6 @@ from tests.v1.attention.utils import ( create_standard_kv_cache_spec, try_get_attention_backend, ) -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( CacheConfig, DeviceConfig, @@ -25,6 +24,7 @@ from vllm.config import ( from vllm.config.load import LoadConfig from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.platforms import current_platform +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.spec_decode.eagle import EagleProposer mimo_7b_dir = "XiaomiMiMo/MiMo-7B-Base" diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py index 0afeeb891..a0f140cca 100644 --- a/tests/v1/spec_decode/test_tree_attention.py +++ b/tests/v1/spec_decode/test_tree_attention.py @@ -11,9 +11,9 @@ from tests.v1.attention.utils import ( create_vllm_config, try_get_attention_backend, ) -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available from vllm.config import ParallelConfig, SpeculativeConfig +from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.utils import CommonAttentionMetadata if not is_flash_attn_varlen_func_available(): diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 5108729ae..badbd3e9a 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -5,8 +5,6 @@ import numpy as np import pytest import torch -from vllm.attention.backends.abstract import MultipleOf -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import Attention from vllm.config import ( AttentionConfig, @@ -27,6 +25,8 @@ from vllm.sampling_params import SamplingParams from vllm.utils.mem_constants import GiB_bytes from vllm.utils.system_utils import update_environment_variables from vllm.utils.torch_utils import set_random_seed +from vllm.v1.attention.backend import MultipleOf +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput from vllm.v1.kv_cache_interface import ( diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index cb98a856c..4b7f85077 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -73,7 +73,9 @@ EXCLUDE = [ "vllm/model_executor/models", "vllm/model_executor/layers/fla/ops", # Ignore triton kernels in ops. - "vllm/attention/ops", + "vllm/v1/attention/ops", + # TODO(matt): remove. + "vllm/v1/attention/backends/fa_utils.py", ] diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index a09666b65..411d11e5a 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -8,13 +8,6 @@ import torch import torch.nn as nn import vllm.envs as envs -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionType, - MLAAttentionImpl, -) -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.selector import get_attn_backend from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer from vllm.config import CacheConfig, get_current_vllm_config @@ -37,6 +30,13 @@ from vllm.utils.torch_utils import ( direct_register_custom_op, kv_cache_dtype_str_to_dtype, ) +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionType, + MLAAttentionImpl, +) +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import ( FullAttentionSpec, KVCacheSpec, diff --git a/vllm/attention/ops/__init__.py b/vllm/attention/ops/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/vllm/config/attention.py b/vllm/config/attention.py index dd62d8882..293045787 100644 --- a/vllm/config/attention.py +++ b/vllm/config/attention.py @@ -6,9 +6,9 @@ from typing import Any, Literal from pydantic import field_validator from pydantic.dataclasses import dataclass -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config.utils import config from vllm.logger import init_logger +from vllm.v1.attention.backends.registry import AttentionBackendEnum logger = init_logger(__name__) diff --git a/vllm/config/model.py b/vllm/config/model.py index c8b677695..bec1de554 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -12,7 +12,6 @@ from pydantic import ConfigDict, Field, field_validator, model_validator from pydantic.dataclasses import dataclass import vllm.envs as envs -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config.model_arch import ( ModelArchitectureConfig, ) @@ -50,6 +49,7 @@ from vllm.transformers_utils.model_arch_config_convertor import ( from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri from vllm.transformers_utils.utils import maybe_model_redirect from vllm.utils.import_utils import LazyLoader +from vllm.v1.attention.backends.registry import AttentionBackendEnum if TYPE_CHECKING: from transformers import PretrainedConfig diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 8a2936de9..ecb346af8 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -7,9 +7,9 @@ from typing import Any, Literal, TypeAlias from pydantic import ConfigDict, Field, field_validator, model_validator from pydantic.dataclasses import dataclass -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config.utils import config from vllm.utils.hashing import safe_hash +from vllm.v1.attention.backends.registry import AttentionBackendEnum @dataclass @@ -124,7 +124,7 @@ class MultiModalConfig: mm_encoder_attn_backend: AttentionBackendEnum | None = None """Optional override for the multi-modal encoder attention backend when using vision transformers. Accepts any value from - `vllm.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`).""" + `vllm.v1.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`).""" interleave_mm_strings: bool = False """Enable fully interleaved support for multimodal prompts, while using --chat-template-content-format=string.""" diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 914ab91b1..1a09f2e6b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -10,10 +10,10 @@ from typing import TYPE_CHECKING, Literal import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import get_current_vllm_config from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput if TYPE_CHECKING: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 0829336f0..fd997d67e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -42,8 +42,8 @@ from typing import TYPE_CHECKING, Any, Literal, Optional import torch -from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py index e9b2bd392..525061fc0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py @@ -36,7 +36,6 @@ from typing import TYPE_CHECKING, Any, Optional import torch -from vllm.attention.backends.abstract import AttentionMetadata from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, KVConnectorRole, @@ -44,6 +43,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import ( from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata from vllm.logger import init_logger from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backend import AttentionMetadata if TYPE_CHECKING: from vllm.config import VllmConfig diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py index 41243fc86..ca2647194 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Any, Optional import safetensors import torch -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -16,6 +15,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( ) from vllm.logger import init_logger from vllm.utils.hashing import safe_hash +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index 7869e08f1..ae2d7442d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Any import torch -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_events import ( BlockStored, @@ -19,6 +18,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorRole, ) from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py index beeffd7c6..8159832cc 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py @@ -36,7 +36,6 @@ except ImportError: PluginLauncher as RuntimePluginLauncher, ) -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -54,6 +53,7 @@ from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_ from vllm.sampling_params import SamplingParams from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import get_kv_cache_torch_dtype +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from vllm.version import __version__ as VLLM_VERSION diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py index 9ebd2b1a3..629170615 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -10,13 +10,13 @@ import zmq from lmcache.integration.vllm.utils import mla_enabled from lmcache.utils import init_logger as lmcache_init_logger -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole, ) +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput from vllm.v1.request import RequestStatus diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py index 91f0c6d48..2c6046172 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py @@ -16,8 +16,6 @@ import zmq import zmq.asyncio from vllm import envs -from vllm.attention.backends.abstract import AttentionMetadata -from vllm.attention.selector import get_attn_backend from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.utils import TpKVTopology from vllm.distributed.kv_transfer.kv_connector.v1.base import ( @@ -33,7 +31,9 @@ from vllm.distributed.parallel_state import ( from vllm.forward_context import ForwardContext from vllm.logger import init_logger from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.utils import get_kv_cache_layout +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import RequestStatus diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py index 4b6bd906d..abdbeb9e4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py @@ -15,7 +15,6 @@ import numpy as np import torch import zmq -from vllm.attention.selector import get_attn_backend from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -56,11 +55,12 @@ from vllm.utils.network_utils import ( make_zmq_path, make_zmq_socket, ) +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import RequestStatus if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata + from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 3fa1cdc1e..412e2c571 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Any import torch -from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata from vllm.config import VllmConfig from vllm.config.kv_transfer import KVTransferConfig from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType @@ -24,6 +23,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( PromMetricT, ) from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 8177a26a4..dc50ea678 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -20,8 +20,6 @@ import torch import zmq from vllm import envs -from vllm.attention.backends.abstract import AttentionMetadata -from vllm.attention.selector import get_attn_backend from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.utils import ( EngineId, @@ -50,7 +48,9 @@ from vllm.forward_context import ForwardContext from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.network_utils import make_zmq_path, make_zmq_socket +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.utils import get_kv_cache_layout +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.worker.block_table import BlockTable diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 7f03e0d88..67cf4b047 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -8,7 +8,6 @@ from typing import Any import torch -from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent @@ -20,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import ( from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata from vllm.forward_context import ForwardContext from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_utils import BlockHash from vllm.v1.core.sched.output import SchedulerOutput diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 8f3a62d7b..09e3b0333 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Any, Optional import regex as re import torch -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -19,6 +18,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_engine import ( ) from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 55f4c280a..94608b13d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -32,7 +32,6 @@ from pydantic.fields import FieldInfo from typing_extensions import TypeIs import vllm.envs as envs -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( AttentionConfig, CacheConfig, @@ -94,6 +93,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.mem_constants import GiB_bytes from vllm.utils.network_utils import get_ip from vllm.utils.torch_utils import resolve_kv_cache_dtype_string +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.sample.logits_processor import LogitsProcessor if TYPE_CHECKING: diff --git a/vllm/envs.py b/vllm/envs.py index a9f6123a7..c416da755 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -684,7 +684,7 @@ environment_variables: dict[str, Callable[[], Any]] = { None, lambda: list( __import__( - "vllm.attention.backends.registry", fromlist=["AttentionBackendEnum"] + "vllm.v1.attention.backends.registry", fromlist=["AttentionBackendEnum"] ).AttentionBackendEnum.__members__.keys() ), ), diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 1ef8e5403..9ef0569e8 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -10,10 +10,10 @@ from typing import Any, NamedTuple import torch import vllm.envs as envs -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.worker.dp_utils import coordinate_batch_across_dp from vllm.v1.worker.ubatch_utils import UBatchSlices diff --git a/vllm/attention/backends/__init__.py b/vllm/model_executor/layers/attention/__init__.py similarity index 100% rename from vllm/attention/backends/__init__.py rename to vllm/model_executor/layers/attention/__init__.py diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/model_executor/layers/attention/chunked_local_attention.py similarity index 97% rename from vllm/attention/layers/chunked_local_attention.py rename to vllm/model_executor/layers/attention/chunked_local_attention.py index 7e3794d40..a34506934 100644 --- a/vllm/attention/layers/chunked_local_attention.py +++ b/vllm/model_executor/layers/attention/chunked_local_attention.py @@ -4,12 +4,11 @@ import functools import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention -from vllm.attention.selector import get_attn_backend from vllm.config import CacheConfig from vllm.config.vllm import VllmConfig from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, @@ -17,6 +16,7 @@ from vllm.v1.attention.backends.utils import ( make_local_attention_virtual_batches, subclass_attention_backend, ) +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import ( AttentionSpec, ChunkedLocalAttentionSpec, diff --git a/vllm/attention/layers/cross_attention.py b/vllm/model_executor/layers/attention/cross_attention.py similarity index 98% rename from vllm/attention/layers/cross_attention.py rename to vllm/model_executor/layers/attention/cross_attention.py index f58c9d541..9c3bc3403 100644 --- a/vllm/attention/layers/cross_attention.py +++ b/vllm/model_executor/layers/attention/cross_attention.py @@ -6,20 +6,20 @@ from copy import copy import numpy as np import torch -from vllm.attention.backends.abstract import ( +from vllm.attention.layer import Attention +from vllm.config import CacheConfig, VllmConfig +from vllm.logger import init_logger +from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, AttentionType, ) -from vllm.attention.layer import Attention -from vllm.attention.selector import get_attn_backend -from vllm.config import CacheConfig, VllmConfig -from vllm.logger import init_logger -from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, subclass_attention_backend, ) +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import CrossAttentionSpec, KVCacheSpec logger = init_logger(__name__) diff --git a/vllm/attention/layers/encoder_only_attention.py b/vllm/model_executor/layers/attention/encoder_only_attention.py similarity index 96% rename from vllm/attention/layers/encoder_only_attention.py rename to vllm/model_executor/layers/attention/encoder_only_attention.py index 5e99c9901..c130fd095 100644 --- a/vllm/attention/layers/encoder_only_attention.py +++ b/vllm/model_executor/layers/attention/encoder_only_attention.py @@ -5,19 +5,19 @@ from copy import copy import torch -from vllm.attention.backends.abstract import ( +from vllm.attention.layer import Attention +from vllm.config import CacheConfig +from vllm.config.vllm import VllmConfig +from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, AttentionType, ) -from vllm.attention.layer import Attention -from vllm.attention.selector import get_attn_backend -from vllm.config import CacheConfig -from vllm.config.vllm import VllmConfig from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, subclass_attention_backend, ) +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import KVCacheSpec diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py similarity index 97% rename from vllm/attention/layers/mm_encoder_attention.py rename to vllm/model_executor/layers/attention/mm_encoder_attention.py index 411bdfa75..099fe2391 100644 --- a/vllm/attention/layers/mm_encoder_attention.py +++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py @@ -4,16 +4,16 @@ import torch -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.ops.vit_attn_wrappers import ( - vit_flash_attn_wrapper, - vit_torch_sdpa_wrapper, -) -from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.config import MultiModalConfig from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.models.vision import get_vit_attn_backend +from vllm.v1.attention.backends.fa_utils import get_flash_attn_version +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.ops.vit_attn_wrappers import ( + vit_flash_attn_wrapper, + vit_torch_sdpa_wrapper, +) logger = init_logger(__name__) diff --git a/vllm/attention/layers/static_sink_attention.py b/vllm/model_executor/layers/attention/static_sink_attention.py similarity index 98% rename from vllm/attention/layers/static_sink_attention.py rename to vllm/model_executor/layers/attention/static_sink_attention.py index 13be65d8b..918dff560 100644 --- a/vllm/attention/layers/static_sink_attention.py +++ b/vllm/model_executor/layers/attention/static_sink_attention.py @@ -4,26 +4,26 @@ import functools import torch -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionMetadata, - AttentionType, -) from vllm.attention.layer import Attention -from vllm.attention.ops.triton_reshape_and_cache_flash import ( - triton_reshape_and_cache_flash_diffkv, -) -from vllm.attention.selector import get_attn_backend from vllm.config import CacheConfig, VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionMetadata, + AttentionType, +) from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, subclass_attention_backend, ) +from vllm.v1.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash_diffkv, +) +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import ( AttentionSpec, KVCacheSpec, diff --git a/vllm/model_executor/layers/attention_layer_base.py b/vllm/model_executor/layers/attention_layer_base.py index 24809ccb0..97395b641 100644 --- a/vllm/model_executor/layers/attention_layer_base.py +++ b/vllm/model_executor/layers/attention_layer_base.py @@ -4,8 +4,8 @@ from abc import ABC, abstractmethod -from vllm.attention.backends.abstract import AttentionBackend, AttentionImpl from vllm.config import VllmConfig +from vllm.v1.attention.backend import AttentionBackend, AttentionImpl from vllm.v1.kv_cache_interface import KVCacheSpec diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 105827088..d3cf9739f 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -6,11 +6,11 @@ from typing import Any import torch -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.torch_utils import is_torch_equal_or_newer +from vllm.v1.attention.backends.registry import AttentionBackendEnum logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py index 27cc38845..fde9ad36b 100644 --- a/vllm/model_executor/layers/kda.py +++ b/vllm/model_executor/layers/kda.py @@ -5,7 +5,6 @@ import torch from einops import rearrange from torch import nn -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed import ( divide, @@ -17,6 +16,7 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import sharded_weight_loader from vllm.model_executor.utils import set_weight_attrs from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata from .fla.ops.kda import ( diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py index 74f4383e9..4f45dd6ca 100644 --- a/vllm/model_executor/layers/mamba/abstract.py +++ b/vllm/model_executor/layers/mamba/abstract.py @@ -5,10 +5,10 @@ from collections.abc import Iterable import torch -from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.selector import get_mamba_attn_backend from vllm.config import VllmConfig from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.v1.attention.backend import AttentionBackend +from vllm.v1.attention.selector import get_mamba_attn_backend from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py index 8020efbe3..8b5f80f54 100644 --- a/vllm/model_executor/layers/mamba/linear_attn.py +++ b/vllm/model_executor/layers/mamba/linear_attn.py @@ -8,7 +8,6 @@ import torch.nn.functional as F from einops import rearrange from torch import nn -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed.communication_op import tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import ( @@ -29,6 +28,7 @@ from vllm.model_executor.layers.mamba.mamba_utils import ( ) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 875bc9019..74e4a34b4 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -5,7 +5,6 @@ import torch from torch import nn -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed import ( divide, @@ -43,6 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ) from vllm.model_executor.utils import set_weight_attrs from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata # Added by the IBM Team, 2024 diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py index e6bfea3a2..14e00bce2 100644 --- a/vllm/model_executor/layers/mamba/short_conv.py +++ b/vllm/model_executor/layers/mamba/short_conv.py @@ -4,7 +4,6 @@ import torch -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import ForwardContext, get_forward_context @@ -24,6 +23,7 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_update, ) from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionMetadata diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index f4248b67f..ef6f59e44 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -9,7 +9,6 @@ from itertools import islice import torch from torch import nn -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config @@ -50,6 +49,7 @@ from vllm.model_executor.models.utils import ( maybe_prefix, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType logger = init_logger(__name__) diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index 96ca27ad0..b802bb0ee 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -8,10 +8,10 @@ from collections.abc import Iterable import torch import torch.nn as nn -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.utils import divide from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index e3f97a718..7d43735c0 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -32,13 +32,14 @@ import torch from torch import nn from transformers import ApertusConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import XIELU +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -57,6 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import ( maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index b52f6d2bf..cce01ea50 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -7,11 +7,13 @@ import torch from torch import nn from transformers import BertConfig -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, PoolerConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 14794fd6a..a5c43bbb3 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -6,7 +6,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -16,6 +15,9 @@ from vllm.distributed import ( tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import get_act_and_mul_fn, get_act_fn +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.fused_moe import activation_without_mul, fused_topk from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 7387830b3..9279cccd5 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -9,9 +9,9 @@ import torch import torch.nn as nn from transformers import Blip2VisionConfig, BlipVisionConfig -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 1eae71f3a..d18904fdf 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -15,11 +15,11 @@ from transformers import ( ) from vllm.attention.layer import Attention -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 9ef038d84..82e6df199 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -4,12 +4,12 @@ from copy import deepcopy from math import lcm from typing import TYPE_CHECKING -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec if TYPE_CHECKING: diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py index 6b9d09e88..b3e5d920e 100644 --- a/vllm/model_executor/models/deepencoder.py +++ b/vllm/model_executor/models/deepencoder.py @@ -18,8 +18,8 @@ import torch.nn as nn import torch.nn.functional as F from transformers import CLIPVisionConfig -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 7f1880e44..db0ccd695 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -33,9 +33,7 @@ from torch import nn from transformers import DeepseekV2Config, DeepseekV3Config from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention -from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -78,10 +76,12 @@ from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils.deep_gemm import fp8_mqa_logits, fp8_paged_mqa_logits from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.mla.indexer import ( DeepseekV32IndexerBackend, DeepseekV32IndexerMetadata, ) +from vllm.v1.attention.ops.common import pack_seq_triton, unpack_seq_triton from vllm.v1.kv_cache_interface import KVCacheSpec, MLAAttentionSpec from vllm.v1.worker.workspace import current_workspace_manager @@ -679,7 +679,9 @@ def sparse_attn_indexer( ) fp8_mqa_logits_func = fp8_mqa_logits if current_platform.is_rocm(): - from vllm.attention.ops.rocm_aiter_mla_sparse import rocm_fp8_mqa_logits + from vllm.v1.attention.ops.rocm_aiter_mla_sparse import ( + rocm_fp8_mqa_logits, + ) fp8_mqa_logits_func = rocm_fp8_mqa_logits logits = fp8_mqa_logits_func( @@ -729,7 +731,7 @@ def sparse_attn_indexer( num_padded_tokens = batch_size * next_n fp8_paged_mqa_logits_func = fp8_paged_mqa_logits if current_platform.is_rocm(): - from vllm.attention.ops.rocm_aiter_mla_sparse import ( + from vllm.v1.attention.ops.rocm_aiter_mla_sparse import ( rocm_fp8_paged_mqa_logits, ) diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index c9e0dc8b9..ac9ad3b67 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -8,10 +8,6 @@ import torch.nn as nn from torch.nn import LayerNorm from transformers.models.qwen2_vl import Qwen2VLProcessor -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import ( - MMEncoderAttention, -) from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import utils as dist_utils @@ -20,6 +16,9 @@ from vllm.distributed.parallel_state import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -59,6 +58,7 @@ from vllm.multimodal.inputs import MultiModalDataDict from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig, DotsVisionConfig from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .vision import run_dp_sharded_mrope_vision_model diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index d47955ea3..a382cb5b6 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -36,16 +36,15 @@ import torch.nn.functional as F from einops import rearrange from transformers import BatchFeature -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import ( - MMEncoderAttention, -) from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -74,6 +73,7 @@ from vllm.multimodal.processing import ( from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .ernie45_vl_moe import Ernie4_5_VLMoeForCausalLM from .interfaces import ( diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index e6a201c66..c8a0ba8c9 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -22,13 +22,15 @@ import torch from torch import nn from transformers import Gemma3TextConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -47,8 +49,8 @@ from vllm.model_executor.model_loader.weight_utils import ( maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType -from ...attention.layers.encoder_only_attention import EncoderOnlyAttention from .interfaces import SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 2cd11e66c..06da2a8b3 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -29,7 +29,6 @@ import torch from torch import nn from transformers import Glm4Config -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -41,6 +40,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsLoRA, SupportsPP from .llama import LlamaMLP as Glm4MLP diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 4c4347f5a..05257bd1e 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -46,15 +46,14 @@ from transformers.models.glm4v.image_processing_glm4v import ( from transformers.models.glm4v.video_processing_glm4v import Glm4vVideoProcessor from transformers.video_utils import VideoMetadata -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import ( - MMEncoderAttention, -) from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) from vllm.model_executor.layers.conv import Conv2dLayer, Conv3dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -89,6 +88,7 @@ from vllm.multimodal.processing import ( from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from ..layers.activation import SiluAndMul from .interfaces import ( diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 453a7812a..297237fd1 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -19,11 +19,11 @@ from transformers import BatchFeature, PreTrainedTokenizer, TensorType from transformers.image_utils import ImageInput from transformers.tokenization_utils_base import TextInput -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py index 27d408afd..a3b9a1221 100644 --- a/vllm/model_executor/models/glmasr.py +++ b/vllm/model_executor/models/glmasr.py @@ -11,12 +11,12 @@ from transformers import BatchFeature from transformers.models.glmasr import GlmAsrConfig, GlmAsrProcessor from transformers.models.whisper import WhisperFeatureExtractor -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 8a8df9f6e..69678188a 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -7,7 +7,6 @@ import torch.distributed as dist from torch import nn from transformers import GptOssConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -37,6 +36,7 @@ from vllm.model_executor.models.utils import sequence_parallel_chunk from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index adb71e93b..1cf6e824f 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -33,7 +33,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config @@ -65,6 +64,7 @@ from vllm.model_executor.model_loader.weight_utils import ( maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 6fc56094a..9afb86a89 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -33,14 +33,13 @@ import torch.nn as nn import torch.nn.functional as F from transformers import BatchFeature -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -81,6 +80,7 @@ from vllm.transformers_utils.configs.hunyuan_vl import ( from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor from vllm.transformers_utils.processors.hunyuan_vl_image import smart_resize from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interfaces import ( MultiModalEmbeddings, diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index ee6ca5eac..c78ad6479 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -27,9 +27,9 @@ from transformers.models.idefics2.configuration_idefics2 import ( Idefics2VisionConfig, ) -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 5f7ba838a..3e3d60cea 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -15,7 +15,6 @@ import torch.nn as nn import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.distributed import ( divide, get_tensor_model_parallel_rank, @@ -24,6 +23,7 @@ from vllm.distributed import ( tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py index a16857d61..2b2866d67 100644 --- a/vllm/model_executor/models/interns1_vit.py +++ b/vllm/model_executor/models/interns1_vit.py @@ -14,8 +14,8 @@ import torch.nn as nn from transformers import PretrainedConfig from transformers.utils import torch_int -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear diff --git a/vllm/model_executor/models/iquest_loopcoder.py b/vllm/model_executor/models/iquest_loopcoder.py index 47704aac8..1901cc6e8 100644 --- a/vllm/model_executor/models/iquest_loopcoder.py +++ b/vllm/model_executor/models/iquest_loopcoder.py @@ -24,7 +24,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -48,6 +47,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ) from vllm.model_executor.models.llama import LlamaMLP from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .utils import ( AutoWeightsLoader, diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index e05df611f..ffcc24446 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -16,11 +16,11 @@ from transformers.image_processing_utils import BatchFeature from transformers.tokenization_utils import TensorType from typing_extensions import TypedDict, Unpack -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig, VllmConfig from vllm.config.model import ModelConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 18a999ea0..8e6b66425 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -16,13 +16,13 @@ from transformers.feature_extraction_utils import BatchFeature from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from transformers.utils import torch_int -from vllm.attention.layers.mm_encoder_attention import ( - MMEncoderAttention, -) from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 4332acc82..95b5f0f5b 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -31,13 +31,14 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -56,6 +57,7 @@ from vllm.model_executor.model_loader.weight_utils import ( maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .adapters import as_embedding_model, as_seq_cls_model from .interfaces import ( diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 9ed0741ac..dde6db7c2 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -25,7 +25,6 @@ from torch import nn from transformers import Llama4TextConfig from vllm.attention.layer import Attention -from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -34,6 +33,9 @@ from vllm.distributed import ( tensor_model_parallel_all_gather, ) from vllm.logger import init_logger +from vllm.model_executor.layers.attention.chunked_local_attention import ( + ChunkedLocalAttention, +) from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py index 57aa4d91d..db85073b3 100644 --- a/vllm/model_executor/models/mimo_v2_flash.py +++ b/vllm/model_executor/models/mimo_v2_flash.py @@ -6,7 +6,6 @@ from itertools import islice import torch from torch import nn -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.config import ( CacheConfig, @@ -43,6 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ) from vllm.model_executor.models.utils import sequence_parallel_chunk from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .interfaces import MixtureOfExperts, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 18509882d..955a73ff1 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -14,7 +14,6 @@ import torch from torch import nn from transformers import MiniMaxConfig -from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig @@ -48,6 +47,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import maybe_prefix from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionMetadata from .interfaces import HasInnerState, IsHybrid from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index aeea4a140..fb66a03b8 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -31,10 +31,10 @@ from transformers.models.llama4.image_processing_llama4_fast import ( get_best_fit, ) -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index d72b4800c..773948039 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -7,10 +7,12 @@ from torch import nn from transformers import ModernBertConfig from transformers.activations import ACT2FN -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.pooler import DispatchPooler from vllm.model_executor.layers.pooler.seqwise import ( diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 5ccc5653e..bdfa6178b 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -18,7 +18,6 @@ from transformers.image_utils import ImageInput from transformers.tokenization_utils_base import TextInput from vllm.attention.layer import Attention -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions @@ -30,6 +29,7 @@ from vllm.distributed import ( tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import MulAndSilu, QuickGELU, SiluAndMul +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index c785b9910..c675b2cd6 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -52,9 +52,9 @@ import torch.nn.functional as F from transformers.activations import ACT2FN from transformers.modeling_utils import PreTrainedModel -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 83ef5e7e1..da0688f71 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -31,7 +31,6 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.attention.backends.abstract import AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group @@ -49,6 +48,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ) from vllm.model_executor.models.llama import LlamaAttention, LlamaMLP from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .interfaces import HasNoOps, SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/nemotron_parse.py b/vllm/model_executor/models/nemotron_parse.py index 1e7bb0e43..a88e52b55 100644 --- a/vllm/model_executor/models/nemotron_parse.py +++ b/vllm/model_executor/models/nemotron_parse.py @@ -26,7 +26,6 @@ from transformers import ( TensorType, ) -from vllm.attention.backends.abstract import AttentionType from vllm.config import CacheConfig, VllmConfig from vllm.config.lora import LoRAConfig from vllm.config.multimodal import BaseDummyOptions @@ -63,6 +62,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.transformers_utils.configs.radio import RadioConfig from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backend import AttentionType logger = init_logger(__name__) DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648) diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 44e3baee0..9f569bcc7 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -30,7 +30,6 @@ from torch import nn from transformers import PretrainedConfig from vllm.attention.layer import Attention, AttentionType -from vllm.attention.layers.static_sink_attention import StaticSinkAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.distributed import ( @@ -42,6 +41,9 @@ from vllm.distributed import ( tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.static_sink_attention import ( + StaticSinkAttention, +) from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index 829148b4c..f51c0f095 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -33,7 +33,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -57,6 +56,7 @@ from vllm.model_executor.model_loader.weight_utils import ( maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsLoRA from .utils import ( diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 0e5537b86..530974f7f 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -30,14 +30,13 @@ from transformers.modeling_outputs import ( ) from transformers.utils import torch_int -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import ( - MMEncoderAttention, -) from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -72,6 +71,7 @@ from vllm.multimodal.processing import ( from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .ernie45 import Ernie4_5ForCausalLM from .interfaces import MultiModalEmbeddings, SupportsMRoPE, SupportsMultiModal diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 9e052ce0b..225e131ec 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -9,7 +9,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig, get_current_vllm_config @@ -66,6 +65,7 @@ from vllm.model_executor.models.utils import ( from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import IntermediateTensors from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index a91aa2cdf..ab9eac1a9 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -33,13 +33,14 @@ import torch from torch import nn from transformers import Qwen2Config -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -59,6 +60,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 221e7bb06..6e9e46368 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -41,8 +41,6 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( Qwen2_5_VLVisionConfig, ) -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import MultiModalConfig, VllmConfig from vllm.distributed import parallel_state @@ -50,6 +48,7 @@ from vllm.distributed import utils as dist_utils from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_and_mul_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -83,6 +82,7 @@ from vllm.multimodal.processing import PromptReplacement, PromptUpdate from vllm.sequence import IntermediateTensors from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interfaces import ( MultiModalEmbeddings, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ee2b6c22b..3b0dce7fc 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -43,14 +43,13 @@ from transformers.models.qwen2_vl.configuration_qwen2_vl import ( from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -90,6 +89,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.tokenizers import TokenizerLike from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interfaces import ( MultiModalEmbeddings, diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 0d0da52ed..707e0ccfd 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -30,7 +30,6 @@ import torch from torch import nn from transformers import Qwen3Config -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -44,6 +43,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import set_default_rope_theta +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .qwen2 import Qwen2MLP as Qwen3MLP diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 9fded8e6b..c3e45de70 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -10,7 +10,6 @@ from einops import rearrange from torch import nn from transformers.activations import ACT2FN -from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import ( @@ -75,6 +74,7 @@ from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import Qwen3NextConfig from vllm.triton_utils import tl, triton from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata from .interfaces import ( diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index de8027c43..d17ac6ce8 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -46,7 +46,6 @@ from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import ( ) from transformers.models.whisper import WhisperFeatureExtractor -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.compilation.decorators import support_torch_compile from vllm.config import MultiModalConfig, VllmConfig from vllm.distributed import get_pp_group @@ -75,6 +74,7 @@ from vllm.multimodal.processing import ( PromptUpdateDetails, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interfaces import ( MultiModalEmbeddings, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index f47062c10..279d28067 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -48,7 +48,6 @@ from transformers.models.qwen3_vl.video_processing_qwen3_vl import ( ) from transformers.video_utils import VideoMetadata -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.compilation.decorators import support_torch_compile from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions @@ -92,6 +91,7 @@ from vllm.multimodal.processing import ( from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.collection_utils import is_list_of +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interfaces import ( MultiModalEmbeddings, diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index f25223c78..91a60bfd1 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -30,7 +30,6 @@ import torch from torch import nn from transformers import PretrainedConfig as SeedOssConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -56,6 +55,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import set_default_rope_theta +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index e39ae4340..c047415d4 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -15,12 +15,14 @@ from transformers import ( SiglipVisionConfig, ) -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/siglip2.py b/vllm/model_executor/models/siglip2.py index f7c91aa28..8fbc408ec 100644 --- a/vllm/model_executor/models/siglip2.py +++ b/vllm/model_executor/models/siglip2.py @@ -10,11 +10,11 @@ from torch import nn from torch.nn import functional as F from transformers import Siglip2VisionConfig -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import MultiModalConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index b560710d9..f4b79da5c 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -11,10 +11,10 @@ from torch.nn import functional as F from transformers import Siglip2VisionConfig from transformers.configuration_utils import PretrainedConfig -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 3c965721b..771e5974a 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -15,11 +15,11 @@ from torchvision import transforms from torchvision.transforms.functional import InterpolationMode from transformers import BatchFeature, PretrainedConfig, TensorType -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index 2e79ace46..d094bb289 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -27,13 +27,14 @@ from torch import nn from transformers import AutoModel from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.config.utils import getattr_iter from vllm.distributed import get_pp_group, get_tp_group from vllm.distributed.utils import get_pp_indices from vllm.logger import init_logger +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.models.interfaces import ( SupportsEagle, @@ -59,6 +60,7 @@ from vllm.model_executor.models.utils import ( maybe_prefix, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType if TYPE_CHECKING: from transformers import PreTrainedModel diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 024c50f12..2a4bec774 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -10,7 +10,6 @@ from typing import Final, Generic, Literal, Protocol, TypeAlias, TypeVar import torch from transformers import PretrainedConfig -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import VllmConfig from vllm.distributed import ( get_tensor_model_parallel_rank, @@ -19,6 +18,7 @@ from vllm.distributed import ( ) from vllm.logger import init_logger from vllm.platforms import current_platform +from vllm.v1.attention.backends.registry import AttentionBackendEnum logger = init_logger(__name__) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index ccd063d93..14d646f85 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -18,18 +18,15 @@ from transformers import ( ) from transformers.models.whisper.modeling_whisper import sinusoids -from vllm.attention.backends.abstract import ( - AttentionType, -) from vllm.attention.layer import Attention -from vllm.attention.layers.cross_attention import CrossAttention -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.cross_attention import CrossAttention +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -62,6 +59,9 @@ from vllm.transformers_utils.processor import cached_processor_from_config from vllm.utils.jsontree import json_map_leaves from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.torch_utils import set_default_torch_dtype +from vllm.v1.attention.backend import ( + AttentionType, +) from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsTranscription from .utils import ( diff --git a/vllm/model_executor/models/whisper_utils.py b/vllm/model_executor/models/whisper_utils.py index 077b4aff6..0bd0db061 100644 --- a/vllm/model_executor/models/whisper_utils.py +++ b/vllm/model_executor/models/whisper_utils.py @@ -9,20 +9,20 @@ import torch import torch.nn.functional as F from torch import nn -from vllm.attention.backends.abstract import ( +from vllm.attention.layer import Attention +from vllm.config import CacheConfig, VllmConfig +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, AttentionType, ) -from vllm.attention.layer import Attention -from vllm.attention.selector import get_attn_backend -from vllm.config import CacheConfig, VllmConfig -from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, subclass_attention_backend_with_overrides, ) +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import AttentionSpec # From https://platform.openai.com/docs/guides/speech-to-text/supported-languages diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index c3adc0036..949e9f41e 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -15,16 +15,16 @@ import regex as re import torch from vllm import envs -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interface import CpuArchEnum, Platform, PlatformEnum logger = init_logger(__name__) if TYPE_CHECKING: - from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig + from vllm.v1.attention.selector import AttentionSelectorConfig else: VllmConfig = None diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 2dc4ba5d7..47d634416 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -14,17 +14,17 @@ from typing_extensions import ParamSpec # import custom ops, trigger op registration import vllm._C # noqa -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.utils.import_utils import import_pynvml from vllm.utils.torch_utils import cuda_device_count_stateless +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interface import DeviceCapability, Platform, PlatformEnum if TYPE_CHECKING: - from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig from vllm.config.cache import CacheDType + from vllm.v1.attention.selector import AttentionSelectorConfig else: VllmConfig = None CacheDType = None @@ -148,7 +148,7 @@ class CudaPlatformBase(Platform): @classmethod def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: - from vllm.attention.backends.registry import AttentionBackendEnum + from vllm.v1.attention.backends.registry import AttentionBackendEnum parallel_config = vllm_config.parallel_config model_config = vllm_config.model_config @@ -200,7 +200,7 @@ class CudaPlatformBase(Platform): use_cutlass_mla = backend == AttentionBackendEnum.CUTLASS_MLA use_flashinfer_mla = backend == AttentionBackendEnum.FLASHINFER_MLA - from vllm.attention.ops.flashmla import is_flashmla_dense_supported + from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported if ( use_flashmla diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 3bea498f1..f86abd712 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -13,18 +13,18 @@ import numpy as np import torch from typing_extensions import deprecated -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger +from vllm.v1.attention.backends.registry import AttentionBackendEnum if TYPE_CHECKING: from torch.distributed import PrefixStore, ProcessGroup - from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig from vllm.inputs import ProcessorInputs, PromptType from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.utils.argparse_utils import FlexibleArgumentParser + from vllm.v1.attention.selector import AttentionSelectorConfig else: FlexibleArgumentParser = object diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 278be5a71..3a55dd36d 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -8,15 +8,15 @@ from typing import TYPE_CHECKING, Optional import torch import vllm.envs as envs -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.utils.torch_utils import cuda_device_count_stateless +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interface import DeviceCapability, Platform, PlatformEnum if TYPE_CHECKING: - from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig + from vllm.v1.attention.selector import AttentionSelectorConfig logger = init_logger(__name__) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 60e4968ab..b2d7bf38d 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -7,14 +7,14 @@ from typing import TYPE_CHECKING, Optional import torch -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interface import DeviceCapability, Platform, PlatformEnum if TYPE_CHECKING: - from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig + from vllm.v1.attention.selector import AttentionSelectorConfig else: VllmConfig = None diff --git a/vllm/attention/backends/abstract.py b/vllm/v1/attention/backend.py similarity index 100% rename from vllm/attention/backends/abstract.py rename to vllm/v1/attention/backend.py diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index abbee244a..3fc53278a 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -6,16 +6,16 @@ from typing import ClassVar import torch from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import ( +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.platforms import CpuArchEnum, current_platform +from vllm.v1.attention.backend import ( AttentionBackend, AttentionImpl, AttentionLayer, AttentionType, is_quantized_kv_cache, ) -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.platforms import CpuArchEnum, current_platform from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, diff --git a/vllm/attention/utils/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py similarity index 100% rename from vllm/attention/utils/fa_utils.py rename to vllm/v1/attention/backends/fa_utils.py diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 24390605a..aa51c1a43 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -9,24 +9,24 @@ from typing import ClassVar import numpy as np import torch -from vllm.attention.backends.abstract import ( +from vllm.attention.layer import Attention +from vllm.v1.attention.backend import ( AttentionBackend, AttentionImpl, AttentionType, MultipleOf, is_quantized_kv_cache, ) -from vllm.attention.layer import Attention -from vllm.attention.ops.common import cp_lse_ag_out_rs -from vllm.attention.ops.merge_attn_states import merge_attn_states -from vllm.attention.utils.fa_utils import ( +from vllm.v1.attention.backends.fa_utils import ( flash_attn_supports_fp8, get_flash_attn_version, is_flash_attn_varlen_func_available, ) +from vllm.v1.attention.ops.common import cp_lse_ag_out_rs +from vllm.v1.attention.ops.merge_attn_states import merge_attn_states if is_flash_attn_varlen_func_available(): - from vllm.attention.utils.fa_utils import ( + from vllm.v1.attention.backends.fa_utils import ( flash_attn_supports_sinks, flash_attn_varlen_func, get_scheduler_metadata, diff --git a/vllm/v1/attention/backends/flash_attn_diffkv.py b/vllm/v1/attention/backends/flash_attn_diffkv.py index ebbc4a02c..5305cc1b8 100644 --- a/vllm/v1/attention/backends/flash_attn_diffkv.py +++ b/vllm/v1/attention/backends/flash_attn_diffkv.py @@ -4,14 +4,14 @@ import torch -from vllm.attention.backends.abstract import AttentionType -from vllm.attention.ops.triton_reshape_and_cache_flash import ( +from vllm.v1.attention.backend import AttentionType +from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available +from vllm.v1.attention.ops.triton_reshape_and_cache_flash import ( triton_reshape_and_cache_flash_diffkv, ) -from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available if is_flash_attn_varlen_func_available(): - from vllm.attention.utils.fa_utils import flash_attn_varlen_func + from vllm.v1.attention.backends.fa_utils import flash_attn_varlen_func from vllm.logger import init_logger from vllm.v1.attention.backends.utils import get_kv_cache_layout diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 0bdf396d8..8dc2838d8 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -19,14 +19,6 @@ from flashinfer.utils import FP4Tensor from typing_extensions import override from vllm import envs -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionType, - MultipleOf, -) -from vllm.attention.ops.common import cp_lse_ag_out_rs -from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.config import CUDAGraphMode, VllmConfig, get_current_vllm_config from vllm.config.cache import CacheDType from vllm.distributed.parallel_state import get_dcp_group @@ -48,6 +40,12 @@ from vllm.utils.flashinfer import ( ) from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import is_pin_memory_available +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionImpl, + AttentionType, + MultipleOf, +) from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, @@ -59,6 +57,8 @@ from vllm.v1.attention.backends.utils import ( infer_global_hyperparameters, split_decodes_and_prefills, ) +from vllm.v1.attention.ops.common import cp_lse_ag_out_rs +from vllm.v1.attention.ops.merge_attn_states import merge_attn_states from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.utils import CpuGpuBuffer diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index ad99a6dad..994bbe3c9 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -20,12 +20,6 @@ from torch.nn.attention.flex_attention import ( or_masks, ) -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionType, - is_quantized_kv_cache, -) from vllm.config import VllmConfig from vllm.config.cache import CacheDType from vllm.logger import init_logger @@ -35,6 +29,12 @@ from vllm.model_executor.layers.batch_invariant import ( from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import is_torch_equal_or_newer +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionImpl, + AttentionType, + is_quantized_kv_cache, +) from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index 96f0d20ac..1d58ac683 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -6,8 +6,8 @@ from dataclasses import dataclass import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.utils import ( PAD_SLOT_ID, AttentionCGSupport, diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py index 004baa2d0..b1aad30ee 100644 --- a/vllm/v1/attention/backends/linear_attn.py +++ b/vllm/v1/attention/backends/linear_attn.py @@ -4,8 +4,8 @@ from dataclasses import dataclass import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py index 47dd44601..9d4a37576 100644 --- a/vllm/v1/attention/backends/mamba1_attn.py +++ b/vllm/v1/attention/backends/mamba1_attn.py @@ -3,7 +3,7 @@ from dataclasses import dataclass -from vllm.attention.backends.abstract import AttentionBackend +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.mamba_attn import ( BaseMambaAttentionMetadata, BaseMambaAttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index 74925a86e..a5f661d5d 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -5,9 +5,9 @@ from dataclasses import dataclass, replace import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.mamba_attn import ( BaseMambaAttentionMetadata, BaseMambaAttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 2ee2740a5..a5bd949e9 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -199,15 +199,6 @@ from tqdm import tqdm from vllm import _custom_ops as ops from vllm import envs from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionLayer, - AttentionMetadata, - MLAAttentionImpl, -) -from vllm.attention.ops.common import cp_lse_ag_out_rs -from vllm.attention.ops.merge_attn_states import merge_attn_states -from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.config import ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank from vllm.logger import init_logger @@ -222,6 +213,13 @@ from vllm.model_executor.layers.linear import ( from vllm.platforms import current_platform from vllm.utils.flashinfer import has_nvidia_artifactory from vllm.utils.math_utils import cdiv, round_down +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionLayer, + AttentionMetadata, + MLAAttentionImpl, +) +from vllm.v1.attention.backends.fa_utils import get_flash_attn_version from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, @@ -230,6 +228,8 @@ from vllm.v1.attention.backends.utils import ( infer_global_hyperparameters, split_decodes_and_prefills, ) +from vllm.v1.attention.ops.common import cp_lse_ag_out_rs +from vllm.v1.attention.ops.merge_attn_states import merge_attn_states from vllm.v1.kv_cache_interface import AttentionSpec diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 5e3fbc0ab..8cb8fa1f5 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -7,15 +7,15 @@ from typing import ClassVar import torch import vllm._custom_ops as ops -from vllm.attention.backends.abstract import ( +from vllm.config.cache import CacheDType +from vllm.logger import init_logger +from vllm.platforms.interface import DeviceCapability +from vllm.v1.attention.backend import ( AttentionLayer, AttentionType, MultipleOf, is_quantized_kv_cache, ) -from vllm.config.cache import CacheDType -from vllm.logger import init_logger -from vllm.platforms.interface import DeviceCapability from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonImpl, diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 915b51c25..2e0a19ac5 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -6,16 +6,6 @@ from typing import ClassVar import torch -from vllm.attention.backends.abstract import ( - AttentionLayer, - AttentionType, - MultipleOf, - is_quantized_kv_cache, -) -from vllm.attention.utils.fa_utils import ( - flash_attn_supports_mla, - get_flash_attn_version, -) from vllm.config import VllmConfig from vllm.config.cache import CacheDType from vllm.logger import init_logger @@ -23,6 +13,16 @@ from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) from vllm.platforms.interface import DeviceCapability +from vllm.v1.attention.backend import ( + AttentionLayer, + AttentionType, + MultipleOf, + is_quantized_kv_cache, +) +from vllm.v1.attention.backends.fa_utils import ( + flash_attn_supports_mla, + get_flash_attn_version, +) from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonDecodeMetadata, diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py index f02a4bb1e..c0442b13f 100644 --- a/vllm/v1/attention/backends/mla/flashinfer_mla.py +++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py @@ -6,14 +6,14 @@ from typing import ClassVar import torch from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla -from vllm.attention.backends.abstract import ( +from vllm.config.cache import CacheDType +from vllm.logger import init_logger +from vllm.platforms.interface import DeviceCapability +from vllm.v1.attention.backend import ( AttentionLayer, AttentionType, MultipleOf, ) -from vllm.config.cache import CacheDType -from vllm.logger import init_logger -from vllm.platforms.interface import DeviceCapability from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonImpl, diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 913503ce4..24ef6dd4d 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -6,12 +6,6 @@ from typing import ClassVar import torch -from vllm.attention.backends.abstract import AttentionLayer, AttentionType, MultipleOf -from vllm.attention.ops.flashmla import ( - flash_mla_with_kvcache, - get_mla_metadata, - is_flashmla_dense_supported, -) from vllm.config import VllmConfig from vllm.config.cache import CacheDType from vllm.logger import init_logger @@ -19,6 +13,7 @@ from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) from vllm.platforms.interface import DeviceCapability +from vllm.v1.attention.backend import AttentionLayer, AttentionType, MultipleOf from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonDecodeMetadata, @@ -32,6 +27,11 @@ from vllm.v1.attention.backends.utils import ( reshape_attn_output_for_spec_decode, reshape_query_for_spec_decode, ) +from vllm.v1.attention.ops.flashmla import ( + flash_mla_with_kvcache, + get_mla_metadata, + is_flashmla_dense_supported, +) from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) @@ -78,11 +78,11 @@ class FlashMLABackend(MLACommonBackend): device_capability: DeviceCapability, ) -> str | None: if use_sparse: - from vllm.attention.ops.flashmla import is_flashmla_sparse_supported + from vllm.v1.attention.ops.flashmla import is_flashmla_sparse_supported return is_flashmla_sparse_supported()[1] else: - from vllm.attention.ops.flashmla import is_flashmla_dense_supported + from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported return is_flashmla_dense_supported()[1] diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index dec92d2d4..282880adf 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -7,17 +7,6 @@ import numpy as np import torch from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionLayer, - AttentionMetadata, - MultipleOf, -) -from vllm.attention.ops.flashmla import ( - flash_mla_sparse_prefill, - flash_mla_with_kvcache, - get_mla_metadata, -) from vllm.config import VllmConfig, get_current_vllm_config from vllm.config.cache import CacheDType from vllm.logger import init_logger @@ -25,6 +14,12 @@ from vllm.platforms import current_platform from vllm.platforms.interface import DeviceCapability from vllm.triton_utils import tl, triton from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionLayer, + AttentionMetadata, + MultipleOf, +) from vllm.v1.attention.backends.mla.common import MLACommonBaseImpl, get_mla_dims from vllm.v1.attention.backends.utils import ( AttentionCGSupport, @@ -35,6 +30,11 @@ from vllm.v1.attention.backends.utils import ( split_decodes_and_prefills, split_prefill_chunks, ) +from vllm.v1.attention.ops.flashmla import ( + flash_mla_sparse_prefill, + flash_mla_with_kvcache, + get_mla_metadata, +) from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.worker.workspace import current_workspace_manager diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py index d0696f60a..351cbc8a6 100644 --- a/vllm/v1/attention/backends/mla/indexer.py +++ b/vllm/v1/attention/backends/mla/indexer.py @@ -5,14 +5,14 @@ from typing import ClassVar import torch -from vllm.attention.backends.abstract import ( - AttentionBackend, - MultipleOf, -) from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata, is_deep_gemm_supported +from vllm.v1.attention.backend import ( + AttentionBackend, + MultipleOf, +) from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index f79d58ca1..d43516e55 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -7,8 +7,8 @@ from typing import ClassVar import torch from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.backends.abstract import AttentionLayer, MultipleOf from vllm.config import VllmConfig +from vllm.v1.attention.backend import AttentionLayer, MultipleOf from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonDecodeMetadata, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py index e68e80e86..7d05879d9 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py @@ -9,13 +9,13 @@ import torch from vllm import _custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.backends.abstract import ( +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.v1.attention.backend import ( AttentionBackend, AttentionLayer, AttentionMetadata, ) -from vllm.config import VllmConfig -from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import MLACommonBaseImpl, get_mla_dims from vllm.v1.attention.backends.mla.flashmla_sparse import ( triton_convert_req_index_to_global_index, diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 54ad3acb9..32d3fa3b0 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -5,23 +5,23 @@ from typing import ClassVar import torch -from vllm.attention.backends.abstract import ( - AttentionLayer, - AttentionType, - is_quantized_kv_cache, -) -from vllm.attention.ops.triton_decode_attention import decode_attention_fwd from vllm.config.cache import CacheDType from vllm.logger import init_logger from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) from vllm.platforms.interface import DeviceCapability +from vllm.v1.attention.backend import ( + AttentionLayer, + AttentionType, + is_quantized_kv_cache, +) from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonImpl, MLACommonMetadata, ) +from vllm.v1.attention.ops.triton_decode_attention import decode_attention_fwd logger = init_logger(__name__) diff --git a/vllm/attention/backends/registry.py b/vllm/v1/attention/backends/registry.py similarity index 99% rename from vllm/attention/backends/registry.py rename to vllm/v1/attention/backends/registry.py index cc1d3bfb3..bd45702fa 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/v1/attention/backends/registry.py @@ -10,7 +10,7 @@ from vllm.logger import init_logger from vllm.utils.import_utils import resolve_obj_by_qualname if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend + from vllm.v1.attention.backend import AttentionBackend logger = init_logger(__name__) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 501f197c5..da14a8484 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -7,25 +7,25 @@ from typing import ClassVar import torch -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionType, - MultipleOf, -) from vllm.attention.layer import Attention -from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import get_cu_count +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionImpl, + AttentionType, + MultipleOf, +) from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, split_decodes_prefills_and_extends, ) +from vllm.v1.attention.ops.merge_attn_states import merge_attn_states from vllm.v1.kv_cache_interface import AttentionSpec _PARTITION_SIZE_ROCM = 256 diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py index 16fb52ab5..9589c3128 100644 --- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py +++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py @@ -5,12 +5,12 @@ import torch from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import AttentionType from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8StaticTensorSym, ) +from vllm.v1.attention.backend import AttentionType from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.rocm_attn import ( RocmAttentionBackend, diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index 0b7a51434..9d00d8fa6 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -7,17 +7,6 @@ from typing import ClassVar import torch -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionType, - MultipleOf, -) -from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode -from vllm.attention.ops.paged_attn import PagedAttention -from vllm.attention.ops.triton_reshape_and_cache_flash import ( - triton_reshape_and_cache_flash, -) from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -25,12 +14,25 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( kFp8StaticTensorSym, ) from vllm.platforms import current_platform +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionImpl, + AttentionType, + MultipleOf, +) from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, ) +from vllm.v1.attention.ops.chunked_prefill_paged_decode import ( + chunked_prefill_paged_decode, +) +from vllm.v1.attention.ops.paged_attn import PagedAttention +from vllm.v1.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash, +) from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py index e2fae37f5..dc6b425ce 100644 --- a/vllm/v1/attention/backends/short_conv_attn.py +++ b/vllm/v1/attention/backends/short_conv_attn.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from vllm.attention.backends.abstract import AttentionBackend +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.mamba_attn import ( BaseMambaAttentionMetadata, BaseMambaAttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index 5e3c436f8..b6e58a25f 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -9,20 +9,20 @@ from typing import ClassVar, Optional import torch from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import ( +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.v1.attention.backend import ( AttentionBackend, AttentionImpl, AttentionType, MultipleOf, ) -from vllm.attention.ops.triton_unified_attention import unified_attention -from vllm.config import VllmConfig -from vllm.logger import init_logger from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, split_decodes_and_prefills, ) +from vllm.v1.attention.ops.triton_unified_attention import unified_attention from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 9bf440a04..ed2f9564e 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -7,17 +7,6 @@ from typing import ClassVar import torch -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionType, - MultipleOf, -) -from vllm.attention.ops.triton_prefill_attention import context_attention_fwd -from vllm.attention.ops.triton_reshape_and_cache_flash import ( - triton_reshape_and_cache_flash, -) -from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.config import CUDAGraphMode, VllmConfig from vllm.config.cache import CacheDType from vllm.logger import init_logger @@ -28,11 +17,22 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform from vllm.platforms.interface import DeviceCapability from vllm.utils.math_utils import next_power_of_2 +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionImpl, + AttentionType, + MultipleOf, +) from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, ) +from vllm.v1.attention.ops.triton_prefill_attention import context_attention_fwd +from vllm.v1.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash, +) +from vllm.v1.attention.ops.triton_unified_attention import unified_attention from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index cc33b3319..eecd81a69 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -29,16 +29,16 @@ if TYPE_CHECKING: from vllm.v1.worker.gpu_input_batch import InputBatch import vllm.envs as envs -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionMetadata, -) from vllm.distributed.kv_transfer.kv_connector.utils import ( get_kv_connector_cache_layout, ) from vllm.logger import init_logger from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionImpl, + AttentionMetadata, +) from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.worker.ubatch_utils import UBatchSlice diff --git a/vllm/attention/layers/__init__.py b/vllm/v1/attention/ops/__init__.py similarity index 100% rename from vllm/attention/layers/__init__.py rename to vllm/v1/attention/ops/__init__.py diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/v1/attention/ops/chunked_prefill_paged_decode.py similarity index 100% rename from vllm/attention/ops/chunked_prefill_paged_decode.py rename to vllm/v1/attention/ops/chunked_prefill_paged_decode.py diff --git a/vllm/attention/ops/common.py b/vllm/v1/attention/ops/common.py similarity index 100% rename from vllm/attention/ops/common.py rename to vllm/v1/attention/ops/common.py diff --git a/vllm/attention/ops/flashmla.py b/vllm/v1/attention/ops/flashmla.py similarity index 100% rename from vllm/attention/ops/flashmla.py rename to vllm/v1/attention/ops/flashmla.py diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/v1/attention/ops/merge_attn_states.py similarity index 94% rename from vllm/attention/ops/merge_attn_states.py rename to vllm/v1/attention/ops/merge_attn_states.py index f347fb3fb..673d2d947 100644 --- a/vllm/attention/ops/merge_attn_states.py +++ b/vllm/v1/attention/ops/merge_attn_states.py @@ -40,7 +40,7 @@ def merge_attn_states( output, prefix_output, prefix_lse, suffix_output, suffix_lse, output_lse ) else: - from vllm.attention.ops.triton_merge_attn_states import merge_attn_states + from vllm.v1.attention.ops.triton_merge_attn_states import merge_attn_states return merge_attn_states( output, prefix_output, prefix_lse, suffix_output, suffix_lse, output_lse diff --git a/vllm/attention/ops/paged_attn.py b/vllm/v1/attention/ops/paged_attn.py similarity index 100% rename from vllm/attention/ops/paged_attn.py rename to vllm/v1/attention/ops/paged_attn.py diff --git a/vllm/attention/ops/pallas_kv_cache_update.py b/vllm/v1/attention/ops/pallas_kv_cache_update.py similarity index 100% rename from vllm/attention/ops/pallas_kv_cache_update.py rename to vllm/v1/attention/ops/pallas_kv_cache_update.py diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/v1/attention/ops/prefix_prefill.py similarity index 100% rename from vllm/attention/ops/prefix_prefill.py rename to vllm/v1/attention/ops/prefix_prefill.py diff --git a/vllm/attention/ops/rocm_aiter_mla_sparse.py b/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py similarity index 100% rename from vllm/attention/ops/rocm_aiter_mla_sparse.py rename to vllm/v1/attention/ops/rocm_aiter_mla_sparse.py diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/v1/attention/ops/triton_decode_attention.py similarity index 100% rename from vllm/attention/ops/triton_decode_attention.py rename to vllm/v1/attention/ops/triton_decode_attention.py diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/v1/attention/ops/triton_merge_attn_states.py similarity index 100% rename from vllm/attention/ops/triton_merge_attn_states.py rename to vllm/v1/attention/ops/triton_merge_attn_states.py diff --git a/vllm/attention/ops/triton_prefill_attention.py b/vllm/v1/attention/ops/triton_prefill_attention.py similarity index 100% rename from vllm/attention/ops/triton_prefill_attention.py rename to vllm/v1/attention/ops/triton_prefill_attention.py diff --git a/vllm/attention/ops/triton_reshape_and_cache_flash.py b/vllm/v1/attention/ops/triton_reshape_and_cache_flash.py similarity index 100% rename from vllm/attention/ops/triton_reshape_and_cache_flash.py rename to vllm/v1/attention/ops/triton_reshape_and_cache_flash.py diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/v1/attention/ops/triton_unified_attention.py similarity index 100% rename from vllm/attention/ops/triton_unified_attention.py rename to vllm/v1/attention/ops/triton_unified_attention.py diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py similarity index 98% rename from vllm/attention/ops/vit_attn_wrappers.py rename to vllm/v1/attention/ops/vit_attn_wrappers.py index 80c4f1491..72c45571f 100644 --- a/vllm/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -35,7 +35,7 @@ def flash_attn_maxseqlen_wrapper( if is_rocm_aiter: from aiter import flash_attn_varlen_func else: - from vllm.attention.utils.fa_utils import flash_attn_varlen_func + from vllm.v1.attention.backends.fa_utils import flash_attn_varlen_func if not current_platform.is_rocm() and fa_version is not None: kwargs["fa_version"] = fa_version diff --git a/vllm/attention/selector.py b/vllm/v1/attention/selector.py similarity index 97% rename from vllm/attention/selector.py rename to vllm/v1/attention/selector.py index e66f698ad..e364c3235 100644 --- a/vllm/attention/selector.py +++ b/vllm/v1/attention/selector.py @@ -6,14 +6,14 @@ from typing import NamedTuple, cast, get_args import torch -from vllm.attention.backends.abstract import AttentionBackend, AttentionType -from vllm.attention.backends.registry import ( - MAMBA_TYPE_TO_BACKEND_MAP, - MambaAttentionBackendEnum, -) from vllm.config.cache import CacheDType from vllm.logger import init_logger from vllm.utils.import_utils import resolve_obj_by_qualname +from vllm.v1.attention.backend import AttentionBackend, AttentionType +from vllm.v1.attention.backends.registry import ( + MAMBA_TYPE_TO_BACKEND_MAP, + MambaAttentionBackendEnum, +) logger = init_logger(__name__) diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py index e1cf7b14a..061cf2267 100644 --- a/vllm/v1/kv_offload/cpu.py +++ b/vllm/v1/kv_offload/cpu.py @@ -4,9 +4,9 @@ from collections.abc import Iterator import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.platforms import current_platform +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager from vllm.v1.kv_offload.backends.cpu import CPUBackend diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py index 2cdd5ba5f..549a0fdbf 100644 --- a/vllm/v1/kv_offload/spec.py +++ b/vllm/v1/kv_offload/spec.py @@ -6,8 +6,8 @@ from typing import TYPE_CHECKING import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager from vllm.v1.kv_offload.worker.worker import OffloadingHandler diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py index 42ae4f141..dcaecb099 100644 --- a/vllm/v1/kv_offload/worker/cpu_gpu.py +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -6,9 +6,9 @@ import numpy as np import torch from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger from vllm.utils.platform_utils import is_pin_memory_available +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.kv_offload.mediums import BlockIDsLoadStoreSpec from vllm.v1.kv_offload.worker.worker import ( OffloadingHandler, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index f6d198f63..cd4f55b79 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -8,7 +8,6 @@ import numpy as np import torch import torch.nn as nn -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( CompilationMode, CUDAGraphMode, @@ -27,6 +26,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.platforms import current_platform from vllm.triton_utils import triton from vllm.utils.platform_utils import is_pin_memory_available +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.tree_attn import ( TreeAttentionMetadata, TreeAttentionMetadataBuilder, diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py index 6386f1a08..312f0ab93 100644 --- a/vllm/v1/worker/gpu/attn_utils.py +++ b/vllm/v1/worker/gpu/attn_utils.py @@ -6,9 +6,9 @@ from typing import Any, cast import numpy as np import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5228167ed..40937caef 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -20,12 +20,6 @@ import torch.nn as nn from tqdm import tqdm import vllm.envs as envs -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionMetadata, - AttentionType, - MultipleOf, -) from vllm.attention.layer import Attention, MLAAttention from vllm.compilation.counter import compilation_counter from vllm.compilation.cuda_graph import CUDAGraphStat, CUDAGraphWrapper @@ -101,6 +95,12 @@ from vllm.utils.torch_utils import ( kv_cache_dtype_str_to_dtype, supports_dynamo, ) +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionMetadata, + AttentionType, + MultipleOf, +) from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( AttentionCGSupport, diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index 7bb4ebe47..ca0868bef 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -11,7 +11,6 @@ from typing import TYPE_CHECKING import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.config.cache import CacheDType from vllm.distributed.kv_transfer import ( @@ -22,6 +21,7 @@ from vllm.distributed.kv_transfer import ( from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig from vllm.v1.outputs import ( EMPTY_MODEL_RUNNER_OUTPUT, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index bfe90572e..85acc1679 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -7,7 +7,6 @@ from dataclasses import dataclass, field import torch from typing_extensions import deprecated -from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.logger import init_logger @@ -17,6 +16,7 @@ from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.registry import MultiModalRegistry from vllm.platforms import current_platform from vllm.utils.mem_utils import MemorySnapshot, format_gib +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec