diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index e0e22522b..e383c10dd 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1473,7 +1473,7 @@ steps: - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh --attention-backend ROCM_ATTN - label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min mirror_hardwares: [amdexperimental, amdproduction] @@ -1487,7 +1487,7 @@ steps: - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - - VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh --attention-backend ROCM_ATTN ##### multi gpus test ##### ##### A100 test ##### diff --git a/tests/v1/spec_decode/test_acceptance_length.py b/tests/v1/spec_decode/test_acceptance_length.py index 1a615878b..13393da30 100644 --- a/tests/v1/spec_decode/test_acceptance_length.py +++ b/tests/v1/spec_decode/test_acceptance_length.py @@ -207,7 +207,6 @@ def test_eagle3_acceptance_length( with monkeypatch.context() as m: m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") - m.setenv("VLLM_ATTENTION_BACKEND", attention_backend) with VllmRunner( model_name=model_config.verifier, @@ -216,6 +215,7 @@ def test_eagle3_acceptance_length( "model": model_config.drafter, "num_speculative_tokens": num_spec_tokens, }, + attention_config={"backend": attention_backend}, tensor_parallel_size=tp_size, gpu_memory_utilization=0.7, disable_log_stats=False, diff --git a/vllm/config/attention.py b/vllm/config/attention.py index 354ef056c..ee072fb1c 100644 --- a/vllm/config/attention.py +++ b/vllm/config/attention.py @@ -7,11 +7,8 @@ from pydantic import field_validator from pydantic.dataclasses import dataclass from vllm.config.utils import config -from vllm.logger import init_logger from vllm.v1.attention.backends.registry import AttentionBackendEnum -logger = init_logger(__name__) - @config @dataclass @@ -69,46 +66,3 @@ class AttentionConfig: if isinstance(value, str): return AttentionBackendEnum[value.upper()] return value - - def _set_from_env_if_set(self, field_name: str, env_var_name: str) -> None: - """Set field from env var if set, with deprecation warning.""" - from vllm import envs - - if envs.is_set(env_var_name): - value = getattr(envs, env_var_name) - if field_name == "backend": - value = self.validate_backend_before(value) - setattr(self, field_name, value) - logger.warning_once( - "Using %s environment variable is deprecated and will be removed in " - "v0.14.0 or v1.0.0, whichever is soonest. Please use " - "--attention-config.%s command line argument or " - "AttentionConfig(%s=...) config field instead.", - env_var_name, - field_name, - field_name, - ) - - def __post_init__(self) -> None: - self._set_from_env_if_set("backend", "VLLM_ATTENTION_BACKEND") - self._set_from_env_if_set("flash_attn_version", "VLLM_FLASH_ATTN_VERSION") - self._set_from_env_if_set( - "use_prefill_decode_attention", "VLLM_V1_USE_PREFILL_DECODE_ATTENTION" - ) - self._set_from_env_if_set( - "flash_attn_max_num_splits_for_cuda_graph", - "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", - ) - self._set_from_env_if_set("use_cudnn_prefill", "VLLM_USE_CUDNN_PREFILL") - self._set_from_env_if_set( - "use_trtllm_ragged_deepseek_prefill", - "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL", - ) - self._set_from_env_if_set("use_trtllm_attention", "VLLM_USE_TRTLLM_ATTENTION") - self._set_from_env_if_set( - "disable_flashinfer_prefill", "VLLM_DISABLE_FLASHINFER_PREFILL" - ) - self._set_from_env_if_set( - "disable_flashinfer_q_quantization", - "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", - ) diff --git a/vllm/envs.py b/vllm/envs.py index 3c8372283..0cc0b8627 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -20,8 +20,6 @@ if TYPE_CHECKING: VLLM_NCCL_SO_PATH: str | None = None LD_LIBRARY_PATH: str | None = None VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE: int = 256 - VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False - VLLM_FLASH_ATTN_VERSION: int | None = None LOCAL_RANK: int = 0 CUDA_VISIBLE_DEVICES: str | None = None VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60 @@ -36,7 +34,6 @@ if TYPE_CHECKING: VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm") VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai" VLLM_NO_USAGE_STATS: bool = False - VLLM_DISABLE_FLASHINFER_PREFILL: bool = False VLLM_DO_NOT_TRACK: bool = False VLLM_USAGE_SOURCE: str = "" VLLM_CONFIGURE_LOGGING: bool = True @@ -48,7 +45,6 @@ if TYPE_CHECKING: NO_COLOR: bool = False VLLM_LOG_STATS_INTERVAL: float = 10.0 VLLM_TRACE_FUNCTION: int = 0 - VLLM_ATTENTION_BACKEND: str | None = None VLLM_USE_FLASHINFER_SAMPLER: bool | None = None VLLM_PP_LAYER_PARTITION: str | None = None VLLM_CPU_KVCACHE_SPACE: int | None = 0 @@ -142,7 +138,6 @@ if TYPE_CHECKING: VLLM_SERVER_DEV_MODE: bool = False VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 VLLM_MLA_DISABLE: bool = False - VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32 VLLM_RAY_PER_WORKER_GPUS: float = 1.0 VLLM_RAY_BUNDLE_INDICES: str = "" VLLM_CUDART_SO_PATH: str | None = None @@ -214,15 +209,11 @@ if TYPE_CHECKING: VLLM_MORIIO_POST_BATCH_SIZE: int = -1 VLLM_MORIIO_NUM_WORKERS: int = 1 VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: int = 480 - VLLM_USE_CUDNN_PREFILL: bool = False - VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False VLLM_ENABLE_CUDAGRAPH_GC: bool = False VLLM_LOOPBACK_IP: str = "" VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = True VLLM_ENABLE_RESPONSES_API_STORE: bool = False - VLLM_USE_TRTLLM_ATTENTION: str | None = None VLLM_NVFP4_GEMM_BACKEND: str | None = None - VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False VLLM_HAS_FLASHINFER_CUBIN: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False @@ -592,17 +583,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE": lambda: int( os.environ.get("VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE", "256") ), - # Use separate prefill and decode kernels for V1 attention instead of - # the unified triton kernel. - "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: ( - os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() - in ("true", "1") - ), - # Force vllm to use a specific flash-attention version (2 or 3), only valid - # when using the flash-attention backend. - "VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int( - os.environ.get("VLLM_FLASH_ATTN_VERSION", None) - ), # Feature flag to enable/disable Inductor standalone compile. # In torch <= 2.7 we ignore this flag; in torch >= 2.9 this is # enabled by default. @@ -668,10 +648,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai" ), "VLLM_NO_USAGE_STATS": lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1", - "VLLM_DISABLE_FLASHINFER_PREFILL": lambda: os.environ.get( - "VLLM_DISABLE_FLASHINFER_PREFILL", "0" - ) - == "1", "VLLM_DO_NOT_TRACK": lambda: ( os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get("DO_NOT_TRACK", None) @@ -707,25 +683,6 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set to 1, vllm will trace function calls # Useful for debugging "VLLM_TRACE_FUNCTION": lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")), - # Backend for attention computation - # Example options: - # - "TORCH_SDPA": use torch.nn.MultiheadAttention - # - "FLASH_ATTN": use FlashAttention - # - "FLASHINFER": use flashinfer - # - "FLASHMLA": use FlashMLA - # - "FLASH_ATTN_MLA": use FlashAttention for MLA - # - "FLASHINFER_MLA": use FlashInfer for MLA - # - "CUTLASS_MLA": use CUTLASS for MLA - # All possible options loaded dynamically from AttentionBackendEnum - "VLLM_ATTENTION_BACKEND": env_with_choices( - "VLLM_ATTENTION_BACKEND", - None, - lambda: list( - __import__( - "vllm.v1.attention.backends.registry", fromlist=["AttentionBackendEnum"] - ).AttentionBackendEnum.__members__.keys() - ), - ), # If set, vllm will use flashinfer sampler "VLLM_USE_FLASHINFER_SAMPLER": lambda: bool( int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]) @@ -1127,10 +1084,6 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, vLLM will disable the MLA attention optimizations. "VLLM_MLA_DISABLE": lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), # If set, vLLM will pick up the provided Flash Attention MLA - # max number splits for cuda graph decode - "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": lambda: int( - os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", "32") - ), # Number of GPUs per worker in Ray, if it is set to be a fraction, # it allows ray to schedule multiple actors on a single GPU, # so that users can colocate other actors on the same GPUs as vLLM. @@ -1464,26 +1417,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT": lambda: int( os.getenv("VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT", "480") ), - # Controls whether or not to use cudnn prefill - "VLLM_USE_CUDNN_PREFILL": lambda: bool( - int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0")) - ), - # Controls whether to use TRT-LLM ragged DeepSeek prefill - "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL": lambda: bool( - int(os.getenv("VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL", "0")) - ), - # If set to 1/True, use the TRTLLM attention backend in flashinfer. - # If set to 0/False, use the default attention backend in flashinfer. - # If not set, auto-detect the attention backend in flashinfer. - "VLLM_USE_TRTLLM_ATTENTION": lambda: ( - None - if "VLLM_USE_TRTLLM_ATTENTION" not in os.environ - else os.environ["VLLM_USE_TRTLLM_ATTENTION"].lower() in ("1", "true") - ), - # If set to 1, when we use fp8 kv, we do not quantize Q to fp8 - "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION": lambda: bool( - int(os.getenv("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "0")) - ), # If set, it means we pre-downloaded cubin files and flashinfer will # read the cubin files directly. "VLLM_HAS_FLASHINFER_CUBIN": lambda: bool( diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 782235af8..84a994de5 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -287,7 +287,10 @@ class RocmPlatform(Platform): return AttentionBackendEnum.ROCM_AITER_FA.get_path() # Priority 3: Check for ROCM_ATTN (prefill-decode split) - if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION: + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() + if vllm_config.attention_config.use_prefill_decode_attention: logger.info("Using Rocm Attention backend.") return AttentionBackendEnum.ROCM_ATTN.get_path() diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index b0886bba8..1d51446b7 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -37,7 +37,6 @@ _GLOBAL_RUNTIME_DATA = dict[str, str | int | bool]() _USAGE_ENV_VARS_TO_COLLECT = [ "VLLM_USE_MODELSCOPE", - "VLLM_ATTENTION_BACKEND", "VLLM_USE_FLASHINFER_SAMPLER", "VLLM_PP_LAYER_PARTITION", "VLLM_USE_TRITON_AWQ",