Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -9,9 +9,17 @@ import pytest
 import torch

 from vllm.attention.backends.registry import _Backend
-from vllm.config import (CacheConfig, CompilationConfig, DeviceConfig,
-                         LoadConfig, ModelConfig, ModelDType, ParallelConfig,
-                         SchedulerConfig, VllmConfig)
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    DeviceConfig,
+    LoadConfig,
+    ModelConfig,
+    ModelDType,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
 from vllm.platforms import current_platform
 from vllm.utils import resolve_obj_by_qualname
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
@@ -21,6 +29,7 @@ from vllm.v1.kv_cache_interface import FullAttentionSpec
@dataclass
 class BatchSpec:
    """Specification for a batch configuration (workload shape only)."""
+
    seq_lens: list[int]
    query_lens: list[int]

@@ -38,26 +47,25 @@ class BatchSpec:


 def create_common_attn_metadata(
-        batch_spec: BatchSpec,
-        block_size: int,
-        device: torch.device,
-        max_block_idx: int = 1000,
-        arange_block_indices: bool = False) -> CommonAttentionMetadata:
+    batch_spec: BatchSpec,
+    block_size: int,
+    device: torch.device,
+    max_block_idx: int = 1000,
+    arange_block_indices: bool = False,
+) -> CommonAttentionMetadata:
    """Create CommonAttentionMetadata from a BatchSpec and ModelParams."""
    # Create query start locations
-    query_start_loc = torch.zeros(batch_spec.batch_size + 1,
-                                  dtype=torch.int32,
-                                  device=device)
-    query_start_loc[1:] = torch.tensor(batch_spec.query_lens,
-                                       dtype=torch.int32,
-                                       device=device).cumsum(0)
+    query_start_loc = torch.zeros(
+        batch_spec.batch_size + 1, dtype=torch.int32, device=device
+    )
+    query_start_loc[1:] = torch.tensor(
+        batch_spec.query_lens, dtype=torch.int32, device=device
+    ).cumsum(0)
    query_start_loc_cpu = query_start_loc.cpu()
    num_tokens = batch_spec.compute_num_tokens()

    # Create sequence lengths
-    seq_lens = torch.tensor(batch_spec.seq_lens,
-                            dtype=torch.int32,
-                            device=device)
+    seq_lens = torch.tensor(batch_spec.seq_lens, dtype=torch.int32, device=device)
    seq_lens_cpu = seq_lens.cpu()
    max_seq_len = int(seq_lens_cpu.max())

@@ -72,24 +80,23 @@ def create_common_attn_metadata(
    max_blocks = (max(batch_spec.seq_lens) + block_size - 1) // block_size
    if arange_block_indices:
        num_blocks = batch_spec.batch_size * max_blocks
-        block_table_tensor = torch.arange(num_blocks,
-                                          dtype=torch.int32,
-                                          device=device).view(
-                                              batch_spec.batch_size,
-                                              max_blocks)
-        slot_mapping = torch.arange(num_tokens,
-                                    dtype=torch.int64,
-                                    device=device).view(num_tokens)
+        block_table_tensor = torch.arange(
+            num_blocks, dtype=torch.int32, device=device
+        ).view(batch_spec.batch_size, max_blocks)
+        slot_mapping = torch.arange(num_tokens, dtype=torch.int64, device=device).view(
+            num_tokens
+        )
    else:
-        block_table_tensor = torch.randint(0,
-                                           max_block_idx,
-                                           (batch_spec.batch_size, max_blocks),
-                                           dtype=torch.int32,
-                                           device=device)
-        slot_mapping = torch.randint(0,
-                                     max_block_idx, (num_tokens, ),
-                                     dtype=torch.int64,
-                                     device=device)
+        block_table_tensor = torch.randint(
+            0,
+            max_block_idx,
+            (batch_spec.batch_size, max_blocks),
+            dtype=torch.int32,
+            device=device,
+        )
+        slot_mapping = torch.randint(
+            0, max_block_idx, (num_tokens,), dtype=torch.int64, device=device
+        )

    # Calculate max query length
    max_query_len = max(batch_spec.query_lens)
@@ -121,31 +128,21 @@ def get_attention_backend(backend_name: _Backend):
        Tuple of (backend_builder_class, backend_impl_class)
    """
    backend_map = {
-        _Backend.FLASH_ATTN:
-        ("vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
-         if current_platform.is_cuda() else
-         "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
-         ),
-        _Backend.FLASHINFER:
-        "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
-        _Backend.FLEX_ATTENTION:
-        "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
-        _Backend.TRITON_ATTN:
-        "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
-        _Backend.TREE_ATTN:
-        "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
-        _Backend.XFORMERS:
-        "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
-        _Backend.CUTLASS_MLA:
-        "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
-        _Backend.FLASHMLA:
-        "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
-        _Backend.FLASH_ATTN_MLA:
-        "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
-        _Backend.FLASHINFER_MLA:
-        "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",
-        _Backend.TRITON_MLA:
-        "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
+        _Backend.FLASH_ATTN: (
+            "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+            if current_platform.is_cuda()
+            else "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
+        ),
+        _Backend.FLASHINFER: "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
+        _Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
+        _Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
+        _Backend.TREE_ATTN: "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
+        _Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
+        _Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
+        _Backend.FLASHMLA: "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
+        _Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
+        _Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",
+        _Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
    }

    if backend_name not in backend_map:
@@ -160,29 +157,31 @@ def get_attention_backend(backend_name: _Backend):
        pytest.skip(f"{backend_name} not available: {e}")


-def create_standard_kv_cache_spec(
-        vllm_config: VllmConfig) -> FullAttentionSpec:
+def create_standard_kv_cache_spec(vllm_config: VllmConfig) -> FullAttentionSpec:
    """Create a FullAttentionSpec from ModelParams only."""
    return FullAttentionSpec(
        block_size=vllm_config.cache_config.block_size,
        num_kv_heads=vllm_config.model_config.get_num_kv_heads(
-            vllm_config.parallel_config),
+            vllm_config.parallel_config
+        ),
        head_size=vllm_config.model_config.get_head_size(),
        dtype=vllm_config.model_config.dtype,
        sliding_window=vllm_config.model_config.get_sliding_window(),
    )


-def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
-                       tensor_parallel_size: int = 1,
-                       max_model_len: int = 1024,
-                       dtype: Union[ModelDType, torch.dtype] = "auto",
-                       num_gpu_blocks: int = 1000,
-                       block_size: int = 16,
-                       max_num_seqs: int = 256,
-                       max_num_batched_tokens: int = 8192,
-                       enable_chunked_prefill: bool = True,
-                       add_mock_model_methods: bool = True) -> VllmConfig:
+def create_vllm_config(
+    model_name: str = "meta-llama/Meta-Llama-3-8B",
+    tensor_parallel_size: int = 1,
+    max_model_len: int = 1024,
+    dtype: Union[ModelDType, torch.dtype] = "auto",
+    num_gpu_blocks: int = 1000,
+    block_size: int = 16,
+    max_num_seqs: int = 256,
+    max_num_batched_tokens: int = 8192,
+    enable_chunked_prefill: bool = True,
+    add_mock_model_methods: bool = True,
+) -> VllmConfig:
    """Create a VllmConfig for testing with reasonable defaults."""

    model_config = ModelConfig(
@@ -205,7 +204,8 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
    cache_config.num_cpu_blocks = 0

    parallel_config = ParallelConfig(
-        tensor_parallel_size=tensor_parallel_size, )
+        tensor_parallel_size=tensor_parallel_size,
+    )

    scheduler_config = SchedulerConfig(
        max_num_seqs=max_num_seqs,
@@ -223,15 +223,17 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
        # but some backends expect to query the model for layer-specific
        # parameters
        import types
-        model_config.get_num_layers = types.MethodType(lambda self: 1,
-                                                       model_config)
+
+        model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
        model_config.get_sliding_window_for_layer = types.MethodType(
-            lambda self, i: None, model_config)
+            lambda self, i: None, model_config
+        )
        model_config.get_logits_soft_cap_for_layer = types.MethodType(
-            lambda self, i: 0.0, model_config)
+            lambda self, i: 0.0, model_config
+        )
        model_config.get_sm_scale_for_layer = types.MethodType(
-            lambda self, i: 1.0 / model_config.get_head_size()**0.5,
-            model_config)
+            lambda self, i: 1.0 / model_config.get_head_size() ** 0.5, model_config
+        )

    return VllmConfig(
        model_config=model_config,
@@ -244,12 +246,14 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
    )


-def create_dummy_kv_cache(block_size: int,
-                          num_kv_heads: int,
-                          head_size: int,
-                          dtype: torch.dtype,
-                          device: torch.device,
-                          num_blocks: int = 100) -> torch.Tensor:
+def create_dummy_kv_cache(
+    block_size: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    num_blocks: int = 100,
+) -> torch.Tensor:
    """Create a dummy KV cache tensor for testing."""
    kv_cache = torch.randn(
        num_blocks,
@@ -258,7 +262,8 @@ def create_dummy_kv_cache(block_size: int,
        num_kv_heads,
        head_size,
        dtype=dtype,
-        device=device)
+        device=device,
+    )
    return kv_cache


@@ -273,75 +278,80 @@ class BackendConfig:
 # Define all backend configurations of full cudagraph to be tested
 full_cg_backend_configs = {
    # FA3 on Hopper
-    "FA3":
-    BackendConfig(name="FA3",
-                  env_vars={
-                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
-                      "VLLM_FLASH_ATTN_VERSION": "3",
-                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
-                  },
-                  comp_config={
-                      "cudagraph_mode": "FULL",
-                  },
-                  specific_gpu_arch=(9, 0)),
-    # FlashMLA on Hopper
-    "FlashMLA":
-    BackendConfig(name="FlashMLA",
-                  env_vars={
-                      "VLLM_ATTENTION_BACKEND": "FLASHMLA",
-                  },
-                  comp_config={
-                      "cudagraph_mode": "FULL_AND_PIECEWISE",
-                  },
-                  specific_gpu_arch=(9, 0)),
-    # Cutlass MLA on Blackwell
-    "CutlassMLA":
-    BackendConfig(
-        name="CutlassMLA",
+    "FA3": BackendConfig(
+        name="FA3",
        env_vars={
-            "VLLM_USE_V1": "1",
-            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
-            "FORCE_NUM_KV_SPLITS":
-            "1",  # TODO: remove this when hang issue is fixed
+            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
+            "VLLM_FLASH_ATTN_VERSION": "3",
+            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        },
+        comp_config={
+            "cudagraph_mode": "FULL",
+        },
+        specific_gpu_arch=(9, 0),
+    ),
+    # FlashMLA on Hopper
+    "FlashMLA": BackendConfig(
+        name="FlashMLA",
+        env_vars={
+            "VLLM_ATTENTION_BACKEND": "FLASHMLA",
        },
        comp_config={
            "cudagraph_mode": "FULL_AND_PIECEWISE",
        },
-        specific_gpu_arch=(10, 0)),
+        specific_gpu_arch=(9, 0),
+    ),
+    # Cutlass MLA on Blackwell
+    "CutlassMLA": BackendConfig(
+        name="CutlassMLA",
+        env_vars={
+            "VLLM_USE_V1": "1",
+            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
+            "FORCE_NUM_KV_SPLITS": "1",  # TODO: remove this when hang issue is fixed
+        },
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+        },
+        specific_gpu_arch=(10, 0),
+    ),
    # FlashAttention MLA on Hopper
-    "FlashAttentionMLA":
-    BackendConfig(name="FlashAttentionMLA",
-                  env_vars={
-                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
-                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
-                  },
-                  comp_config={
-                      "cudagraph_mode": "FULL_DECODE_ONLY",
-                  },
-                  specific_gpu_arch=(9, 0)),
+    "FlashAttentionMLA": BackendConfig(
+        name="FlashAttentionMLA",
+        env_vars={
+            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
+            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        },
+        comp_config={
+            "cudagraph_mode": "FULL_DECODE_ONLY",
+        },
+        specific_gpu_arch=(9, 0),
+    ),
    # FA2
-    "FA2":
-    BackendConfig(name="FA2",
-                  env_vars={
-                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
-                      "VLLM_FLASH_ATTN_VERSION": "2",
-                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
-                  },
-                  comp_config={
-                      "cudagraph_mode": "FULL_AND_PIECEWISE",
-                  }),
+    "FA2": BackendConfig(
+        name="FA2",
+        env_vars={
+            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
+            "VLLM_FLASH_ATTN_VERSION": "2",
+            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        },
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+        },
+    ),
    # Triton Attention
-    "TritonAttn":
-    BackendConfig(name="TritonAttn",
-                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
-                  comp_config={
-                      "cudagraph_mode": "FULL_AND_PIECEWISE",
-                  }),
+    "TritonAttn": BackendConfig(
+        name="TritonAttn",
+        env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+        },
+    ),
    # FlashInfer
-    "FlashInfer":
-    BackendConfig(name="FlashInfer",
-                  env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
-                  comp_config={
-                      "cudagraph_mode": "FULL_AND_PIECEWISE",
-                  }),
+    "FlashInfer": BackendConfig(
+        name="FlashInfer",
+        env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+        },
+    ),
 }