Update deprecated Python 3.8 typing (#13971)

This commit is contained in:
Harry Mellor
2025-03-03 01:34:51 +00:00
committed by GitHub
parent bf33700ecd
commit cf069aa8aa
300 changed files with 2294 additions and 2347 deletions

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
"""Attention layer with FlashAttention."""
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
from typing import TYPE_CHECKING, Any, Optional
import numpy as np
import torch
@@ -30,7 +30,7 @@ class FlashAttentionBackend(AttentionBackend):
accept_output_buffer: bool = True
@staticmethod
def get_supported_head_sizes() -> List[int]:
def get_supported_head_sizes() -> list[int]:
return [32, 64, 96, 128, 160, 192, 224, 256]
@staticmethod
@@ -38,15 +38,15 @@ class FlashAttentionBackend(AttentionBackend):
return "FLASH_ATTN_VLLM_V1"
@staticmethod
def get_impl_cls() -> Type["FlashAttentionImpl"]:
def get_impl_cls() -> type["FlashAttentionImpl"]:
return FlashAttentionImpl
@staticmethod
def get_metadata_cls() -> Type["AttentionMetadata"]:
def get_metadata_cls() -> type["AttentionMetadata"]:
return FlashAttentionMetadata
@staticmethod
def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]:
def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]:
return FlashAttentionMetadataBuilder
@staticmethod
@@ -55,7 +55,7 @@ class FlashAttentionBackend(AttentionBackend):
block_size: int,
num_kv_heads: int,
head_size: int,
) -> Tuple[int, ...]:
) -> tuple[int, ...]:
if block_size % 16 != 0:
raise ValueError("Block size must be a multiple of 16.")
return (2, num_blocks, block_size, num_kv_heads, head_size)
@@ -158,10 +158,10 @@ class FlashAttentionImpl(AttentionImpl):
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
alibi_slopes: Optional[list[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
blocksparse_params: Optional[dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: AttentionType = AttentionType.DECODER,
) -> None:
@@ -381,7 +381,7 @@ def cascade_attention(
max_kv_len: int,
softmax_scale: float,
alibi_slopes: Optional[torch.Tensor],
sliding_window: Tuple[int, int],
sliding_window: tuple[int, int],
logits_soft_cap: float,
block_table: torch.Tensor,
common_prefix_len: int,

View File

@@ -195,8 +195,7 @@ return curr_o @ W_O
import functools
from abc import abstractmethod
from dataclasses import dataclass
from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
Type, TypeVar)
from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar
import torch
from compressed_tensors.quantization import QuantizationStrategy
@@ -250,11 +249,11 @@ class MLACommonBackend(AttentionBackend):
return "TRITON_MLA_VLLM_V1"
@staticmethod
def get_metadata_cls() -> Type["AttentionMetadata"]:
def get_metadata_cls() -> type["AttentionMetadata"]:
return MLACommonMetadata
@staticmethod
def get_builder_cls() -> Type["MLACommonMetadataBuilder"]:
def get_builder_cls() -> type["MLACommonMetadataBuilder"]:
return MLACommonMetadataBuilder
@staticmethod
@@ -263,11 +262,11 @@ class MLACommonBackend(AttentionBackend):
block_size: int,
num_kv_heads: int, # assumed to be 1 for MLA
head_size: int,
) -> Tuple[int, ...]:
) -> tuple[int, ...]:
return (num_blocks, block_size, head_size)
@staticmethod
def get_supported_head_sizes() -> List[int]:
def get_supported_head_sizes() -> list[int]:
return [576]
@staticmethod
@@ -317,8 +316,8 @@ class MLACommonMetadata:
has_context: bool = False
context_chunk_cu_seq_lens: Optional[torch.Tensor] = None
context_chunk_starts: Optional[torch.Tensor] = None
context_chunk_seq_tot: Optional[List[int]] = None
context_chunk_max_seq_lens: Optional[List[int]] = None
context_chunk_seq_tot: Optional[list[int]] = None
context_chunk_max_seq_lens: Optional[list[int]] = None
chunked_prefill_workspace: Optional[torch.Tensor] = None
def __post_init__(self):
@@ -538,10 +537,10 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
alibi_slopes: Optional[list[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]],
blocksparse_params: Optional[dict[str, Any]],
logits_soft_cap: Optional[float],
attn_type: str,
# MLA Specific Arguments
@@ -634,7 +633,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
#
# returns input_group_shape, weight_group_shape
def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
Tuple[Tuple[int, int], Tuple[int, int]]:
tuple[tuple[int, int], tuple[int, int]]:
if isinstance(layer.quant_method, Fp8LinearMethod):
if layer.quant_method.block_quant:
weight_block_size = \

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Type
from typing import Any, Optional
import torch
@@ -25,21 +25,21 @@ class FlashMLABackend(MLACommonBackend):
return "FLASHMLA_VLLM_V1"
@staticmethod
def get_metadata_cls() -> Type["FlashMLAMetadata"]:
def get_metadata_cls() -> type["FlashMLAMetadata"]:
return FlashMLAMetadata
@staticmethod
def get_builder_cls() -> Type["FlashMLAMetadataBuilder"]:
def get_builder_cls() -> type["FlashMLAMetadataBuilder"]:
return FlashMLAMetadataBuilder
@staticmethod
def get_impl_cls() -> Type["FlashMLAImpl"]:
def get_impl_cls() -> type["FlashMLAImpl"]:
return FlashMLAImpl
@dataclass
class FlashMLAMetadata(MLACommonMetadata):
decode_tile_scheduler_metadata: Optional[Tuple[torch.Tensor,
decode_tile_scheduler_metadata: Optional[tuple[torch.Tensor,
torch.Tensor]] = None
decode_num_splits: Optional[torch.Tensor] = None
@@ -76,10 +76,10 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
alibi_slopes: Optional[list[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]],
blocksparse_params: Optional[dict[str, Any]],
logits_soft_cap: Optional[float],
attn_type: str,
# MLA Specific Arguments

View File

@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Optional, Type
from typing import Any, Optional
import torch
@@ -21,7 +21,7 @@ class TritonMLABackend(MLACommonBackend):
return "TRITON_MLA_VLLM_V1"
@staticmethod
def get_impl_cls() -> Type["TritonMLAImpl"]:
def get_impl_cls() -> type["TritonMLAImpl"]:
return TritonMLAImpl
@@ -33,10 +33,10 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
alibi_slopes: Optional[list[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]],
blocksparse_params: Optional[dict[str, Any]],
logits_soft_cap: Optional[float],
attn_type: str,
# MLA Specific Arguments

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Type
from typing import Any, Optional
import torch
# Required to register custom ops.
@@ -22,15 +22,15 @@ class PallasAttentionBackend(AttentionBackend):
return "PALLAS_VLLM_V1"
@staticmethod
def get_impl_cls() -> Type["PallasAttentionBackendImpl"]:
def get_impl_cls() -> type["PallasAttentionBackendImpl"]:
return PallasAttentionBackendImpl
@staticmethod
def get_metadata_cls() -> Type["PallasMetadata"]:
def get_metadata_cls() -> type["PallasMetadata"]:
return PallasMetadata
@staticmethod
def get_state_cls() -> Type["CommonAttentionState"]:
def get_state_cls() -> type["CommonAttentionState"]:
return CommonAttentionState
@staticmethod
@@ -39,7 +39,7 @@ class PallasAttentionBackend(AttentionBackend):
block_size: int,
num_kv_heads: int,
head_size: int,
) -> Tuple[int, ...]:
) -> tuple[int, ...]:
return (num_kv_heads, num_blocks, block_size, head_size)
@staticmethod
@@ -77,10 +77,10 @@ class PallasAttentionBackendImpl(AttentionImpl):
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
alibi_slopes: Optional[list[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
blocksparse_params: Optional[dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
) -> None:
@@ -120,7 +120,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
kv_cache: Tuple[torch.Tensor, torch.Tensor],
kv_cache: tuple[torch.Tensor, torch.Tensor],
attn_metadata: PallasMetadata,
output: Optional[torch.Tensor] = None,
) -> torch.Tensor:

View File

@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
"""Attention layer with PagedAttention on rocm"""
from typing import Any, Dict, List, Optional, Tuple, Type
from typing import Any, Optional
import torch
@@ -20,7 +20,7 @@ class ROCmAttentionBackend(AttentionBackend):
accept_output_buffer: bool = True
@staticmethod
def get_supported_head_sizes() -> List[int]:
def get_supported_head_sizes() -> list[int]:
return [32, 64, 96, 128, 160, 192, 224, 256]
@staticmethod
@@ -28,11 +28,11 @@ class ROCmAttentionBackend(AttentionBackend):
return "ROCM_ATTN_VLLM_V1"
@staticmethod
def get_impl_cls() -> Type["ROCmAttentionImpl"]:
def get_impl_cls() -> type["ROCmAttentionImpl"]:
return ROCmAttentionImpl
@staticmethod
def get_metadata_cls() -> Type["AttentionMetadata"]:
def get_metadata_cls() -> type["AttentionMetadata"]:
return FlashAttentionMetadata
@staticmethod
@@ -41,7 +41,7 @@ class ROCmAttentionBackend(AttentionBackend):
block_size: int,
num_kv_heads: int,
head_size: int,
) -> Tuple[int, ...]:
) -> tuple[int, ...]:
if block_size % 16 != 0:
raise ValueError("Block size must be a multiple of 16.")
return (2, num_blocks, block_size, num_kv_heads, head_size)
@@ -51,7 +51,7 @@ class ROCmAttentionBackend(AttentionBackend):
return False
@staticmethod
def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]:
def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]:
return FlashAttentionMetadataBuilder
@@ -63,10 +63,10 @@ class ROCmAttentionImpl(AttentionImpl):
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
alibi_slopes: Optional[list[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
blocksparse_params: Optional[dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: AttentionType = AttentionType.DECODER,
) -> None: