Add warning for Attention backends that do not support irope yet (#16212)
This commit is contained in:
@@ -38,9 +38,12 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
|
|||||||
from vllm.attention.layer import Attention
|
from vllm.attention.layer import Attention
|
||||||
from vllm.attention.ops.paged_attn import PagedAttention
|
from vllm.attention.ops.paged_attn import PagedAttention
|
||||||
from vllm.config import VllmConfig, get_current_vllm_config
|
from vllm.config import VllmConfig, get_current_vllm_config
|
||||||
|
from vllm.logger import init_logger
|
||||||
from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
|
from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
|
||||||
make_tensor_with_pad)
|
make_tensor_with_pad)
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
|
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
|
||||||
ModelInputForGPUWithSamplingMetadata)
|
ModelInputForGPUWithSamplingMetadata)
|
||||||
@@ -907,7 +910,12 @@ class FlashInferImpl(AttentionImpl):
|
|||||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||||
logits_soft_cap: Optional[float] = None,
|
logits_soft_cap: Optional[float] = None,
|
||||||
attn_type: str = AttentionType.DECODER,
|
attn_type: str = AttentionType.DECODER,
|
||||||
|
use_irope: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
if use_irope:
|
||||||
|
logger.warning_once(
|
||||||
|
"Using irope in FlashInfer is not supported yet, it will fall"
|
||||||
|
" back to global attention for long context.")
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
self.head_size = head_size
|
self.head_size = head_size
|
||||||
self.scale = float(scale)
|
self.scale = float(scale)
|
||||||
|
|||||||
@@ -108,8 +108,13 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
|
|||||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||||
max_seq_len: int = 4096,
|
max_seq_len: int = 4096,
|
||||||
attn_type: str = AttentionType.DECODER,
|
attn_type: str = AttentionType.DECODER,
|
||||||
|
use_irope: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
super(AttentionImpl, self).__init__()
|
super(AttentionImpl, self).__init__()
|
||||||
|
if use_irope:
|
||||||
|
logger.warning_once(
|
||||||
|
"Using irope in HPU is not supported yet, it will fall back "
|
||||||
|
"to global attention for long context.")
|
||||||
self.kv_cache_dtype = kv_cache_dtype
|
self.kv_cache_dtype = kv_cache_dtype
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
self.head_size = head_size
|
self.head_size = head_size
|
||||||
|
|||||||
@@ -14,6 +14,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
|||||||
from vllm.attention.backends.utils import CommonAttentionState
|
from vllm.attention.backends.utils import CommonAttentionState
|
||||||
from vllm.attention.ops.paged_attn import (PagedAttention,
|
from vllm.attention.ops.paged_attn import (PagedAttention,
|
||||||
PagedAttentionMetadata)
|
PagedAttentionMetadata)
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
_PARTITION_SIZE = 512
|
_PARTITION_SIZE = 512
|
||||||
|
|
||||||
@@ -119,7 +122,12 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
|
|||||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||||
logits_soft_cap: Optional[float] = None,
|
logits_soft_cap: Optional[float] = None,
|
||||||
attn_type: str = AttentionType.DECODER,
|
attn_type: str = AttentionType.DECODER,
|
||||||
|
use_irope: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
if use_irope:
|
||||||
|
logger.warning_once(
|
||||||
|
"Using irope in Ipex is not supported yet, it will fall"
|
||||||
|
" back to global attention for long context.")
|
||||||
if blocksparse_params is not None:
|
if blocksparse_params is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"IPEX backend does not support block-sparse attention.")
|
"IPEX backend does not support block-sparse attention.")
|
||||||
|
|||||||
@@ -11,6 +11,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
|||||||
AttentionMetadata, AttentionType,
|
AttentionMetadata, AttentionType,
|
||||||
is_quantized_kv_cache)
|
is_quantized_kv_cache)
|
||||||
from vllm.attention.backends.utils import CommonAttentionState
|
from vllm.attention.backends.utils import CommonAttentionState
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PallasAttentionBackend(AttentionBackend):
|
class PallasAttentionBackend(AttentionBackend):
|
||||||
@@ -105,7 +108,12 @@ class PallasAttentionBackendImpl(AttentionImpl):
|
|||||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||||
logits_soft_cap: Optional[float] = None,
|
logits_soft_cap: Optional[float] = None,
|
||||||
attn_type: str = AttentionType.DECODER,
|
attn_type: str = AttentionType.DECODER,
|
||||||
|
use_irope: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
if use_irope:
|
||||||
|
logger.warning_once(
|
||||||
|
"Using irope in Pallas is not supported yet, it will fall back "
|
||||||
|
"to global attention for long context.")
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
self.head_size = head_size
|
self.head_size = head_size
|
||||||
self.scale = float(scale)
|
self.scale = float(scale)
|
||||||
|
|||||||
@@ -462,7 +462,12 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
|||||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||||
logits_soft_cap: Optional[float] = None,
|
logits_soft_cap: Optional[float] = None,
|
||||||
attn_type: str = AttentionType.DECODER,
|
attn_type: str = AttentionType.DECODER,
|
||||||
|
use_irope: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
if use_irope:
|
||||||
|
logger.warning_once(
|
||||||
|
"Using irope in ROCm Flash Attention is not supported yet, it "
|
||||||
|
"will fail back to global attention for long context.")
|
||||||
if blocksparse_params is not None:
|
if blocksparse_params is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"ROCmFlashAttention does not support blocksparse attention.")
|
"ROCmFlashAttention does not support blocksparse attention.")
|
||||||
|
|||||||
@@ -404,6 +404,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
|
|||||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||||
logits_soft_cap: Optional[float] = None,
|
logits_soft_cap: Optional[float] = None,
|
||||||
attn_type: str = AttentionType.DECODER,
|
attn_type: str = AttentionType.DECODER,
|
||||||
|
use_irope: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
if blocksparse_params is not None:
|
if blocksparse_params is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -411,6 +412,10 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
|
|||||||
if logits_soft_cap is not None:
|
if logits_soft_cap is not None:
|
||||||
logger.warning_once("Torch SPDA does not support logits soft cap. "
|
logger.warning_once("Torch SPDA does not support logits soft cap. "
|
||||||
"Outputs may be slightly off.")
|
"Outputs may be slightly off.")
|
||||||
|
if use_irope:
|
||||||
|
logger.warning_once(
|
||||||
|
"Using irope in Torch SPDA is not supported yet, it will fall"
|
||||||
|
" back to global attention for long context.")
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
self.head_size = head_size
|
self.head_size = head_size
|
||||||
self.scale = float(scale)
|
self.scale = float(scale)
|
||||||
|
|||||||
@@ -389,6 +389,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
|
|||||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||||
logits_soft_cap: Optional[float] = None,
|
logits_soft_cap: Optional[float] = None,
|
||||||
attn_type: str = AttentionType.DECODER,
|
attn_type: str = AttentionType.DECODER,
|
||||||
|
use_irope: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
if blocksparse_params is not None:
|
if blocksparse_params is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -396,6 +397,10 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
|
|||||||
if logits_soft_cap is not None:
|
if logits_soft_cap is not None:
|
||||||
logger.warning_once("XFormers does not support logits soft cap. "
|
logger.warning_once("XFormers does not support logits soft cap. "
|
||||||
"Outputs may be slightly off.")
|
"Outputs may be slightly off.")
|
||||||
|
if use_irope:
|
||||||
|
logger.warning_once(
|
||||||
|
"Using irope in XFormers is not supported yet, it will fall"
|
||||||
|
" back to global attention for long context.")
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
self.head_size = head_size
|
self.head_size = head_size
|
||||||
self.scale = float(scale)
|
self.scale = float(scale)
|
||||||
|
|||||||
@@ -10,6 +10,9 @@ import torch_xla.experimental.custom_kernel # noqa: F401
|
|||||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||||
AttentionLayer, AttentionType)
|
AttentionLayer, AttentionType)
|
||||||
from vllm.attention.backends.utils import CommonAttentionState
|
from vllm.attention.backends.utils import CommonAttentionState
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PallasAttentionBackend(AttentionBackend):
|
class PallasAttentionBackend(AttentionBackend):
|
||||||
@@ -80,7 +83,12 @@ class PallasAttentionBackendImpl(AttentionImpl):
|
|||||||
blocksparse_params: Optional[dict[str, Any]] = None,
|
blocksparse_params: Optional[dict[str, Any]] = None,
|
||||||
logits_soft_cap: Optional[float] = None,
|
logits_soft_cap: Optional[float] = None,
|
||||||
attn_type: str = AttentionType.DECODER,
|
attn_type: str = AttentionType.DECODER,
|
||||||
|
use_irope: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
if use_irope:
|
||||||
|
logger.warning_once(
|
||||||
|
"Using irope in Pallas is not supported yet, it will fall back "
|
||||||
|
"to global attention for long context.")
|
||||||
if blocksparse_params is not None:
|
if blocksparse_params is not None:
|
||||||
raise ValueError("Paged attention Pallas kernel does "
|
raise ValueError("Paged attention Pallas kernel does "
|
||||||
"not support block-sparse attention.")
|
"not support block-sparse attention.")
|
||||||
|
|||||||
Reference in New Issue
Block a user