[attention][DCP] use AttentionImpl.need_to_return_lse_for_decode (#24372)
Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
@@ -56,7 +56,6 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
||||
GiB_bytes, LazyLoader, cdiv, check_use_alibi,
|
||||
get_dtype_size, is_pin_memory_available, round_up,
|
||||
supports_dynamo)
|
||||
from vllm.v1.attention.backends.mla.flashmla import FlashMLABackend
|
||||
from vllm.v1.attention.backends.utils import (
|
||||
AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
|
||||
create_fast_prefill_custom_backend,
|
||||
@@ -3405,10 +3404,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
copy_kv_blocks)
|
||||
|
||||
if self.dcp_world_size > 1:
|
||||
assert self.attn_groups[0][0].backend is FlashMLABackend, (
|
||||
"DCP only support flashmla now."
|
||||
"For a mla backend want to enable DCP, it is mandatory that the"
|
||||
"corresponding decode attn kernel return the softmax lse.")
|
||||
layer_names = self.attn_groups[0][0].layer_names
|
||||
layers = get_layers_from_vllm_config(self.vllm_config,
|
||||
AttentionLayerBase,
|
||||
layer_names)
|
||||
for layer in layers.values():
|
||||
assert layer.impl.need_to_return_lse_for_decode, (
|
||||
"DCP requires attention impls to return"
|
||||
" the softmax lse for decode, but the impl "
|
||||
f"{layer.impl.__class__.__name__} "
|
||||
"does not return the softmax lse for decode.")
|
||||
|
||||
def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user