[attention][DCP] use AttentionImpl.need_to_return_lse_for_decode (#24372)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-09-07 09:18:59 +08:00
parent 4172235ab7
commit 558f0907dc
4 changed files with 38 additions and 9 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -56,7 +56,6 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                        GiB_bytes, LazyLoader, cdiv, check_use_alibi,
                        get_dtype_size, is_pin_memory_available, round_up,
                        supports_dynamo)
-from vllm.v1.attention.backends.mla.flashmla import FlashMLABackend
 from vllm.v1.attention.backends.utils import (
    AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
    create_fast_prefill_custom_backend,
@@ -3405,10 +3404,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                    copy_kv_blocks)

        if self.dcp_world_size > 1:
-            assert self.attn_groups[0][0].backend is FlashMLABackend, (
-                "DCP only support flashmla now."
-                "For a mla backend want to enable DCP, it is mandatory that the"
-                "corresponding decode attn kernel return the softmax lse.")
+            layer_names = self.attn_groups[0][0].layer_names
+            layers = get_layers_from_vllm_config(self.vllm_config,
+                                                 AttentionLayerBase,
+                                                 layer_names)
+            for layer in layers.values():
+                assert layer.impl.need_to_return_lse_for_decode, (
+                    "DCP requires attention impls to return"
+                    " the softmax lse for decode, but the impl "
+                    f"{layer.impl.__class__.__name__} "
+                    "does not return the softmax lse for decode.")

    def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
        """