Separate MLAAttention class from Attention (#25103)

Signed-off-by: Naveenraj Kamalakannan <therealnaveenkamal@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-08 20:11:11 -04:00
parent 2a03f93de9
commit e614ab7806
10 changed files with 502 additions and 163 deletions
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -58,7 +58,7 @@ from vllm.model_executor.layers.linear import (
    RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttention
+from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttentionWrapper
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8,
@@ -1038,7 +1038,7 @@ class DeepseekV2MLAAttention(nn.Module):
            topk_indices_buffer=topk_indices_buffer,
        )

-        self.mla_attn = MultiHeadLatentAttention(
+        self.mla_attn = MultiHeadLatentAttentionWrapper(
            self.hidden_size,
            self.num_local_heads,
            self.scaling,