[Core] Refactor Attention Take 2 (#3462)

2024-03-24 21:39:33 -07:00
parent b0dfa91dd7
commit 925f3332ca
47 changed files with 1268 additions and 1117 deletions
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -19,16 +19,15 @@
 """PyTorch Falcon model."""

 import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Union

 import torch
 from torch import nn
 from torch.nn import LayerNorm
 from transformers import FalconConfig as HF_FalconConfig

-from vllm.model_executor.input_metadata import InputMetadata
+from vllm.attention import Attention, AttentionMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               LinearMethodBase,
                                               QKVParallelLinear,
@@ -48,7 +47,6 @@ from vllm.model_executor.weight_utils import (default_weight_loader,
 from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs import RWConfig

-KVCache = Tuple[torch.Tensor, torch.Tensor]
 FalconConfig = Union[HF_FalconConfig, RWConfig]


@@ -177,8 +175,8 @@ class FalconAttention(nn.Module):
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
    ) -> torch.Tensor:
        qkv, bias = self.query_key_value(hidden_states)
        if bias is not None:
@@ -186,8 +184,7 @@ class FalconAttention(nn.Module):
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        if self.use_rotary:
            q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
        attn_output, bias = self.dense(attn_output)
        return attn_output, bias

@@ -263,8 +260,8 @@ class FalconDecoderLayer(nn.Module):
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
    ) -> torch.Tensor:
        residual = hidden_states

@@ -279,7 +276,7 @@ class FalconDecoderLayer(nn.Module):
            positions=positions,
            hidden_states=attention_layernorm_out,
            kv_cache=kv_cache,
-            input_metadata=input_metadata,
+            attn_metadata=attn_metadata,
        )
        if self.reduce_row_parallel_results and attention_bias is not None:
            attention_output += attention_bias
@@ -343,8 +340,8 @@ class FalconModel(nn.Module):
        self,
        input_ids: torch.LongTensor,
        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
    ) -> torch.Tensor:
        hidden_states = self.word_embeddings(input_ids)
        for i in range(len(self.h)):
@@ -353,7 +350,7 @@ class FalconModel(nn.Module):
                positions,
                hidden_states,
                kv_caches[i],
-                input_metadata,
+                attn_metadata,
            )
        hidden_states = self.ln_f(hidden_states)
        return hidden_states
@@ -378,14 +375,14 @@ class FalconForCausalLM(nn.Module):
        self,
        input_ids: torch.LongTensor,
        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
    ) -> torch.Tensor:
        hidden_states = self.transformer(
            input_ids,
            positions,
            kv_caches,
-            input_metadata,
+            attn_metadata,
        )
        return hidden_states