Replace FlashAttention with xformers (#70)

2023-05-05 02:01:08 -07:00
parent 189ae23133
commit c9d5b6d4a8
13 changed files with 89 additions and 133 deletions
--- a/cacheflow/models/opt.py
+++ b/cacheflow/models/opt.py
@@ -6,7 +6,7 @@ from torch import nn
 from transformers import OPTConfig

 from cacheflow.models import InputMetadata
-from cacheflow.models.attention import OPTCacheFlowAttention
+from cacheflow.models.attention import GPTCacheFlowAttention
 from cacheflow.models.sample import Sampler
 from cacheflow.models.utils import (hf_model_weights_iterator,
                                    load_tensor_parallel_weights)
@@ -55,7 +55,7 @@ class OPTAttention(nn.Module):
        self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias,
                                          input_is_parallel=True,
                                          perform_initialization=False)
-        self.attn = OPTCacheFlowAttention(scale=self.scaling)
+        self.attn = GPTCacheFlowAttention(scale=self.scaling)

    def forward(
        self,