Use runtime profiling to replace manual memory analyzers (#81)

2023-05-19 11:35:44 -06:00
parent 825d8892b5
commit f756799b84
14 changed files with 211 additions and 478 deletions
--- a/cacheflow/model_executor/models/opt.py
+++ b/cacheflow/model_executor/models/opt.py
@@ -74,7 +74,8 @@ class OPTAttention(nn.Module):
        self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias,
                                          input_is_parallel=True,
                                          perform_initialization=False)
-        self.attn = GPTCacheFlowAttention(scale=self.scaling)
+        self.attn = GPTCacheFlowAttention(self.num_heads, self.head_dim,
+                                          scale=self.scaling)

    def forward(
        self,