[Kernel][Attention] Separate Attention.kv_scale into k_scale and v_scale (#6081)

2024-07-16 18:31:32 -04:00
parent 160e1d8c99
commit 978aed5300
33 changed files with 317 additions and 185 deletions
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -107,8 +107,9 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, double kv_scale) {
-  TORCH_CHECK(kv_scale == 1.0f);
+                       const std::string& kv_cache_dtype, double k_scale,
+                       double v_scale) {
+  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);

  int num_tokens = key.size(0);
  int num_heads = key.size(1);