Make key optional for rotary embedding (#17566)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-05-07 00:11:46 -07:00
parent 324a3119b0
commit 98c89e16ff
10 changed files with 221 additions and 151 deletions
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -9,7 +9,8 @@ void rotary_embedding_impl(
    scalar_t* __restrict__ query,           /// [batch_size, seq_len, num_heads,
                                   /// head_size] or [num_tokens, num_heads,
                                   /// head_size]
-    scalar_t* __restrict__ key,  // [batch_size, seq_len, num_kv_heads,
+    scalar_t* __restrict__ key,  // nullptr (optional) or
+                                 // [batch_size, seq_len, num_kv_heads,
                                 // head_size] or [num_tokens, num_kv_heads,
                                 // head_size]
    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
@@ -85,10 +86,13 @@ void rotary_embedding_impl(
      compute_loop(token_head, cache_ptr, query);
    }

-    for (int i = 0; i < num_kv_heads; ++i) {
-      const int head_idx = i;
-      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
-      compute_loop(token_head, cache_ptr, key);
+    if (key != nullptr) {
+      for (int i = 0; i < num_kv_heads; ++i) {
+        const int head_idx = i;
+        const int64_t token_head =
+            token_idx * key_stride + head_idx * head_size;
+        compute_loop(token_head, cache_ptr, key);
+      }
    }
  }
 }
@@ -100,7 +104,8 @@ void rotary_embedding_gptj_impl(
    scalar_t* __restrict__ query,           /// [batch_size, seq_len, num_heads,
                                   /// head_size] or [num_tokens, num_heads,
                                   /// head_size]
-    scalar_t* __restrict__ key,  // [batch_size, seq_len, num_kv_heads,
+    scalar_t* __restrict__ key,  // nullptr (optional) or
+                                 // [batch_size, seq_len, num_kv_heads,
                                 // head_size] or [num_tokens, num_kv_heads,
                                 // head_size]
    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
@@ -138,6 +143,10 @@ void rotary_embedding_gptj_impl(
    }
  }

+  if (key == nullptr) {
+    return;
+  }
+
 #pragma omp parallel for collapse(2)
  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
    for (int i = 0; i < num_kv_heads; ++i) {
@@ -168,13 +177,13 @@ void rotary_embedding_gptj_impl(
 };  // namespace

 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                      torch::Tensor& key, int64_t head_size,
+                      std::optional<torch::Tensor> key, int64_t head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox) {
  int num_tokens = positions.numel();
  int rot_dim = cos_sin_cache.size(1);
  int num_heads = query.size(-1) / head_size;
-  int num_kv_heads = key.size(-1) / head_size;
-  int64_t key_stride = key.stride(-2);
+  int num_kv_heads = key.has_value() ? key->size(-1) / head_size : num_heads;
+  int64_t key_stride = key.has_value() ? key->stride(-2) : 0;
  int64_t query_stride = query.stride(-2);

  VLLM_DISPATCH_FLOATING_TYPES(
@@ -183,15 +192,15 @@ void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
        if (is_neox) {
          rotary_embedding_impl(
              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
-              key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
-              rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
-              head_size, num_tokens);
+              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
+              cos_sin_cache.data_ptr<scalar_t>(), rot_dim, query_stride,
+              key_stride, num_heads, num_kv_heads, head_size, num_tokens);
        } else {
          rotary_embedding_gptj_impl(
              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
-              key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
-              rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
-              head_size, num_tokens);
+              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
+              cos_sin_cache.data_ptr<scalar_t>(), rot_dim, query_stride,
+              key_stride, num_heads, num_kv_heads, head_size, num_tokens);
        }

        CPU_KERNEL_GUARD_OUT(rotary_embedding_impl)