[Chore] Remove unused batched RoPE op & kernel (#24789)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-13 00:08:20 -07:00
parent 99bfef841f
commit 5febdc8750
8 changed files with 16 additions and 348 deletions
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@@ -99,35 +99,6 @@ __global__ void rotary_embedding_kernel(
      token_idx, query_stride, key_stride, head_stride);
 }

-template <typename scalar_t, bool IS_NEOX>
-__global__ void batched_rotary_embedding_kernel(
-    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
-                                            // [num_tokens]
-    scalar_t* __restrict__ query,           // [batch_size, seq_len, num_heads,
-                                   // head_size] or [num_tokens, num_heads,
-                                   // head_size]
-    scalar_t* __restrict__ key,  // nullptr or
-                                 // [batch_size, seq_len, num_kv_heads,
-                                 // head_size] or [num_tokens, num_kv_heads,
-                                 // head_size]
-    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
-                                                 // 2]
-    const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len]
-    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
-    const int64_t head_stride, const int num_heads, const int num_kv_heads,
-    const int head_size) {
-  // Each thread block is responsible for one token.
-  const int token_idx = blockIdx.x;
-  int64_t pos = positions[token_idx];
-  int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx];
-  const scalar_t* cache_ptr =
-      cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;
-
-  apply_rotary_embedding<scalar_t, IS_NEOX>(
-      query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
-      token_idx, query_stride, key_stride, head_stride);
-}
-
 }  // namespace vllm

 void rotary_embedding(
@@ -211,96 +182,3 @@ void rotary_embedding(
    }
  });
 }
-
-/*
-Batched version of rotary embedding, pack multiple LoRAs together
-and process in batched manner.
-*/
-void batched_rotary_embedding(
-    torch::Tensor& positions,  // [batch_size, seq_len] or [num_tokens]
-    torch::Tensor& query,  // [batch_size, seq_len, num_heads * head_size] or
-                           // [num_tokens, num_heads * head_size] or
-                           // [batch_size, seq_len, num_heads, head_size] or
-                           // [num_tokens, num_heads, head_size]
-    std::optional<torch::Tensor>
-        key,  // null or
-              // [batch_size, seq_len, num_kv_heads * head_size] or
-              // [num_tokens, num_kv_heads * head_size] or
-              // [batch_size, seq_len, num_heads, head_size] or
-              // [num_tokens, num_heads, head_size]
-    int64_t head_size,
-    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
-    bool is_neox, int64_t rot_dim,
-    torch::Tensor& cos_sin_cache_offsets  // [num_tokens] or [batch_size]
-) {
-  // num_tokens = batch_size * seq_len
-  int64_t num_tokens = cos_sin_cache_offsets.size(0);
-  TORCH_CHECK(
-      positions.size(0) == num_tokens || positions.numel() == num_tokens,
-      "positions must have the same num_tokens or batch_size as "
-      "cos_sin_cache_offsets");
-
-  int positions_ndim = positions.dim();
-  // Make sure num_tokens dim is consistent across positions, query, and key
-  TORCH_CHECK(
-      positions_ndim == 1 || positions_ndim == 2,
-      "positions must have shape [num_tokens] or [batch_size, seq_len]");
-  if (positions_ndim == 1) {
-    TORCH_CHECK(query.size(0) == positions.size(0) &&
-                    (!key.has_value() || key->size(0) == positions.size(0)),
-                "query, key and positions must have the same number of tokens");
-  }
-  if (positions_ndim == 2) {
-    TORCH_CHECK(
-        query.size(0) == positions.size(0) &&
-            (!key.has_value() || key->size(0) == positions.size(0)) &&
-            query.size(1) == positions.size(1) &&
-            (!key.has_value() || key->size(1) == positions.size(1)),
-        "query, key and positions must have the same batch_size and seq_len");
-  }
-
-  // Make sure head_size is valid for query and key
-  int query_hidden_size = query.numel() / num_tokens;
-  int key_hidden_size = key.has_value() ? key->numel() / num_tokens : 0;
-  TORCH_CHECK(query_hidden_size % head_size == 0);
-  TORCH_CHECK(key_hidden_size % head_size == 0);
-
-  // Make sure query and key have concistent number of heads
-  int num_heads = query_hidden_size / head_size;
-  int num_kv_heads = key.has_value() ? key_hidden_size / head_size : num_heads;
-  TORCH_CHECK(num_heads % num_kv_heads == 0);
-
-  int seq_dim_idx = positions_ndim - 1;
-  int64_t query_stride = query.stride(seq_dim_idx);
-  int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0;
-  // Determine head stride: for [*, heads, head_size] use stride of last dim;
-  // for flat [*, heads*head_size], heads blocks are contiguous of size
-  // head_size
-  int query_ndim = query.dim();
-  int64_t head_stride =
-      (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size;
-
-  dim3 grid(num_tokens);
-  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
-    if (is_neox) {
-      vllm::batched_rotary_embedding_kernel<scalar_t, true>
-          <<<grid, block, 0, stream>>>(
-              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
-              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
-              cos_sin_cache.data_ptr<scalar_t>(),
-              cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
-              key_stride, head_stride, num_heads, num_kv_heads, head_size);
-    } else {
-      vllm::batched_rotary_embedding_kernel<scalar_t, false>
-          <<<grid, block, 0, stream>>>(
-              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
-              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
-              cos_sin_cache.data_ptr<scalar_t>(),
-              cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
-              key_stride, head_stride, num_heads, num_kv_heads, head_size);
-    }
-  });
-}