[Feature] Support Decode Context Parallel (DCP) for MLA (#23734)

Signed-off-by: hongchao <hongchao@msh.team> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: hongchao <hongchao@msh.team> Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-09-06 13:24:05 +08:00
parent 3c529fc994
commit ac201a0eaf
27 changed files with 999 additions and 230 deletions
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -396,51 +396,6 @@ __global__ void concat_and_cache_mla_kernel(
  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }

-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
-__global__ void cp_fused_concat_and_cache_mla_kernel(
-    const scalar_t* __restrict__ kv_c,  // [num_full_tokens, kv_lora_rank]
-    const scalar_t* __restrict__ k_pe,  // [num_full_tokens, pe_dim]
-    const int64_t* __restrict__ cp_local_token_select_indices,  // [num_tokens]
-    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
-                                     // + pe_dim)]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int block_stride,                    //
-    const int entry_stride,                    //
-    const int kv_c_stride,                     //
-    const int k_pe_stride,                     //
-    const int kv_lora_rank,                    //
-    const int pe_dim,                          //
-    const int block_size,                      //
-    const float* scale                         //
-) {
-  const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
-  const int64_t slot_idx = slot_mapping[blockIdx.x];
-  // NOTE: slot_idx can be -1 if the token is padded
-  if (slot_idx < 0) {
-    return;
-  }
-  const int64_t block_idx = slot_idx / block_size;
-  const int64_t block_offset = slot_idx % block_size;
-
-  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
-                  int src_stride, int dst_stride, int size, int offset) {
-    for (int i = threadIdx.x; i < size; i += blockDim.x) {
-      const int64_t src_idx = token_idx * src_stride + i;
-      const int64_t dst_idx =
-          block_idx * block_stride + block_offset * entry_stride + i + offset;
-      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        dst[dst_idx] = src[src_idx];
-      } else {
-        dst[dst_idx] =
-            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
-      }
-    }
-  };
-
-  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
-  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
-}
-
 }  // namespace vllm

 // KV_T is the data type of key and value tensors.
@@ -554,20 +509,6 @@ void reshape_and_cache_flash(
          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
          reinterpret_cast<const float*>(scale.data_ptr()));

-// KV_T is the data type of key and value tensors.
-// CACHE_T is the stored data type of kv-cache.
-// KV_DTYPE is the real data type of kv-cache.
-#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)     \
-  vllm::cp_fused_concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>   \
-      <<<grid, block, 0, stream>>>(                                     \
-          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
-          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
-          cp_local_token_select_indices.data_ptr<int64_t>(),            \
-          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
-          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
-          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
-          reinterpret_cast<const float*>(scale.data_ptr()));
-
 void concat_and_cache_mla(
    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@@ -606,50 +547,6 @@ void concat_and_cache_mla(
                             CALL_CONCAT_AND_CACHE_MLA);
 }

-// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
-// calls into one:
-// k_c_normed.index_select(0, cp_local_token_select_indices) + \
-// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
-// concat_and_cache_mla.
-void cp_fused_concat_and_cache_mla(
-    torch::Tensor& kv_c,  // [num_total_tokens, kv_lora_rank]
-    torch::Tensor& k_pe,  // [num_total_tokens, pe_dim]
-    torch::Tensor& cp_local_token_select_indices,  // [num_tokens]
-    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
-                                  // pe_dim)]
-    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, torch::Tensor& scale) {
-  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
-  // slot_mapping.size(0) because of padding for CUDA graphs.
-  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
-  // both include padding.
-  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
-  // since key includes padding for CUDA graphs, while slot_mapping does not.
-  // In this case, slot_mapping.size(0) represents the actual number of tokens
-  // before padding.
-  // For compatibility with both cases, we use slot_mapping.size(0) as the
-  // number of tokens.
-  int num_tokens = slot_mapping.size(0);
-  int kv_lora_rank = kv_c.size(1);
-  int pe_dim = k_pe.size(1);
-  int block_size = kv_cache.size(1);
-
-  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
-
-  int kv_c_stride = kv_c.stride(0);
-  int k_pe_stride = k_pe.stride(0);
-  int block_stride = kv_cache.stride(0);
-  int entry_stride = kv_cache.stride(1);
-
-  dim3 grid(num_tokens);
-  dim3 block(std::min(kv_lora_rank, 512));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
-                             CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
-}
-
 namespace vllm {

 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>