csrc/cache.h

#pragma once

#include <torch/all.h>
#include <c10/util/Optional.h>

#include <map>
#include <vector>

void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 int64_t block_size_in_bytes,
                 const torch::Tensor& block_mapping);

void swap_blocks_batch(const torch::Tensor& src_ptrs,
                       const torch::Tensor& dst_ptrs,
                       const torch::Tensor& sizes);

void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
                       const std::string& kv_cache_dtype,
                       torch::Tensor& k_scale, torch::Tensor& v_scale);

void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                             torch::Tensor& key_cache,
                             torch::Tensor& value_cache,
                             torch::Tensor& slot_mapping,
                             const std::string& kv_cache_dtype,
                             torch::Tensor& k_scale, torch::Tensor& v_scale);

void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                          torch::Tensor& kv_cache, torch::Tensor& slot_mapping,
                          const std::string& kv_cache_dtype,
                          torch::Tensor& scale);

// NOTE: k_pe and kv_c order is flipped compared to concat_and_cache_mla
void concat_and_cache_mla_rope_fused(
    torch::Tensor& positions, torch::Tensor& q_pe, torch::Tensor& k_pe,
    torch::Tensor& kv_c, torch::Tensor& rope_cos_sin_cache, bool rope_is_neox,
    torch::Tensor& kv_cache_slot_mapping, torch::Tensor& kv_cache,
    const std::string& kv_cache_dtype, torch::Tensor& kv_cache_quant_scale);

// Just for unittest
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);

void gather_and_maybe_dequant_cache(
    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
    int64_t num_tokens, const std::string& kv_cache_dtype,
    torch::Tensor const& scale,
    std::optional<torch::Tensor> seq_starts = std::nullopt);

// TODO(hc): cp_gather_cache need support scaled kvcahe in the future.
void cp_gather_cache(
    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);

// Gather and upconvert FP8 KV cache to BF16 workspace
void cp_gather_and_upconvert_fp8_kv_cache(
    torch::Tensor const& src_cache,         // [NUM_BLOCKS, BLOCK_SIZE, 656]
    torch::Tensor const& dst,               // [TOT_TOKENS, 576]
    torch::Tensor const& block_table,       // [BATCH, BLOCK_INDICES]
    torch::Tensor const& seq_lens,          // [BATCH]
    torch::Tensor const& workspace_starts,  // [BATCH]
    int64_t batch_size);

// Indexer K quantization and cache function
void indexer_k_quant_and_cache(
    torch::Tensor& k,             // [num_tokens, head_dim]
    torch::Tensor& kv_cache,      // [num_blocks, block_size, cache_stride]
    torch::Tensor& slot_mapping,  // [num_tokens]
    int64_t quant_block_size,     // quantization block size
    const std::string& scale_fmt);

// Concatenate query nope and rope for MLA/DSA attention
void concat_mla_q(
    torch::Tensor& ql_nope,  // [num_tokens, num_heads, nope_dim]
    torch::Tensor& q_pe,     // [num_tokens, num_heads, rope_dim]
    torch::Tensor& q_out);   // [num_tokens, num_heads, nope_dim + rope_dim]

// Extract function to gather quantized K cache
void cp_gather_indexer_k_quant_cache(
    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
    torch::Tensor& dst_k,           // [num_tokens, head_dim]
    torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
    const torch::Tensor& block_table,   // [batch_size, num_blocks]
    const torch::Tensor& cu_seq_lens);  // [batch_size + 1]
Avoid multiple redefinition (#1817) 2023-12-14 12:35:58 -05:00			`#pragma once`

[Kernel][Misc] Use TORCH_LIBRARY instead of PYBIND11_MODULE for custom ops (#5047) 2024-06-09 16:23:30 -04:00			`#include <torch/all.h>`
[Attention] Use sparse prefill kernel for fp8 kv-cache in DeepSeek-v3.2 (#27532) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> 2025-12-12 08:57:47 -05:00			`#include <c10/util/Optional.h>`
Implement cache ops 2023-02-16 07:47:03 +00:00
Support beam search & parallel generation (#7) 2023-03-10 09:58:21 -08:00			`#include <map>`
			`#include <vector>`

[CI/Build] Enforce style for C++ and CUDA code with `clang-format` (#4722) 2024-05-22 03:18:41 -04:00			`void swap_blocks(torch::Tensor& src, torch::Tensor& dst,`
OffloadingConnector: Support kernel_block_size != block_size (#30692) Signed-off-by: Or Ozeri <oro@il.ibm.com> 2026-01-22 14:30:04 +02:00			`int64_t block_size_in_bytes,`
[CI/Build] Enforce style for C++ and CUDA code with `clang-format` (#4722) 2024-05-22 03:18:41 -04:00			`const torch::Tensor& block_mapping);`
Implement cache ops 2023-02-16 07:47:03 +00:00
[Perf] Batch KV cache swap copies via cuMemcpyBatchAsync (#38460) Signed-off-by: Itay Etelis <itay.etelis@ibm.com> Co-authored-by: Itay Etelis <itay.etelis@ibm.com> Co-authored-by: Or Ozeri <oro@il.ibm.com> 2026-04-03 06:13:23 +03:00			`void swap_blocks_batch(const torch::Tensor& src_ptrs,`
			`const torch::Tensor& dst_ptrs,`
			`const torch::Tensor& sizes);`

[CI/Build] Enforce style for C++ and CUDA code with `clang-format` (#4722) 2024-05-22 03:18:41 -04:00			`void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,`
			`torch::Tensor& key_cache, torch::Tensor& value_cache,`
			`torch::Tensor& slot_mapping,`
[FP8][Kernel] Dynamic kv cache scaling factors computation (#11906) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Co-authored-by: Micah Williamson <micah.williamson@amd.com> 2025-01-23 13:04:03 -05:00			`const std::string& kv_cache_dtype,`
			`torch::Tensor& k_scale, torch::Tensor& v_scale);`
Add reshape_and_cache op 2023-02-18 19:22:57 +00:00
[CI/Build] Enforce style for C++ and CUDA code with `clang-format` (#4722) 2024-05-22 03:18:41 -04:00			`void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,`
			`torch::Tensor& key_cache,`
			`torch::Tensor& value_cache,`
			`torch::Tensor& slot_mapping,`
Add fp8 support to `reshape_and_cache_flash` (#6667) 2024-07-24 11:36:52 -07:00			`const std::string& kv_cache_dtype,`
[FP8][Kernel] Dynamic kv cache scaling factors computation (#11906) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Co-authored-by: Micah Williamson <micah.williamson@amd.com> 2025-01-23 13:04:03 -05:00			`torch::Tensor& k_scale, torch::Tensor& v_scale);`
[Kernel] Use flashinfer for decoding (#4353) Co-authored-by: LiuXiaoxuanPKU <llilyliupku@gmail.com> 2024-05-03 15:51:27 -07:00
[Attention] MLA decode optimizations (#12528) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: simon-mo <simon.mo@hey.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Co-authored-by: simon-mo <xmo@berkeley.edu> 2025-01-31 02:49:37 -05:00			`void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,`
			`torch::Tensor& kv_cache, torch::Tensor& slot_mapping,`
			`const std::string& kv_cache_dtype,`
			`torch::Tensor& scale);`

Fuse RoPE and MLA KV-cache write (#25774) Signed-off-by: Patryk Saffer <patryk.saffer99@gmail.com> Signed-off-by: PatrykSaffer <patryk.saffer@mistral.ai> Co-authored-by: Patryk Saffer <patryk.saffer99@gmail.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> 2026-01-10 04:18:37 +01:00			`// NOTE: k_pe and kv_c order is flipped compared to concat_and_cache_mla`
			`void concat_and_cache_mla_rope_fused(`
			`torch::Tensor& positions, torch::Tensor& q_pe, torch::Tensor& k_pe,`
			`torch::Tensor& kv_c, torch::Tensor& rope_cos_sin_cache, bool rope_is_neox,`
			`torch::Tensor& kv_cache_slot_mapping, torch::Tensor& kv_cache,`
			`const std::string& kv_cache_dtype, torch::Tensor& kv_cache_quant_scale);`

Support FP8-E5M2 KV Cache (#2279) Co-authored-by: zhaoyang <zhao.yang16@zte.com.cn> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2024-01-29 08:43:54 +08:00			`// Just for unittest`
[CI/Build] Enforce style for C++ and CUDA code with `clang-format` (#4722) 2024-05-22 03:18:41 -04:00			`void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,`
[Kernel][Misc] Use TORCH_LIBRARY instead of PYBIND11_MODULE for custom ops (#5047) 2024-06-09 16:23:30 -04:00			`const double scale, const std::string& kv_cache_dtype);`
[Attention] MLA with chunked prefill (#12639) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Patrick Horn <patrick.horn@gmail.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> 2025-02-21 18:30:12 -05:00
[Kernel] Add FP8 support with FlashMLA backend (#22668) Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> 2025-08-21 22:26:32 -04:00			`void gather_and_maybe_dequant_cache(`
[Perf][Deepseek] optimize gather_and_maybe_dequant_cache kernel's perf for extremely long sequence (#28029) Signed-off-by: ganyi <ygan@amd.com> 2025-11-25 10:05:46 +08:00			`torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]`
			`torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]`
			`torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]`
			`torch::Tensor const& cu_seq_lens, // [BATCH+1]`
			`torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]`
			`int64_t num_tokens, const std::string& kv_cache_dtype,`
[Kernel] Add FP8 support with FlashMLA backend (#22668) Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com> 2025-08-21 22:26:32 -04:00			`torch::Tensor const& scale,`
[Kernel] cuda kernels for upcoming decode context parallel feature (#23791) Co-authored-by: hongchao <hongchao@msh.team> 2025-08-28 15:29:11 +08:00			`std::optional<torch::Tensor> seq_starts = std::nullopt);`

			`// TODO(hc): cp_gather_cache need support scaled kvcahe in the future.`
			`void cp_gather_cache(`
			`torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]`
			`torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]`
			`torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]`
			`torch::Tensor const& cu_seq_lens, // [BATCH+1]`
			`int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);`
[New Model] DeepSeek-V3.2 (Rebased to Main) (#25896) Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Yongye Zhu <zyy1102000@gmail.com> Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com> Signed-off-by: Lucia Fang <fanglu@meta.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: mgoin <mgoin64@gmail.com> Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Lucia Fang <fanglu@meta.com> Co-authored-by: NickLucche <nlucches@redhat.com> Co-authored-by: Siyuan Fu <siyuanf@nvidia.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Xiaozhu Meng <mxz297@gmail.com> Co-authored-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com> 2025-09-30 05:14:41 -04:00
[Attention] Use sparse prefill kernel for fp8 kv-cache in DeepSeek-v3.2 (#27532) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> 2025-12-12 08:57:47 -05:00			`// Gather and upconvert FP8 KV cache to BF16 workspace`
			`void cp_gather_and_upconvert_fp8_kv_cache(`
			`torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]`
			`torch::Tensor const& dst, // [TOT_TOKENS, 576]`
			`torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]`
			`torch::Tensor const& seq_lens, // [BATCH]`
			`torch::Tensor const& workspace_starts, // [BATCH]`
			`int64_t batch_size);`

[New Model] DeepSeek-V3.2 (Rebased to Main) (#25896) Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Yongye Zhu <zyy1102000@gmail.com> Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com> Signed-off-by: Lucia Fang <fanglu@meta.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: mgoin <mgoin64@gmail.com> Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Co-authored-by: Lucia Fang <fanglu@meta.com> Co-authored-by: NickLucche <nlucches@redhat.com> Co-authored-by: Siyuan Fu <siyuanf@nvidia.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Xiaozhu Meng <mxz297@gmail.com> Co-authored-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com> 2025-09-30 05:14:41 -04:00			`// Indexer K quantization and cache function`
			`void indexer_k_quant_and_cache(`
			`torch::Tensor& k, // [num_tokens, head_dim]`
			`torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]`
			`torch::Tensor& slot_mapping, // [num_tokens]`
			`int64_t quant_block_size, // quantization block size`
			`const std::string& scale_fmt);`
Add gather_indexer_k_quant_cache kernel (#25931) Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com> Signed-off-by: Simon Mo <simon.mo@hey.com> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Yongye Zhu <zyy1102000@gmail.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com> 2025-10-08 12:58:57 +08:00
[Attention][Perf][Kernel] Replace torch.cat with vectorized CUDA kernel MLA query concat - DeepSeek-V3.2 (#34917) Signed-off-by: LopezCastroRoberto <rocastro@redhat.com> Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> 2026-03-09 17:50:36 +01:00			`// Concatenate query nope and rope for MLA/DSA attention`
			`void concat_mla_q(`
			`torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]`
			`torch::Tensor& q_pe, // [num_tokens, num_heads, rope_dim]`
			`torch::Tensor& q_out); // [num_tokens, num_heads, nope_dim + rope_dim]`

Add gather_indexer_k_quant_cache kernel (#25931) Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com> Signed-off-by: Simon Mo <simon.mo@hey.com> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Yongye Zhu <zyy1102000@gmail.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com> 2025-10-08 12:58:57 +08:00			`// Extract function to gather quantized K cache`
			`void cp_gather_indexer_k_quant_cache(`
			`const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]`
			`torch::Tensor& dst_k, // [num_tokens, head_dim]`
			`torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]`
			`const torch::Tensor& block_table, // [batch_size, num_blocks]`
[Attention] Use sparse prefill kernel for fp8 kv-cache in DeepSeek-v3.2 (#27532) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> 2025-12-12 08:57:47 -05:00			`const torch::Tensor& cu_seq_lens); // [batch_size + 1]`