csrc/rocm/ops.h

#pragma once

#include <torch/all.h>

torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
                    const int64_t rows_per_block);

torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
                       const std::optional<at::Tensor>& in_bias,
                       const int64_t CuCount);

torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
                         const std::optional<at::Tensor>& in_bias,
                         const int64_t CuCount);

void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
               const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
               const at::Tensor& scale_a, const at::Tensor& scale_b,
               const int64_t CuCount);

void paged_attention(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
    torch::Tensor& block_tables, torch::Tensor& seq_lens,
    const std::optional<torch::Tensor>& query_start_loc, int64_t block_size,
    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
    torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale,
    const std::string& mfma_type);
[Kernel][Hardware][Amd]Custom paged attention kernel for rocm (#8310) 2024-09-13 19:01:11 -05:00			`#pragma once`

			`#include <torch/all.h>`

[Performance][ROCm] Add skinny gemms for unquantized linear on ROCm (#15830) Signed-off-by: charlifu <charlifu@amd.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> 2025-04-21 22:46:22 -05:00			`torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,`
			`const int64_t rows_per_block);`

[ROCm] Add skinny gemm bias support for dtypes fp16,bf16,fp8 (#24988) Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com> Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> 2025-09-23 11:31:45 -07:00			`torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,`
[MISC] replace c10::optional with std::optional (#25602) Signed-off-by: Shiyan Deng <dsy842974287@meta.com> 2025-09-24 16:56:21 -07:00			`const std::optional<at::Tensor>& in_bias,`
[Performance][ROCm] Add skinny gemms for unquantized linear on ROCm (#15830) Signed-off-by: charlifu <charlifu@amd.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> 2025-04-21 22:46:22 -05:00			`const int64_t CuCount);`

Atomics Reduce Counting Optimization for SplitK Skinny GEMMs. (#29843) Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com> 2026-01-16 09:45:04 -08:00			`torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,`
			`const std::optional<at::Tensor>& in_bias,`
			`const int64_t CuCount);`

[ROCm] Add skinny gemm bias support for dtypes fp16,bf16,fp8 (#24988) Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com> Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> 2025-09-23 11:31:45 -07:00			`void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,`
[MISC] replace c10::optional with std::optional (#25602) Signed-off-by: Shiyan Deng <dsy842974287@meta.com> 2025-09-24 16:56:21 -07:00			`const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,`
[ROCm] Add skinny gemm bias support for dtypes fp16,bf16,fp8 (#24988) Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com> Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> 2025-09-23 11:31:45 -07:00			`const at::Tensor& scale_a, const at::Tensor& scale_b,`
			`const int64_t CuCount);`
[Performance][ROCm] Add skinny gemms for unquantized linear on ROCm (#15830) Signed-off-by: charlifu <charlifu@amd.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> 2025-04-21 22:46:22 -05:00
[ROCm][FP8][Kernel] FP8 quantization fused into Custom Paged Attention (#17139) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> 2025-05-07 10:12:35 -04:00			`void paged_attention(`
			`torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,`
			`torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,`
			`torch::Tensor& value_cache, int64_t num_kv_heads, double scale,`
[ROCm][Misc] Rename the context_len to seq_len in ROCm custom paged attention kernel (#22097) Signed-off-by: charlifu <charlifu@amd.com> 2025-08-09 01:15:06 -05:00			`torch::Tensor& block_tables, torch::Tensor& seq_lens,`
[ROCm][FP8][Kernel] FP8 quantization fused into Custom Paged Attention (#17139) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> 2025-05-07 10:12:35 -04:00			`const std::optional<torch::Tensor>& query_start_loc, int64_t block_size,`
[ROCm][Misc] Rename the context_len to seq_len in ROCm custom paged attention kernel (#22097) Signed-off-by: charlifu <charlifu@amd.com> 2025-08-09 01:15:06 -05:00			`int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,`
[ROCm][FP8][Kernel] FP8 quantization fused into Custom Paged Attention (#17139) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> 2025-05-07 10:12:35 -04:00			`const std::string& kv_cache_dtype, torch::Tensor& k_scale,`
Fp8 paged attention update (#22222) Signed-off-by: Xiao Yu <xiao.yu@amd.com> Signed-off-by: xiao-llm <xiao.yu.dc@outlook.com> Co-authored-by: Xiao Yu <xiao.yu@metamaterial.com> Co-authored-by: Xiao Yu <xiao.yu@amd.com> Co-authored-by: Bowen Bao <bowenbao@amd.com> 2025-09-15 10:43:26 -04:00			`torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale,`
			`const std::string& mfma_type);`