From 8a24842765ba9b45b0116d65b16c2d5b1fcb7e05 Mon Sep 17 00:00:00 2001 From: Amanzhol Salykov Date: Wed, 11 Mar 2026 20:00:08 +0100 Subject: [PATCH] [ROCm] add tuned moe_wna16_triton kernel configs for CDNA4 (#35093) Signed-off-by: salykova Signed-off-by: amd-asalykov --- ...=AMD_Instinct_MI350X,dtype=int4_w4a16.json | 192 ++++++++++++++++++ ...D_Instinct_MI350_OAM,dtype=int4_w4a16.json | 192 ++++++++++++++++++ ...=AMD_Instinct_MI355X,dtype=int4_w4a16.json | 192 ++++++++++++++++++ ...D_Instinct_MI355_OAM,dtype=int4_w4a16.json | 192 ++++++++++++++++++ 4 files changed, 768 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI350X,dtype=int4_w4a16.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI350_OAM,dtype=int4_w4a16.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI355X,dtype=int4_w4a16.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI355_OAM,dtype=int4_w4a16.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI350X,dtype=int4_w4a16.json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI350X,dtype=int4_w4a16.json new file mode 100644 index 000000000..98197bfb8 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI350X,dtype=int4_w4a16.json @@ -0,0 +1,192 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 8, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI350_OAM,dtype=int4_w4a16.json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI350_OAM,dtype=int4_w4a16.json new file mode 100644 index 000000000..98197bfb8 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI350_OAM,dtype=int4_w4a16.json @@ -0,0 +1,192 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 8, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI355X,dtype=int4_w4a16.json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI355X,dtype=int4_w4a16.json new file mode 100644 index 000000000..98197bfb8 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI355X,dtype=int4_w4a16.json @@ -0,0 +1,192 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 8, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI355_OAM,dtype=int4_w4a16.json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI355_OAM,dtype=int4_w4a16.json new file mode 100644 index 000000000..98197bfb8 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI355_OAM,dtype=int4_w4a16.json @@ -0,0 +1,192 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 1, + "matrix_instr_nonkdim": 16 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 2, + "matrix_instr_nonkdim": 16 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 2, + "SPLIT_K": 1, + "num_warps": 8, + "num_stages": 2, + "matrix_instr_nonkdim": 32 + } +} \ No newline at end of file