From a6d0299c75f2c0687334d50d302801ade083c784 Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Fri, 20 Feb 2026 08:36:51 -0800 Subject: [PATCH] [Kernel] [Helion] [6/N] Add num_tokens dimension to silu_mul autotuning and dispatching (#34185) Signed-off-by: Yanan Cao --- tests/kernels/helion/test_silu_mul_fp8.py | 86 +- vllm/kernels/helion/configs/silu_mul_fp8.json | 55298 +++++++++++++++- vllm/kernels/helion/ops/silu_mul_fp8.py | 89 +- 3 files changed, 55236 insertions(+), 237 deletions(-) diff --git a/tests/kernels/helion/test_silu_mul_fp8.py b/tests/kernels/helion/test_silu_mul_fp8.py index da6405d6c..887f20b9f 100644 --- a/tests/kernels/helion/test_silu_mul_fp8.py +++ b/tests/kernels/helion/test_silu_mul_fp8.py @@ -54,8 +54,8 @@ def reset_config_manager_singleton(): class TestSiluMulFp8ConfigPicker: def test_config_picker_exact_match(self): config_keys = [ - "intermediate_2048_batchsize_256", - "intermediate_4096_batchsize_256", + "intermediate_2048_numtokens_256", + "intermediate_4096_numtokens_256", ] input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda") @@ -63,12 +63,12 @@ class TestSiluMulFp8ConfigPicker: args = (input_tensor, scale) selected_key = pick_silu_mul_fp8_config(args, config_keys) - assert selected_key == "intermediate_2048_batchsize_256" + assert selected_key == "intermediate_2048_numtokens_256" def test_config_picker_closest_match(self): config_keys = [ - "intermediate_2048_batchsize_256", - "intermediate_4096_batchsize_256", + "intermediate_2048_numtokens_256", + "intermediate_4096_numtokens_256", ] # Use 7000 (intermediate_size=3500) which is closer to 4096 than 2048 input_tensor = torch.randn(32, 7000, dtype=torch.bfloat16, device="cuda") @@ -76,10 +76,10 @@ class TestSiluMulFp8ConfigPicker: args = (input_tensor, scale) selected_key = pick_silu_mul_fp8_config(args, config_keys) - assert selected_key == "intermediate_4096_batchsize_256" + assert selected_key == "intermediate_4096_numtokens_256" def test_config_picker_fallback_to_default(self): - config_keys = ["default", "some_other_key"] + config_keys = ["default"] input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda") scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") @@ -101,9 +101,9 @@ class TestSiluMulFp8ConfigPicker: @pytest.mark.parametrize("intermediate_size", [2048, 4096, 5120]) def test_config_picker_different_sizes(self, intermediate_size): config_keys = [ - "intermediate_2048_batchsize_256", - "intermediate_4096_batchsize_256", - "intermediate_5120_batchsize_256", + "intermediate_2048_numtokens_256", + "intermediate_4096_numtokens_256", + "intermediate_5120_numtokens_256", ] input_tensor = torch.randn( @@ -113,9 +113,73 @@ class TestSiluMulFp8ConfigPicker: args = (input_tensor, scale) selected_key = pick_silu_mul_fp8_config(args, config_keys) - expected_key = f"intermediate_{intermediate_size}_batchsize_256" + expected_key = f"intermediate_{intermediate_size}_numtokens_256" assert selected_key == expected_key + def test_config_picker_numtokens_ceiling(self): + """Pick the smallest numtokens >= input num_tokens.""" + config_keys = [ + "intermediate_4096_numtokens_8", + "intermediate_4096_numtokens_32", + "intermediate_4096_numtokens_128", + "intermediate_4096_numtokens_256", + ] + # 20 tokens -> should pick numtokens_32 (smallest >= 20) + input_tensor = torch.randn(20, 8192, dtype=torch.bfloat16, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + + selected_key = pick_silu_mul_fp8_config((input_tensor, scale), config_keys) + assert selected_key == "intermediate_4096_numtokens_32" + + def test_config_picker_numtokens_exact(self): + """Exact num_tokens match is preferred over ceiling.""" + config_keys = [ + "intermediate_4096_numtokens_8", + "intermediate_4096_numtokens_32", + "intermediate_4096_numtokens_128", + ] + input_tensor = torch.randn(32, 8192, dtype=torch.bfloat16, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + + selected_key = pick_silu_mul_fp8_config((input_tensor, scale), config_keys) + assert selected_key == "intermediate_4096_numtokens_32" + + def test_config_picker_numtokens_fallback_to_largest(self): + """Fall back to the largest numtokens when input exceeds all.""" + config_keys = [ + "intermediate_4096_numtokens_8", + "intermediate_4096_numtokens_32", + "intermediate_4096_numtokens_128", + ] + # 512 tokens -> exceeds all available, should pick largest (128) + input_tensor = torch.randn(512, 8192, dtype=torch.bfloat16, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + + selected_key = pick_silu_mul_fp8_config((input_tensor, scale), config_keys) + assert selected_key == "intermediate_4096_numtokens_128" + + def test_config_picker_malformed_key_raises(self): + """Malformed config keys should raise ValueError.""" + config_keys = ["intermediate_4096_badformat_256"] + input_tensor = torch.randn(32, 8192, dtype=torch.bfloat16, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + + with pytest.raises(ValueError, match="Malformed config key"): + pick_silu_mul_fp8_config((input_tensor, scale), config_keys) + + def test_config_picker_default_ignored_when_valid_keys_exist(self): + """'default' is skipped in favor of a real match.""" + config_keys = [ + "default", + "intermediate_4096_numtokens_32", + "intermediate_4096_numtokens_128", + ] + input_tensor = torch.randn(64, 8192, dtype=torch.bfloat16, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + + selected_key = pick_silu_mul_fp8_config((input_tensor, scale), config_keys) + assert selected_key == "intermediate_4096_numtokens_128" + class TestSiluMulFp8Correctness: @pytest.mark.parametrize("batch_size", [1, 8, 32, 128]) diff --git a/vllm/kernels/helion/configs/silu_mul_fp8.json b/vllm/kernels/helion/configs/silu_mul_fp8.json index c26ca087d..0f0de04a1 100644 --- a/vllm/kernels/helion/configs/silu_mul_fp8.json +++ b/vllm/kernels/helion/configs/silu_mul_fp8.json @@ -1,9 +1,54 @@ { "nvidia_h200": { - "intermediate_2048_batchsize_256": { + "intermediate_2048_numtokens_256": { "block_sizes": [ 64, - 128 + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_256": { + "block_sizes": [ + 32, + 512 ], "loop_orders": [ [ @@ -20,6 +65,7 @@ "range_unroll_factors": [ 0 ], + "range_warp_specializes": [], "range_num_stages": [ 0 ], @@ -34,21 +80,65 @@ "", "" ], - "num_warps": 32, - "num_stages": 1, + "num_warps": 2, + "num_stages": 2, "indexing": [ "pointer", "tensor_descriptor", "pointer", "pointer" ], - "pid_type": "flat", - "range_warp_specializes": [] + "pid_type": "flat" }, - "intermediate_4096_batchsize_256": { + "default": { "block_sizes": [ - 16, - 64 + 1, + 512 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_256": { + "block_sizes": [ + 32, + 8 ], "loop_orders": [ [ @@ -57,7 +147,7 @@ ] ], "flatten_loops": [ - false + true ], "l2_groupings": [ 1 @@ -65,6 +155,97 @@ "range_unroll_factors": [ 0 ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_256": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_256": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], "range_num_stages": [ 0 ], @@ -87,13 +268,192 @@ "pointer", "pointer" ], - "pid_type": "flat", - "range_warp_specializes": [] + "pid_type": "flat" }, - "default": { + "intermediate_7688_numtokens_256": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_256": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_1": { "block_sizes": [ 1, - 512 + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_1": { + "block_sizes": [ + 1, + 1 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_1": { + "block_sizes": [ + 1, + 32 ], "loop_orders": [ [ @@ -104,12 +464,5098 @@ "flatten_loops": [ false ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_2": { + "block_sizes": [ + 2, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_2": { + "block_sizes": [ + 1, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_2": { + "block_sizes": [ + 2, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_2": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_2": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_2": { + "block_sizes": [ + 1, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_4": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_4": { + "block_sizes": [ + 1, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_4": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_4": { + "block_sizes": [ + 1, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_4": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_4": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_8": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_8": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_8": { + "block_sizes": [ + 2, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_8": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_8": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_8": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_16": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_16": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_16": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_16": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_16": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_16": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_24": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_24": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_24": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_24": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_24": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_24": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_32": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_32": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_32": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_32": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_32": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_32": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_40": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_40": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_40": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_40": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_40": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_40": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_48": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_48": { + "block_sizes": [ + 8, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_48": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_48": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_48": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_48": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_56": { + "block_sizes": [ + 2, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_56": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_56": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_56": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_56": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_56": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_64": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_64": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_64": { + "block_sizes": [ + 2, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_64": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_64": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_64": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_72": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_72": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_72": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_72": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_72": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_72": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_80": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_80": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_80": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_80": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], "l2_groupings": [ 4 ], "range_unroll_factors": [ 0 ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_80": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_80": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_88": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_88": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_88": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_88": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_88": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_88": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_96": { + "block_sizes": [ + 128, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_96": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_96": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_96": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_96": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_96": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_104": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_104": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_104": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_104": { + "block_sizes": [ + 8, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_104": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_104": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_112": { + "block_sizes": [ + 32, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_112": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_112": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_112": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_112": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_112": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_120": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_120": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_120": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_120": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_120": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_120": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_128": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_128": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_128": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_128": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_128": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_128": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_136": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_136": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], "range_num_stages": [ 0 ], @@ -125,11 +5571,8290 @@ "" ], "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_136": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_136": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_136": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_136": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_144": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_144": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_144": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_144": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_144": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_144": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_152": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_152": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_152": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_152": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_152": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_152": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_160": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_160": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_160": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_160": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_160": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_160": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_168": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_168": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_168": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_168": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_168": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_168": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_176": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_176": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_176": { + "block_sizes": [ + 4, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_176": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_176": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_176": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_184": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_184": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_184": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_184": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_184": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_184": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_192": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_192": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_192": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_192": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_192": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_192": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_200": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_200": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_200": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_200": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_200": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_200": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_208": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_208": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_208": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_208": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, "num_stages": 2, "indexing": [ "tensor_descriptor", "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_208": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_208": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_216": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_216": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_216": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_216": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_216": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_216": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_224": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_224": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_224": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_224": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_224": { + "block_sizes": [ + 256, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_224": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "first" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_232": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_232": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_232": { + "block_sizes": [ + 16, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_232": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_232": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_232": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_240": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_240": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_240": { + "block_sizes": [ + 16, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_240": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_240": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_240": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_248": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_248": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_248": { + "block_sizes": [ + 256, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_248": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_248": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_248": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_272": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_272": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_272": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_272": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_272": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_272": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_288": { + "block_sizes": [ + 4, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_288": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_288": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_288": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_288": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_288": { + "block_sizes": [ + 16, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_304": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_304": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_304": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_304": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_304": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_304": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_320": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_320": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_320": { + "block_sizes": [ + 512, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_320": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_320": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_320": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_336": { + "block_sizes": [ + 2, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_336": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_336": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_336": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_336": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_336": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_352": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_352": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_352": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_352": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_352": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_352": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_368": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_368": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_368": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_368": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_368": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_368": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_384": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_384": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_384": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_384": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_384": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_384": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_400": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_400": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_400": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_400": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_400": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_400": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_416": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_416": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_416": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_416": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_416": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_416": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_432": { + "block_sizes": [ + 16, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_432": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_432": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_432": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_432": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_432": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_448": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_448": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_448": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_448": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_448": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_448": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_464": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_464": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_464": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_464": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_464": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_464": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_480": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_480": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "first" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_480": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_480": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_480": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_480": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_496": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_496": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_496": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_496": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_496": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_496": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_512": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_512": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_512": { + "block_sizes": [ + 128, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_512": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_512": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_512": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", "pointer" ], "pid_type": "flat", @@ -137,55 +13862,10 @@ } }, "nvidia_h100_pcie": { - "intermediate_2048_batchsize_256": { + "intermediate_2048_numtokens_256": { "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 8, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat", - "range_warp_specializes": [] - }, - "intermediate_4096_batchsize_256": { - "block_sizes": [ - 256, - 128 + 64, + 32 ], "loop_orders": [ [ @@ -200,32 +13880,77 @@ 1 ], "range_unroll_factors": [ - 2 + 0 ], + "range_warp_specializes": [], "range_num_stages": [ - 3 + 0 ], "range_multi_buffers": [ - false + null ], "range_flattens": [ - true + null ], "load_eviction_policies": [ - "last", - "last", + "", + "", "" ], - "num_warps": 32, - "num_stages": 3, + "num_warps": 8, + "num_stages": 1, "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", + "pointer", + "pointer", + "pointer", "pointer" ], - "pid_type": "persistent_blocked", - "range_warp_specializes": [] + "pid_type": "flat" + }, + "intermediate_4096_numtokens_256": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" }, "default": { "block_sizes": [ @@ -247,6 +13972,7 @@ "range_unroll_factors": [ 0 ], + "range_warp_specializes": [], "range_num_stages": [ 0 ], @@ -269,60 +13995,13739 @@ "tensor_descriptor", "pointer" ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_256": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_256": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_256": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_7688_numtokens_256": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_256": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_1": { + "block_sizes": [ + 1, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_1": { + "block_sizes": [ + 1, + 1 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_2": { + "block_sizes": [ + 2, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_2": { + "block_sizes": [ + 1, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_2": { + "block_sizes": [ + 2, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_2": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_2": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_2": { + "block_sizes": [ + 1, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_4": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_4": { + "block_sizes": [ + 1, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_4": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_4": { + "block_sizes": [ + 1, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_4": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_4": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_8": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_8": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_8": { + "block_sizes": [ + 2, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_8": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_8": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_8": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_16": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_16": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_16": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_16": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_16": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_16": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_24": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_24": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_24": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_24": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_24": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_24": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_32": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_32": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_32": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_32": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_32": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_32": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_40": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_40": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_40": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_40": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_40": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_40": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_48": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_48": { + "block_sizes": [ + 8, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_48": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_48": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_48": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_48": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_56": { + "block_sizes": [ + 2, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_56": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_56": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_56": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_56": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_56": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_64": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_64": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_64": { + "block_sizes": [ + 2, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_64": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_64": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_64": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_72": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_72": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_72": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_72": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_72": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_72": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_80": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_80": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_80": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_80": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_80": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_80": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_88": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_88": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_88": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_88": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_88": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_88": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_96": { + "block_sizes": [ + 128, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_96": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_96": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_96": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_96": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_96": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_104": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_104": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_104": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_104": { + "block_sizes": [ + 8, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_104": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_104": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_112": { + "block_sizes": [ + 32, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_112": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_112": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_112": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_112": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_112": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_120": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_120": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_120": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_120": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_120": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_120": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_128": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_128": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_128": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_128": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_128": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_128": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_136": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_136": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_136": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_136": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_136": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_136": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_144": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_144": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_144": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_144": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_144": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_144": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_152": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_152": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_152": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_152": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_152": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_152": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_160": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_160": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_160": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_160": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_160": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_160": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_168": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_168": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_168": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_168": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_168": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_168": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_176": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_176": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_176": { + "block_sizes": [ + 4, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_176": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_176": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_176": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_184": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_184": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_184": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_184": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_184": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_184": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_192": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_192": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_192": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_192": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_192": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_192": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_200": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_200": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_200": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_200": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_200": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_200": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_208": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_208": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_208": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_208": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_208": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_208": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_216": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_216": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_216": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_216": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_216": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_216": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_224": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_224": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_224": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_224": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_224": { + "block_sizes": [ + 256, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_224": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "first" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_232": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_232": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_232": { + "block_sizes": [ + 16, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_232": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_232": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_232": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_240": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_240": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_240": { + "block_sizes": [ + 16, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_240": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_240": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_240": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_248": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_248": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_248": { + "block_sizes": [ + 256, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_248": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_248": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_248": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_272": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_272": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_272": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_272": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_272": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_272": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_288": { + "block_sizes": [ + 4, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_288": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_288": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_288": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_288": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_288": { + "block_sizes": [ + 16, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_304": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_304": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_304": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_304": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_304": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_304": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_320": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_320": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_320": { + "block_sizes": [ + 512, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_320": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_320": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_320": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_336": { + "block_sizes": [ + 2, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_336": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_336": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_336": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_336": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_336": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_352": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_352": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_352": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_352": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_352": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_352": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_368": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_368": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_368": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_368": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_368": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_368": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_384": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_384": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_384": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_384": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_384": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_384": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_400": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_400": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_400": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_400": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_400": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_400": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_416": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_416": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_416": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_416": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_416": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_416": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_432": { + "block_sizes": [ + 16, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_432": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_432": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_432": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_432": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_432": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_448": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_448": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_448": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_448": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_448": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_448": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_464": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_464": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_464": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_464": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_464": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_464": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_480": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_480": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "first" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_480": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_480": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_480": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_480": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_496": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_496": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_496": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_496": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_496": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_496": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_512": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_512": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_512": { + "block_sizes": [ + 128, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_512": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_512": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_512": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], "pid_type": "flat", "range_warp_specializes": [] } }, - "nvidia_h100_sxm5": { - "intermediate_2048_batchsize_256": { + "nvidia_h100_80gb_hbm3": { + "intermediate_2048_numtokens_256": { "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 8, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat", - "range_warp_specializes": [] - }, - "intermediate_4096_batchsize_256": { - "block_sizes": [ - 256, - 128 + 64, + 32 ], "loop_orders": [ [ @@ -337,32 +27742,77 @@ 1 ], "range_unroll_factors": [ - 2 + 0 ], + "range_warp_specializes": [], "range_num_stages": [ - 3 + 0 ], "range_multi_buffers": [ - false + null ], "range_flattens": [ - true + null ], "load_eviction_policies": [ - "last", - "last", + "", + "", "" ], - "num_warps": 32, - "num_stages": 3, + "num_warps": 8, + "num_stages": 1, "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", + "pointer", + "pointer", + "pointer", "pointer" ], - "pid_type": "persistent_blocked", - "range_warp_specializes": [] + "pid_type": "flat" + }, + "intermediate_4096_numtokens_256": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" }, "default": { "block_sizes": [ @@ -384,6 +27834,7 @@ "range_unroll_factors": [ 0 ], + "range_warp_specializes": [], "range_num_stages": [ 0 ], @@ -406,60 +27857,13739 @@ "tensor_descriptor", "pointer" ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_256": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_256": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_256": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_7688_numtokens_256": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_256": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_1": { + "block_sizes": [ + 1, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_1": { + "block_sizes": [ + 1, + 1 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_2": { + "block_sizes": [ + 2, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_2": { + "block_sizes": [ + 1, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_2": { + "block_sizes": [ + 2, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_2": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_2": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_2": { + "block_sizes": [ + 1, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_4": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_4": { + "block_sizes": [ + 1, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_4": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_4": { + "block_sizes": [ + 1, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_4": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_4": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_8": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_8": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_8": { + "block_sizes": [ + 2, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_8": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_8": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_8": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_16": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_16": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_16": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_16": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_16": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_16": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_24": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_24": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_24": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_24": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_24": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_24": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_32": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_32": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_32": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_32": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_32": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_32": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_40": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_40": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_40": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_40": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_40": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_40": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_48": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_48": { + "block_sizes": [ + 8, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_48": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_48": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_48": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_48": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_56": { + "block_sizes": [ + 2, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_56": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_56": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_56": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_56": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_56": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_64": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_64": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_64": { + "block_sizes": [ + 2, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_64": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_64": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_64": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_72": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_72": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_72": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_72": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_72": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_72": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_80": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_80": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_80": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_80": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_80": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_80": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_88": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_88": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_88": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_88": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_88": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_88": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_96": { + "block_sizes": [ + 128, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_96": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_96": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_96": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_96": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_96": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_104": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_104": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_104": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_104": { + "block_sizes": [ + 8, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_104": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_104": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_112": { + "block_sizes": [ + 32, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_112": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_112": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_112": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_112": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_112": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_120": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_120": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_120": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_120": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_120": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_120": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_128": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_128": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_128": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_128": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_128": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_128": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_136": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_136": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_136": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_136": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_136": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_136": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_144": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_144": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_144": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_144": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_144": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_144": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_152": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_152": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_152": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_152": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_152": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_152": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_160": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_160": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_160": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_160": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_160": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_160": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_168": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_168": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_168": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_168": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_168": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_168": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_176": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_176": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_176": { + "block_sizes": [ + 4, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_176": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_176": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_176": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_184": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_184": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_184": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_184": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_184": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_184": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_192": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_192": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_192": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_192": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_192": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_192": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_200": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_200": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_200": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_200": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_200": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_200": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_208": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_208": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_208": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_208": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_208": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_208": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_216": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_216": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_216": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_216": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_216": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_216": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_224": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_224": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_224": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_224": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_224": { + "block_sizes": [ + 256, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_224": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "first" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_232": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_232": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_232": { + "block_sizes": [ + 16, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_232": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_232": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_232": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_240": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_240": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_240": { + "block_sizes": [ + 16, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_240": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_240": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_240": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_248": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_248": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_248": { + "block_sizes": [ + 256, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_248": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_248": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_248": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_272": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_272": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_272": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_272": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_272": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_272": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_288": { + "block_sizes": [ + 4, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_288": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_288": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_288": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_288": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_288": { + "block_sizes": [ + 16, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_304": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_304": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_304": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_304": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_304": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_304": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_320": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_320": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_320": { + "block_sizes": [ + 512, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_320": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_320": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_320": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_336": { + "block_sizes": [ + 2, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_336": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_336": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_336": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_336": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_336": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_352": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_352": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_352": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_352": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_352": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_352": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_368": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_368": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_368": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_368": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_368": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_368": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_384": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_384": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_384": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_384": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_384": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_384": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_400": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_400": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_400": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_400": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_400": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_400": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_416": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_416": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_416": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_416": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_416": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_416": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_432": { + "block_sizes": [ + 16, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_432": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_432": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_432": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_432": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_432": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_448": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_448": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_448": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_448": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_448": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_448": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_464": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_464": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_464": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_464": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_464": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_464": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_480": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_480": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "first" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_480": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_480": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_480": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_480": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_496": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_496": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_496": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_496": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_496": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_496": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_512": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_512": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_512": { + "block_sizes": [ + 128, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_512": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_512": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_512": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], "pid_type": "flat", "range_warp_specializes": [] } }, "nvidia_h100": { - "intermediate_2048_batchsize_256": { + "intermediate_2048_numtokens_256": { "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 8, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat", - "range_warp_specializes": [] - }, - "intermediate_4096_batchsize_256": { - "block_sizes": [ - 256, - 128 + 64, + 32 ], "loop_orders": [ [ @@ -474,32 +41604,77 @@ 1 ], "range_unroll_factors": [ - 2 + 0 ], + "range_warp_specializes": [], "range_num_stages": [ - 3 + 0 ], "range_multi_buffers": [ - false + null ], "range_flattens": [ - true + null ], "load_eviction_policies": [ - "last", - "last", + "", + "", "" ], - "num_warps": 32, - "num_stages": 3, + "num_warps": 8, + "num_stages": 1, "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", + "pointer", + "pointer", + "pointer", "pointer" ], - "pid_type": "persistent_blocked", - "range_warp_specializes": [] + "pid_type": "flat" + }, + "intermediate_4096_numtokens_256": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" }, "default": { "block_sizes": [ @@ -521,6 +41696,7 @@ "range_unroll_factors": [ 0 ], + "range_warp_specializes": [], "range_num_stages": [ 0 ], @@ -543,8 +41719,13732 @@ "tensor_descriptor", "pointer" ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_256": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_256": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_256": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_7688_numtokens_256": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_256": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_1": { + "block_sizes": [ + 1, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_1": { + "block_sizes": [ + 1, + 1 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_1": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_2": { + "block_sizes": [ + 2, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_2": { + "block_sizes": [ + 1, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_2": { + "block_sizes": [ + 2, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_2": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_2": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_2": { + "block_sizes": [ + 1, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_4": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_4": { + "block_sizes": [ + 1, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_4": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_4": { + "block_sizes": [ + 1, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_4": { + "block_sizes": [ + 1, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_4": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_8": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_8": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_8": { + "block_sizes": [ + 2, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_8": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_8": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_8": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_16": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_16": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_16": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_16": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_16": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_16": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_24": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_24": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_24": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_24": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_24": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_24": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_32": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_32": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_32": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_32": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_32": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_32": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_40": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_40": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_40": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_40": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_40": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_40": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_48": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_48": { + "block_sizes": [ + 8, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_48": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_48": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_48": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_48": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_56": { + "block_sizes": [ + 2, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_56": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_56": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_56": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_56": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_56": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_64": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_64": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_64": { + "block_sizes": [ + 2, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_64": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_64": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_64": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_72": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_72": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_72": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_72": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_72": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_72": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_80": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_80": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_80": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_80": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_80": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_80": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_88": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_88": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_88": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_88": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_88": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_88": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_96": { + "block_sizes": [ + 128, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_96": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_96": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_96": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_96": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_96": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_104": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_104": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_104": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_104": { + "block_sizes": [ + 8, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_104": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_104": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_112": { + "block_sizes": [ + 32, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_112": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_112": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_112": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_112": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_112": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_120": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_120": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_120": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_120": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_120": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_120": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_128": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_128": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_128": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_128": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_128": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_128": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_136": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_136": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_136": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_136": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_136": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_136": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_144": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_144": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_144": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_144": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_144": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_144": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_152": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_152": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_152": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_152": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_152": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_152": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_160": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_160": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_160": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_160": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_160": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_160": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_168": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_168": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_168": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_168": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_168": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_168": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_176": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_176": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_176": { + "block_sizes": [ + 4, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_176": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_176": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_176": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_184": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_184": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_184": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_184": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_184": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_184": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_192": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_192": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_192": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_192": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_192": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_192": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_200": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_200": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_200": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_200": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_200": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_200": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_208": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_208": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_208": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_208": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_208": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_208": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_216": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_216": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_216": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_216": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_216": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_216": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_224": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_224": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_224": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_224": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_224": { + "block_sizes": [ + 256, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_224": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "first" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_232": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_232": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_232": { + "block_sizes": [ + 16, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_232": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_232": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_232": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_240": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_240": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_240": { + "block_sizes": [ + 16, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_240": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_240": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_240": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_248": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_248": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_248": { + "block_sizes": [ + 256, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_248": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_248": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_248": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_272": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_272": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_272": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_272": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_272": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_272": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_288": { + "block_sizes": [ + 4, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_288": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_288": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_288": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_288": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_288": { + "block_sizes": [ + 16, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_304": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_304": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_304": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_304": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_304": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_304": { + "block_sizes": [ + 64, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_320": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_320": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_320": { + "block_sizes": [ + 512, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_320": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_320": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_320": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_336": { + "block_sizes": [ + 2, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_336": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_336": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_336": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_336": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_336": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_352": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_352": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_352": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_352": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_352": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_352": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_368": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_368": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_368": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_368": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_368": { + "block_sizes": [ + 32, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_368": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_384": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_384": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_384": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_384": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_384": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_384": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_400": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_400": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_400": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_400": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_400": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_400": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_416": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_416": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_416": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_416": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_416": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_416": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_432": { + "block_sizes": [ + 16, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_432": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_432": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_432": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_432": { + "block_sizes": [ + 16, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_432": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_448": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_448": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_448": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_448": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_448": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_448": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_464": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_464": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_464": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_464": { + "block_sizes": [ + 8, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_464": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_464": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_480": { + "block_sizes": [ + 4, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_480": { + "block_sizes": [ + 4, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "first" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_480": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_480": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_480": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_480": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_496": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_496": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_496": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_496": { + "block_sizes": [ + 32, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_496": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_496": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_512": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_512": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_512": { + "block_sizes": [ + 128, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_512": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_512": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_512": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], "pid_type": "flat", "range_warp_specializes": [] } } -} \ No newline at end of file +} diff --git a/vllm/kernels/helion/ops/silu_mul_fp8.py b/vllm/kernels/helion/ops/silu_mul_fp8.py index a45943b1a..954f5df3a 100644 --- a/vllm/kernels/helion/ops/silu_mul_fp8.py +++ b/vllm/kernels/helion/ops/silu_mul_fp8.py @@ -3,6 +3,7 @@ from typing import Any +import regex as re import torch from vllm.logger import init_logger @@ -53,44 +54,78 @@ def silu_mul_fp8(input: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: return out.view(output_shape) +@silu_mul_fp8.register_input_generator # type: ignore[misc] +def generate_silu_mul_fp8_inputs() -> dict[str, tuple[Any, ...]]: + intermediate_sizes = [2048, 2880, 4096, 8192, 11008, 14336] + + # Use the same num_tokens values as vLLM's default cudagraph capture sizes. + # See vllm/config/vllm.py _set_cudagraph_sizes() for the canonical formula. + num_tokens_list = [1, 2, 4] + list(range(8, 256, 8)) + list(range(256, 513, 16)) + + inputs = {} + for num_tokens in num_tokens_list: + for intermediate_size in intermediate_sizes: + # Input tensor has shape (num_tokens, 2 * intermediate_size) + # because silu_mul splits it into two halves + input_tensor = torch.randn( + num_tokens, + 2 * intermediate_size, + device="cuda", + dtype=torch.bfloat16, + ) + scale = torch.tensor([1.0], device="cuda", dtype=torch.float32) + + config_key = f"intermediate_{intermediate_size}_numtokens_{num_tokens}" + inputs[config_key] = (input_tensor, scale) + + return inputs + + @silu_mul_fp8.register_config_picker # type: ignore[misc] def pick_silu_mul_fp8_config( args: tuple[Any, ...], config_keys: list[str] ) -> str | None: + """Pick the best pre-tuned config for the given input shape. + + Selection strategy: + 1. Find the closest intermediate_size among available configs + (exact match preferred). + 2. Among the num_tokens values tuned for that intermediate_size, pick + the smallest num_tokens >= the input's num_tokens. If the input is + larger than all available num_tokens, fall back to the largest. + + Config keys must be "default" or follow the format + "intermediate_{int}_numtokens_{int}". + """ if not config_keys: return None - input_tensor, scale = args + input_tensor, _scale = args intermediate_size = input_tensor.shape[-1] // 2 - - # TODO(gmagosfm): Rerun autotuning to capture config for - # other batch sizes. - target_key = f"intermediate_{intermediate_size}_batchsize_256" - if target_key in config_keys: - return target_key - - intermediate_sizes = [] + num_tokens = input_tensor.view(-1, input_tensor.shape[-1]).shape[0] + configs: dict[int, list[int]] = {} for key in config_keys: - if key.startswith("intermediate_") and "_batchsize_256" in key: - try: - size_str = key.split("_")[1] - size = int(size_str) - intermediate_sizes.append((abs(size - intermediate_size), key)) - except (ValueError, IndexError): - continue + if key == "default": + continue + match = re.fullmatch(r"intermediate_(\d+)_numtokens_(\d+)", key) + if not match: + raise ValueError( + f"Malformed config key '{key}', " + f"expected format 'intermediate_{{int}}_numtokens_{{int}}'" + ) + isize_str, ntokens_str = match.groups() + configs.setdefault(int(isize_str), []).append(int(ntokens_str)) - if intermediate_sizes: - _, best_key = min(intermediate_sizes) - logger.debug( - "No exact config for intermediate_size=%d, using closest match: %s", - intermediate_size, - best_key, - ) - return best_key - if "default" in config_keys: - return "default" + if not configs: + return "default" if "default" in config_keys else None - return None + best_isize = min(configs, key=lambda s: abs(s - intermediate_size)) + available_ntokens = sorted(configs[best_isize]) + best_ntokens = next( + (n for n in available_ntokens if n >= num_tokens), available_ntokens[-1] + ) + + return f"intermediate_{best_isize}_numtokens_{best_ntokens}" def silu_mul_fp8_baseline(input: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: