diff --git a/tests/kernels/helion/test_config_manager.py b/tests/kernels/helion/test_config_manager.py index d95909c92..337696ee0 100644 --- a/tests/kernels/helion/test_config_manager.py +++ b/tests/kernels/helion/test_config_manager.py @@ -160,10 +160,11 @@ class TestConfigManager: """Test getting config file path for a kernel.""" manager = ConfigManager(base_dir="/tmp") - file_path = manager.get_config_file_path("silu_mul_fp8") + dir_path = manager.get_config_file_path("silu_mul_fp8") + assert dir_path == Path("/tmp/silu_mul_fp8") - expected_path = Path("/tmp/silu_mul_fp8.json") - assert file_path == expected_path + file_path = manager.get_config_file_path("silu_mul_fp8", "nvidia_h100") + assert file_path == Path("/tmp/silu_mul_fp8/nvidia_h100.json") def test_ensure_base_dir_exists(self): """Test ensuring base directory exists.""" @@ -189,19 +190,19 @@ class TestConfigManager: assert config_set.get_platforms() == [] def test_load_config_set_valid_file(self): - """Test loading config set from valid file.""" + """Test loading config set from per-platform files.""" with tempfile.TemporaryDirectory() as temp_dir: - # Use realistic config data kernel_config = { "block_sizes": [128, 64], "num_warps": 8, "num_stages": 6, "pid_type": "persistent_interleaved", } - config_data = {"h100": {"batch_32_hidden_4096": kernel_config}} - config_file = Path(temp_dir) / "test_kernel.json" - with open(config_file, "w") as f: - json.dump(config_data, f) + kernel_dir = Path(temp_dir) / "test_kernel" + kernel_dir.mkdir() + platform_file = kernel_dir / "h100.json" + with open(platform_file, "w") as f: + json.dump({"batch_32_hidden_4096": kernel_config}, f) manager = ConfigManager(base_dir=temp_dir) config_set = manager.load_config_set("test_kernel") @@ -210,7 +211,6 @@ class TestConfigManager: assert config_set.kernel_name == "test_kernel" assert config_set.get_platforms() == ["h100"] - # Verify the config was loaded correctly config = config_set.get_config("h100", "batch_32_hidden_4096") assert isinstance(config, helion.Config) assert config.block_sizes == [128, 64] @@ -219,7 +219,9 @@ class TestConfigManager: def test_load_config_set_invalid_json(self): """Test loading config set from file with invalid JSON.""" with tempfile.TemporaryDirectory() as temp_dir: - config_file = Path(temp_dir) / "test_kernel.json" + kernel_dir = Path(temp_dir) / "test_kernel" + kernel_dir.mkdir() + config_file = kernel_dir / "h100.json" with open(config_file, "w") as f: f.write("invalid json content {") @@ -231,9 +233,8 @@ class TestConfigManager: assert config_set.get_platforms() == [] def test_save_config_set(self): - """Test saving ConfigSet to file.""" + """Test saving ConfigSet to per-platform files.""" with tempfile.TemporaryDirectory() as temp_dir: - # Use realistic config data kernel_config = { "block_sizes": [256, 128], "num_warps": 16, @@ -246,31 +247,34 @@ class TestConfigManager: manager = ConfigManager(base_dir=temp_dir) saved_path = manager.save_config_set(config_set) - expected_path = Path(temp_dir) / "test_kernel.json" - assert saved_path == expected_path - assert saved_path.exists() + expected_dir = Path(temp_dir) / "test_kernel" + assert saved_path == expected_dir + assert saved_path.is_dir() - with open(saved_path) as f: + platform_file = expected_dir / "h100.json" + assert platform_file.exists() + with open(platform_file) as f: loaded_data = json.load(f) - assert loaded_data == data + assert loaded_data == data["h100"] def test_save_config_set_creates_directory(self): """Test that save_config_set creates parent directories if needed.""" with tempfile.TemporaryDirectory() as temp_dir: nested_dir = Path(temp_dir) / "nested" / "configs" - config_set = ConfigSet("test_kernel") + data = {"h100": {"default": {"num_warps": 4}}} + config_set = ConfigSet.from_dict("test_kernel", data) manager = ConfigManager(base_dir=nested_dir) saved_path = manager.save_config_set(config_set) assert nested_dir.exists() assert nested_dir.is_dir() - assert saved_path.exists() + assert saved_path.is_dir() + assert (saved_path / "h100.json").exists() def test_get_platform_configs(self): """Test getting all configs for a specific platform.""" with tempfile.TemporaryDirectory() as temp_dir: - # Use realistic config data config_1 = {"num_warps": 4, "num_stages": 3, "block_sizes": [64, 32]} config_2 = {"num_warps": 8, "num_stages": 5, "block_sizes": [128, 64]} default_config = { @@ -280,17 +284,19 @@ class TestConfigManager: } config_3 = {"num_warps": 2, "num_stages": 2, "block_sizes": [32, 16]} - config_data = { - "h100": { - "batch_32_hidden_4096": config_1, - "batch_64_hidden_2048": config_2, - "default": default_config, - }, - "a100": {"batch_16_hidden_1024": config_3}, - } - config_file = Path(temp_dir) / "test_kernel.json" - with open(config_file, "w") as f: - json.dump(config_data, f) + kernel_dir = Path(temp_dir) / "test_kernel" + kernel_dir.mkdir() + with open(kernel_dir / "h100.json", "w") as f: + json.dump( + { + "batch_32_hidden_4096": config_1, + "batch_64_hidden_2048": config_2, + "default": default_config, + }, + f, + ) + with open(kernel_dir / "a100.json", "w") as f: + json.dump({"batch_16_hidden_1024": config_3}, f) manager = ConfigManager(base_dir=temp_dir) @@ -302,7 +308,6 @@ class TestConfigManager: for config in h100_configs.values(): assert isinstance(config, helion.Config) - # Verify specific config details assert h100_configs["batch_32_hidden_4096"].num_warps == 4 assert h100_configs["default"].num_stages == 7 diff --git a/vllm/kernels/helion/config_manager.py b/vllm/kernels/helion/config_manager.py index 7a6836ac8..f34d93604 100644 --- a/vllm/kernels/helion/config_manager.py +++ b/vllm/kernels/helion/config_manager.py @@ -8,23 +8,15 @@ operations, including naming conventions, directory resolution, and file I/O. Config File Structure --------------------- -Each kernel has a single JSON config file: {kernel_name}.json +Each kernel has a directory: {kernel_name}/ +Inside, each GPU platform has its own JSON file: {kernel_name}/{platform}.json -The file uses a simplified 2-layer hierarchical structure: -{ - "h100": { # GPU platform - "default": { ... }, # Fallback configuration - "batch_32_hidden_4096": { ... }, - "batch_64_hidden_8192": { ... } - }, - "a100": { - "default": { ... }, - "batch_16_hidden_2048": { ... } - } -} - -Example file: silu_mul_fp8.json +For example: + silu_mul_fp8/ + nvidia_h100.json # { "default": {...}, "batch_32_hidden_4096": {...} } + nvidia_h200.json # { "batch_16_hidden_2048": {...} } +Each platform file maps config keys to Helion config objects. Config keys should be structured strings that encode the relevant parameters (e.g., "batch_32_hidden_4096", "seq_512_heads_16", "fp8_batch_64", etc.). @@ -212,8 +204,15 @@ class ConfigManager: cls._instance = None cls._instance_base_dir = None - def get_config_file_path(self, kernel_name: str) -> Path: - return self._base_dir / f"{kernel_name}.json" + def get_kernel_dir(self, kernel_name: str) -> Path: + return self._base_dir / kernel_name + + def get_config_file_path( + self, kernel_name: str, platform: str | None = None + ) -> Path: + if platform is not None: + return self.get_kernel_dir(kernel_name) / f"{platform}.json" + return self.get_kernel_dir(kernel_name) def ensure_base_dir_exists(self) -> Path: self._base_dir.mkdir(parents=True, exist_ok=True) @@ -230,39 +229,59 @@ class ConfigManager: f"Config directory '{self._base_dir}' is not writable: {e}" ) from e - def load_config_set(self, kernel_name: str) -> ConfigSet: - config_path = self.get_config_file_path(kernel_name) + def _load_platform_file(self, kernel_name: str, platform: str) -> dict[str, Any]: + config_path = self.get_config_file_path(kernel_name, platform) if not config_path.exists(): - return ConfigSet.from_dict(kernel_name, {}) - + return {} try: with open(config_path) as f: - data = json.load(f) - return ConfigSet.from_dict(kernel_name, data) + return json.load(f) except (json.JSONDecodeError, OSError) as e: logger.error("Failed to load config file %s: %s", config_path, e) + return {} + + def load_config_set(self, kernel_name: str) -> ConfigSet: + kernel_dir = self.get_kernel_dir(kernel_name) + if not kernel_dir.is_dir(): return ConfigSet.from_dict(kernel_name, {}) + data: dict[str, Any] = {} + for platform_file in sorted(kernel_dir.glob("*.json")): + platform = platform_file.stem + try: + with open(platform_file) as f: + platform_data = json.load(f) + data[platform] = platform_data + except (json.JSONDecodeError, OSError) as e: + logger.error("Failed to load config file %s: %s", platform_file, e) + + return ConfigSet.from_dict(kernel_name, data) + def get_platform_configs( self, kernel_name: str, platform: str ) -> dict[str, helion.Config]: - config_set = self.load_config_set(kernel_name) + platform_data = self._load_platform_file(kernel_name, platform) + if not platform_data: + return {} + config_set = ConfigSet.from_dict(kernel_name, {platform: platform_data}) config_keys = config_set.get_config_keys(platform) - return { config_key: config_set.get_config(platform, config_key) for config_key in config_keys } def save_config_set(self, config_set: ConfigSet) -> Path: - config_path = self.get_config_file_path(config_set.kernel_name) - config_path.parent.mkdir(parents=True, exist_ok=True) + kernel_dir = self.get_kernel_dir(config_set.kernel_name) + kernel_dir.mkdir(parents=True, exist_ok=True) - with open(config_path, "w") as f: - json.dump(config_set.to_dict(), f, indent=2) + full_data = config_set.to_dict() + for platform, platform_data in full_data.items(): + platform_path = kernel_dir / f"{platform}.json" + with open(platform_path, "w") as f: + json.dump(platform_data, f, indent=2) + logger.info("Saved config to: %s", platform_path) - logger.info("Saved config to: %s", config_path) - return config_path + return kernel_dir def save_configs( self, @@ -271,11 +290,18 @@ class ConfigManager: configs: dict[str, "helion.Config"], ) -> Path: """Save configs for a kernel/platform, merging with existing.""" - config_set = self.load_config_set(kernel_name) + platform_data = self._load_platform_file(kernel_name, platform) for config_key, config in configs.items(): - config_set.set_config(platform, config_key, config) - return self.save_config_set(config_set) + platform_data[config_key] = json.loads(config.to_json()) + + platform_path = self.get_config_file_path(kernel_name, platform) + platform_path.parent.mkdir(parents=True, exist_ok=True) + with open(platform_path, "w") as f: + json.dump(platform_data, f, indent=2) + + logger.info("Saved config to: %s", platform_path) + return platform_path def config_exists(self, kernel_name: str, platform: str, config_key: str) -> bool: - config_set = self.load_config_set(kernel_name) - return config_set.has_config(platform, config_key) + platform_data = self._load_platform_file(kernel_name, platform) + return config_key in platform_data diff --git a/vllm/kernels/helion/configs/silu_mul_fp8.json b/vllm/kernels/helion/configs/silu_mul_fp8.json deleted file mode 100644 index bdef5e0fc..000000000 --- a/vllm/kernels/helion/configs/silu_mul_fp8.json +++ /dev/null @@ -1,27734 +0,0 @@ -{ - "nvidia_h200": { - "intermediate_2048_numtokens_256": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_256": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "default": { - "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 8, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_256": { - "block_sizes": [ - 256, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_256": { - "block_sizes": [ - 8, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_256": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_7688_numtokens_256": { - "block_sizes": [ - 32, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_256": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_1": { - "block_sizes": [ - 1, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_1": { - "block_sizes": [ - 1, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_1": { - "block_sizes": [ - 1, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_1": { - "block_sizes": [ - 1, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_1": { - "block_sizes": [ - 1, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_1": { - "block_sizes": [ - 1, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_2": { - "block_sizes": [ - 2, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_2": { - "block_sizes": [ - 2, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_2": { - "block_sizes": [ - 2, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_2": { - "block_sizes": [ - 2, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_2": { - "block_sizes": [ - 1, - 16384 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "last" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "xyz" - }, - "intermediate_14336_numtokens_2": { - "block_sizes": [ - 2, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_4": { - "block_sizes": [ - 4, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_4": { - "block_sizes": [ - 4, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 8, - "num_stages": 7, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "xyz" - }, - "intermediate_4096_numtokens_4": { - "block_sizes": [ - 4, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_4": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 8, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_4": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 8, - "num_stages": 6, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "xyz" - }, - "intermediate_14336_numtokens_4": { - "block_sizes": [ - 4, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "xyz" - }, - "intermediate_2048_numtokens_8": { - "block_sizes": [ - 8, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_8": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "" - ], - "num_warps": 16, - "num_stages": 5, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "xyz" - }, - "intermediate_4096_numtokens_8": { - "block_sizes": [ - 8, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_8": { - "block_sizes": [ - 2, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "" - ], - "num_warps": 1, - "num_stages": 8, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_8": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "first" - ], - "num_warps": 2, - "num_stages": 5, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_8": { - "block_sizes": [ - 8, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_16": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "first" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "xyz" - }, - "intermediate_2880_numtokens_16": { - "block_sizes": [ - 2, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_16": { - "block_sizes": [ - 16, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 32, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_16": { - "block_sizes": [ - 16, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_16": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_16": { - "block_sizes": [ - 2, - 256 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "last" - ], - "num_warps": 16, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_24": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 4, - "num_stages": 8, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_24": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 16, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_24": { - "block_sizes": [ - 16, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_24": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 5, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_24": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "" - ], - "num_warps": 2, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_24": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "first" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_32": { - "block_sizes": [ - 32, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_32": { - "block_sizes": [ - 4, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_32": { - "block_sizes": [ - 4, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "" - ], - "num_warps": 32, - "num_stages": 5, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_32": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "" - ], - "num_warps": 2, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_32": { - "block_sizes": [ - 2, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_32": { - "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "" - ], - "num_warps": 4, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_40": { - "block_sizes": [ - 32, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_40": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 8, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_40": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_40": { - "block_sizes": [ - 2, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 2, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_40": { - "block_sizes": [ - 16, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_40": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 1 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 16, - "num_stages": 5, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "persistent_interleaved", - "num_sm_multiplier": 32, - "maxnreg": 32 - }, - "intermediate_2048_numtokens_48": { - "block_sizes": [ - 32, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_48": { - "block_sizes": [ - 16, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_48": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_48": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 4, - "num_stages": 2, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_48": { - "block_sizes": [ - 16, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_48": { - "block_sizes": [ - 32, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_56": { - "block_sizes": [ - 32, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_56": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_56": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_56": { - "block_sizes": [ - 32, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_56": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_56": { - "block_sizes": [ - 2, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "" - ], - "num_warps": 2, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_64": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_64": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_64": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_64": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_64": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_64": { - "block_sizes": [ - 16, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_72": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_72": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_72": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_72": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_72": { - "block_sizes": [ - 4, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 5, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_72": { - "block_sizes": [ - 32, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 8, - "num_stages": 3, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_80": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_80": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_80": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_80": { - "block_sizes": [ - 4, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "last" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_80": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_80": { - "block_sizes": [ - 2, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "" - ], - "num_warps": 4, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_88": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_88": { - "block_sizes": [ - 8, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_88": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_88": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_88": { - "block_sizes": [ - 16, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "" - ], - "num_warps": 32, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_88": { - "block_sizes": [ - 4, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 32, - "num_stages": 5, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_96": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_96": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_96": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 4, - "num_stages": 6, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_96": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_96": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "" - ], - "num_warps": 32, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_96": { - "block_sizes": [ - 4, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_104": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_104": { - "block_sizes": [ - 8, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_104": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_104": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_104": { - "block_sizes": [ - 2, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_104": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_112": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_112": { - "block_sizes": [ - 2, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_112": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_112": { - "block_sizes": [ - 4, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 16, - "num_stages": 3, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_112": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_112": { - "block_sizes": [ - 64, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_120": { - "block_sizes": [ - 8, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "" - ], - "num_warps": 4, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_120": { - "block_sizes": [ - 2, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_120": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_120": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_120": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 1, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_120": { - "block_sizes": [ - 32, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "last" - ], - "num_warps": 16, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_128": { - "block_sizes": [ - 128, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_128": { - "block_sizes": [ - 2, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_128": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_128": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_128": { - "block_sizes": [ - 2, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_128": { - "block_sizes": [ - 4, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_136": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_136": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_136": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_136": { - "block_sizes": [ - 2, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "" - ], - "num_warps": 32, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_136": { - "block_sizes": [ - 4, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 8, - "num_stages": 4, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_136": { - "block_sizes": [ - 4, - 16384 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_144": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "first" - ], - "num_warps": 16, - "num_stages": 7, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_144": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_144": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_144": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 1, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_144": { - "block_sizes": [ - 256, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "first" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_144": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 16, - "num_stages": 8, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_152": { - "block_sizes": [ - 4, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "first" - ], - "num_warps": 8, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_152": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_152": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_152": { - "block_sizes": [ - 64, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 1, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_152": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 4, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_152": { - "block_sizes": [ - 2, - 16384 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "" - ], - "num_warps": 16, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_160": { - "block_sizes": [ - 4, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 1, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_160": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_160": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_160": { - "block_sizes": [ - 64, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 32, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_160": { - "block_sizes": [ - 128, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "last" - ], - "num_warps": 32, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_160": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 8, - "num_stages": 8, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_168": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 32, - "num_stages": 8, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_168": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_168": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_168": { - "block_sizes": [ - 128, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_168": { - "block_sizes": [ - 64, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_168": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "first" - ], - "num_warps": 2, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_176": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_176": { - "block_sizes": [ - 16, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_176": { - "block_sizes": [ - 128, - 4 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 4, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_176": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 16, - "num_stages": 5, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_176": { - "block_sizes": [ - 64, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_176": { - "block_sizes": [ - 128, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_184": { - "block_sizes": [ - 2, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 16, - "num_stages": 6, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_184": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_184": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_184": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 8, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_184": { - "block_sizes": [ - 64, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_184": { - "block_sizes": [ - 64, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 8, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_192": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_192": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_192": { - "block_sizes": [ - 8, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "first" - ], - "num_warps": 16, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_192": { - "block_sizes": [ - 32, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 32, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_192": { - "block_sizes": [ - 16, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 32, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_192": { - "block_sizes": [ - 128, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_200": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_200": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_200": { - "block_sizes": [ - 4, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "first" - ], - "num_warps": 1, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_200": { - "block_sizes": [ - 128, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_200": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "first" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_200": { - "block_sizes": [ - 16, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 32, - "num_stages": 6, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_208": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_208": { - "block_sizes": [ - 256, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 32, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_208": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "last" - ], - "num_warps": 4, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_208": { - "block_sizes": [ - 128, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_208": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "first" - ], - "num_warps": 8, - "num_stages": 5, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_208": { - "block_sizes": [ - 128, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_216": { - "block_sizes": [ - 32, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 8, - "num_stages": 4, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_216": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "first" - ], - "num_warps": 8, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_216": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_216": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 2, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_216": { - "block_sizes": [ - 1, - 16384 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 4, - "num_stages": 4, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_216": { - "block_sizes": [ - 128, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_224": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "first" - ], - "num_warps": 16, - "num_stages": 5, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_224": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_224": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_224": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 32, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_224": { - "block_sizes": [ - 32, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 2, - "num_stages": 3, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_224": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 8, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_232": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 16, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_232": { - "block_sizes": [ - 256, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "last" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_232": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_232": { - "block_sizes": [ - 256, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 16, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_232": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "first" - ], - "num_warps": 1, - "num_stages": 8, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_232": { - "block_sizes": [ - 8, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_240": { - "block_sizes": [ - 64, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 4, - "num_stages": 5, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_240": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_240": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 8, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_240": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "last" - ], - "num_warps": 4, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_240": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "first" - ], - "num_warps": 32, - "num_stages": 7, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_240": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 32, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_248": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_248": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_248": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_248": { - "block_sizes": [ - 256, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 4, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_248": { - "block_sizes": [ - 4, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_248": { - "block_sizes": [ - 8, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_272": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_272": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_272": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 32, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_272": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 2, - "num_stages": 6, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_272": { - "block_sizes": [ - 8, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 4, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_272": { - "block_sizes": [ - 512, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 8, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_288": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 4, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_288": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_288": { - "block_sizes": [ - 512, - 4 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "first" - ], - "num_warps": 1, - "num_stages": 2, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_288": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "" - ], - "num_warps": 2, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_288": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 8, - "num_stages": 3, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_288": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 1, - "num_stages": 5, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_304": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_304": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 2 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 2 - ], - "range_multi_buffers": [ - false - ], - "range_flattens": [ - true - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 16, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "persistent_blocked", - "num_sm_multiplier": 2, - "maxnreg": 64 - }, - "intermediate_4096_numtokens_304": { - "block_sizes": [ - 16, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_304": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_304": { - "block_sizes": [ - 128, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_304": { - "block_sizes": [ - 4, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "" - ], - "num_warps": 16, - "num_stages": 6, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_320": { - "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "" - ], - "num_warps": 2, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_320": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_320": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "last" - ], - "num_warps": 8, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_320": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "" - ], - "num_warps": 16, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_320": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "" - ], - "num_warps": 2, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_320": { - "block_sizes": [ - 8, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_336": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_336": { - "block_sizes": [ - 16, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_336": { - "block_sizes": [ - 16, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "first" - ], - "num_warps": 2, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_336": { - "block_sizes": [ - 256, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_336": { - "block_sizes": [ - 4, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "" - ], - "num_warps": 4, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_336": { - "block_sizes": [ - 256, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "last" - ], - "num_warps": 16, - "num_stages": 8, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_352": { - "block_sizes": [ - 512, - 1 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 1, - "num_stages": 4, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_352": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 32, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_352": { - "block_sizes": [ - 512, - 4 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 16, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_352": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_352": { - "block_sizes": [ - 16, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_352": { - "block_sizes": [ - 32, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_368": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "first" - ], - "num_warps": 8, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_368": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "" - ], - "num_warps": 4, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_368": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_368": { - "block_sizes": [ - 2, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 1, - "num_stages": 4, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_368": { - "block_sizes": [ - 128, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_368": { - "block_sizes": [ - 32, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_384": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_384": { - "block_sizes": [ - 512, - 2 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "last" - ], - "num_warps": 8, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_384": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 8, - "num_stages": 5, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_384": { - "block_sizes": [ - 128, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "first" - ], - "num_warps": 4, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_384": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "first" - ], - "num_warps": 4, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_384": { - "block_sizes": [ - 128, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_400": { - "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_400": { - "block_sizes": [ - 16, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_400": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 1, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_400": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 8, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_400": { - "block_sizes": [ - 2, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 4, - "num_stages": 3, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_400": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "first" - ], - "num_warps": 8, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_416": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_416": { - "block_sizes": [ - 32, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_416": { - "block_sizes": [ - 512, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "" - ], - "num_warps": 8, - "num_stages": 7, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_416": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 8, - "num_stages": 8, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_416": { - "block_sizes": [ - 256, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 4, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_416": { - "block_sizes": [ - 128, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "first" - ], - "num_warps": 16, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_432": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_432": { - "block_sizes": [ - 8, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_432": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_432": { - "block_sizes": [ - 256, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 32, - "num_stages": 5, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_432": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "first" - ], - "num_warps": 1, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_432": { - "block_sizes": [ - 512, - 4 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 1, - "num_stages": 7, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_448": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_448": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "" - ], - "num_warps": 2, - "num_stages": 6, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_448": { - "block_sizes": [ - 8, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_448": { - "block_sizes": [ - 128, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "last" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_448": { - "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_448": { - "block_sizes": [ - 64, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 8, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_464": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_464": { - "block_sizes": [ - 8, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_464": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 1, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_464": { - "block_sizes": [ - 256, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_464": { - "block_sizes": [ - 1, - 16384 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 32, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_464": { - "block_sizes": [ - 64, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 32, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_480": { - "block_sizes": [ - 16, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "" - ], - "num_warps": 16, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_480": { - "block_sizes": [ - 128, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "" - ], - "num_warps": 8, - "num_stages": 5, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_480": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_480": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "" - ], - "num_warps": 1, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_480": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_480": { - "block_sizes": [ - 1, - 16384 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_496": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 16, - "num_stages": 7, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_496": { - "block_sizes": [ - 8, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 4, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_496": { - "block_sizes": [ - 256, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_496": { - "block_sizes": [ - 256, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_496": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 8, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_496": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "first" - ], - "num_warps": 4, - "num_stages": 4, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_512": { - "block_sizes": [ - 512, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_512": { - "block_sizes": [ - 8, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_512": { - "block_sizes": [ - 8, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "last" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_512": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 4, - "num_stages": 4, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_512": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 16, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_512": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 2, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - } - }, - "nvidia_h100": { - "intermediate_2048_numtokens_256": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_256": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "default": { - "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 8, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_256": { - "block_sizes": [ - 256, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_256": { - "block_sizes": [ - 8, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_256": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_7688_numtokens_256": { - "block_sizes": [ - 32, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_256": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_1": { - "block_sizes": [ - 1, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_1": { - "block_sizes": [ - 1, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_1": { - "block_sizes": [ - 1, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_1": { - "block_sizes": [ - 1, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_1": { - "block_sizes": [ - 1, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_1": { - "block_sizes": [ - 1, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_2": { - "block_sizes": [ - 2, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_2": { - "block_sizes": [ - 2, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_2": { - "block_sizes": [ - 2, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_2": { - "block_sizes": [ - 2, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_2": { - "block_sizes": [ - 1, - 16384 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "last" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "xyz" - }, - "intermediate_14336_numtokens_2": { - "block_sizes": [ - 2, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_4": { - "block_sizes": [ - 4, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_4": { - "block_sizes": [ - 4, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 8, - "num_stages": 7, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "xyz" - }, - "intermediate_4096_numtokens_4": { - "block_sizes": [ - 4, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_4": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 8, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_4": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 8, - "num_stages": 6, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "xyz" - }, - "intermediate_14336_numtokens_4": { - "block_sizes": [ - 4, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "xyz" - }, - "intermediate_2048_numtokens_8": { - "block_sizes": [ - 8, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_8": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "" - ], - "num_warps": 16, - "num_stages": 5, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "xyz" - }, - "intermediate_4096_numtokens_8": { - "block_sizes": [ - 8, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_8": { - "block_sizes": [ - 2, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "" - ], - "num_warps": 1, - "num_stages": 8, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_8": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "first" - ], - "num_warps": 2, - "num_stages": 5, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_8": { - "block_sizes": [ - 8, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_16": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "first" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "xyz" - }, - "intermediate_2880_numtokens_16": { - "block_sizes": [ - 2, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_16": { - "block_sizes": [ - 16, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 32, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_16": { - "block_sizes": [ - 16, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_16": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_16": { - "block_sizes": [ - 2, - 256 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "last" - ], - "num_warps": 16, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_24": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 4, - "num_stages": 8, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_24": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 16, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_24": { - "block_sizes": [ - 16, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_24": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 5, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_24": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "" - ], - "num_warps": 2, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_24": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "first" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_32": { - "block_sizes": [ - 32, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_32": { - "block_sizes": [ - 4, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_32": { - "block_sizes": [ - 4, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "" - ], - "num_warps": 32, - "num_stages": 5, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_32": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "" - ], - "num_warps": 2, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_32": { - "block_sizes": [ - 2, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_32": { - "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "" - ], - "num_warps": 4, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_40": { - "block_sizes": [ - 32, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_40": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 8, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_40": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_40": { - "block_sizes": [ - 2, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 2, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_40": { - "block_sizes": [ - 16, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_40": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 1 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 16, - "num_stages": 5, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "persistent_interleaved", - "num_sm_multiplier": 32, - "maxnreg": 32 - }, - "intermediate_2048_numtokens_48": { - "block_sizes": [ - 32, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_48": { - "block_sizes": [ - 16, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_48": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_48": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 4, - "num_stages": 2, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_48": { - "block_sizes": [ - 16, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_48": { - "block_sizes": [ - 32, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_56": { - "block_sizes": [ - 32, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_56": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_56": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_56": { - "block_sizes": [ - 32, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_56": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_56": { - "block_sizes": [ - 2, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "" - ], - "num_warps": 2, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_64": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_64": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_64": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_64": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_64": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_64": { - "block_sizes": [ - 16, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_72": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_72": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_72": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_72": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_72": { - "block_sizes": [ - 4, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 5, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_72": { - "block_sizes": [ - 32, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 8, - "num_stages": 3, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_80": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_80": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_80": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_80": { - "block_sizes": [ - 4, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "last" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_80": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_80": { - "block_sizes": [ - 2, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "" - ], - "num_warps": 4, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_88": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_88": { - "block_sizes": [ - 8, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_88": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_88": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_88": { - "block_sizes": [ - 16, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "" - ], - "num_warps": 32, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_88": { - "block_sizes": [ - 4, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 32, - "num_stages": 5, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_96": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_96": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_96": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 4, - "num_stages": 6, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_96": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_96": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "" - ], - "num_warps": 32, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_96": { - "block_sizes": [ - 4, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_104": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_104": { - "block_sizes": [ - 8, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_104": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_104": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_104": { - "block_sizes": [ - 2, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_104": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_112": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_112": { - "block_sizes": [ - 2, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_112": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_112": { - "block_sizes": [ - 4, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 16, - "num_stages": 3, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_112": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_112": { - "block_sizes": [ - 64, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_120": { - "block_sizes": [ - 8, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "" - ], - "num_warps": 4, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_120": { - "block_sizes": [ - 2, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_120": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_120": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_120": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 1, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_120": { - "block_sizes": [ - 32, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "last" - ], - "num_warps": 16, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_128": { - "block_sizes": [ - 128, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_128": { - "block_sizes": [ - 2, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_128": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_128": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_128": { - "block_sizes": [ - 2, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_128": { - "block_sizes": [ - 4, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_136": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_136": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_136": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_136": { - "block_sizes": [ - 2, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "" - ], - "num_warps": 32, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_136": { - "block_sizes": [ - 4, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 8, - "num_stages": 4, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_136": { - "block_sizes": [ - 4, - 16384 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_144": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "first" - ], - "num_warps": 16, - "num_stages": 7, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_144": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_144": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_144": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 1, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_144": { - "block_sizes": [ - 256, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "first" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_144": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 16, - "num_stages": 8, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_152": { - "block_sizes": [ - 4, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "first" - ], - "num_warps": 8, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_152": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_152": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_152": { - "block_sizes": [ - 64, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 1, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_152": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 4, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_152": { - "block_sizes": [ - 2, - 16384 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "" - ], - "num_warps": 16, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_160": { - "block_sizes": [ - 4, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 1, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_160": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_160": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_160": { - "block_sizes": [ - 64, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 32, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_160": { - "block_sizes": [ - 128, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "last" - ], - "num_warps": 32, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_160": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 8, - "num_stages": 8, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_168": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 32, - "num_stages": 8, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_168": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_168": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_168": { - "block_sizes": [ - 128, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_168": { - "block_sizes": [ - 64, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_168": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "first" - ], - "num_warps": 2, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_176": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_176": { - "block_sizes": [ - 16, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_176": { - "block_sizes": [ - 128, - 4 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 4, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_176": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 16, - "num_stages": 5, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_176": { - "block_sizes": [ - 64, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_176": { - "block_sizes": [ - 128, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_184": { - "block_sizes": [ - 2, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 16, - "num_stages": 6, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_184": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_184": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_184": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 8, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_184": { - "block_sizes": [ - 64, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_184": { - "block_sizes": [ - 64, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 8, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_192": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_192": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_192": { - "block_sizes": [ - 8, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "first" - ], - "num_warps": 16, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_192": { - "block_sizes": [ - 32, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 32, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_192": { - "block_sizes": [ - 16, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 32, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_192": { - "block_sizes": [ - 128, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_200": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_200": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_200": { - "block_sizes": [ - 4, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "first" - ], - "num_warps": 1, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_200": { - "block_sizes": [ - 128, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_200": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "first" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_200": { - "block_sizes": [ - 16, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 32, - "num_stages": 6, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_208": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_208": { - "block_sizes": [ - 256, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 32, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_208": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "last" - ], - "num_warps": 4, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_208": { - "block_sizes": [ - 128, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_208": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "first" - ], - "num_warps": 8, - "num_stages": 5, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_208": { - "block_sizes": [ - 128, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_216": { - "block_sizes": [ - 32, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 8, - "num_stages": 4, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_216": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "first" - ], - "num_warps": 8, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_216": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_216": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 2, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_216": { - "block_sizes": [ - 1, - 16384 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 4, - "num_stages": 4, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_216": { - "block_sizes": [ - 128, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_224": { - "block_sizes": [ - 32, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "first" - ], - "num_warps": 16, - "num_stages": 5, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_224": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_224": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_224": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 32, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_224": { - "block_sizes": [ - 32, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 2, - "num_stages": 3, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_224": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 8, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_232": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 16, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_232": { - "block_sizes": [ - 256, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "last" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_232": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_232": { - "block_sizes": [ - 256, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 16, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_232": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "first" - ], - "num_warps": 1, - "num_stages": 8, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_232": { - "block_sizes": [ - 8, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_240": { - "block_sizes": [ - 64, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 4, - "num_stages": 5, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_240": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_240": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 8, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_240": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "last" - ], - "num_warps": 4, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_240": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "first" - ], - "num_warps": 32, - "num_stages": 7, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_240": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 32, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_248": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_248": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_248": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_248": { - "block_sizes": [ - 256, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 2, - "num_stages": 4, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_248": { - "block_sizes": [ - 4, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_248": { - "block_sizes": [ - 8, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_272": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_272": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_272": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 32, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_272": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 2, - "num_stages": 6, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_272": { - "block_sizes": [ - 8, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 4, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_272": { - "block_sizes": [ - 512, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 8, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_288": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 4, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_288": { - "block_sizes": [ - 8, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_288": { - "block_sizes": [ - 512, - 4 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "first" - ], - "num_warps": 1, - "num_stages": 2, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_288": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "" - ], - "num_warps": 2, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_288": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 8, - "num_stages": 3, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_288": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 1, - "num_stages": 5, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_304": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_304": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 2 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 2 - ], - "range_multi_buffers": [ - false - ], - "range_flattens": [ - true - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 16, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "persistent_blocked", - "num_sm_multiplier": 2, - "maxnreg": 64 - }, - "intermediate_4096_numtokens_304": { - "block_sizes": [ - 16, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_304": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_304": { - "block_sizes": [ - 128, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_304": { - "block_sizes": [ - 4, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "" - ], - "num_warps": 16, - "num_stages": 6, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_320": { - "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "" - ], - "num_warps": 2, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_320": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_320": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "last" - ], - "num_warps": 8, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_320": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "" - ], - "num_warps": 16, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_320": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "" - ], - "num_warps": 2, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_320": { - "block_sizes": [ - 8, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_336": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_336": { - "block_sizes": [ - 16, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_336": { - "block_sizes": [ - 16, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "first" - ], - "num_warps": 2, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_336": { - "block_sizes": [ - 256, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_336": { - "block_sizes": [ - 4, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "" - ], - "num_warps": 4, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_336": { - "block_sizes": [ - 256, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "last" - ], - "num_warps": 16, - "num_stages": 8, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_352": { - "block_sizes": [ - 512, - 1 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "first" - ], - "num_warps": 1, - "num_stages": 4, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_352": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 32, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_352": { - "block_sizes": [ - 512, - 4 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 16, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_352": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_352": { - "block_sizes": [ - 16, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_352": { - "block_sizes": [ - 32, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_368": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "first" - ], - "num_warps": 8, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_368": { - "block_sizes": [ - 128, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "" - ], - "num_warps": 4, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_368": { - "block_sizes": [ - 64, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_368": { - "block_sizes": [ - 2, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 1, - "num_stages": 4, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_368": { - "block_sizes": [ - 128, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_368": { - "block_sizes": [ - 32, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_384": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_384": { - "block_sizes": [ - 512, - 2 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "last" - ], - "num_warps": 8, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_384": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 8, - "num_stages": 5, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_384": { - "block_sizes": [ - 128, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "first" - ], - "num_warps": 4, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_384": { - "block_sizes": [ - 1, - 8192 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "first" - ], - "num_warps": 4, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_384": { - "block_sizes": [ - 128, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_400": { - "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_400": { - "block_sizes": [ - 16, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_400": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 1, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_400": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 8, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_400": { - "block_sizes": [ - 2, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "last" - ], - "num_warps": 4, - "num_stages": 3, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_400": { - "block_sizes": [ - 4, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "first" - ], - "num_warps": 8, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_416": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_416": { - "block_sizes": [ - 32, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_416": { - "block_sizes": [ - 512, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "" - ], - "num_warps": 8, - "num_stages": 7, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_416": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 8, - "num_stages": 8, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_416": { - "block_sizes": [ - 256, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "last" - ], - "num_warps": 4, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_416": { - "block_sizes": [ - 128, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "first" - ], - "num_warps": 16, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_432": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_432": { - "block_sizes": [ - 8, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_432": { - "block_sizes": [ - 64, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_432": { - "block_sizes": [ - 256, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 32, - "num_stages": 5, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_432": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "first" - ], - "num_warps": 1, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_432": { - "block_sizes": [ - 512, - 4 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 1, - "num_stages": 7, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_448": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_448": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "" - ], - "num_warps": 2, - "num_stages": 6, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_448": { - "block_sizes": [ - 8, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "last" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_448": { - "block_sizes": [ - 128, - 8 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "last" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_448": { - "block_sizes": [ - 1, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_448": { - "block_sizes": [ - 64, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 16 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 32, - "num_stages": 8, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_464": { - "block_sizes": [ - 256, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_464": { - "block_sizes": [ - 8, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_464": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 1, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_464": { - "block_sizes": [ - 256, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_464": { - "block_sizes": [ - 1, - 16384 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 32, - "num_stages": 6, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_464": { - "block_sizes": [ - 64, - 512 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 32, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_480": { - "block_sizes": [ - 16, - 32 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "first", - "" - ], - "num_warps": 16, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_480": { - "block_sizes": [ - 128, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "", - "" - ], - "num_warps": 8, - "num_stages": 5, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_480": { - "block_sizes": [ - 64, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 8 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 2, - "num_stages": 1, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_480": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "first", - "" - ], - "num_warps": 1, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_480": { - "block_sizes": [ - 1, - 1024 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 4, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_480": { - "block_sizes": [ - 1, - 16384 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "last", - "first" - ], - "num_warps": 32, - "num_stages": 3, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_496": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 16, - "num_stages": 7, - "indexing": [ - "pointer", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_496": { - "block_sizes": [ - 8, - 256 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 4, - "num_stages": 8, - "indexing": [ - "pointer", - "pointer", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_496": { - "block_sizes": [ - 256, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_496": { - "block_sizes": [ - 256, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_496": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 4 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "last", - "last" - ], - "num_warps": 8, - "num_stages": 4, - "indexing": [ - "tensor_descriptor", - "pointer", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_496": { - "block_sizes": [ - 4, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "first" - ], - "num_warps": 4, - "num_stages": 4, - "indexing": [ - "pointer", - "tensor_descriptor", - "tensor_descriptor", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2048_numtokens_512": { - "block_sizes": [ - 512, - 16 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_2880_numtokens_512": { - "block_sizes": [ - 8, - 2048 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "" - ], - "num_warps": 8, - "num_stages": 1, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_4096_numtokens_512": { - "block_sizes": [ - 8, - 128 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 2 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "last", - "last", - "last" - ], - "num_warps": 16, - "num_stages": 2, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_8192_numtokens_512": { - "block_sizes": [ - 1, - 2048 - ], - "loop_orders": [ - [ - 1, - 0 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 64 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "", - "last" - ], - "num_warps": 4, - "num_stages": 4, - "indexing": [ - "pointer", - "pointer", - "pointer", - "pointer" - ], - "pid_type": "flat" - }, - "intermediate_11008_numtokens_512": { - "block_sizes": [ - 1, - 4096 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - false - ], - "l2_groupings": [ - 1 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "first", - "", - "first" - ], - "num_warps": 16, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "pointer", - "tensor_descriptor" - ], - "pid_type": "flat" - }, - "intermediate_14336_numtokens_512": { - "block_sizes": [ - 128, - 64 - ], - "loop_orders": [ - [ - 0, - 1 - ] - ], - "flatten_loops": [ - true - ], - "l2_groupings": [ - 32 - ], - "range_unroll_factors": [ - 0 - ], - "range_warp_specializes": [], - "range_num_stages": [ - 0 - ], - "range_multi_buffers": [ - null - ], - "range_flattens": [ - null - ], - "load_eviction_policies": [ - "", - "first", - "" - ], - "num_warps": 2, - "num_stages": 7, - "indexing": [ - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor", - "tensor_descriptor" - ], - "pid_type": "flat" - } - } -} diff --git a/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h100.json b/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h100.json new file mode 100644 index 000000000..c314eb2da --- /dev/null +++ b/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h100.json @@ -0,0 +1,13866 @@ +{ + "intermediate_2048_numtokens_256": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_256": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "default": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_256": { + "block_sizes": [ + 256, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_256": { + "block_sizes": [ + 8, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_256": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_7688_numtokens_256": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_256": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_1": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_1": { + "block_sizes": [ + 1, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_1": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_1": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_1": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_1": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_2": { + "block_sizes": [ + 2, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_2": { + "block_sizes": [ + 2, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_2": { + "block_sizes": [ + 2, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_2": { + "block_sizes": [ + 2, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_2": { + "block_sizes": [ + 1, + 16384 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "last" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "xyz" + }, + "intermediate_14336_numtokens_2": { + "block_sizes": [ + 2, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_4": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_4": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 8, + "num_stages": 7, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "xyz" + }, + "intermediate_4096_numtokens_4": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_4": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 8, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_4": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 8, + "num_stages": 6, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "xyz" + }, + "intermediate_14336_numtokens_4": { + "block_sizes": [ + 4, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "xyz" + }, + "intermediate_2048_numtokens_8": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_8": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 16, + "num_stages": 5, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "xyz" + }, + "intermediate_4096_numtokens_8": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_8": { + "block_sizes": [ + 2, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "" + ], + "num_warps": 1, + "num_stages": 8, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_8": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "first" + ], + "num_warps": 2, + "num_stages": 5, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_8": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_16": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "first" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "xyz" + }, + "intermediate_2880_numtokens_16": { + "block_sizes": [ + 2, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_16": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_16": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_16": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_16": { + "block_sizes": [ + 2, + 256 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "last" + ], + "num_warps": 16, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_24": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 8, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_24": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 16, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_24": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_24": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 5, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_24": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_24": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "first" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_32": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_32": { + "block_sizes": [ + 4, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_32": { + "block_sizes": [ + 4, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "" + ], + "num_warps": 32, + "num_stages": 5, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_32": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_32": { + "block_sizes": [ + 2, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_32": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 4, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_40": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_40": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 8, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_40": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_40": { + "block_sizes": [ + 2, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_40": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_40": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 1 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 16, + "num_stages": 5, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "persistent_interleaved", + "num_sm_multiplier": 32, + "maxnreg": 32 + }, + "intermediate_2048_numtokens_48": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_48": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_48": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_48": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_48": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_48": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_56": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_56": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_56": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_56": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_56": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_56": { + "block_sizes": [ + 2, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "" + ], + "num_warps": 2, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_64": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_64": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_64": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_64": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_64": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_64": { + "block_sizes": [ + 16, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_72": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_72": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_72": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_72": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_72": { + "block_sizes": [ + 4, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 5, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_72": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 8, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_80": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_80": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_80": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_80": { + "block_sizes": [ + 4, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "last" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_80": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_80": { + "block_sizes": [ + 2, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 4, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_88": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_88": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_88": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_88": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_88": { + "block_sizes": [ + 16, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_88": { + "block_sizes": [ + 4, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 5, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_96": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_96": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_96": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 4, + "num_stages": 6, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_96": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_96": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_96": { + "block_sizes": [ + 4, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_104": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_104": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_104": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_104": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_104": { + "block_sizes": [ + 2, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_104": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_112": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_112": { + "block_sizes": [ + 2, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_112": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_112": { + "block_sizes": [ + 4, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 16, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_112": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_112": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_120": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "" + ], + "num_warps": 4, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_120": { + "block_sizes": [ + 2, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_120": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_120": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_120": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_120": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "last" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_128": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_128": { + "block_sizes": [ + 2, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_128": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_128": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_128": { + "block_sizes": [ + 2, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_128": { + "block_sizes": [ + 4, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_136": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_136": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_136": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_136": { + "block_sizes": [ + 2, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 32, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_136": { + "block_sizes": [ + 4, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 8, + "num_stages": 4, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_136": { + "block_sizes": [ + 4, + 16384 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_144": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "first" + ], + "num_warps": 16, + "num_stages": 7, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_144": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_144": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_144": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 1, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_144": { + "block_sizes": [ + 256, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "first" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_144": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 16, + "num_stages": 8, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_152": { + "block_sizes": [ + 4, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "first" + ], + "num_warps": 8, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_152": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_152": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_152": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_152": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 4, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_152": { + "block_sizes": [ + 2, + 16384 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 16, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_160": { + "block_sizes": [ + 4, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_160": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_160": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_160": { + "block_sizes": [ + 64, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_160": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "last" + ], + "num_warps": 32, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_160": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 8, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_168": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 32, + "num_stages": 8, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_168": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_168": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_168": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_168": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_168": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "first" + ], + "num_warps": 2, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_176": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_176": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_176": { + "block_sizes": [ + 128, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_176": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 16, + "num_stages": 5, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_176": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_176": { + "block_sizes": [ + 128, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_184": { + "block_sizes": [ + 2, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 6, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_184": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_184": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_184": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 8, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_184": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_184": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 8, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_192": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_192": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_192": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "first" + ], + "num_warps": 16, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_192": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_192": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 32, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_192": { + "block_sizes": [ + 128, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_200": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_200": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_200": { + "block_sizes": [ + 4, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "first" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_200": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_200": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "first" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_200": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 32, + "num_stages": 6, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_208": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_208": { + "block_sizes": [ + 256, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 32, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_208": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "last" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_208": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_208": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "first" + ], + "num_warps": 8, + "num_stages": 5, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_208": { + "block_sizes": [ + 128, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_216": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 8, + "num_stages": 4, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_216": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "first" + ], + "num_warps": 8, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_216": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_216": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_216": { + "block_sizes": [ + 1, + 16384 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 4, + "num_stages": 4, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_216": { + "block_sizes": [ + 128, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_224": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "first" + ], + "num_warps": 16, + "num_stages": 5, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_224": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_224": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_224": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_224": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 2, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_224": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 8, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_232": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_232": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "last" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_232": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_232": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_232": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "first" + ], + "num_warps": 1, + "num_stages": 8, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_232": { + "block_sizes": [ + 8, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_240": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 4, + "num_stages": 5, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_240": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_240": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_240": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "last" + ], + "num_warps": 4, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_240": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "first" + ], + "num_warps": 32, + "num_stages": 7, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_240": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_248": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_248": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_248": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_248": { + "block_sizes": [ + 256, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 4, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_248": { + "block_sizes": [ + 4, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_248": { + "block_sizes": [ + 8, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_272": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_272": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_272": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_272": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 2, + "num_stages": 6, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_272": { + "block_sizes": [ + 8, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 4, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_272": { + "block_sizes": [ + 512, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_288": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 4, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_288": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_288": { + "block_sizes": [ + 512, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "first" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_288": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "" + ], + "num_warps": 2, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_288": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 8, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_288": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 5, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_304": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_304": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 2 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 2 + ], + "range_multi_buffers": [ + false + ], + "range_flattens": [ + true + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 16, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "persistent_blocked", + "num_sm_multiplier": 2, + "maxnreg": 64 + }, + "intermediate_4096_numtokens_304": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_304": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_304": { + "block_sizes": [ + 128, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_304": { + "block_sizes": [ + 4, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 16, + "num_stages": 6, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_320": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_320": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_320": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "last" + ], + "num_warps": 8, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_320": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "" + ], + "num_warps": 16, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_320": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "" + ], + "num_warps": 2, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_320": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_336": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_336": { + "block_sizes": [ + 16, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_336": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "first" + ], + "num_warps": 2, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_336": { + "block_sizes": [ + 256, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_336": { + "block_sizes": [ + 4, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 4, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_336": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "last" + ], + "num_warps": 16, + "num_stages": 8, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_352": { + "block_sizes": [ + 512, + 1 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 4, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_352": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 32, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_352": { + "block_sizes": [ + 512, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 16, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_352": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_352": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_352": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_368": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "first" + ], + "num_warps": 8, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_368": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "" + ], + "num_warps": 4, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_368": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_368": { + "block_sizes": [ + 2, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 1, + "num_stages": 4, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_368": { + "block_sizes": [ + 128, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_368": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_384": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_384": { + "block_sizes": [ + 512, + 2 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "last" + ], + "num_warps": 8, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_384": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 5, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_384": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "first" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_384": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "first" + ], + "num_warps": 4, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_384": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_400": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_400": { + "block_sizes": [ + 16, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_400": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_400": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 8, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_400": { + "block_sizes": [ + 2, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 4, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_400": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "first" + ], + "num_warps": 8, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_416": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_416": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_416": { + "block_sizes": [ + 512, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 8, + "num_stages": 7, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_416": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 8, + "num_stages": 8, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_416": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 4, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_416": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "first" + ], + "num_warps": 16, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_432": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_432": { + "block_sizes": [ + 8, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_432": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_432": { + "block_sizes": [ + 256, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 32, + "num_stages": 5, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_432": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "first" + ], + "num_warps": 1, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_432": { + "block_sizes": [ + 512, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 1, + "num_stages": 7, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_448": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_448": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 6, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_448": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_448": { + "block_sizes": [ + 128, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "last" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_448": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_448": { + "block_sizes": [ + 64, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 8, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_464": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_464": { + "block_sizes": [ + 8, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_464": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_464": { + "block_sizes": [ + 256, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_464": { + "block_sizes": [ + 1, + 16384 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_464": { + "block_sizes": [ + 64, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 32, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_480": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_480": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 8, + "num_stages": 5, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_480": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_480": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_480": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_480": { + "block_sizes": [ + 1, + 16384 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_496": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 16, + "num_stages": 7, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_496": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 4, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_496": { + "block_sizes": [ + 256, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_496": { + "block_sizes": [ + 256, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_496": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 8, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_496": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "first" + ], + "num_warps": 4, + "num_stages": 4, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_512": { + "block_sizes": [ + 512, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_512": { + "block_sizes": [ + 8, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_512": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "last" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_512": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 4, + "num_stages": 4, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_512": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 16, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_512": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 2, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + } +} \ No newline at end of file diff --git a/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h200.json b/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h200.json new file mode 100644 index 000000000..c314eb2da --- /dev/null +++ b/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h200.json @@ -0,0 +1,13866 @@ +{ + "intermediate_2048_numtokens_256": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_256": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "default": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_256": { + "block_sizes": [ + 256, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_256": { + "block_sizes": [ + 8, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_256": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_7688_numtokens_256": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_256": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_1": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_1": { + "block_sizes": [ + 1, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_1": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_1": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_1": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_1": { + "block_sizes": [ + 1, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_2": { + "block_sizes": [ + 2, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_2": { + "block_sizes": [ + 2, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_2": { + "block_sizes": [ + 2, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_2": { + "block_sizes": [ + 2, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_2": { + "block_sizes": [ + 1, + 16384 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "last" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "xyz" + }, + "intermediate_14336_numtokens_2": { + "block_sizes": [ + 2, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_4": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_4": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 8, + "num_stages": 7, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "xyz" + }, + "intermediate_4096_numtokens_4": { + "block_sizes": [ + 4, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_4": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 8, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_4": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 8, + "num_stages": 6, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "xyz" + }, + "intermediate_14336_numtokens_4": { + "block_sizes": [ + 4, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "xyz" + }, + "intermediate_2048_numtokens_8": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_8": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 16, + "num_stages": 5, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "xyz" + }, + "intermediate_4096_numtokens_8": { + "block_sizes": [ + 8, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_8": { + "block_sizes": [ + 2, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "" + ], + "num_warps": 1, + "num_stages": 8, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_8": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "first" + ], + "num_warps": 2, + "num_stages": 5, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_8": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_16": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "first" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "xyz" + }, + "intermediate_2880_numtokens_16": { + "block_sizes": [ + 2, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_16": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_16": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_16": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_16": { + "block_sizes": [ + 2, + 256 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "last" + ], + "num_warps": 16, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_24": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 8, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_24": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 16, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_24": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_24": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 5, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_24": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_24": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "first" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_32": { + "block_sizes": [ + 32, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_32": { + "block_sizes": [ + 4, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_32": { + "block_sizes": [ + 4, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "" + ], + "num_warps": 32, + "num_stages": 5, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_32": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_32": { + "block_sizes": [ + 2, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_32": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 4, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_40": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_40": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 8, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_40": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_40": { + "block_sizes": [ + 2, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 2, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_40": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_40": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 1 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 16, + "num_stages": 5, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "persistent_interleaved", + "num_sm_multiplier": 32, + "maxnreg": 32 + }, + "intermediate_2048_numtokens_48": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_48": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_48": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_48": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_48": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_48": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_56": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_56": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_56": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_56": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_56": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_56": { + "block_sizes": [ + 2, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "" + ], + "num_warps": 2, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_64": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_64": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_64": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_64": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_64": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_64": { + "block_sizes": [ + 16, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_72": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_72": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_72": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_72": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_72": { + "block_sizes": [ + 4, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 5, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_72": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 8, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_80": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_80": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_80": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_80": { + "block_sizes": [ + 4, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "last" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_80": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_80": { + "block_sizes": [ + 2, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 4, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_88": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_88": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_88": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_88": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_88": { + "block_sizes": [ + 16, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_88": { + "block_sizes": [ + 4, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 5, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_96": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_96": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_96": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 4, + "num_stages": 6, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_96": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_96": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_96": { + "block_sizes": [ + 4, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_104": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_104": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_104": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_104": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_104": { + "block_sizes": [ + 2, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_104": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_112": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_112": { + "block_sizes": [ + 2, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_112": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_112": { + "block_sizes": [ + 4, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 16, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_112": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_112": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_120": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "" + ], + "num_warps": 4, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_120": { + "block_sizes": [ + 2, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_120": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_120": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_120": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_120": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "last" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_128": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_128": { + "block_sizes": [ + 2, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_128": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_128": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_128": { + "block_sizes": [ + 2, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_128": { + "block_sizes": [ + 4, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_136": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_136": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_136": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_136": { + "block_sizes": [ + 2, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 32, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_136": { + "block_sizes": [ + 4, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 8, + "num_stages": 4, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_136": { + "block_sizes": [ + 4, + 16384 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_144": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "first" + ], + "num_warps": 16, + "num_stages": 7, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_144": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_144": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_144": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 1, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_144": { + "block_sizes": [ + 256, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "first" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_144": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 16, + "num_stages": 8, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_152": { + "block_sizes": [ + 4, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "first" + ], + "num_warps": 8, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_152": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_152": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_152": { + "block_sizes": [ + 64, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_152": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 4, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_152": { + "block_sizes": [ + 2, + 16384 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 16, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_160": { + "block_sizes": [ + 4, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_160": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_160": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_160": { + "block_sizes": [ + 64, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_160": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "last" + ], + "num_warps": 32, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_160": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 8, + "num_stages": 8, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_168": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 32, + "num_stages": 8, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_168": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_168": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_168": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_168": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_168": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "first" + ], + "num_warps": 2, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_176": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_176": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_176": { + "block_sizes": [ + 128, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 4, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_176": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 16, + "num_stages": 5, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_176": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_176": { + "block_sizes": [ + 128, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_184": { + "block_sizes": [ + 2, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 16, + "num_stages": 6, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_184": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_184": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_184": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 8, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_184": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_184": { + "block_sizes": [ + 64, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 8, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_192": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_192": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_192": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "first" + ], + "num_warps": 16, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_192": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_192": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 32, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_192": { + "block_sizes": [ + 128, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_200": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_200": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_200": { + "block_sizes": [ + 4, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "first" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_200": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_200": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "first" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_200": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 32, + "num_stages": 6, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_208": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_208": { + "block_sizes": [ + 256, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 32, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_208": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "last" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_208": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_208": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "first" + ], + "num_warps": 8, + "num_stages": 5, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_208": { + "block_sizes": [ + 128, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_216": { + "block_sizes": [ + 32, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 8, + "num_stages": 4, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_216": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "first" + ], + "num_warps": 8, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_216": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_216": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_216": { + "block_sizes": [ + 1, + 16384 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 4, + "num_stages": 4, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_216": { + "block_sizes": [ + 128, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_224": { + "block_sizes": [ + 32, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "first" + ], + "num_warps": 16, + "num_stages": 5, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_224": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_224": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_224": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_224": { + "block_sizes": [ + 32, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 2, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_224": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 8, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_232": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_232": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "last" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_232": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_232": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_232": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "first" + ], + "num_warps": 1, + "num_stages": 8, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_232": { + "block_sizes": [ + 8, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_240": { + "block_sizes": [ + 64, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 4, + "num_stages": 5, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_240": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_240": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_240": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "last" + ], + "num_warps": 4, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_240": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "first" + ], + "num_warps": 32, + "num_stages": 7, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_240": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 32, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_248": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_248": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_248": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_248": { + "block_sizes": [ + 256, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 4, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_248": { + "block_sizes": [ + 4, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_248": { + "block_sizes": [ + 8, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_272": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_272": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_272": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_272": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 2, + "num_stages": 6, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_272": { + "block_sizes": [ + 8, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 4, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_272": { + "block_sizes": [ + 512, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_288": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 4, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_288": { + "block_sizes": [ + 8, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_288": { + "block_sizes": [ + 512, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "first" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_288": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "" + ], + "num_warps": 2, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_288": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 8, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_288": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 1, + "num_stages": 5, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_304": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_304": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 2 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 2 + ], + "range_multi_buffers": [ + false + ], + "range_flattens": [ + true + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 16, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "persistent_blocked", + "num_sm_multiplier": 2, + "maxnreg": 64 + }, + "intermediate_4096_numtokens_304": { + "block_sizes": [ + 16, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_304": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_304": { + "block_sizes": [ + 128, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_304": { + "block_sizes": [ + 4, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 16, + "num_stages": 6, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_320": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_320": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_320": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "last" + ], + "num_warps": 8, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_320": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "" + ], + "num_warps": 16, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_320": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "" + ], + "num_warps": 2, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_320": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_336": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_336": { + "block_sizes": [ + 16, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_336": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "first" + ], + "num_warps": 2, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_336": { + "block_sizes": [ + 256, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_336": { + "block_sizes": [ + 4, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "" + ], + "num_warps": 4, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_336": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "last" + ], + "num_warps": 16, + "num_stages": 8, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_352": { + "block_sizes": [ + 512, + 1 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "first" + ], + "num_warps": 1, + "num_stages": 4, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_352": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 32, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_352": { + "block_sizes": [ + 512, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 16, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_352": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_352": { + "block_sizes": [ + 16, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_352": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_368": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "first" + ], + "num_warps": 8, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_368": { + "block_sizes": [ + 128, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "" + ], + "num_warps": 4, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_368": { + "block_sizes": [ + 64, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_368": { + "block_sizes": [ + 2, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 1, + "num_stages": 4, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_368": { + "block_sizes": [ + 128, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_368": { + "block_sizes": [ + 32, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_384": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_384": { + "block_sizes": [ + 512, + 2 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "last" + ], + "num_warps": 8, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_384": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 8, + "num_stages": 5, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_384": { + "block_sizes": [ + 128, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "first" + ], + "num_warps": 4, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_384": { + "block_sizes": [ + 1, + 8192 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "first" + ], + "num_warps": 4, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_384": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_400": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_400": { + "block_sizes": [ + 16, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_400": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 1, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_400": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 8, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_400": { + "block_sizes": [ + 2, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "last" + ], + "num_warps": 4, + "num_stages": 3, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_400": { + "block_sizes": [ + 4, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "first" + ], + "num_warps": 8, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_416": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_416": { + "block_sizes": [ + 32, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_416": { + "block_sizes": [ + 512, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 8, + "num_stages": 7, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_416": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 8, + "num_stages": 8, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_416": { + "block_sizes": [ + 256, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "last" + ], + "num_warps": 4, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_416": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "first" + ], + "num_warps": 16, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_432": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_432": { + "block_sizes": [ + 8, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_432": { + "block_sizes": [ + 64, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_432": { + "block_sizes": [ + 256, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 32, + "num_stages": 5, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_432": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "first" + ], + "num_warps": 1, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_432": { + "block_sizes": [ + 512, + 4 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 1, + "num_stages": 7, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_448": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_448": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "" + ], + "num_warps": 2, + "num_stages": 6, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_448": { + "block_sizes": [ + 8, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "last" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_448": { + "block_sizes": [ + 128, + 8 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "last" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_448": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_448": { + "block_sizes": [ + 64, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 16 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 32, + "num_stages": 8, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_464": { + "block_sizes": [ + 256, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_464": { + "block_sizes": [ + 8, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_464": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 1, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_464": { + "block_sizes": [ + 256, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_464": { + "block_sizes": [ + 1, + 16384 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 6, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_464": { + "block_sizes": [ + 64, + 512 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 32, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_480": { + "block_sizes": [ + 16, + 32 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "first", + "" + ], + "num_warps": 16, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_480": { + "block_sizes": [ + 128, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "", + "" + ], + "num_warps": 8, + "num_stages": 5, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_480": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 8 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_480": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "first", + "" + ], + "num_warps": 1, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_480": { + "block_sizes": [ + 1, + 1024 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 4, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_480": { + "block_sizes": [ + 1, + 16384 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "last", + "first" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_496": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 16, + "num_stages": 7, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_496": { + "block_sizes": [ + 8, + 256 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 4, + "num_stages": 8, + "indexing": [ + "pointer", + "pointer", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_496": { + "block_sizes": [ + 256, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_496": { + "block_sizes": [ + 256, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_496": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "last", + "last" + ], + "num_warps": 8, + "num_stages": 4, + "indexing": [ + "tensor_descriptor", + "pointer", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_496": { + "block_sizes": [ + 4, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "first" + ], + "num_warps": 4, + "num_stages": 4, + "indexing": [ + "pointer", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2048_numtokens_512": { + "block_sizes": [ + 512, + 16 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_2880_numtokens_512": { + "block_sizes": [ + 8, + 2048 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 8, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_4096_numtokens_512": { + "block_sizes": [ + 8, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "last", + "last", + "last" + ], + "num_warps": 16, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_8192_numtokens_512": { + "block_sizes": [ + 1, + 2048 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 64 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "last" + ], + "num_warps": 4, + "num_stages": 4, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat" + }, + "intermediate_11008_numtokens_512": { + "block_sizes": [ + 1, + 4096 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "first", + "", + "first" + ], + "num_warps": 16, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "pointer", + "tensor_descriptor" + ], + "pid_type": "flat" + }, + "intermediate_14336_numtokens_512": { + "block_sizes": [ + 128, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 32 + ], + "range_unroll_factors": [ + 0 + ], + "range_warp_specializes": [], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 2, + "num_stages": 7, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor" + ], + "pid_type": "flat" + } +} \ No newline at end of file