[Kernel] (2/N) Machete - Integrate into CompressedTensorsWNA16 and GPTQMarlin (#7701)

Co-authored-by: mgoin <michael@neuralmagic.com> Co-authored-by: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-09-23 13:46:26 -04:00
parent ee5f34b1c2
commit 86e9c8df29
27 changed files with 1005 additions and 246 deletions
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -157,7 +157,7 @@ TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput
 TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative


-@dataclass
+@dataclass(frozen=True)
 class ScheduleConfig:
    tile_shape_mn: Tuple[int, int]
    cluster_shape_mnk: Tuple[int, int, int]
@@ -328,56 +328,137 @@ def generate():
    # about how this works
    SCRIPT_DIR = os.path.dirname(__file__)

-    schedules = [
-        ScheduleConfig(
-            tile_shape_mn=tile_shape_mn,
-            cluster_shape_mnk=cluster_shape_mnk,
-            kernel_schedule=kernel_schedule,
-            epilogue_schedule=epilogue_schedule,
-            tile_scheduler=tile_scheduler,
-        ) for tile_shape_mn, cluster_shape_mnk in (
-            ((128, 16), (1, 1, 1)),
-            ((128, 32), (1, 1, 1)),
-            ((128, 64), (1, 1, 1)),
-            ((128, 128), (1, 1, 1)),
-        ) for kernel_schedule in (TmaMI, ) for epilogue_schedule in (TmaCoop, )
-        for tile_scheduler in (TileSchedulerType.StreamK, )
-    ]
+    schedule_common_params = dict(
+        kernel_schedule=TmaMI,
+        epilogue_schedule=TmaCoop,
+        tile_scheduler=TileSchedulerType.StreamK,
+    )

    # For now we use the same heuristic for all types
+    # Heuristic is currently tuned for H100s
    default_heuristic = [
-        ("M > 64",
-         ScheduleConfig(
-             tile_shape_mn=(128, 128),
-             cluster_shape_mnk=(1, 1, 1),
-             kernel_schedule=TmaMI,
-             epilogue_schedule=TmaCoop,
-             tile_scheduler=TileSchedulerType.StreamK,
-         )),
-        ("M > 32",
-         ScheduleConfig(
-             tile_shape_mn=(128, 64),
-             cluster_shape_mnk=(1, 1, 1),
-             kernel_schedule=TmaMI,
-             epilogue_schedule=TmaCoop,
-             tile_scheduler=TileSchedulerType.StreamK,
-         )),
-        ("M > 16",
-         ScheduleConfig(
-             tile_shape_mn=(128, 32),
-             cluster_shape_mnk=(1, 1, 1),
-             kernel_schedule=TmaMI,
-             epilogue_schedule=TmaCoop,
-             tile_scheduler=TileSchedulerType.StreamK,
-         )),
-        (None,
-         ScheduleConfig(tile_shape_mn=(128, 16),
-                        cluster_shape_mnk=(1, 1, 1),
-                        kernel_schedule=TmaMI,
-                        epilogue_schedule=TmaCoop,
-                        tile_scheduler=TileSchedulerType.StreamK))
+        #### M = 257+
+        (
+            "M > 256 && K <= 16384 && N <= 4096",
+            ScheduleConfig(
+                tile_shape_mn=(128, 128),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 256",
+            ScheduleConfig(
+                tile_shape_mn=(128, 256),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        #### M = 129-256
+        (
+            "M > 128 && K <= 4096 && N <= 4096",
+            ScheduleConfig(
+                tile_shape_mn=(128, 64),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 128 && K <= 8192 && N <= 8192",
+            ScheduleConfig(
+                tile_shape_mn=(128, 128),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 128",
+            ScheduleConfig(
+                tile_shape_mn=(128, 256),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        #### M = 65-128
+        (
+            "M > 64 && K <= 4069 && N <= 4069",
+            ScheduleConfig(
+                tile_shape_mn=(128, 32),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 64 && K <= 4069 && N <= 8192",
+            ScheduleConfig(
+                tile_shape_mn=(128, 64),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 64 && K >= 8192 && N >= 12288",
+            ScheduleConfig(
+                tile_shape_mn=(256, 128),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 64",
+            ScheduleConfig(
+                tile_shape_mn=(128, 128),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        #### M = 33-64
+        (
+            "M > 32 && K <= 6144 && N <= 6144",
+            ScheduleConfig(
+                tile_shape_mn=(128, 16),
+                cluster_shape_mnk=(1, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 32 && K >= 16384 && N >= 12288",
+            ScheduleConfig(
+                tile_shape_mn=(256, 64),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 32",
+            ScheduleConfig(
+                tile_shape_mn=(128, 64),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        #### M = 17-32
+        (
+            "M > 16 && K <= 12288 && N <= 8192",
+            ScheduleConfig(
+                tile_shape_mn=(128, 32),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 16",
+            ScheduleConfig(
+                tile_shape_mn=(256, 32),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        #### M = 1-16
+        (
+            "N >= 26624",
+            ScheduleConfig(
+                tile_shape_mn=(256, 16),
+                cluster_shape_mnk=(1, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            None,
+            ScheduleConfig(
+                tile_shape_mn=(128, 16),
+                cluster_shape_mnk=(1, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
    ]

+    schedules = list(set([x[1] for x in default_heuristic]))
+
    impl_configs = []

    GPTQ_kernel_type_configs = list(
--- a/csrc/quantization/machete/machete_mm_kernel.cuh
+++ b/csrc/quantization/machete/machete_mm_kernel.cuh
@@ -152,7 +152,8 @@ struct MacheteKernelTemplate {

    int M = size<0>(layout_A), N = size<1>(layout_D), K = size<1>(layout_A);

-    int const group_size = maybe_group_size.value_or(K);
+    int const group_size =
+        maybe_group_size == -1 ? K : maybe_group_size.value_or(K);
    int const scale_k = (K + group_size - 1) / group_size;

    TORCH_CHECK(size<0>(layout_A) == M && size<1>(layout_A) == K);
--- a/csrc/quantization/machete/machete_mm_launcher.cuh
+++ b/csrc/quantization/machete/machete_mm_launcher.cuh
@@ -71,7 +71,7 @@ torch::Tensor run_impl(PyTorchArguments args) {
  auto arguments = MacheteKernel::create_arguments(
      stream, A_ptr, layout_A, B_ptr, D_ptr, layout_D, C_ptr, layout_C, S_ptr,
      layout_S, Z_ptr, layout_Z, args.alpha.value_or(1), args.beta.value_or(0),
-      args.group_size.value_or(K));
+      args.group_size);
  TORCH_CHECK(MacheteKernel::can_implement(arguments),
              "Machete kernel cannot be run with these arguments");

--- a/csrc/quantization/machete/machete_prepack_launcher.cuh
+++ b/csrc/quantization/machete/machete_prepack_launcher.cuh
@@ -53,7 +53,7 @@ torch::Tensor prepack_impl(torch::Tensor const B) {
  // clang-format on

  // Allocate output
-  torch::Tensor D = torch::empty_like(B);
+  torch::Tensor D = torch::empty_like(B, {}, at::MemoryFormat::Contiguous);

  prepack_B<PrepackedLayoutB>(stream, B_ptr, layout_Bt,
                              static_cast<ElementB*>(D.mutable_data_ptr()));