[Kernel/Quant] Remove the original marlin format and qqq (#23204)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-20 15:13:36 -04:00
parent ebe56a0064
commit 0cdbf5e61c
26 changed files with 92 additions and 3698 deletions
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -571,78 +571,79 @@ def generate():
                     itertools.repeat(default_heuristic))
    ]

-    # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
-    # TODO (LucasWilkinson): Further tuning required
-    qqq_tile_heuristic_config = {
-        #### M = 257+
-        # ((128, 256), (2, 1, 1)) Broken for QQQ types
-        # TODO (LucasWilkinson): Investigate further
-        # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
-        # "M > 256": ((128, 256), (2, 1, 1)),
-        "M > 256": ((128, 128), (2, 1, 1)),
-        #### M = 129-256
-        "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
-        "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
-        # ((128, 256), (2, 1, 1)) Broken for QQQ types
-        # TODO (LucasWilkinson): Investigate further
-        # "M > 128": ((128, 256), (2, 1, 1)),
-        "M > 128": ((128, 128), (2, 1, 1)),
-        #### M = 65-128
-        "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
-        "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
-        "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
-        "M > 64": ((128, 128), (2, 1, 1)),
-        #### M = 33-64
-        "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
-        # Broken for QQQ types
-        # TODO (LucasWilkinson): Investigate further
-        #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
-        "M > 32": ((128, 64), (2, 1, 1)),
-        #### M = 17-32
-        "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
-        "M > 16": ((256, 32), (2, 1, 1)),
-        #### M = 1-16
-        "N >= 26624": ((256, 16), (1, 1, 1)),
-        None: ((128, 16), (1, 1, 1)),
-    }
+    # TODO: Support W4A8 when ready
+    # # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
+    # # TODO (LucasWilkinson): Further tuning required
+    # qqq_tile_heuristic_config = {
+    #     #### M = 257+
+    #     # ((128, 256), (2, 1, 1)) Broken for QQQ types
+    #     # TODO (LucasWilkinson): Investigate further
+    #     # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
+    #     # "M > 256": ((128, 256), (2, 1, 1)),
+    #     "M > 256": ((128, 128), (2, 1, 1)),
+    #     #### M = 129-256
+    #     "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
+    #     "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
+    #     # ((128, 256), (2, 1, 1)) Broken for QQQ types
+    #     # TODO (LucasWilkinson): Investigate further
+    #     # "M > 128": ((128, 256), (2, 1, 1)),
+    #     "M > 128": ((128, 128), (2, 1, 1)),
+    #     #### M = 65-128
+    #     "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
+    #     "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
+    #     "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
+    #     "M > 64": ((128, 128), (2, 1, 1)),
+    #     #### M = 33-64
+    #     "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
+    #     # Broken for QQQ types
+    #     # TODO (LucasWilkinson): Investigate further
+    #     #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
+    #     "M > 32": ((128, 64), (2, 1, 1)),
+    #     #### M = 17-32
+    #     "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
+    #     "M > 16": ((256, 32), (2, 1, 1)),
+    #     #### M = 1-16
+    #     "N >= 26624": ((256, 16), (1, 1, 1)),
+    #     None: ((128, 16), (1, 1, 1)),
+    # }

-    # For now we use the same heuristic for all types
-    # Heuristic is currently tuned for H100s
-    qqq_heuristic = [
-        (cond, ScheduleConfig(*tile_config,
-                              **sch_common_params))  # type: ignore
-        for cond, tile_config in qqq_tile_heuristic_config.items()
-    ]
+    # # For now we use the same heuristic for all types
+    # # Heuristic is currently tuned for H100s
+    # qqq_heuristic = [
+    #     (cond, ScheduleConfig(*tile_config,
+    #                           **sch_common_params))  # type: ignore
+    #     for cond, tile_config in qqq_tile_heuristic_config.items()
+    # ]

-    QQQ_kernel_types = [
-        *(TypeConfig(
-            a=DataType.s8,
-            b=VLLMDataType.u4b8,
-            b_group_scale=b_group_scale,
-            b_group_zeropoint=DataType.void,
-            b_channel_scale=DataType.f32,
-            a_token_scale=DataType.f32,
-            out=DataType.f16,
-            accumulator=DataType.s32,
-        ) for b_group_scale in (DataType.f16, DataType.void)),
-        *(TypeConfig(
-            a=DataType.e4m3,
-            b=VLLMDataType.u4b8,
-            b_group_scale=b_group_scale,
-            b_group_zeropoint=DataType.void,
-            b_channel_scale=DataType.f32,
-            a_token_scale=DataType.f32,
-            out=DataType.f16,
-            accumulator=DataType.f32,
-        ) for b_group_scale in (DataType.f16, DataType.void)),
-    ]
+    # QQQ_kernel_types = [
+    #     *(TypeConfig(
+    #         a=DataType.s8,
+    #         b=VLLMDataType.u4b8,
+    #         b_group_scale=b_group_scale,
+    #         b_group_zeropoint=DataType.void,
+    #         b_channel_scale=DataType.f32,
+    #         a_token_scale=DataType.f32,
+    #         out=DataType.f16,
+    #         accumulator=DataType.s32,
+    #     ) for b_group_scale in (DataType.f16, DataType.void)),
+    #     *(TypeConfig(
+    #         a=DataType.e4m3,
+    #         b=VLLMDataType.u4b8,
+    #         b_group_scale=b_group_scale,
+    #         b_group_zeropoint=DataType.void,
+    #         b_channel_scale=DataType.f32,
+    #         a_token_scale=DataType.f32,
+    #         out=DataType.f16,
+    #         accumulator=DataType.f32,
+    #     ) for b_group_scale in (DataType.f16, DataType.void)),
+    # ]

-    impl_configs += [
-        ImplConfig(x[0], x[1], x[2])
-        for x in zip(QQQ_kernel_types,
-                     itertools.repeat(get_unique_schedules(qqq_heuristic)),
-                     itertools.repeat(qqq_heuristic))
-    ]
+    # impl_configs += [
+    #     ImplConfig(x[0], x[1], x[2])
+    #     for x in zip(QQQ_kernel_types,
+    #                  itertools.repeat(get_unique_schedules(qqq_heuristic)),
+    #                  itertools.repeat(qqq_heuristic))
+    # ]

    output_dir = os.path.join(SCRIPT_DIR, "generated")