[Quantization][Deprecation] Remove Marlin 24 (#32688)

Signed-off-by: Robert Shaw <robshaw@redhat.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-28 07:54:59 -08:00
parent 8e5e40daf4
commit af9b69f977
20 changed files with 159 additions and 3161 deletions
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -6,12 +6,6 @@ import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
-    GPTQ_MARLIN_24_MAX_PARALLEL,
-    GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
-    GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
-)
 from vllm.model_executor.layers.quantization.utils.allspark_utils import (
    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
    ALLSPARK_SUPPORTED_QUANT_TYPES,
@@ -34,9 +28,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    awq_marlin_quantize,
    marlin_quantize,
 )
-from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
-    marlin_24_quantize,
-)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    gptq_pack,
    gptq_quantize_weights,
@@ -78,14 +69,7 @@ def bench_run(
    if size_k % group_size != 0:
        return

-    marlin_24_supported = (
-        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
-        and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
-    )
-    repack_supported = (
-        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
-        and group_size in MARLIN_SUPPORTED_GROUP_SIZES
-    )
+    repack_supported = group_size in MARLIN_SUPPORTED_GROUP_SIZES
    allspark_supported = (
        quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
        and group_size == -1
@@ -126,14 +110,6 @@ def bench_run(
            marlin_sort_indices,
        )

-    def gen_marlin_24_params():
-        marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
-        if marlin_24_supported:
-            (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
-                marlin_24_quantize(b, quant_type, group_size)
-            )
-        return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
-
    def gen_repack_params():
        q_w_gptq = None
        repack_sort_indices = None
@@ -188,9 +164,6 @@ def bench_run(
        marlin_g_idx,
        marlin_sort_indices,
    ) = gen_marlin_params()
-    marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
-        gen_marlin_24_params()
-    )
    q_w_gptq, repack_sort_indices = gen_repack_params()
    qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
        gen_allspark_params()
@@ -200,9 +173,6 @@ def bench_run(
    marlin_workspace = MarlinWorkspace(
        size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
    )
-    marlin_24_workspace = MarlinWorkspace(
-        size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
-    )

    globals = {
        # Gen params
@@ -222,12 +192,6 @@ def bench_run(
        "marlin_sort_indices": marlin_sort_indices,
        "marlin_workspace": marlin_workspace,
        "is_k_full": is_k_full,
-        # Marlin_24 params
-        "marlin_24_w_ref": marlin_24_w_ref,
-        "marlin_24_q_w_comp": marlin_24_q_w_comp,
-        "marlin_24_meta": marlin_24_meta,
-        "marlin_24_s": marlin_24_s,
-        "marlin_24_workspace": marlin_24_workspace,
        # GPTQ params
        "q_w_gptq": q_w_gptq,
        "repack_sort_indices": repack_sort_indices,
@@ -240,7 +204,6 @@ def bench_run(
        "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
        # Kernels
        "marlin_gemm": ops.marlin_gemm,
-        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
        "gptq_marlin_repack": ops.gptq_marlin_repack,
        "allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
    }
@@ -281,17 +244,6 @@ def bench_run(
        ).blocked_autorange(min_run_time=min_run_time)
    )

-    if marlin_24_supported:
-        results.append(
-            benchmark.Timer(
-                stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
-                globals=globals,
-                label=label,
-                sub_label=sub_label,
-                description="gptq_marlin_24_gemm",
-            ).blocked_autorange(min_run_time=min_run_time)
-        )
-
    if repack_supported:
        results.append(
            benchmark.Timer(