[Refactor] Rename gptq_marlin to marlin to match MoE (#32952)

Signed-off-by: mgoin <mgoin64@gmail.com>
2026-01-23 16:48:12 -05:00
parent 6cc6d92be5
commit 4561f13985
24 changed files with 40 additions and 40 deletions
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -591,8 +591,8 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
    ) -> torch.Tensor:
        return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)

-    @register_fake("_C::gptq_marlin_gemm")
-    def _gptq_marlin_gemm_fake(
+    @register_fake("_C::marlin_gemm")
+    def _marlin_gemm_fake(
        a: torch.Tensor,
        c: torch.Tensor | None,
        b_q_weight: torch.Tensor,
@@ -1312,7 +1312,7 @@ def marlin_int4_fp8_preprocess(
    return torch.ops._C.marlin_int4_fp8_preprocess(qweight, qzeros_or_none, inplace)


-def gptq_marlin_gemm(
+def marlin_gemm(
    a: torch.Tensor,
    c: torch.Tensor | None,
    b_q_weight: torch.Tensor,
@@ -1333,7 +1333,7 @@ def gptq_marlin_gemm(
    use_fp32_reduce: bool = False,
    is_zp_float: bool = False,
 ) -> torch.Tensor:
-    return torch.ops._C.gptq_marlin_gemm(
+    return torch.ops._C.marlin_gemm(
        a,
        c,
        b_q_weight,
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -563,7 +563,7 @@ def apply_gptq_marlin_linear(

        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)

-    output = ops.gptq_marlin_gemm(
+    output = ops.marlin_gemm(
        reshaped_x,
        None,
        weight,
@@ -628,7 +628,7 @@ def apply_awq_marlin_linear(
        )
        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)

-    output = ops.gptq_marlin_gemm(
+    output = ops.marlin_gemm(
        reshaped_x,
        None,
        weight,
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -121,7 +121,7 @@ def apply_fp4_marlin_linear(

        inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)

-    output = ops.gptq_marlin_gemm(
+    output = ops.marlin_gemm(
        a=inputs,
        c=None,
        b_q_weight=weight,
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -66,7 +66,7 @@ def apply_fp8_marlin_linear(
        # inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
        raise RuntimeError("Marlin W8A8 is not supported.")

-    output = ops.gptq_marlin_gemm(
+    output = ops.marlin_gemm(
        a=inputs,
        c=None,
        b_q_weight=weight,