[Performance] Move apply_w8a8_block_fp8_linear to an op class (#24666)

Signed-off-by: ElizaWszola <ewszola@redhat.com> Signed-off-by: ElizaWszola <elizaw.9289@gmail.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Luka Govedič <lgovedic@redhat.com>
2025-09-23 21:03:10 +02:00
parent 8c1c81a3de
commit 63400259d0
14 changed files with 341 additions and 201 deletions
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -10,7 +10,7 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    get_col_major_tma_aligned_tensor,
    per_token_group_quant_fp8,
-    w8a8_block_fp8_matmul,
+    w8a8_triton_block_scaled_mm,
 )
 from vllm.triton_utils import triton
 from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
@@ -59,7 +59,7 @@ def benchmark_shape(m: int,

    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
-        return w8a8_block_fp8_matmul(A_vllm,
+        return w8a8_triton_block_scaled_mm(A_vllm,
                                     B_vllm,
                                     A_scale_vllm,
                                     B_scale_vllm,