Revert "[Performance] Move apply_w8a8_block_fp8_linear to an op class… (#25607)

Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
This commit is contained in:
Tyler Michael Smith
2025-09-25 04:05:21 -04:00
committed by GitHub
parent af4ee63e0e
commit 1260180c67
14 changed files with 205 additions and 346 deletions

View File

@@ -9,7 +9,7 @@ import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8,
w8a8_triton_block_scaled_mm,
w8a8_block_fp8_matmul,
)
from vllm.triton_utils import triton
from vllm.utils.deep_gemm import (
@@ -63,7 +63,7 @@ def benchmark_shape(m: int,
# === vLLM Triton Implementation ===
def vllm_triton_gemm():
return w8a8_triton_block_scaled_mm(A_vllm,
return w8a8_block_fp8_matmul(A_vllm,
B_vllm,
A_scale_vllm,
B_scale_vllm,