[Perf] Create TMA-aligned input scale tensor for DeepGemm on Hopper (#32619)

Signed-off-by: Xin Yang <xyangx@amazon.com>
This commit is contained in:
Xin Yang
2026-01-22 12:47:04 -08:00
committed by GitHub
parent f744810184
commit d08b356ee0
7 changed files with 75 additions and 17 deletions

View File

@@ -14,7 +14,6 @@ from vllm.triton_utils import triton
from vllm.utils.deep_gemm import (
calc_diff,
fp8_gemm_nt,
get_col_major_tma_aligned_tensor,
per_block_cast_to_fp8,
)
@@ -48,8 +47,9 @@ def benchmark_shape(
block_size = [128, 128]
# Pre-quantize A for all implementations
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
A, block_size[1], column_major_scales=True, tma_aligned_scales=True
)
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(