[Perf] Create TMA-aligned input scale tensor for DeepGemm on Hopper (#32619)

Signed-off-by: Xin Yang <xyangx@amazon.com>
This commit is contained in:
Xin Yang
2026-01-22 12:47:04 -08:00
committed by GitHub
parent f744810184
commit d08b356ee0
7 changed files with 75 additions and 17 deletions

View File

@@ -8,13 +8,16 @@ import torch
from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils
@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
@pytest.mark.parametrize(
"shape", [(31, 128), (32, 128), (63, 256), (64, 256), (16, 512)]
)
@pytest.mark.parametrize("column_major", [False, True])
@pytest.mark.parametrize("tma_aligned", [False, True])
@pytest.mark.parametrize("scale_ue8m0", [False, True])
@pytest.mark.parametrize("group_size", [64, 128])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_per_token_group_quant_fp8(
shape, column_major: bool, scale_ue8m0: bool, group_size: int
shape, column_major: bool, tma_aligned: bool, scale_ue8m0: bool, group_size: int
):
device = "cuda"
@@ -28,6 +31,7 @@ def test_per_token_group_quant_fp8(
x,
group_size,
column_major_scales=column_major,
tma_aligned_scales=tma_aligned,
use_ue8m0=scale_ue8m0,
)