[Perf] Create TMA-aligned input scale tensor for DeepGemm on Hopper (#32619)

Signed-off-by: Xin Yang <xyangx@amazon.com>
2026-01-22 12:47:04 -08:00
parent f744810184
commit d08b356ee0
7 changed files with 75 additions and 17 deletions
--- a/tests/kernels/quantization/test_per_token_group_quant.py
+++ b/tests/kernels/quantization/test_per_token_group_quant.py
@@ -8,13 +8,16 @@ import torch
 from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils


-@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
+@pytest.mark.parametrize(
+    "shape", [(31, 128), (32, 128), (63, 256), (64, 256), (16, 512)]
+)
@pytest.mark.parametrize("column_major", [False, True])
+@pytest.mark.parametrize("tma_aligned", [False, True])
@pytest.mark.parametrize("scale_ue8m0", [False, True])
@pytest.mark.parametrize("group_size", [64, 128])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_per_token_group_quant_fp8(
-    shape, column_major: bool, scale_ue8m0: bool, group_size: int
+    shape, column_major: bool, tma_aligned: bool, scale_ue8m0: bool, group_size: int
 ):
    device = "cuda"

@@ -28,6 +31,7 @@ def test_per_token_group_quant_fp8(
        x,
        group_size,
        column_major_scales=column_major,
+        tma_aligned_scales=tma_aligned,
        use_ue8m0=scale_ue8m0,
    )