[Perf][Kernel] Optimize FP4 quantization kernels (SM100F) (#32520)

Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
2026-01-25 02:45:27 +01:00
parent 1ebdff412a
commit fcb9df99bd
18 changed files with 508 additions and 151 deletions
--- a/benchmarks/kernels/bench_nvfp4_quant.py
+++ b/benchmarks/kernels/bench_nvfp4_quant.py
@@ -20,8 +20,12 @@ FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max

 PROVIDER_CFGS = {
-    "vllm": dict(backend="vllm", enabled=True),
-    "flashinfer": dict(backend="flashinfer", enabled=True),
+    "vllm": dict(backend="vllm", is_sf_swizzled_layout=False, enabled=True),
+    "vllm-swizzle": dict(backend="vllm", is_sf_swizzled_layout=True, enabled=True),
+    "flashinfer": dict(backend="flashinfer", is_sf_swizzled_layout=False, enabled=True),
+    "flashinfer-swizzle": dict(
+        backend="flashinfer", is_sf_swizzled_layout=True, enabled=True
+    ),
 }

 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
@@ -36,7 +40,7 @@ def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=["batch_size"],
-        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192],
        x_log=False,
        line_arg="provider",
        line_vals=_enabled,
@@ -63,19 +67,36 @@ def benchmark(batch_size, provider, N, K):

    if cfg["backend"] == "vllm":
        # vLLM's FP4 quantization
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: ops.scaled_fp4_quant(a, a_global_scale),
-            quantiles=quantiles,
-        )
+        if cfg["is_sf_swizzled_layout"]:
+            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+                lambda: ops.scaled_fp4_quant(
+                    a, a_global_scale, is_sf_swizzled_layout=True
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+                lambda: ops.scaled_fp4_quant(
+                    a, a_global_scale, is_sf_swizzled_layout=False
+                ),
+                quantiles=quantiles,
+            )
    elif cfg["backend"] == "flashinfer":
        # FlashInfer's FP4 quantization
-        # Use is_sf_swizzled_layout=True to match vLLM's output format
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: flashinfer_fp4_quantize(
-                a, a_global_scale, is_sf_swizzled_layout=True
-            ),
-            quantiles=quantiles,
-        )
+        if cfg["is_sf_swizzled_layout"]:
+            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+                lambda: flashinfer_fp4_quantize(
+                    a, a_global_scale, is_sf_swizzled_layout=True
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+                lambda: flashinfer_fp4_quantize(
+                    a, a_global_scale, is_sf_swizzled_layout=False
+                ),
+                quantiles=quantiles,
+            )

    # Convert ms to us for better readability at small batch sizes
    to_us = lambda t_ms: t_ms * 1000
@@ -92,7 +113,9 @@ def prepare_shapes(args):
    return out


-def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
+def _test_accuracy_once(
+    M: int, K: int, dtype: torch.dtype, device: str, is_sf_swizzled_layout: bool
+):
    """Test accuracy between vLLM and FlashInfer FP4 quantization."""
    # Create input tensor
    a = torch.randn((M, K), device=device, dtype=dtype)
@@ -101,11 +124,13 @@ def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
    a_global_scale = compute_global_scale(a)

    # vLLM quantization
-    vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
+    vllm_fp4, vllm_scale = ops.scaled_fp4_quant(
+        a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
+    )

    # FlashInfer quantization (with swizzled layout to match vLLM's output)
    flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
-        a, a_global_scale, is_sf_swizzled_layout=True
+        a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
    )
    flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)

@@ -114,7 +139,14 @@ def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
        vllm_fp4,
        flashinfer_fp4,
    )
-    print(f"M={M}, K={K}, dtype={dtype}: PASSED")
+    # Compare scales
+    torch.testing.assert_close(
+        vllm_scale,
+        flashinfer_scale,
+    )
+    print(
+        f"M={M}, K={K}, dtype={dtype}, is_sf_swizzled_layout={is_sf_swizzled_layout}: PASSED"  # noqa: E501
+    )


 def test_accuracy():
@@ -130,9 +162,10 @@ def test_accuracy():
    Ms = [1, 1024]
    Ks = [4096]

-    for M in Ms:
-        for K in Ks:
-            _test_accuracy_once(M, K, dtype, device)
+    for is_sf_swizzled_layout in [True, False]:
+        for M in Ms:
+            for K in Ks:
+                _test_accuracy_once(M, K, dtype, device, is_sf_swizzled_layout)

    print("\nAll accuracy tests passed!")

@@ -145,7 +178,7 @@ if __name__ == "__main__":
        "--models",
        nargs="+",
        type=str,
-        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        default=["meta-llama/Llama-3.3-70B-Instruct"],
        choices=list(WEIGHT_SHAPES.keys()),
    )
    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])