Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -7,20 +7,26 @@ import itertools
 import pytest
 import torch

-from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
-                                       native_w8a8_block_matmul)
+from tests.kernels.quant_utils import (
+    native_per_token_group_quant_fp8,
+    native_w8a8_block_matmul,
+)
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_triton_block_scaled_mm)
+    cutlass_scaled_mm,
+    per_token_group_quant_fp8,
+    w8a8_triton_block_scaled_mm,
+)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import (fp8_gemm_nt,
-                                  get_col_major_tma_aligned_tensor,
-                                  per_block_cast_to_fp8)
+from vllm.utils.deep_gemm import (
+    fp8_gemm_nt,
+    get_col_major_tma_aligned_tensor,
+    per_block_cast_to_fp8,
+)

 if current_platform.get_device_capability() < (9, 0):
-    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
-                allow_module_level=True)
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)

 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
@@ -51,7 +57,8 @@ def setup_cuda():

@pytest.mark.parametrize(
    "num_tokens,d,dtype,group_size,seed",
-    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
+    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS),
+)
@torch.inference_mode()
 def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
    torch.manual_seed(seed)
@@ -60,15 +67,14 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
    ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
    out, scale = per_token_group_quant_fp8(x, group_size)

-    assert torch.allclose(out.to(torch.float32),
-                          ref_out.to(torch.float32),
-                          rtol=0.15)
+    assert torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.15)
    assert torch.allclose(scale, ref_scale)


@pytest.mark.parametrize(
    "M,N,K,block_size,out_dtype,seed",
-    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
+)
@torch.inference_mode()
 def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
    torch.manual_seed(seed)
@@ -89,14 +95,12 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
    As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale

-    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                       out_dtype)
-    out = w8a8_triton_block_scaled_mm(A_fp8, B_fp8, As, Bs, block_size,
-                                      out_dtype)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+    out = w8a8_triton_block_scaled_mm(A_fp8, B_fp8, As, Bs, block_size, out_dtype)

-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
    assert rel_diff < 0.001


@@ -127,32 +131,32 @@ def test_w8a8_block_fp8_cutlass_matmul():

    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
    # Hopper requires row-major format for scales
-    Bs_cutlass = Bs.T.contiguous() if current_platform.is_device_capability(
-        90) else Bs
+    Bs_cutlass = Bs.T.contiguous() if current_platform.is_device_capability(90) else Bs

-    A_fp8, As = per_token_group_quant_fp8(A_fp32,
-                                          block_size[1],
-                                          column_major_scales=False)
+    A_fp8, As = per_token_group_quant_fp8(
+        A_fp32, block_size[1], column_major_scales=False
+    )
    # CUTLASS uses column-major format for scales
    A_fp8_cutlass, As_cutlass = per_token_group_quant_fp8(
-        A_fp32, block_size[1], column_major_scales=True)
+        A_fp32, block_size[1], column_major_scales=True
+    )

-    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                       out_dtype)
-    out = cutlass_scaled_mm(A_fp8_cutlass, B_fp8, As_cutlass, Bs_cutlass,
-                            block_size, out_dtype)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+    out = cutlass_scaled_mm(
+        A_fp8_cutlass, B_fp8, As_cutlass, Bs_cutlass, block_size, out_dtype
+    )

-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
    assert rel_diff < 0.001


@pytest.mark.parametrize(
    "M,N,K,block_size,out_dtype,seed",
-    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
-@pytest.mark.skipif(not has_deep_gemm(),
-                    reason="DeepGemm kernels not available.")
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
+)
+@pytest.mark.skipif(not has_deep_gemm(), reason="DeepGemm kernels not available.")
@torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
    # only aligned sizes
@@ -172,20 +176,20 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
    As = As_fp8.to(torch.float32)
    Bs = Bs_fp8.to(torch.float32)

-    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                       out_dtype)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)

    # Transpose earlier so that the testing will not trigger transposing kernels
    As_fp8 = get_col_major_tma_aligned_tensor(As_fp8)

-    out = torch.zeros((M, N), device='cuda', dtype=out_dtype)
+    out = torch.zeros((M, N), device="cuda", dtype=out_dtype)

-    assert As_fp8.shape == (M, (K + 127) //
-                            128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
+    assert As_fp8.shape == (M, (K + 127) // 128), (
+        f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
+    )

    fp8_gemm_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)

-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
    assert rel_diff < 0.001