Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -11,12 +11,11 @@ import torch
 from tests.kernels.utils import baseline_scaled_mm, to_fp8, to_int8
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    sparse_cutlass_supported)
+    sparse_cutlass_supported,
+)
 from vllm.platforms import current_platform

-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]

 capability = current_platform.get_device_capability()
 capability = capability[0] * 10 + capability[1]
@@ -40,9 +39,7 @@ def prune_to_2_4(tensor):

    # Create binary mask
    mask = torch.zeros_like(reshaped)
-    mask.scatter_(dim=1,
-                  index=indices,
-                  src=torch.ones_like(indices, dtype=mask.dtype))
+    mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype))

    # Apply mask and reshape back
    pruned = reshaped * mask
@@ -55,32 +52,31 @@ def prune_to_2_4(tensor):

 # This function checks that applying an identity matrix multiplication
 # to the compressed weights yields the original uncompressed weights.
-def check_compress_decompress_invariance(dtype: torch.dtype, b: torch.Tensor,
-                                         b_compressed: torch.Tensor,
-                                         b_metadata: torch.Tensor):
-
+def check_compress_decompress_invariance(
+    dtype: torch.dtype,
+    b: torch.Tensor,
+    b_compressed: torch.Tensor,
+    b_metadata: torch.Tensor,
+):
    # For float16 and bfloat16, cutlass_scaled_sparse_mm's output must be the
    # same dtype as its inputs. This line addresses that constraint while
    # arbitrarily using bfloat16 for the int8/fp8 cases.
    out_dtype = torch.float16 if dtype is torch.float16 else torch.bfloat16

-    eye = torch.eye(b.shape[0], device='cuda', dtype=dtype)
-    eye_scale = torch.ones(1, device='cuda', dtype=torch.float32)
-    b_decomp = ops.cutlass_scaled_sparse_mm(eye,
-                                            b_compressed,
-                                            b_metadata,
-                                            eye_scale,
-                                            eye_scale,
-                                            out_dtype=out_dtype)
+    eye = torch.eye(b.shape[0], device="cuda", dtype=dtype)
+    eye_scale = torch.ones(1, device="cuda", dtype=torch.float32)
+    b_decomp = ops.cutlass_scaled_sparse_mm(
+        eye, b_compressed, b_metadata, eye_scale, eye_scale, out_dtype=out_dtype
+    )

    torch.testing.assert_close(b.to(dtype=out_dtype), b_decomp)


 def make_rand_sparse_tensors(
-        dtype: torch.dtype, m: int, n: int, k: int
+    dtype: torch.dtype, m: int, n: int, k: int
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda')
-    b = torch.randn((n, k), device='cuda').t()
+    a = torch.randn((m, k), device="cuda")
+    b = torch.randn((n, k), device="cuda").t()

    if dtype == torch.int8:
        # ensure A and B aren't all zeros after rounding
@@ -107,32 +103,25 @@ def make_rand_sparse_tensors(
    return b_compressed, e, a, b


-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse CUTLASS is not supported on this GPU type.",
+)
 # Test working with a subset of A and B for sparse matmul
 def test_cutlass_sparse_subset():
-
    big_m = 1024
    m, n, k = 512, 512, 512

    # Create tensors
-    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
-                                                     big_m, n, k)
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, big_m, n, k)
    a = whole_a[0:m, 0:k]
    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10

-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=torch.bfloat16)
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_comp, e, scale_a, scale_b, out_dtype=torch.bfloat16
+    )
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype=torch.bfloat16)

    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)

@@ -161,105 +150,87 @@ MNK_FACTORS = [


 # Test working with a subset of A and B for sparse matmul
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse CUTLASS is not supported on this GPU type.",
+)
@pytest.mark.parametrize("m, n, k", MNK_FACTORS)
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: type[torch.dtype],
-                             use_bias: bool):
-
+def test_cutlass_sparse_gemm(
+    m: int, k: int, n: int, dtype: type[torch.dtype], use_bias: bool
+):
    # Create tensors
    b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
    scale_a = torch.ones((1, 1), device="cuda", dtype=torch.float32)
    scale_b = torch.ones((1, 1), device="cuda", dtype=torch.float32)

-    bias = torch.rand((n, ), device="cuda", dtype=dtype) if use_bias else None
+    bias = torch.rand((n,), device="cuda", dtype=dtype) if use_bias else None

-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=dtype,
-                                       bias=bias)
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_comp, e, scale_a, scale_b, out_dtype=dtype, bias=bias
+    )

-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=dtype,
-                                  bias=bias)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype=dtype, bias=bias)

    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=3e-1)


-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse CUTLASS is not supported on this GPU type.",
+)
@pytest.mark.parametrize("m, k, n", MNK_FACTORS)
-@pytest.mark.skipif(not current_platform.has_device_capability(89),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(89),
+    reason="FP8 is not supported on this GPU type.",
+)
@pytest.mark.parametrize("use_bias", [True, False])
 def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int, use_bias: bool):
-
    # Create tensors
    b_comp, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
-    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
-    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32)
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32)
    out_dtype = torch.bfloat16

-    bias = torch.rand(
-        (n, ), device="cuda", dtype=out_dtype) * 10 if use_bias else None
+    bias = torch.rand((n,), device="cuda", dtype=out_dtype) * 10 if use_bias else None

-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=out_dtype,
-                                       bias=bias)
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_comp, e, scale_a, scale_b, out_dtype=out_dtype, bias=bias
+    )

-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=out_dtype,
-                                  bias=bias)
+    baseline = baseline_scaled_mm(
+        a, b, scale_a, scale_b, out_dtype=out_dtype, bias=bias
+    )

    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=3e-1)


-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse CUTLASS is not supported on this GPU type.",
+)
@pytest.mark.parametrize("m,k,n", MNK_FACTORS)
@pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                                  per_out_ch: bool, use_bias: bool):
-
+def test_cutlass_sparse_int8_gemm(
+    m: int, n: int, k: int, per_act_token: bool, per_out_ch: bool, use_bias: bool
+):
    # Create tensors
    b_comp, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
-    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
-    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32)
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32)
    out_dtype = torch.bfloat16

-    bias = torch.rand(
-        (n, ), device="cuda", dtype=out_dtype) * 10 if use_bias else None
+    bias = torch.rand((n,), device="cuda", dtype=out_dtype) * 10 if use_bias else None

-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=out_dtype,
-                                       bias=bias)
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_comp, e, scale_a, scale_b, out_dtype=out_dtype, bias=bias
+    )

-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=out_dtype,
-                                  bias=bias)
+    baseline = baseline_scaled_mm(
+        a, b, scale_a, scale_b, out_dtype=out_dtype, bias=bias
+    )

    torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)