Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -17,20 +17,21 @@ if not has_triton_kernels():
 import triton_kernels.swiglu
 from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
 from triton_kernels.numerics import InFlexData
-from triton_kernels.numerics_details.mxfp import (downcast_to_mxfp,
-                                                  upcast_from_mxfp)
+from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
 from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
 from triton_kernels.tensor_details import layout
 from triton_kernels.testing import assert_close

 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedPrepareAndFinalize)
+    BatchedPrepareAndFinalize,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
-    BatchedOAITritonExperts, triton_kernel_moe_forward)
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+    BatchedOAITritonExperts,
+    triton_kernel_moe_forward,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.model_executor.layers.utils import shuffle_weight
 from vllm.utils import round_up

@@ -46,13 +47,11 @@ def deshuffle(w: torch.Tensor):
 def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
    randbits = [torch.randperm(E) for _ in range(M)]
    x_list = [
-        (-1)**i *
-        ((16384 +
-          ((i * 512) % 4096) + bits).to(torch.int16).view(torch.bfloat16))
+        (-1) ** i
+        * ((16384 + ((i * 512) % 4096) + bits).to(torch.int16).view(torch.bfloat16))
        for i, bits in enumerate(randbits)
    ]
-    exp_data = torch.stack(x_list).to(
-        device="cuda")  # simulating gate_output (M, E)
+    exp_data = torch.stack(x_list).to(device="cuda")  # simulating gate_output (M, E)

    # create input tensor
    x = torch.randn((M, K), dtype=torch.bfloat16, device="cuda")
@@ -120,20 +119,21 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
            value=0,
        )

-        w1_bias_tri = F.pad(w1_bias_tri, (0, w1_right_pad, 0, 0),
-                            mode="constant",
-                            value=0)
-        w2_bias_tri = F.pad(w2_bias_tri, (0, w2_right_pad, 0, 0),
-                            mode="constant",
-                            value=0)
+        w1_bias_tri = F.pad(
+            w1_bias_tri, (0, w1_right_pad, 0, 0), mode="constant", value=0
+        )
+        w2_bias_tri = F.pad(
+            w2_bias_tri, (0, w2_right_pad, 0, 0), mode="constant", value=0
+        )

        x_tri = F.pad(x_tri, (0, x_pad, 0, 0), mode="constant", value=0)

-        w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(
-            mx_axis=1)
+        w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
        w_scale_layout, w_scale_layout_opts = (
            layout.make_default_matmul_mxfp4_w_scale_layout(
-                mx_axis=1, num_warps=num_warps))
+                mx_axis=1, num_warps=num_warps
+            )
+        )

        w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1)
        w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, torch.bfloat16, axis=1)
@@ -141,29 +141,33 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
        w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1)
        w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, torch.bfloat16, axis=1)

-        w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout,
-                                **w_layout_opts)
+        w1_tri = convert_layout(
+            wrap_torch_tensor(w1_tri, FP4), w_layout, **w_layout_opts
+        )
        w1_scale_tri = convert_layout(
            wrap_torch_tensor(w1_scale_tri),
            w_scale_layout,
            **w_scale_layout_opts,
        )

-        w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout,
-                                **w_layout_opts)
+        w2_tri = convert_layout(
+            wrap_torch_tensor(w2_tri, FP4), w_layout, **w_layout_opts
+        )
        w2_scale_tri = convert_layout(
            wrap_torch_tensor(w2_scale_tri),
            w_scale_layout,
            **w_scale_layout_opts,
        )

-        pc1 = PrecisionConfig(weight_scale=w1_scale_tri,
-                              flex_ctx=FlexCtx(rhs_data=InFlexData()))
-        pc2 = PrecisionConfig(weight_scale=w2_scale_tri,
-                              flex_ctx=FlexCtx(rhs_data=InFlexData()))
+        pc1 = PrecisionConfig(
+            weight_scale=w1_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+        )
+        pc2 = PrecisionConfig(
+            weight_scale=w2_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+        )

        # tucuate so the rest can run properly
-        w1 = w1[..., :K, :2 * N]
+        w1 = w1[..., :K, : 2 * N]
        w2 = w2[..., :N, :K]

        w1 = deshuffle(w1)
@@ -261,7 +265,8 @@ class Case:
@pytest.mark.parametrize(
    ", ".join(f.name for f in fields(Case)),
    [
-        tuple(getattr(case, f.name) for f in fields(Case)) for case in [
+        tuple(getattr(case, f.name) for f in fields(Case))
+        for case in [
            # Case(a_dtype="bf16", w_dtype="bf16"),
            # Case(a_dtype="fp8_e4m3", w_dtype="fp8_e5m2"),
            Case(a_dtype="bf16", w_dtype="mx4")
@@ -321,10 +326,7 @@ def test_equiv(num_token, a_dtype, w_dtype, tp):
        gating_output=exp_data,
        topk=topk,
    )
-    assert_close(ref=out_ref,
-                 tri=out_triton_monolithic,
-                 maxtol=0.025,
-                 rmstol=0.005)
+    assert_close(ref=out_ref, tri=out_triton_monolithic, maxtol=0.025, rmstol=0.005)


 def batched_moe(
@@ -376,7 +378,8 @@ def batched_moe(
@pytest.mark.parametrize(
    ", ".join(f.name for f in fields(Case)),
    [
-        tuple(getattr(case, f.name) for f in fields(Case)) for case in [
+        tuple(getattr(case, f.name) for f in fields(Case))
+        for case in [
            # Case(a_dtype="bf16", w_dtype="bf16"),
            # Case(a_dtype="fp8_e4m3", w_dtype="fp8_e5m2"),
            Case(a_dtype="bf16", w_dtype="mx4")