Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -8,19 +8,23 @@ import torch

 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
-from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
-                                                   GeluAndMul, MulAndSilu,
-                                                   NewGELU, QuickGELU,
-                                                   SiluAndMul, SwigluOAIAndMul)
+from vllm.model_executor.layers.activation import (
+    FastGELU,
+    FatreluAndMul,
+    GeluAndMul,
+    MulAndSilu,
+    NewGELU,
+    QuickGELU,
+    SiluAndMul,
+    SwigluOAIAndMul,
+)
 from vllm.platforms import current_platform

 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 13824]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]


@pytest.mark.parametrize(
@@ -73,24 +77,19 @@ def test_act_and_mul(
    out = layer(x)
    ref_out = layer.forward_native(x)
    if activation == "swigluoai_and_mul":
-
        rtol = {
-            #For fp16, change the relative tolerance from 1e-3 to 2e-3
-            torch.float16:
-            2e-3,
-            torch.bfloat16:
-            2e-2,
-            torch.float:
-            1.3e-6
+            # For fp16, change the relative tolerance from 1e-3 to 2e-3
+            torch.float16: 2e-3,
+            torch.bfloat16: 2e-2,
+            torch.float: 1.3e-6,
        }

        def _get_rtol(output) -> float:
            return rtol[output.dtype]

-        torch.testing.assert_close(out,
-                                   ref_out,
-                                   atol=get_default_atol(out),
-                                   rtol=_get_rtol(out))
+        torch.testing.assert_close(
+            out, ref_out, atol=get_default_atol(out), rtol=_get_rtol(out)
+        )
    else:
        # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
        # equivalent to the native PyTorch implementations, so we can do exact
@@ -98,7 +97,7 @@ def test_act_and_mul(
        torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)

    d = x.shape[-1] // 2
-    output_shape = (x.shape[:-1] + (d, ))
+    output_shape = x.shape[:-1] + (d,)
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    if activation == "fatrelu":
        opcheck(fn, (out, x, threshold))
@@ -108,9 +107,14 @@ def test_act_and_mul(
        opcheck(fn, (out, x))


-@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
-                                        (NewGELU, torch.ops._C.gelu_new),
-                                        (QuickGELU, torch.ops._C.gelu_quick)])
+@pytest.mark.parametrize(
+    "activation",
+    [
+        (FastGELU, torch.ops._C.gelu_fast),
+        (NewGELU, torch.ops._C.gelu_new),
+        (QuickGELU, torch.ops._C.gelu_quick),
+    ],
+)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES)
@@ -132,10 +136,9 @@ def test_activation(
    fn = activation[1]
    out = layer(x)
    ref_out = layer.forward_native(x)
-    torch.testing.assert_close(out,
-                               ref_out,
-                               atol=get_default_atol(out),
-                               rtol=get_default_rtol(out))
+    torch.testing.assert_close(
+        out, ref_out, atol=get_default_atol(out), rtol=get_default_rtol(out)
+    )

    out = torch.empty_like(x)
    opcheck(fn, (out, x))
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -24,9 +24,7 @@ NUM_TOKENS_HIDDEN_SIZES = [
 ADD_RESIDUAL = [False, True]
 SCALE_UBS = [True, False]
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]

 EPS = 1e-6

@@ -34,13 +32,12 @@ EPS = 1e-6


 def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
-    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+    return torch.as_tensor(x, dtype=torch.float32, device="cuda")


-def ref_rms_norm(rms_norm_layer: RMSNorm,
-                 x: torch.Tensor,
-                 residual: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+def ref_rms_norm(
+    rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor]
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
    if residual is not None:
        residual = residual.clone()
        out, residual = rms_norm_layer.forward_native(x, residual)
@@ -50,12 +47,13 @@ def ref_rms_norm(rms_norm_layer: RMSNorm,
    return out, residual


-def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
-                                x: torch.Tensor,
-                                quant_dtype: torch.dtype,
-                                residual: Optional[torch.Tensor],
-                                scale_ub: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+def ref_dynamic_per_token_quant(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    residual: Optional[torch.Tensor],
+    scale_ub: Optional[torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
    if scale_ub is not None:
        assert quant_dtype == torch.float8_e4m3fn

@@ -64,9 +62,9 @@ def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,

    # Quant
    if quant_dtype == torch.float8_e4m3fn:
-        torch_out, scales = ops.scaled_fp8_quant(torch_out,
-                                                 scale_ub=scale_ub,
-                                                 use_per_token_if_dynamic=True)
+        torch_out, scales = ops.scaled_fp8_quant(
+            torch_out, scale_ub=scale_ub, use_per_token_if_dynamic=True
+        )
    else:
        assert quant_dtype == torch.int8
        torch_out, scales = ops.scaled_int8_quant(torch_out)
@@ -74,38 +72,41 @@ def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
    return torch_out, scales, residual


-def ref_impl(rms_norm_layer: RMSNorm,
-             x: torch.Tensor,
-             quant_dtype: torch.dtype,
-             residual: Optional[torch.Tensor],
-             scale_ub: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-    return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype,
-                                       residual, scale_ub)
+def ref_impl(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    residual: Optional[torch.Tensor],
+    scale_ub: Optional[torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ref_dynamic_per_token_quant(
+        rms_norm_layer, x, quant_dtype, residual, scale_ub
+    )


-def ops_dynamic_per_token_quant(weight: torch.Tensor,
-                                x: torch.Tensor,
-                                quant_dtype: torch.dtype,
-                                residual: Optional[torch.Tensor],
-                                scale_ub: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+def ops_dynamic_per_token_quant(
+    weight: torch.Tensor,
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    residual: Optional[torch.Tensor],
+    scale_ub: Optional[torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
    if residual is not None:
        residual = residual.clone()
-    out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS,
-                                                       quant_dtype, scale_ub,
-                                                       residual)
+    out, scales = ops.rms_norm_dynamic_per_token_quant(
+        x, weight, EPS, quant_dtype, scale_ub, residual
+    )
    return out, scales, residual


-def ops_impl(weight: torch.Tensor,
-             x: torch.Tensor,
-             quant_dtype: torch.dtype,
-             residual: Optional[torch.Tensor],
-             scale_ub: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual,
-                                       scale_ub)
+def ops_impl(
+    weight: torch.Tensor,
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    residual: Optional[torch.Tensor],
+    scale_ub: Optional[torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)


@pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
@@ -146,12 +147,14 @@ def test_rms_norm(
    residual = torch.randn_like(x) * scale if add_residual else None
    if scale_ub is not None:
        rms_x, _ = ref_rms_norm(layer, x, residual)
-        scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device='cuda')
+        scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device="cuda")

-    ref_out, ref_scales, ref_residual = \
-        ref_impl(layer, x, quant_dtype, residual, scale_ub)
-    ops_out, ops_scales, ops_residual = \
-        ops_impl(layer.weight, x, quant_dtype, residual, scale_ub)
+    ref_out, ref_scales, ref_residual = ref_impl(
+        layer, x, quant_dtype, residual, scale_ub
+    )
+    ops_out, ops_scales, ops_residual = ops_impl(
+        layer.weight, x, quant_dtype, residual, scale_ub
+    )

    assert ref_out.dtype == quant_dtype
    assert ops_out.dtype == quant_dtype
@@ -160,15 +163,18 @@ def test_rms_norm(
        # big atol to account for round-off errors.
        assert torch.allclose(ref_out, ops_out, atol=1)
    else:
-        assert torch.allclose(ref_out.to(dtype=torch.float32),
-                              ops_out.to(dtype=torch.float32))
+        assert torch.allclose(
+            ref_out.to(dtype=torch.float32), ops_out.to(dtype=torch.float32)
+        )
    if add_residual:
        assert torch.allclose(ref_residual, ops_residual)

    output = torch.empty_like(x, dtype=quant_dtype)
-    scales = torch.empty((x.numel() // x.shape[-1], 1),
-                         device=x.device,
-                         dtype=torch.float32)
+    scales = torch.empty(
+        (x.numel() // x.shape[-1], 1), device=x.device, dtype=torch.float32
+    )

-    opcheck(torch.ops._C.rms_norm_dynamic_per_token_quant,
-            (output, x, layer.weight, scales, 1e-5, scale_ub, residual))
+    opcheck(
+        torch.ops._C.rms_norm_dynamic_per_token_quant,
+        (output, x, layer.weight, scales, 1e-5, scale_ub, residual),
+    )
--- a/tests/kernels/core/test_layernorm.py
+++ b/tests/kernels/core/test_layernorm.py
@@ -11,13 +11,22 @@ from vllm.platforms import current_platform

 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
-HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
-                8199]  # Arbitrary values for testing
+HIDDEN_SIZES = [
+    8,
+    768,
+    769,
+    770,
+    771,
+    5120,
+    5124,
+    5125,
+    5126,
+    8192,
+    8199,
+]  # Arbitrary values for testing
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]


@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -63,11 +72,14 @@ def test_rms_norm(
        torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)

    if residual is not None:
-        opcheck(torch.ops._C.fused_add_rms_norm,
-                (x, residual, layer.weight.data, layer.variance_epsilon))
+        opcheck(
+            torch.ops._C.fused_add_rms_norm,
+            (x, residual, layer.weight.data, layer.variance_epsilon),
+        )
    else:
-        opcheck(torch.ops._C.rms_norm,
-                (out, x, layer.weight.data, layer.variance_epsilon))
+        opcheck(
+            torch.ops._C.rms_norm, (out, x, layer.weight.data, layer.variance_epsilon)
+        )


@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -98,7 +110,8 @@ def test_poly_norm(

    opcheck(
        torch.ops._C.poly_norm,
-        (out, x, layer.weight.data, layer.bias.data, layer.variance_epsilon))
+        (out, x, layer.weight.data, layer.bias.data, layer.variance_epsilon),
+    )


@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -144,7 +157,8 @@ def test_fused_rms_norm_quant(

    if add_residual:
        torch.ops._C.fused_add_rms_norm_static_fp8_quant(
-            out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6)
+            out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6
+        )

        # Unfused kernel is in-place so it goes second
        # Also use a separate clone of x to avoid modifying the input
@@ -152,29 +166,32 @@ def test_fused_rms_norm_quant(
        x_unfused = x_unfused_base[..., :hidden_size]
        assert x_unfused.is_contiguous() != strided_input
        torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
-        torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused.contiguous(),
-                                             quant_scale_t)
+        torch.ops._C.static_scaled_fp8_quant(
+            out_quant, x_unfused.contiguous(), quant_scale_t
+        )

        torch.cuda.synchronize()
-        torch.testing.assert_close(residual_fused,
-                                   residual,
-                                   atol=1e-2,
-                                   rtol=1e-2)
+        torch.testing.assert_close(residual_fused, residual, atol=1e-2, rtol=1e-2)
        opcheck(
            torch.ops._C.fused_add_rms_norm_static_fp8_quant,
-            (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6))
+            (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6),
+        )
    else:
-        torch.ops._C.rms_norm_static_fp8_quant(out_quant_fused, x, weight,
-                                               quant_scale_t, 1e-6)
+        torch.ops._C.rms_norm_static_fp8_quant(
+            out_quant_fused, x, weight, quant_scale_t, 1e-6
+        )

        torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
-        torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm,
-                                             quant_scale_t)
+        torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm, quant_scale_t)

-        opcheck(torch.ops._C.rms_norm_static_fp8_quant,
-                (out_quant_fused, x, weight, quant_scale_t, 1e-6))
+        opcheck(
+            torch.ops._C.rms_norm_static_fp8_quant,
+            (out_quant_fused, x, weight, quant_scale_t, 1e-6),
+        )

-    torch.testing.assert_close(out_quant.to(dtype=torch.float32),
-                               out_quant_fused.to(dtype=torch.float32),
-                               atol=1e-3,
-                               rtol=1e-3)
+    torch.testing.assert_close(
+        out_quant.to(dtype=torch.float32),
+        out_quant_fused.to(dtype=torch.float32),
+        atol=1e-3,
+        rtol=1e-3,
+    )
--- a/tests/kernels/core/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -14,25 +14,25 @@ from vllm.platforms import current_platform
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


-def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
-                       head_size: int, max_position_embeddings: int,
-                       dtype: torch.dtype, device: torch.device):
+def generate_test_data(
+    num_tokens: int,
+    num_q_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    max_position_embeddings: int,
+    dtype: torch.dtype,
+    device: torch.device,
+):
    """Generate test data for given configuration."""
    current_platform.seed_everything(42)
    # Create 2D positions (3, num_tokens) for multimodal case
-    positions = torch.randint(0,
-                              max_position_embeddings // 4, (3, num_tokens),
-                              device=device)
+    positions = torch.randint(
+        0, max_position_embeddings // 4, (3, num_tokens), device=device
+    )

    # Create query and key tensors
-    query = torch.randn(num_tokens,
-                        num_q_heads * head_size,
-                        dtype=dtype,
-                        device=device)
-    key = torch.randn(num_tokens,
-                      num_kv_heads * head_size,
-                      dtype=dtype,
-                      device=device)
+    query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device)
+    key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device)

    return positions, query, key

@@ -59,7 +59,8 @@ MODELS_TO_TEST = [
                Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
                reason="Qwen3-VL only available after Transformers v4.57",
            )
-        ]),
+        ],
+    ),
    MRoPETestInfo(
        model_name="Qwen/Qwen3-VL-30B-A3B-Instruct",
        marks=[
@@ -67,24 +68,33 @@ MODELS_TO_TEST = [
                Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
                reason="Qwen3-VL only available after Transformers v4.57",
            )
-        ]),
+        ],
+    ),
 ]

 num_tokens_list = [11, 8192]


-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Skipping CUDA/ROCm only tests.")
-@pytest.mark.parametrize("model_info, model_name", [
-    pytest.param(test_config, test_config.model_name, marks=test_config.marks)
-    for test_config in MODELS_TO_TEST
-])
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests."
+)
+@pytest.mark.parametrize(
+    "model_info, model_name",
+    [
+        pytest.param(test_config, test_config.model_name, marks=test_config.marks)
+        for test_config in MODELS_TO_TEST
+    ],
+)
@pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("num_tokens", num_tokens_list)
-def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
-               dtype: torch.dtype, num_tokens: int):
-
+def test_mrope(
+    model_name: str,
+    model_info: MRoPETestInfo,
+    tp_size: int,
+    dtype: torch.dtype,
+    num_tokens: int,
+):
    atol = model_info.atol
    rtol = model_info.rtol

@@ -96,8 +106,11 @@ def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
    total_num_heads = config.num_attention_heads
    num_heads = total_num_heads // tp_size
    num_kv_heads = max(1, total_num_kv_heads // tp_size)
-    head_dim = (config.head_dim if hasattr(config, "head_dim") else
-                config.hidden_size // total_num_heads)
+    head_dim = (
+        config.head_dim
+        if hasattr(config, "head_dim")
+        else config.hidden_size // total_num_heads
+    )
    is_neox_style = True

    rope_theta = config.rope_theta
@@ -117,9 +130,9 @@ def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,

    # create q k v input tensors
    # create rotary pos emb input tensors
-    positions, query, key = generate_test_data(num_tokens, num_heads,
-                                               num_kv_heads, head_dim,
-                                               max_position, dtype, device)
+    positions, query, key = generate_test_data(
+        num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
+    )

    query_native, key_native = mrope_helper_class.forward_native(
        positions,
@@ -137,19 +150,26 @@ def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
    torch.testing.assert_close(key_native, key_cuda, atol=atol, rtol=rtol)


-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Skipping CUDA/ROCm only tests.")
-@pytest.mark.parametrize("model_info, model_name", [
-    pytest.param(test_config, test_config.model_name, marks=test_config.marks)
-    for test_config in MODELS_TO_TEST
-])
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests."
+)
+@pytest.mark.parametrize(
+    "model_info, model_name",
+    [
+        pytest.param(test_config, test_config.model_name, marks=test_config.marks)
+        for test_config in MODELS_TO_TEST
+    ],
+)
@pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("num_tokens", num_tokens_list)
-def test_mrope_torch_compile_tracing(model_name: str,
-                                     model_info: MRoPETestInfo, tp_size: int,
-                                     dtype: torch.dtype, num_tokens: int):
-
+def test_mrope_torch_compile_tracing(
+    model_name: str,
+    model_info: MRoPETestInfo,
+    tp_size: int,
+    dtype: torch.dtype,
+    num_tokens: int,
+):
    atol = model_info.atol
    rtol = model_info.rtol

@@ -161,8 +181,11 @@ def test_mrope_torch_compile_tracing(model_name: str,
    total_num_heads = config.num_attention_heads
    num_heads = total_num_heads // tp_size
    num_kv_heads = max(1, total_num_kv_heads // tp_size)
-    head_dim = (config.head_dim if hasattr(config, "head_dim") else
-                config.hidden_size // total_num_heads)
+    head_dim = (
+        config.head_dim
+        if hasattr(config, "head_dim")
+        else config.hidden_size // total_num_heads
+    )
    is_neox_style = True
    rope_theta = config.rope_theta
    max_position = config.max_position_embeddings
@@ -180,16 +203,16 @@ def test_mrope_torch_compile_tracing(model_name: str,
    ).to(device=device)

    # Generate test data
-    positions, query, key = generate_test_data(num_tokens, num_heads,
-                                               num_kv_heads, head_dim,
-                                               max_position, dtype, device)
+    positions, query, key = generate_test_data(
+        num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
+    )

    # Create a wrapper that makes the in-place function appear functional
    def functional_forward_cuda(pos, q, k):
        """Wrapper that converts in-place operation to functional style

        CUDA Graph does not support in-place operations.
-        This wrapper creates working copies of the 
+        This wrapper creates working copies of the
        input tensors and modifies them.
        """
        q_work = q.clone()  # Create working copies
@@ -206,11 +229,13 @@ def test_mrope_torch_compile_tracing(model_name: str,
    )

    try:
-        compiled_forward_cuda = torch.compile(functional_forward_cuda,
-                                              fullgraph=True,
-                                              backend="inductor",
-                                              mode="reduce-overhead",
-                                              dynamic=False)
+        compiled_forward_cuda = torch.compile(
+            functional_forward_cuda,
+            fullgraph=True,
+            backend="inductor",
+            mode="reduce-overhead",
+            dynamic=False,
+        )

        # Run compiled version
        query_compiled_cuda, key_compiled_cuda = compiled_forward_cuda(
@@ -225,25 +250,16 @@ def test_mrope_torch_compile_tracing(model_name: str,
        mrope_helper_class.forward_cuda(positions, query_cuda, key_cuda)

        # Verify results
-        torch.testing.assert_close(query_compiled_cuda,
-                                   query_cuda,
-                                   atol=atol,
-                                   rtol=rtol)
-        torch.testing.assert_close(key_compiled_cuda,
-                                   key_cuda,
-                                   atol=atol,
-                                   rtol=rtol)
-        torch.testing.assert_close(query_compiled_cuda,
-                                   query_native,
-                                   atol=atol,
-                                   rtol=rtol)
-        torch.testing.assert_close(key_compiled_cuda,
-                                   key_native,
-                                   atol=atol,
-                                   rtol=rtol)
+        torch.testing.assert_close(
+            query_compiled_cuda, query_cuda, atol=atol, rtol=rtol
+        )
+        torch.testing.assert_close(key_compiled_cuda, key_cuda, atol=atol, rtol=rtol)
+        torch.testing.assert_close(
+            query_compiled_cuda, query_native, atol=atol, rtol=rtol
+        )
+        torch.testing.assert_close(key_compiled_cuda, key_native, atol=atol, rtol=rtol)

        print("✓ forward_cuda successfully traced with torch.compile inductor")

    except Exception as e:
-        pytest.fail(
-            f"forward_cuda failed to trace with torch.compile inductor: {e}")
+        pytest.fail(f"forward_cuda failed to trace with torch.compile inductor: {e}")
--- a/tests/kernels/core/test_permute_cols.py
+++ b/tests/kernels/core/test_permute_cols.py
@@ -8,11 +8,11 @@ from tests.kernels.utils import opcheck
 from vllm._custom_ops import permute_cols


-@pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
-@pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("shape", [(1, 512), (544, 4096), (67, 8192)])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
 def test_permute_cols(shape, dtype):
    x = torch.randn(shape, dtype=dtype).cuda()
    perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
    opcheck(torch.ops._C.permute_cols, (x, perm))
    y = permute_cols(x, perm)
-    torch.testing.assert_close(y, x[:, perm])
+    torch.testing.assert_close(y, x[:, perm])
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -19,30 +19,33 @@ NUM_HEADS = [17]  # Arbitrary values for testing
 BATCH_SIZES = [5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 USE_KEY = [True, False]


-def _get_flat_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
-                           head_size: int) -> tuple[int, ...]:
+def _get_flat_tensor_shape(
+    batch_size: int, seq_len: int, num_heads: int, head_size: int
+) -> tuple[int, ...]:
    return (batch_size, seq_len, num_heads * head_size)


 # For testing sliced tensors
-def _get_padded_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
-                             head_size: int) -> tuple[int, ...]:
+def _get_padded_tensor_shape(
+    batch_size: int, seq_len: int, num_heads: int, head_size: int
+) -> tuple[int, ...]:
    return (batch_size, seq_len, num_heads, head_size + 64)


-def _get_batch_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
-                            head_size: int) -> tuple[int, ...]:
+def _get_batch_tensor_shape(
+    batch_size: int, seq_len: int, num_heads: int, head_size: int
+) -> tuple[int, ...]:
    return (batch_size, seq_len, num_heads, head_size)


 TENSORS_SHAPES_FN = [
-    _get_batch_tensor_shape, _get_flat_tensor_shape, _get_padded_tensor_shape
+    _get_batch_tensor_shape,
+    _get_flat_tensor_shape,
+    _get_padded_tensor_shape,
 ]


@@ -97,41 +100,63 @@ def test_rotary_embedding(
    ref_query, ref_key = rope.forward_native(positions, query, key)
    out_query, out_key = rope.forward(positions, query, key)
    # Compare the results.
-    torch.testing.assert_close(out_query,
-                               ref_query,
-                               atol=get_default_atol(out_query),
-                               rtol=get_default_rtol(out_query))
+    torch.testing.assert_close(
+        out_query,
+        ref_query,
+        atol=get_default_atol(out_query),
+        rtol=get_default_rtol(out_query),
+    )
    if use_key:
-        torch.testing.assert_close(out_key,
-                                   ref_key,
-                                   atol=get_default_atol(out_key),
-                                   rtol=get_default_rtol(out_key))
+        torch.testing.assert_close(
+            out_key,
+            ref_key,
+            atol=get_default_atol(out_key),
+            rtol=get_default_rtol(out_key),
+        )
    else:
-        assert ref_key is None and out_key is None, \
-            "expected returned key to be None"
+        assert ref_key is None and out_key is None, "expected returned key to be None"


@torch.inference_mode()
 def test_rope_module_cache():
    MAX_POSITIONS = [123, 1234]
    BASES = [10000, 1000000]
-    ROPE_SCALINGS = (None, {
-        "rope_type": "linear",
-        "factor": (1, )
-    }, {
-        "rope_type": "dynamic",
-        "factor": 1
-    })
-    settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
-                ROPE_SCALINGS, DTYPES)
+    ROPE_SCALINGS = (
+        None,
+        {"rope_type": "linear", "factor": (1,)},
+        {"rope_type": "dynamic", "factor": 1},
+    )
+    settings = (
+        HEAD_SIZES,
+        ROTARY_DIMS,
+        MAX_POSITIONS,
+        BASES,
+        IS_NEOX_STYLE,
+        ROPE_SCALINGS,
+        DTYPES,
+    )
    rope_setting_id_map: dict[str, int] = {}
    for setting in product(*settings):
-        head_size, rotary_dim, max_position, base, \
-            is_neox_stype, rope_scaling, dtype = setting
+        (
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_stype,
+            rope_scaling,
+            dtype,
+        ) = setting
        if rotary_dim is None:
            rotary_dim = head_size
-        rope = get_rope(head_size, rotary_dim, max_position, base,
-                        is_neox_stype, rope_scaling, dtype)
+        rope = get_rope(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_stype,
+            rope_scaling,
+            dtype,
+        )
        # different settings cannot share the same rope module
        assert id(rope) not in rope_setting_id_map.values()
        assert all(x.dtype == dtype for x in rope.buffers())
@@ -139,11 +164,25 @@ def test_rope_module_cache():
        rope_setting_id_map[str(setting)] = id(rope)

    for setting in product(*settings):
-        head_size, rotary_dim, max_position, base, \
-            is_neox_stype, rope_scaling, dtype = setting
+        (
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_stype,
+            rope_scaling,
+            dtype,
+        ) = setting
        if rotary_dim is None:
            rotary_dim = head_size
-        rope = get_rope(head_size, rotary_dim, max_position, base,
-                        is_neox_stype, rope_scaling, dtype)
+        rope = get_rope(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_stype,
+            rope_scaling,
+            dtype,
+        )
        # check if cache take effect
        assert id(rope) == rope_setting_id_map[str(setting)]
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@@ -13,17 +13,20 @@ from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding


-def rotary_embedding_opcheck(rot,
-                             positions: torch.Tensor,
-                             query: torch.Tensor,
-                             key: Optional[torch.Tensor] = None):
+def rotary_embedding_opcheck(
+    rot,
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: Optional[torch.Tensor] = None,
+):
    cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)

    # ops.rotary_embedding() is a in-place operation
    # that updates the query and key tensors.
-    opcheck(torch.ops._C.rotary_embedding,
-            (positions, query, key, rot.head_size, cos_sin_cache,
-             rot.is_neox_style))
+    opcheck(
+        torch.ops._C.rotary_embedding,
+        (positions, query, key, rot.head_size, cos_sin_cache, rot.is_neox_style),
+    )


@pytest.mark.parametrize("device", ["cuda"])
@@ -34,26 +37,30 @@ def rotary_embedding_opcheck(rot,
@pytest.mark.parametrize("seq_len", [11, 1024])
@pytest.mark.parametrize("use_key", [True, False])
@pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
-def test_rotary_embedding_opcheck(dist_init, device, max_position,
-                                  is_neox_style, rotary_dim, head_size,
-                                  seq_len, use_key, head_stride_is_contiguous):
+def test_rotary_embedding_opcheck(
+    dist_init,
+    device,
+    max_position,
+    is_neox_style,
+    rotary_dim,
+    head_size,
+    seq_len,
+    use_key,
+    head_stride_is_contiguous,
+):
    batch_size = 1
    base = 10000
    num_heads = 7
-    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                          is_neox_style, torch.float32)
+    rot = RotaryEmbedding(
+        head_size, rotary_dim, max_position, base, is_neox_style, torch.float32
+    )

-    positions = torch.randint(0,
-                              max_position, (batch_size, seq_len),
-                              device=device)
+    positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
    head_stride = head_size + (64 if head_stride_is_contiguous else 0)

-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads,
-                        head_stride,
-                        dtype=torch.float32,
-                        device=device)
+    query = torch.randn(
+        batch_size, seq_len, num_heads, head_stride, dtype=torch.float32, device=device
+    )
    key = torch.randn_like(query) if use_key else None
    query = query[..., :head_size]
    key = key[..., :head_size] if use_key else None
@@ -64,5 +71,8 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
    # [..., num_heads * head_dim] shape/layout
    if head_stride_is_contiguous:
        rotary_embedding_opcheck(
-            rot, positions, query.flatten(start_dim=-2),
-            key.flatten(start_dim=-2) if use_key else None)
+            rot,
+            positions,
+            query.flatten(start_dim=-2),
+            key.flatten(start_dim=-2) if use_key else None,
+        )
--- a/tests/kernels/core/test_uva.py
+++ b/tests/kernels/core/test_uva.py
@@ -5,20 +5,14 @@ import torch

 from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available

-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]


@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_cpu_write(device):
    torch.set_default_device(device)
-    cpu_tensor = torch.zeros(10,
-                             10,
-                             device="cpu",
-                             pin_memory=True,
-                             dtype=torch.int32)
+    cpu_tensor = torch.zeros(10, 10, device="cpu", pin_memory=True, dtype=torch.int32)
    cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
    assert cuda_view.device.type == "cuda"

@@ -40,11 +34,7 @@ def test_cpu_write(device):
@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_gpu_write(device):
    torch.set_default_device(device)
-    cpu_tensor = torch.zeros(10,
-                             10,
-                             device="cpu",
-                             pin_memory=True,
-                             dtype=torch.int32)
+    cpu_tensor = torch.zeros(10, 10, device="cpu", pin_memory=True, dtype=torch.int32)
    cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
    assert cuda_view.device.type == "cuda"

@@ -59,4 +49,4 @@ def test_gpu_write(device):

    assert cpu_tensor[0, 0] == 2
    assert cpu_tensor[2, 3] == 4
-    assert cpu_tensor[4, 5] == -2
+    assert cpu_tensor[4, 5] == -2