Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/kernels/mamba/test_causal_conv1d.py
+++ b/tests/kernels/mamba/test_causal_conv1d.py
@@ -10,7 +10,9 @@ from einops import rearrange

 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update)
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
 from vllm.platforms import current_platform


@@ -39,18 +41,15 @@ def causal_conv1d_ref(
    seqlen = x.shape[-1]
    dim, width = weight.shape
    if initial_states is None:
-        out = F.conv1d(x,
-                       weight.unsqueeze(1),
-                       bias,
-                       padding=width - 1,
-                       groups=dim)
+        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
    else:
        x = torch.cat([initial_states, x], dim=-1)
        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=0, groups=dim)
    out = out[..., :seqlen]
    if return_final_states:
        final_states = F.pad(x, (width - 1 - x.shape[-1], 0)).to(
-            dtype_in)  # (batch, dim, width - 1)
+            dtype_in
+        )  # (batch, dim, width - 1)
        if final_states_out is not None:
            final_states_out.copy_(final_states)
        else:
@@ -59,12 +58,9 @@ def causal_conv1d_ref(
    return (out, None) if not return_final_states else (out, final_states_out)


-def causal_conv1d_update_ref(x,
-                             conv_state,
-                             weight,
-                             bias=None,
-                             activation=None,
-                             cache_seqlens=None):
+def causal_conv1d_update_ref(
+    x, conv_state, weight, bias=None, activation=None, cache_seqlens=None
+):
    """
    x: (batch, dim) or (batch, dim, seqlen)
    conv_state: (batch, dim, state_len), where state_len >= width - 1
@@ -91,24 +87,25 @@ def causal_conv1d_update_ref(x,
    assert weight.shape == (dim, width)
    if cache_seqlens is None:
        x_new = torch.cat([conv_state, x], dim=-1).to(
-            weight.dtype)  # (batch, dim, state_len + seqlen)
+            weight.dtype
+        )  # (batch, dim, state_len + seqlen)
        conv_state.copy_(x_new[:, :, -state_len:])
    else:
        width_idx = torch.arange(
-            -(width - 1), 0, dtype=torch.long,
-            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
-        width_idx = torch.remainder(width_idx, state_len).unsqueeze(1).expand(
-            -1, dim, -1)
-        x_new = torch.cat([conv_state.gather(2, width_idx), x],
-                          dim=-1).to(weight.dtype)
-        copy_idx = torch.arange(
-            seqlen, dtype=torch.long,
-            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
-        copy_idx = torch.remainder(copy_idx,
-                                   state_len).unsqueeze(1).expand(-1, dim, -1)
+            -(width - 1), 0, dtype=torch.long, device=x.device
+        ).unsqueeze(0) + cache_seqlens.unsqueeze(1)
+        width_idx = (
+            torch.remainder(width_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
+        )
+        x_new = torch.cat([conv_state.gather(2, width_idx), x], dim=-1).to(weight.dtype)
+        copy_idx = torch.arange(seqlen, dtype=torch.long, device=x.device).unsqueeze(
+            0
+        ) + cache_seqlens.unsqueeze(1)
+        copy_idx = torch.remainder(copy_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
        conv_state.scatter_(2, copy_idx, x)
-    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0,
-                   groups=dim)[:, :, -seqlen:]
+    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0, groups=dim)[
+        :, :, -seqlen:
+    ]
    if unsqueeze:
        out = out.squeeze(-1)
    return (out if activation is None else F.silu(out)).to(dtype=dtype_in)
@@ -117,15 +114,17 @@ def causal_conv1d_update_ref(x,
@pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
@pytest.mark.parametrize("silu_activation", [True])
@pytest.mark.parametrize("has_bias", [True])
-def causal_conv1d_opcheck_fn(x: torch.Tensor,
-                             weight: torch.Tensor,
-                             bias: Optional[torch.Tensor] = None,
-                             cu_seq_len: Optional[torch.Tensor] = None,
-                             cache_indices: Optional[torch.Tensor] = None,
-                             has_initial_state: Optional[torch.Tensor] = None,
-                             conv_states: Optional[torch.Tensor] = None,
-                             activation: Optional[str] = "silu",
-                             pad_slot_id: int = PAD_SLOT_ID):
+def causal_conv1d_opcheck_fn(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    cu_seq_len: Optional[torch.Tensor] = None,
+    cache_indices: Optional[torch.Tensor] = None,
+    has_initial_state: Optional[torch.Tensor] = None,
+    conv_states: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+    pad_slot_id: int = PAD_SLOT_ID,
+):
    """
    x: (batch, dim, seqlen)
    weight: (dim, width)
@@ -150,8 +149,7 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor,
@pytest.mark.parametrize("seqlen", [1])
@pytest.mark.parametrize("width", [4])
@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
-def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
-                              itype):
+def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, itype):
    device = "cuda"
    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
    if itype == torch.bfloat16:
@@ -167,23 +165,16 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
    conv_state_ref = conv_state.detach().clone()
    activation = None if not silu_activation else "silu"
-    out = causal_conv1d_update(x,
-                               conv_state,
-                               weight,
-                               bias,
-                               activation=activation)
-    out_ref = causal_conv1d_update_ref(x_ref,
-                                       conv_state_ref,
-                                       weight,
-                                       bias,
-                                       activation=activation)
+    out = causal_conv1d_update(x, conv_state, weight, bias, activation=activation)
+    out_ref = causal_conv1d_update_ref(
+        x_ref, conv_state_ref, weight, bias, activation=activation
+    )

    assert torch.equal(conv_state, conv_state_ref)
    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)


-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("silu_activation", [False, True])
@pytest.mark.parametrize("has_bias", [False, True])
@pytest.mark.parametrize("seqlen", [1, 3])
@@ -192,9 +183,9 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
 # tests correctness in case subset of the sequences are padded
@pytest.mark.parametrize("with_padding", [True, False])
@pytest.mark.parametrize("batch_size", [3])
-def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
-                                                width, seqlen, has_bias,
-                                                silu_activation, itype):
+def test_causal_conv1d_update_with_batch_gather(
+    batch_size, with_padding, dim, width, seqlen, has_bias, silu_activation, itype
+):
    device = "cuda"
    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
    if itype == torch.bfloat16:
@@ -209,31 +200,30 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
    total_entries = 10 * batch_size

    # x will be (batch, dim, seqlen) with contiguous along dim-axis
-    x = torch.randn(padded_batch_size, seqlen, dim, device=device,
-                    dtype=itype).transpose(1, 2)
+    x = torch.randn(
+        padded_batch_size, seqlen, dim, device=device, dtype=itype
+    ).transpose(1, 2)

    x_ref = x.clone()

    conv_state_indices = torch.randperm(total_entries)[:batch_size].to(
-        dtype=torch.int32, device=device)
-    unused_states_bool = torch.ones(total_entries,
-                                    dtype=torch.bool,
-                                    device=device)
+        dtype=torch.int32, device=device
+    )
+    unused_states_bool = torch.ones(total_entries, dtype=torch.bool, device=device)
    unused_states_bool[conv_state_indices] = False
-    padded_state_indices = torch.concat([
-        conv_state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
-    ],
-                                        dim=0)
+    padded_state_indices = torch.concat(
+        [
+            conv_state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=0,
+    )

    # conv_state will be (cache_lines, dim, state_len)
    # with contiguous along dim-axis
-    conv_state = torch.randn(total_entries,
-                             width - 1,
-                             dim,
-                             device=device,
-                             dtype=itype).transpose(1, 2)
+    conv_state = torch.randn(
+        total_entries, width - 1, dim, device=device, dtype=itype
+    ).transpose(1, 2)

    conv_state_for_padding_test = conv_state.clone()

@@ -242,22 +232,23 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
    conv_state_ref = conv_state[conv_state_indices, :].detach().clone()
    activation = None if not silu_activation else "silu"

-    out = causal_conv1d_update(x,
-                               conv_state,
-                               weight,
-                               bias,
-                               activation=activation,
-                               conv_state_indices=padded_state_indices,
-                               pad_slot_id=PAD_SLOT_ID)
-    out_ref = causal_conv1d_update_ref(x_ref[:batch_size],
-                                       conv_state_ref,
-                                       weight,
-                                       bias,
-                                       activation=activation)
+    out = causal_conv1d_update(
+        x,
+        conv_state,
+        weight,
+        bias,
+        activation=activation,
+        conv_state_indices=padded_state_indices,
+        pad_slot_id=PAD_SLOT_ID,
+    )
+    out_ref = causal_conv1d_update_ref(
+        x_ref[:batch_size], conv_state_ref, weight, bias, activation=activation
+    )

    assert torch.equal(conv_state[conv_state_indices, :], conv_state_ref)
-    assert torch.equal(conv_state[unused_states_bool],
-                       conv_state_for_padding_test[unused_states_bool])
+    assert torch.equal(
+        conv_state[unused_states_bool], conv_state_for_padding_test[unused_states_bool]
+    )
    assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)


@@ -265,12 +256,13 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
@pytest.mark.parametrize("silu_activation", [True])
@pytest.mark.parametrize("has_bias", [True])
@pytest.mark.parametrize("width", [4])
-@pytest.mark.parametrize('seqlen', [8, 30, 249, 2049, 4096])
-@pytest.mark.parametrize('dim', [64, 4096])
-@pytest.mark.parametrize('with_padding', [True, False])
-@pytest.mark.parametrize('batch', [4, 10])
-def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
-                              has_bias, silu_activation, itype):
+@pytest.mark.parametrize("seqlen", [8, 30, 249, 2049, 4096])
+@pytest.mark.parametrize("dim", [64, 4096])
+@pytest.mark.parametrize("with_padding", [True, False])
+@pytest.mark.parametrize("batch", [4, 10])
+def test_causal_conv1d_varlen(
+    batch, with_padding, dim, seqlen, width, has_bias, silu_activation, itype
+):
    device = "cuda"
    torch.cuda.empty_cache()
    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
@@ -288,19 +280,19 @@ def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,

    seqlens.append(
        torch.diff(
-            torch.cat(
-                [torch.tensor([-1]), eos_pos,
-                 torch.tensor([seqlen - 1])])).tolist())
+            torch.cat([torch.tensor([-1]), eos_pos, torch.tensor([seqlen - 1])])
+        ).tolist()
+    )
    assert sum(seqlens[-1]) == seqlen
    assert all(s > 0 for s in seqlens[-1])

    total_entries = batch_size * 10
    cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
-    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
-                          dim=0)
+    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum], dim=0)
    x = rearrange(
        torch.randn(1, seqlen, 4096 + dim + 64, device=device, dtype=itype),
-        "b s d -> b d s")[:, 4096:4096 + dim, :]
+        "b s d -> b d s",
+    )[:, 4096 : 4096 + dim, :]

    weight = torch.randn(dim, width, device=device, dtype=itype)

@@ -309,34 +301,34 @@ def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
    weight_ref = weight.clone()
    bias_ref = bias.clone() if bias is not None else None
    activation = None if not silu_activation else "silu"
-    final_states = torch.randn(total_entries,
-                               width - 1,
-                               dim,
-                               device=x.device,
-                               dtype=x.dtype).transpose(1, 2)
+    final_states = torch.randn(
+        total_entries, width - 1, dim, device=x.device, dtype=x.dtype
+    ).transpose(1, 2)
    final_states_ref = final_states.clone()
-    has_initial_states = torch.randint(0,
-                                       2, (cumsum.shape[0] - 1, ),
-                                       dtype=torch.bool,
-                                       device=x.device)
-    state_indices = torch.randperm(total_entries,
-                                   dtype=torch.int32,
-                                   device=x.device)[:batch_size]
-    padded_state_indices = torch.concat([
-        state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
-    ],
-                                        dim=-1)
-    out = causal_conv1d_fn(x.squeeze(0),
-                           weight,
-                           bias=bias,
-                           conv_states=final_states,
-                           query_start_loc=cumsum.cuda(),
-                           cache_indices=padded_state_indices,
-                           has_initial_state=has_initial_states,
-                           activation=activation,
-                           pad_slot_id=PAD_SLOT_ID)
+    has_initial_states = torch.randint(
+        0, 2, (cumsum.shape[0] - 1,), dtype=torch.bool, device=x.device
+    )
+    state_indices = torch.randperm(total_entries, dtype=torch.int32, device=x.device)[
+        :batch_size
+    ]
+    padded_state_indices = torch.concat(
+        [
+            state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=-1,
+    )
+    out = causal_conv1d_fn(
+        x.squeeze(0),
+        weight,
+        bias=bias,
+        conv_states=final_states,
+        query_start_loc=cumsum.cuda(),
+        cache_indices=padded_state_indices,
+        has_initial_state=has_initial_states,
+        activation=activation,
+        pad_slot_id=PAD_SLOT_ID,
+    )

    out_ref = []
    out_ref_b = []
@@ -353,16 +345,20 @@ def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
                bias_ref,
                activation=activation,
                return_final_states=True,
-                final_states_out=final_states_ref[
-                    padded_state_indices[i]].unsqueeze(0),
-                initial_states=final_states_ref[padded_state_indices[i]].
-                unsqueeze(0) if has_initial_states[i] else None))
+                final_states_out=final_states_ref[padded_state_indices[i]].unsqueeze(0),
+                initial_states=final_states_ref[padded_state_indices[i]].unsqueeze(0)
+                if has_initial_states[i]
+                else None,
+            )
+        )
    out_ref.append(torch.cat([t[0] for t in out_ref_b], dim=2))
    out_ref_tensor = torch.cat(out_ref, dim=0)

-    assert torch.allclose(final_states[state_indices],
-                          final_states_ref[state_indices],
-                          rtol=rtol,
-                          atol=atol)
-    unpadded_out = out[:, :out_ref_tensor.shape[-1]]
+    assert torch.allclose(
+        final_states[state_indices],
+        final_states_ref[state_indices],
+        rtol=rtol,
+        atol=atol,
+    )
+    unpadded_out = out[:, : out_ref_tensor.shape[-1]]
    assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)