Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/kernels/mamba/test_causal_conv1d.py
+++ b/tests/kernels/mamba/test_causal_conv1d.py
@@ -10,7 +10,9 @@ from einops import rearrange

 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update)
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
 from vllm.platforms import current_platform


@@ -39,18 +41,15 @@ def causal_conv1d_ref(
    seqlen = x.shape[-1]
    dim, width = weight.shape
    if initial_states is None:
-        out = F.conv1d(x,
-                       weight.unsqueeze(1),
-                       bias,
-                       padding=width - 1,
-                       groups=dim)
+        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
    else:
        x = torch.cat([initial_states, x], dim=-1)
        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=0, groups=dim)
    out = out[..., :seqlen]
    if return_final_states:
        final_states = F.pad(x, (width - 1 - x.shape[-1], 0)).to(
-            dtype_in)  # (batch, dim, width - 1)
+            dtype_in
+        )  # (batch, dim, width - 1)
        if final_states_out is not None:
            final_states_out.copy_(final_states)
        else:
@@ -59,12 +58,9 @@ def causal_conv1d_ref(
    return (out, None) if not return_final_states else (out, final_states_out)


-def causal_conv1d_update_ref(x,
-                             conv_state,
-                             weight,
-                             bias=None,
-                             activation=None,
-                             cache_seqlens=None):
+def causal_conv1d_update_ref(
+    x, conv_state, weight, bias=None, activation=None, cache_seqlens=None
+):
    """
    x: (batch, dim) or (batch, dim, seqlen)
    conv_state: (batch, dim, state_len), where state_len >= width - 1
@@ -91,24 +87,25 @@ def causal_conv1d_update_ref(x,
    assert weight.shape == (dim, width)
    if cache_seqlens is None:
        x_new = torch.cat([conv_state, x], dim=-1).to(
-            weight.dtype)  # (batch, dim, state_len + seqlen)
+            weight.dtype
+        )  # (batch, dim, state_len + seqlen)
        conv_state.copy_(x_new[:, :, -state_len:])
    else:
        width_idx = torch.arange(
-            -(width - 1), 0, dtype=torch.long,
-            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
-        width_idx = torch.remainder(width_idx, state_len).unsqueeze(1).expand(
-            -1, dim, -1)
-        x_new = torch.cat([conv_state.gather(2, width_idx), x],
-                          dim=-1).to(weight.dtype)
-        copy_idx = torch.arange(
-            seqlen, dtype=torch.long,
-            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
-        copy_idx = torch.remainder(copy_idx,
-                                   state_len).unsqueeze(1).expand(-1, dim, -1)
+            -(width - 1), 0, dtype=torch.long, device=x.device
+        ).unsqueeze(0) + cache_seqlens.unsqueeze(1)
+        width_idx = (
+            torch.remainder(width_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
+        )
+        x_new = torch.cat([conv_state.gather(2, width_idx), x], dim=-1).to(weight.dtype)
+        copy_idx = torch.arange(seqlen, dtype=torch.long, device=x.device).unsqueeze(
+            0
+        ) + cache_seqlens.unsqueeze(1)
+        copy_idx = torch.remainder(copy_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
        conv_state.scatter_(2, copy_idx, x)
-    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0,
-                   groups=dim)[:, :, -seqlen:]
+    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0, groups=dim)[
+        :, :, -seqlen:
+    ]
    if unsqueeze:
        out = out.squeeze(-1)
    return (out if activation is None else F.silu(out)).to(dtype=dtype_in)
@@ -117,15 +114,17 @@ def causal_conv1d_update_ref(x,
@pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
@pytest.mark.parametrize("silu_activation", [True])
@pytest.mark.parametrize("has_bias", [True])
-def causal_conv1d_opcheck_fn(x: torch.Tensor,
-                             weight: torch.Tensor,
-                             bias: Optional[torch.Tensor] = None,
-                             cu_seq_len: Optional[torch.Tensor] = None,
-                             cache_indices: Optional[torch.Tensor] = None,
-                             has_initial_state: Optional[torch.Tensor] = None,
-                             conv_states: Optional[torch.Tensor] = None,
-                             activation: Optional[str] = "silu",
-                             pad_slot_id: int = PAD_SLOT_ID):
+def causal_conv1d_opcheck_fn(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    cu_seq_len: Optional[torch.Tensor] = None,
+    cache_indices: Optional[torch.Tensor] = None,
+    has_initial_state: Optional[torch.Tensor] = None,
+    conv_states: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+    pad_slot_id: int = PAD_SLOT_ID,
+):
    """
    x: (batch, dim, seqlen)
    weight: (dim, width)
@@ -150,8 +149,7 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor,
@pytest.mark.parametrize("seqlen", [1])
@pytest.mark.parametrize("width", [4])
@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
-def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
-                              itype):
+def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, itype):
    device = "cuda"
    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
    if itype == torch.bfloat16:
@@ -167,23 +165,16 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
    conv_state_ref = conv_state.detach().clone()
    activation = None if not silu_activation else "silu"
-    out = causal_conv1d_update(x,
-                               conv_state,
-                               weight,
-                               bias,
-                               activation=activation)
-    out_ref = causal_conv1d_update_ref(x_ref,
-                                       conv_state_ref,
-                                       weight,
-                                       bias,
-                                       activation=activation)
+    out = causal_conv1d_update(x, conv_state, weight, bias, activation=activation)
+    out_ref = causal_conv1d_update_ref(
+        x_ref, conv_state_ref, weight, bias, activation=activation
+    )

    assert torch.equal(conv_state, conv_state_ref)
    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)


-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("silu_activation", [False, True])
@pytest.mark.parametrize("has_bias", [False, True])
@pytest.mark.parametrize("seqlen", [1, 3])
@@ -192,9 +183,9 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
 # tests correctness in case subset of the sequences are padded
@pytest.mark.parametrize("with_padding", [True, False])
@pytest.mark.parametrize("batch_size", [3])
-def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
-                                                width, seqlen, has_bias,
-                                                silu_activation, itype):
+def test_causal_conv1d_update_with_batch_gather(
+    batch_size, with_padding, dim, width, seqlen, has_bias, silu_activation, itype
+):
    device = "cuda"
    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
    if itype == torch.bfloat16:
@@ -209,31 +200,30 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
    total_entries = 10 * batch_size

    # x will be (batch, dim, seqlen) with contiguous along dim-axis
-    x = torch.randn(padded_batch_size, seqlen, dim, device=device,
-                    dtype=itype).transpose(1, 2)
+    x = torch.randn(
+        padded_batch_size, seqlen, dim, device=device, dtype=itype
+    ).transpose(1, 2)

    x_ref = x.clone()

    conv_state_indices = torch.randperm(total_entries)[:batch_size].to(
-        dtype=torch.int32, device=device)
-    unused_states_bool = torch.ones(total_entries,
-                                    dtype=torch.bool,
-                                    device=device)
+        dtype=torch.int32, device=device
+    )
+    unused_states_bool = torch.ones(total_entries, dtype=torch.bool, device=device)
    unused_states_bool[conv_state_indices] = False
-    padded_state_indices = torch.concat([
-        conv_state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
-    ],
-                                        dim=0)
+    padded_state_indices = torch.concat(
+        [
+            conv_state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=0,
+    )

    # conv_state will be (cache_lines, dim, state_len)
    # with contiguous along dim-axis
-    conv_state = torch.randn(total_entries,
-                             width - 1,
-                             dim,
-                             device=device,
-                             dtype=itype).transpose(1, 2)
+    conv_state = torch.randn(
+        total_entries, width - 1, dim, device=device, dtype=itype
+    ).transpose(1, 2)

    conv_state_for_padding_test = conv_state.clone()

@@ -242,22 +232,23 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
    conv_state_ref = conv_state[conv_state_indices, :].detach().clone()
    activation = None if not silu_activation else "silu"

-    out = causal_conv1d_update(x,
-                               conv_state,
-                               weight,
-                               bias,
-                               activation=activation,
-                               conv_state_indices=padded_state_indices,
-                               pad_slot_id=PAD_SLOT_ID)
-    out_ref = causal_conv1d_update_ref(x_ref[:batch_size],
-                                       conv_state_ref,
-                                       weight,
-                                       bias,
-                                       activation=activation)
+    out = causal_conv1d_update(
+        x,
+        conv_state,
+        weight,
+        bias,
+        activation=activation,
+        conv_state_indices=padded_state_indices,
+        pad_slot_id=PAD_SLOT_ID,
+    )
+    out_ref = causal_conv1d_update_ref(
+        x_ref[:batch_size], conv_state_ref, weight, bias, activation=activation
+    )

    assert torch.equal(conv_state[conv_state_indices, :], conv_state_ref)
-    assert torch.equal(conv_state[unused_states_bool],
-                       conv_state_for_padding_test[unused_states_bool])
+    assert torch.equal(
+        conv_state[unused_states_bool], conv_state_for_padding_test[unused_states_bool]
+    )
    assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)


@@ -265,12 +256,13 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
@pytest.mark.parametrize("silu_activation", [True])
@pytest.mark.parametrize("has_bias", [True])
@pytest.mark.parametrize("width", [4])
-@pytest.mark.parametrize('seqlen', [8, 30, 249, 2049, 4096])
-@pytest.mark.parametrize('dim', [64, 4096])
-@pytest.mark.parametrize('with_padding', [True, False])
-@pytest.mark.parametrize('batch', [4, 10])
-def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
-                              has_bias, silu_activation, itype):
+@pytest.mark.parametrize("seqlen", [8, 30, 249, 2049, 4096])
+@pytest.mark.parametrize("dim", [64, 4096])
+@pytest.mark.parametrize("with_padding", [True, False])
+@pytest.mark.parametrize("batch", [4, 10])
+def test_causal_conv1d_varlen(
+    batch, with_padding, dim, seqlen, width, has_bias, silu_activation, itype
+):
    device = "cuda"
    torch.cuda.empty_cache()
    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
@@ -288,19 +280,19 @@ def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,

    seqlens.append(
        torch.diff(
-            torch.cat(
-                [torch.tensor([-1]), eos_pos,
-                 torch.tensor([seqlen - 1])])).tolist())
+            torch.cat([torch.tensor([-1]), eos_pos, torch.tensor([seqlen - 1])])
+        ).tolist()
+    )
    assert sum(seqlens[-1]) == seqlen
    assert all(s > 0 for s in seqlens[-1])

    total_entries = batch_size * 10
    cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
-    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
-                          dim=0)
+    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum], dim=0)
    x = rearrange(
        torch.randn(1, seqlen, 4096 + dim + 64, device=device, dtype=itype),
-        "b s d -> b d s")[:, 4096:4096 + dim, :]
+        "b s d -> b d s",
+    )[:, 4096 : 4096 + dim, :]

    weight = torch.randn(dim, width, device=device, dtype=itype)

@@ -309,34 +301,34 @@ def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
    weight_ref = weight.clone()
    bias_ref = bias.clone() if bias is not None else None
    activation = None if not silu_activation else "silu"
-    final_states = torch.randn(total_entries,
-                               width - 1,
-                               dim,
-                               device=x.device,
-                               dtype=x.dtype).transpose(1, 2)
+    final_states = torch.randn(
+        total_entries, width - 1, dim, device=x.device, dtype=x.dtype
+    ).transpose(1, 2)
    final_states_ref = final_states.clone()
-    has_initial_states = torch.randint(0,
-                                       2, (cumsum.shape[0] - 1, ),
-                                       dtype=torch.bool,
-                                       device=x.device)
-    state_indices = torch.randperm(total_entries,
-                                   dtype=torch.int32,
-                                   device=x.device)[:batch_size]
-    padded_state_indices = torch.concat([
-        state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
-    ],
-                                        dim=-1)
-    out = causal_conv1d_fn(x.squeeze(0),
-                           weight,
-                           bias=bias,
-                           conv_states=final_states,
-                           query_start_loc=cumsum.cuda(),
-                           cache_indices=padded_state_indices,
-                           has_initial_state=has_initial_states,
-                           activation=activation,
-                           pad_slot_id=PAD_SLOT_ID)
+    has_initial_states = torch.randint(
+        0, 2, (cumsum.shape[0] - 1,), dtype=torch.bool, device=x.device
+    )
+    state_indices = torch.randperm(total_entries, dtype=torch.int32, device=x.device)[
+        :batch_size
+    ]
+    padded_state_indices = torch.concat(
+        [
+            state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=-1,
+    )
+    out = causal_conv1d_fn(
+        x.squeeze(0),
+        weight,
+        bias=bias,
+        conv_states=final_states,
+        query_start_loc=cumsum.cuda(),
+        cache_indices=padded_state_indices,
+        has_initial_state=has_initial_states,
+        activation=activation,
+        pad_slot_id=PAD_SLOT_ID,
+    )

    out_ref = []
    out_ref_b = []
@@ -353,16 +345,20 @@ def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
                bias_ref,
                activation=activation,
                return_final_states=True,
-                final_states_out=final_states_ref[
-                    padded_state_indices[i]].unsqueeze(0),
-                initial_states=final_states_ref[padded_state_indices[i]].
-                unsqueeze(0) if has_initial_states[i] else None))
+                final_states_out=final_states_ref[padded_state_indices[i]].unsqueeze(0),
+                initial_states=final_states_ref[padded_state_indices[i]].unsqueeze(0)
+                if has_initial_states[i]
+                else None,
+            )
+        )
    out_ref.append(torch.cat([t[0] for t in out_ref_b], dim=2))
    out_ref_tensor = torch.cat(out_ref, dim=0)

-    assert torch.allclose(final_states[state_indices],
-                          final_states_ref[state_indices],
-                          rtol=rtol,
-                          atol=atol)
-    unpadded_out = out[:, :out_ref_tensor.shape[-1]]
+    assert torch.allclose(
+        final_states[state_indices],
+        final_states_ref[state_indices],
+        rtol=rtol,
+        atol=atol,
+    )
+    unpadded_out = out[:, : out_ref_tensor.shape[-1]]
    assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
--- a/tests/kernels/mamba/test_mamba_mixer2.py
+++ b/tests/kernels/mamba/test_mamba_mixer2.py
@@ -7,8 +7,10 @@ import pytest
 import torch

 from tests.utils import multi_gpu_test
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import Mixer2RMSNormGated
 from vllm.platforms import current_platform
 from vllm.utils import update_environment_variables
@@ -24,14 +26,15 @@ from vllm.utils import update_environment_variables
        (64, 2),
        (64, 4),  # hidden_size be divisible by num_gpus
        (100, 5),  # and n_groups must divide hidden_size
-    ])
+    ],
+)
@pytest.mark.parametrize("dtype", [torch.float16])
 def test_mixer2_gated_norm_multi_gpu(
    batch_size: int,
    seq_len: int,
    hidden_size_n_groups: tuple[int, int],
    dtype: torch.dtype,
-    device: str = 'cuda',
+    device: str = "cuda",
 ):
    hidden_size, n_groups = hidden_size_n_groups
    num_processes = 2
@@ -39,17 +42,19 @@ def test_mixer2_gated_norm_multi_gpu(
    def run_torch_spawn(fn, nprocs):
        # need to use torch.mp.spawn otherwise will have problems with
        # torch.distributed and cuda
-        torch.multiprocessing.spawn(fn,
-                                    args=(
-                                        num_processes,
-                                        batch_size,
-                                        seq_len,
-                                        hidden_size,
-                                        n_groups,
-                                        dtype,
-                                        device,
-                                    ),
-                                    nprocs=nprocs)
+        torch.multiprocessing.spawn(
+            fn,
+            args=(
+                num_processes,
+                batch_size,
+                seq_len,
+                hidden_size,
+                n_groups,
+                dtype,
+                device,
+            ),
+            nprocs=nprocs,
+        )

    run_torch_spawn(mixer2_gated_norm_tensor_parallel, 2)

@@ -71,20 +76,22 @@ def mixer2_gated_norm_tensor_parallel(
    torch.set_default_device(device)
    torch.set_default_dtype(dtype)

-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': '12345',
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        }
+    )

    # initialize distributed
    init_distributed_environment()
    initialize_model_parallel(tensor_model_parallel_size=world_size)

    # create random weights an inputs
-    weight = torch.rand((hidden_size, ), dtype=dtype, device=device)
+    weight = torch.rand((hidden_size,), dtype=dtype, device=device)
    hidden_states = torch.randn(batch_size, seq_len, hidden_size)
    gate_states = torch.randn(batch_size, seq_len, hidden_size)

@@ -97,14 +104,18 @@ def mixer2_gated_norm_tensor_parallel(

    # create gated-norm without TP to compute reference
    # - utilize mock patching to disable TP when
-    with (unittest.mock.patch(
+    with (
+        unittest.mock.patch(
            "vllm.model_executor.layers.mamba.mamba_mixer2."
            "get_tensor_model_parallel_world_size",
-            return_value=1),
-          unittest.mock.patch(
-              "vllm.model_executor.layers.mamba.mamba_mixer2."
-              "get_tensor_model_parallel_rank",
-              return_value=0)):
+            return_value=1,
+        ),
+        unittest.mock.patch(
+            "vllm.model_executor.layers.mamba.mamba_mixer2."
+            "get_tensor_model_parallel_rank",
+            return_value=0,
+        ),
+    ):
        mixer_single_gpu = Mixer2RMSNormGated(
            full_hidden_size=hidden_size,
            full_n_groups=n_groups,
@@ -115,12 +126,13 @@ def mixer2_gated_norm_tensor_parallel(
    # generate and compare
    N = hidden_size // world_size
    output = mixer(
-        hidden_states[..., local_rank * N:(local_rank + 1) * N],
-        gate_states[..., local_rank * N:(local_rank + 1) * N],
+        hidden_states[..., local_rank * N : (local_rank + 1) * N],
+        gate_states[..., local_rank * N : (local_rank + 1) * N],
    )
    ref_output = mixer_single_gpu(hidden_states, gate_states)
-    torch.testing.assert_close(output,
-                               ref_output[...,
-                                          local_rank * N:(local_rank + 1) * N],
-                               atol=5e-3,
-                               rtol=1e-3)
+    torch.testing.assert_close(
+        output,
+        ref_output[..., local_rank * N : (local_rank + 1) * N],
+        atol=5e-3,
+        rtol=1e-3,
+    )
--- a/tests/kernels/mamba/test_mamba_ssm.py
+++ b/tests/kernels/mamba/test_mamba_ssm.py
@@ -10,20 +10,15 @@ from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops  # noqa: F401
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
-    selective_scan_fn, selective_state_update)
+    selective_scan_fn,
+    selective_state_update,
+)
 from vllm.platforms import current_platform


-def selective_state_update_ref(state,
-                               x,
-                               dt,
-                               A,
-                               B,
-                               C,
-                               D=None,
-                               z=None,
-                               dt_bias=None,
-                               dt_softplus=False):
+def selective_state_update_ref(
+    state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False
+):
    """
    Argument:
        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
@@ -73,16 +68,17 @@ def selective_state_update_ref(state,
        assert dt_bias.shape == (nheads, dim)
        dt = dt + dt_bias
    dt = F.softplus(dt) if dt_softplus else dt
-    dA = torch.exp(rearrange(dt, "b h d -> b h d 1") *
-                   A)  # (batch, nheads, dim, dstate)
-    B = repeat(B, "b g n -> b (g h) n",
-               h=nheads // ngroups)  # (batch, nheads, dstate)
-    C = repeat(C, "b g n -> b (g h) n",
-               h=nheads // ngroups)  # (batch, nheads, dstate)
+    dA = torch.exp(
+        rearrange(dt, "b h d -> b h d 1") * A
+    )  # (batch, nheads, dim, dstate)
+    B = repeat(B, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
+    C = repeat(C, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
    dB = rearrange(dt, "b h d -> b h d 1") * rearrange(
-        B, "b h n -> b h 1 n")  # (batch, nheads, dim, dstate)
-    state.copy_(state * dA +
-                dB * rearrange(x, "b h d -> b h d 1"))  # (batch, dim, dstate
+        B, "b h n -> b h 1 n"
+    )  # (batch, nheads, dim, dstate)
+    state.copy_(
+        state * dA + dB * rearrange(x, "b h d -> b h d 1")
+    )  # (batch, dim, dstate
    out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
    if D is not None:
        out += (x * D).to(out.dtype)
@@ -92,18 +88,20 @@ def selective_state_update_ref(state,
    return out


-def selective_scan_ref(u,
-                       delta,
-                       A,
-                       B,
-                       C,
-                       D=None,
-                       z=None,
-                       delta_bias=None,
-                       delta_softplus=False,
-                       return_last_state=False,
-                       prev_state=None,
-                       final_state_out=None):
+def selective_scan_ref(
+    u,
+    delta,
+    A,
+    B,
+    C,
+    D=None,
+    z=None,
+    delta_bias=None,
+    delta_softplus=False,
+    return_last_state=False,
+    prev_state=None,
+    final_state_out=None,
+):
    """
    u: r(B D L)
    delta: r(B D L)
@@ -132,26 +130,26 @@ def selective_scan_ref(u,
    C = C.float()
    x = A.new_zeros((batch, dim, dstate)) if prev_state is None else prev_state
    ys = []
-    deltaA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    deltaA = torch.exp(torch.einsum("bdl,dn->bdln", delta, A))
    if not is_variable_B:
-        deltaB_u = torch.einsum('bdl,dn,bdl->bdln', delta, B, u)
+        deltaB_u = torch.einsum("bdl,dn,bdl->bdln", delta, B, u)
    else:
        if B.dim() == 3:
-            deltaB_u = torch.einsum('bdl,bnl,bdl->bdln', delta, B, u)
+            deltaB_u = torch.einsum("bdl,bnl,bdl->bdln", delta, B, u)
        else:
            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
-            deltaB_u = torch.einsum('bdl,bdnl,bdl->bdln', delta, B, u)
+            deltaB_u = torch.einsum("bdl,bdnl,bdl->bdln", delta, B, u)
    if is_variable_C and C.dim() == 4:
        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
    for i in range(u.shape[2]):
        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
        if not is_variable_C:
-            y = torch.einsum('bdn,dn->bd', x, C)
+            y = torch.einsum("bdn,dn->bd", x, C)
        else:
            if C.dim() == 3:
-                y = torch.einsum('bdn,bn->bd', x, C[:, :, i])
+                y = torch.einsum("bdn,bn->bd", x, C[:, :, i])
            else:
-                y = torch.einsum('bdn,bdn->bd', x, C[:, :, :, i])
+                y = torch.einsum("bdn,bdn->bd", x, C[:, :, :, i])
        if i == u.shape[2] - 1:
            if final_state_out is None:
                final_state_out = x
@@ -166,20 +164,22 @@ def selective_scan_ref(u,
    return out if not return_last_state else (out, final_state_out)


-def selective_scan_opcheck_fn(u,
-                              delta,
-                              A,
-                              B,
-                              C,
-                              D=None,
-                              z=None,
-                              delta_bias=None,
-                              delta_softplus=False,
-                              cu_seq_len=None,
-                              cache_indices=None,
-                              has_initial_state=None,
-                              ssm_states=None,
-                              pad_slot_id=PAD_SLOT_ID):
+def selective_scan_opcheck_fn(
+    u,
+    delta,
+    A,
+    B,
+    C,
+    D=None,
+    z=None,
+    delta_bias=None,
+    delta_softplus=False,
+    cu_seq_len=None,
+    cache_indices=None,
+    has_initial_state=None,
+    ssm_states=None,
+    pad_slot_id=PAD_SLOT_ID,
+):
    """if return_last_state is True, returns (out, last_state)
    last_state has shape (batch, dim, dstate).
    """
@@ -206,30 +206,55 @@ def selective_scan_opcheck_fn(u,

    # Disable test_autograd_registration for now as it seems to trigger
    # a bogus error.
-    opcheck(torch.ops._C.selective_scan_fwd,
-            (u, delta, A, B, C, D, z, delta_bias, delta_softplus, cu_seq_len,
-             cache_indices, has_initial_state, ssm_states, pad_slot_id),
-            test_utils=["test_schema", "test_faketensor"])
+    opcheck(
+        torch.ops._C.selective_scan_fwd,
+        (
+            u,
+            delta,
+            A,
+            B,
+            C,
+            D,
+            z,
+            delta_bias,
+            delta_softplus,
+            cu_seq_len,
+            cache_indices,
+            has_initial_state,
+            ssm_states,
+            pad_slot_id,
+        ),
+        test_utils=["test_schema", "test_faketensor"],
+    )


-@pytest.mark.parametrize('wtype', [torch.float32])
-@pytest.mark.parametrize('itype',
-                         [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize('seqlen', [128, 256, 512, 1024, 2048, 4096])
-@pytest.mark.parametrize('has_delta_bias', [True])
-@pytest.mark.parametrize('delta_softplus', [True])
-@pytest.mark.parametrize('has_z', [True])
-@pytest.mark.parametrize('has_D', [True])
+@pytest.mark.parametrize("wtype", [torch.float32])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("seqlen", [128, 256, 512, 1024, 2048, 4096])
+@pytest.mark.parametrize("has_delta_bias", [True])
+@pytest.mark.parametrize("delta_softplus", [True])
+@pytest.mark.parametrize("has_z", [True])
+@pytest.mark.parametrize("has_D", [True])
@pytest.mark.parametrize("varBC_groups", [1, 2])
@pytest.mark.parametrize("is_variable_C", [True])
@pytest.mark.parametrize("is_variable_B", [True])
@pytest.mark.parametrize("scan_chunks", [1, 2, 3])
-def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
-                        has_z, has_delta_bias, delta_softplus, seqlen, itype,
-                        wtype, scan_chunks):
+def test_selective_scan(
+    is_variable_B,
+    is_variable_C,
+    varBC_groups,
+    has_D,
+    has_z,
+    has_delta_bias,
+    delta_softplus,
+    seqlen,
+    itype,
+    wtype,
+    scan_chunks,
+):
    if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
        pytest.skip()  # This config is not applicable
-    device = 'cuda'
+    device = "cuda"
    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
    if itype == torch.bfloat16:
        rtol, atol = 3e-2, 5e-2
@@ -242,7 +267,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
    batch_size = 1
    dim = 4
    dstate = 8
-    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
+    A = -0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)
    A_ref = A.clone()
    if not is_variable_B:
        B_shape = [dim, dstate]
@@ -250,9 +275,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
        B_shape = [batch_size, dstate, seqlen]
    else:
        B_shape = [batch_size, varBC_groups, dstate, seqlen]
-    B = torch.randn(B_shape,
-                    device=device,
-                    dtype=wtype if not is_variable_B else itype)
+    B = torch.randn(B_shape, device=device, dtype=wtype if not is_variable_B else itype)
    B_ref = B.clone()
    if not is_variable_C:
        C_shape = [dim, dstate]
@@ -260,27 +283,27 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
        C_shape = [batch_size, dstate, seqlen]
    else:
        C_shape = [batch_size, varBC_groups, dstate, seqlen]
-    C = torch.randn(C_shape,
-                    device=device,
-                    dtype=wtype if not is_variable_C else itype)
+    C = torch.randn(C_shape, device=device, dtype=wtype if not is_variable_C else itype)
    C_ref = C.clone()
    D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
    D_ref = D.clone()
-    z = torch.randn(batch_size, dim, seqlen, device=device,
-                    dtype=itype) if has_z else None
+    z = (
+        torch.randn(batch_size, dim, seqlen, device=device, dtype=itype)
+        if has_z
+        else None
+    )
    z_ref = z.clone() if has_z else None
-    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
-                  ) if has_delta_bias else None
+    delta_bias = (
+        (0.5 * torch.rand(dim, device=device, dtype=torch.float32))
+        if has_delta_bias
+        else None
+    )
    u = torch.randn(batch_size, dim, seqlen, device=device, dtype=itype)
    u_ref = u.clone()
-    delta = (0.5 *
-             torch.rand(batch_size, dim, seqlen, device=device, dtype=itype))
+    delta = 0.5 * torch.rand(batch_size, dim, seqlen, device=device, dtype=itype)
    delta_ref = delta.clone()
    state_shape = (batch_size, u.shape[1], int(A.shape[1]))
-    state = torch.randn(state_shape,
-                        device=u.device,
-                        dtype=itype,
-                        requires_grad=False)
+    state = torch.randn(state_shape, device=u.device, dtype=itype, requires_grad=False)
    state_ref = state.clone()
    out = None
    out_ref = None
@@ -312,9 +335,10 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
            z=_z,
            delta_bias=delta_bias,
            delta_softplus=delta_softplus,
-            has_initial_state=torch.ones(batch_size,
-                                         device=u.device,
-                                         dtype=torch.bool) if c > 0 else None)
+            has_initial_state=torch.ones(batch_size, device=u.device, dtype=torch.bool)
+            if c > 0
+            else None,
+        )
        outs.append(out)
    if len(outs) > 1:
        out = torch.cat(outs, dim=-1)
@@ -329,27 +353,29 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
        z=z_ref,
        delta_bias=delta_bias,
        delta_softplus=delta_softplus,
-        return_last_state=True)
+        return_last_state=True,
+    )

    assert out is not None and out_ref is not None
    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
    assert state is not None and state_ref is not None
    assert torch.allclose(state, state_ref.to(itype), rtol=rtol, atol=atol)

-    selective_scan_opcheck_fn(u,
-                              delta,
-                              A,
-                              B,
-                              C,
-                              D,
-                              z,
-                              delta_bias=delta_bias,
-                              delta_softplus=delta_softplus,
-                              ssm_states=state)
+    selective_scan_opcheck_fn(
+        u,
+        delta,
+        A,
+        B,
+        C,
+        D,
+        z,
+        delta_bias=delta_bias,
+        delta_softplus=delta_softplus,
+        ssm_states=state,
+    )


-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("has_z", [False, True])
@pytest.mark.parametrize("dstate", [16, 32, 64])
@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
@@ -374,52 +400,47 @@ def test_selective_state_update(dim, dstate, has_z, itype):
    D = torch.randn(dim, device=device)
    z = torch.randn_like(x) if has_z else None
    state_ref = state.detach().clone()
-    selective_state_update(state,
-                           x,
-                           dt,
-                           A,
-                           B,
-                           C,
-                           D=D,
-                           z=z,
-                           dt_bias=dt_bias,
-                           dt_softplus=True,
-                           out=out)
-    out_ref = selective_state_update_ref(state_ref,
-                                         x,
-                                         dt,
-                                         A,
-                                         B,
-                                         C,
-                                         D=D,
-                                         z=z,
-                                         dt_bias=dt_bias,
-                                         dt_softplus=True)
+    selective_state_update(
+        state, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True, out=out
+    )
+    out_ref = selective_state_update_ref(
+        state_ref, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True
+    )

    assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)


-@pytest.mark.parametrize('wtype', [torch.float32])
-@pytest.mark.parametrize('itype', [torch.float32])
-@pytest.mark.parametrize('seqlen', [1, 128, 129, 256, 512, 1024, 2048, 4096])
+@pytest.mark.parametrize("wtype", [torch.float32])
+@pytest.mark.parametrize("itype", [torch.float32])
+@pytest.mark.parametrize("seqlen", [1, 128, 129, 256, 512, 1024, 2048, 4096])
@pytest.mark.parametrize("return_last_state", [True])
-@pytest.mark.parametrize('has_delta_bias', [True])
-@pytest.mark.parametrize('delta_softplus', [True])
-@pytest.mark.parametrize('has_z', [True])
-@pytest.mark.parametrize('has_D', [True])
+@pytest.mark.parametrize("has_delta_bias", [True])
+@pytest.mark.parametrize("delta_softplus", [True])
+@pytest.mark.parametrize("has_z", [True])
+@pytest.mark.parametrize("has_D", [True])
@pytest.mark.parametrize("varBC_groups", [1, 2])
@pytest.mark.parametrize("is_variable_C", [True])
@pytest.mark.parametrize("is_variable_B", [True])
 # tests correctness in case subset of the sequences are padded
@pytest.mark.parametrize("with_padding", [False, True])
-def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
-                               varBC_groups, has_D, has_z, has_delta_bias,
-                               delta_softplus, return_last_state, seqlen,
-                               itype, wtype):
+def test_selective_scan_varlen(
+    with_padding,
+    is_variable_B,
+    is_variable_C,
+    varBC_groups,
+    has_D,
+    has_z,
+    has_delta_bias,
+    delta_softplus,
+    return_last_state,
+    seqlen,
+    itype,
+    wtype,
+):
    if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
        pytest.skip()  # This config is not applicable
-    device = 'cuda'
+    device = "cuda"
    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
    if itype == torch.bfloat16:
        rtol, atol = 3e-2, 5e-2
@@ -443,72 +464,79 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
    eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
    seqlens.append(
        torch.diff(
-            torch.cat(
-                [torch.tensor([-1]), eos_pos,
-                 torch.tensor([seqlen - 1])])).tolist())
+            torch.cat([torch.tensor([-1]), eos_pos, torch.tensor([seqlen - 1])])
+        ).tolist()
+    )

    assert sum(seqlens[-1]) == seqlen
    assert all(s > 0 for s in seqlens[-1])

    total_entries = batch_size * 10
    cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
-    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
-                          dim=0).cuda()
+    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum], dim=0).cuda()

    dim = 4
    dstate = 8
-    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
+    A = -0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)
    A_ref = A.clone()
    B_shape = [varBC_groups, dstate, seqlen]
-    B = torch.randn(B_shape,
-                    device=device,
-                    dtype=wtype if not is_variable_B else itype)
+    B = torch.randn(B_shape, device=device, dtype=wtype if not is_variable_B else itype)
    B_ref = B.clone()
    C_shape = [varBC_groups, dstate, seqlen]
-    C = torch.randn(C_shape,
-                    device=device,
-                    dtype=wtype if not is_variable_C else itype)
+    C = torch.randn(C_shape, device=device, dtype=wtype if not is_variable_C else itype)
    C_ref = C.clone()
    D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
    D_ref = D.clone()
    z = torch.randn(dim, seqlen, device=device, dtype=itype)
    z_ref = z.clone()
-    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
-                  ) if has_delta_bias else None
+    delta_bias = (
+        (0.5 * torch.rand(dim, device=device, dtype=torch.float32))
+        if has_delta_bias
+        else None
+    )
    u = torch.randn(dim, seqlen, device=device, dtype=itype)
    u_ref = u.clone()
-    delta = (0.5 * torch.rand(dim, seqlen, device=device, dtype=itype))
+    delta = 0.5 * torch.rand(dim, seqlen, device=device, dtype=itype)
    delta_ref = delta.clone()
    out = None
    out_ref = None

    prev_state_shape = (total_entries, u.shape[0], int(A.shape[1]))
-    prev_state = torch.randn(prev_state_shape,
-                             device=u.device,
-                             dtype=itype,
-                             requires_grad=False)
+    prev_state = torch.randn(
+        prev_state_shape, device=u.device, dtype=itype, requires_grad=False
+    )
    prev_state_ref = prev_state.clone()
-    state_indices = torch.randperm(total_entries,
-                                   dtype=torch.int32,
-                                   device=u.device)[:batch_size]
-    unused_states_bool = torch.ones(total_entries,
-                                    dtype=torch.bool,
-                                    device=device)
+    state_indices = torch.randperm(total_entries, dtype=torch.int32, device=u.device)[
+        :batch_size
+    ]
+    unused_states_bool = torch.ones(total_entries, dtype=torch.bool, device=device)
    unused_states_bool[state_indices] = False
-    padded_state_indices = torch.concat([
-        state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
-    ],
-                                        dim=-1)
+    padded_state_indices = torch.concat(
+        [
+            state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=-1,
+    )

-    has_initial_state = torch.randint(0,
-                                      2, (cumsum.shape[0] - 1, ),
-                                      dtype=torch.bool,
-                                      device=u.device)
-    out = selective_scan_fn(u, prev_state, delta, A, B, C, D, z, delta_bias,
-                            delta_softplus, cumsum, padded_state_indices,
-                            has_initial_state)
+    has_initial_state = torch.randint(
+        0, 2, (cumsum.shape[0] - 1,), dtype=torch.bool, device=u.device
+    )
+    out = selective_scan_fn(
+        u,
+        prev_state,
+        delta,
+        A,
+        B,
+        C,
+        D,
+        z,
+        delta_bias,
+        delta_softplus,
+        cumsum,
+        padded_state_indices,
+        has_initial_state,
+    )
    outs_ref = []
    splits = [
        torch.split(var, seqlens[0], dim=-1)
@@ -530,33 +558,46 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
            delta_softplus=delta_softplus,
            return_last_state=return_last_state,
            prev_state=prev_state_ref[padded_state_indices[i]].unsqueeze(0)
-            if has_initial_state[i] else None,
-            final_state_out=prev_state_ref[padded_state_indices[i]].unsqueeze(
-                0))
+            if has_initial_state[i]
+            else None,
+            final_state_out=prev_state_ref[padded_state_indices[i]].unsqueeze(0),
+        )
        outs_ref.append(out_ref_s)
    out_ref = torch.cat(outs_ref, dim=-1)[0]

-    unpadded_out = out[:, :out_ref[0].shape[-1]]
+    unpadded_out = out[:, : out_ref[0].shape[-1]]
    print("Output diff max", (unpadded_out - out_ref).max())
    print("Output diff mean", (unpadded_out - out_ref).mean())
    print("Output state diff max", (prev_state - prev_state_ref).max())
    print("Output state diff mean", (prev_state - prev_state_ref).mean())
    assert torch.allclose(prev_state, prev_state_ref, rtol=rtol, atol=atol)
    assert torch.allclose(unpadded_out, out_ref, rtol=rtol, atol=atol)
-    selective_scan_opcheck_fn(u, delta, A, B, C, D, z, delta_bias,
-                              delta_softplus, cumsum, padded_state_indices,
-                              has_initial_state, prev_state)
+    selective_scan_opcheck_fn(
+        u,
+        delta,
+        A,
+        B,
+        C,
+        D,
+        z,
+        delta_bias,
+        delta_softplus,
+        cumsum,
+        padded_state_indices,
+        has_initial_state,
+        prev_state,
+    )


-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("has_z", [True])
@pytest.mark.parametrize("dstate", [16, 32, 64])
@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
 # tests correctness in case subset of the sequences are padded
@pytest.mark.parametrize("with_padding", [True, False])
-def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
-                                                   has_z, itype):
+def test_selective_state_update_with_batch_indices(
+    with_padding, dim, dstate, has_z, itype
+):
    device = "cuda"
    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
    if itype == torch.bfloat16:
@@ -571,17 +612,17 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
    total_entries = 10 * batch_size
    state = torch.randn(total_entries, dim, dstate, dtype=itype, device=device)
    state_indices = torch.randperm(total_entries)[:batch_size].to(
-        dtype=torch.int32, device=device)
-    unused_states_bool = torch.ones(total_entries,
-                                    dtype=torch.bool,
-                                    device=device)
+        dtype=torch.int32, device=device
+    )
+    unused_states_bool = torch.ones(total_entries, dtype=torch.bool, device=device)
    unused_states_bool[state_indices] = False
-    padded_state_indices = torch.concat([
-        state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
-    ],
-                                        dim=0)
+    padded_state_indices = torch.concat(
+        [
+            state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=0,
+    )
    x = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
    out = torch.empty_like(x)
    dt = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
@@ -593,61 +634,60 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
    z = torch.randn_like(x) if has_z else None
    state_ref = state[state_indices, :].clone()
    state_before = state.clone()
-    selective_state_update(state,
-                           x,
-                           dt,
-                           A,
-                           B,
-                           C,
-                           D=D,
-                           z=z,
-                           dt_bias=dt_bias,
-                           dt_softplus=True,
-                           state_batch_indices=padded_state_indices,
-                           pad_slot_id=PAD_SLOT_ID,
-                           out=out)
-    out_ref = selective_state_update_ref(state_ref,
-                                         x[:batch_size],
-                                         dt[:batch_size],
-                                         A,
-                                         B[:batch_size],
-                                         C[:batch_size],
-                                         D=D,
-                                         z=z[:batch_size],
-                                         dt_bias=dt_bias,
-                                         dt_softplus=True)
+    selective_state_update(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=True,
+        state_batch_indices=padded_state_indices,
+        pad_slot_id=PAD_SLOT_ID,
+        out=out,
+    )
+    out_ref = selective_state_update_ref(
+        state_ref,
+        x[:batch_size],
+        dt[:batch_size],
+        A,
+        B[:batch_size],
+        C[:batch_size],
+        D=D,
+        z=z[:batch_size],
+        dt_bias=dt_bias,
+        dt_softplus=True,
+    )

    print("Output diff max", (out[:batch_size] - out_ref).max())
    print("Output diff mean", (out[:batch_size] - out_ref).mean())
    print("Output state diff max", (state[state_indices, :] - state_ref).max())
-    print("Output state diff mean",
-          (state[state_indices, :] - state_ref).mean())
+    print("Output state diff mean", (state[state_indices, :] - state_ref).mean())
    # test padded entries stay the same
    if with_padding:
-        assert torch.equal(state_before[unused_states_bool],
-                           state[unused_states_bool])
-        assert torch.equal(x[batch_size + 1:], x[batch_size + 1:])
-        assert torch.equal(dt[batch_size + 1:], dt[batch_size + 1:])
-        assert torch.equal(B[batch_size + 1:], B[batch_size + 1:])
-        assert torch.equal(C[batch_size + 1:], C[batch_size + 1:])
+        assert torch.equal(state_before[unused_states_bool], state[unused_states_bool])
+        assert torch.equal(x[batch_size + 1 :], x[batch_size + 1 :])
+        assert torch.equal(dt[batch_size + 1 :], dt[batch_size + 1 :])
+        assert torch.equal(B[batch_size + 1 :], B[batch_size + 1 :])
+        assert torch.equal(C[batch_size + 1 :], C[batch_size + 1 :])

    # test "real" entries
-    assert torch.allclose(state[state_indices, :],
-                          state_ref,
-                          rtol=rtol,
-                          atol=atol)
+    assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol)
    assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)


-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("has_z", [False, True])
@pytest.mark.parametrize("tie_hdim", [False, True])
@pytest.mark.parametrize("ngroups", [1, 2, 4])
@pytest.mark.parametrize("dstate", [16, 32, 64])
@pytest.mark.parametrize("dim", [2048, 4096])
 def test_selective_state_update_with_heads_with_batch_indices(
-        dim, dstate, ngroups, has_z, tie_hdim, itype):
+    dim, dstate, ngroups, has_z, tie_hdim, itype
+):
    device = "cuda"
    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 3e-2)
    if itype == torch.bfloat16:
@@ -659,71 +699,55 @@ def test_selective_state_update_with_heads_with_batch_indices(
    nheads = dim // headdim

    total_entries = 10 * batch_size
-    state = torch.randn(total_entries,
-                        nheads,
-                        headdim,
-                        dstate,
-                        dtype=itype,
-                        device=device)
+    state = torch.randn(
+        total_entries, nheads, headdim, dstate, dtype=itype, device=device
+    )
    state_indices = torch.randperm(total_entries)[:batch_size].to(
-        dtype=torch.int32, device=device)
+        dtype=torch.int32, device=device
+    )

    x = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype)
    out = torch.empty_like(x)
    if not tie_hdim:
-        dt = torch.randn(batch_size,
-                         nheads,
-                         headdim,
-                         device=device,
-                         dtype=itype)
+        dt = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype)
        dt_bias = torch.rand(nheads, headdim, device=device) - 4.0
        A = -torch.rand(nheads, headdim, dstate, device=device) - 1.0
        D = torch.randn(nheads, headdim, device=device)
    else:
-        dt = repeat(torch.randn(batch_size, nheads, device=device,
-                                dtype=itype),
-                    "b h -> b h p",
-                    p=headdim)
-        dt_bias = repeat(torch.rand(nheads, device=device) - 4.0,
-                         "h -> h p",
-                         p=headdim)
-        A = repeat(-torch.rand(nheads, device=device) - 1.0,
-                   "h -> h p n",
-                   p=headdim,
-                   n=dstate)
+        dt = repeat(
+            torch.randn(batch_size, nheads, device=device, dtype=itype),
+            "b h -> b h p",
+            p=headdim,
+        )
+        dt_bias = repeat(torch.rand(nheads, device=device) - 4.0, "h -> h p", p=headdim)
+        A = repeat(
+            -torch.rand(nheads, device=device) - 1.0, "h -> h p n", p=headdim, n=dstate
+        )
        D = repeat(torch.randn(nheads, device=device), "h -> h p", p=headdim)
    B = torch.randn(batch_size, ngroups, dstate, device=device)
    C = torch.randn(batch_size, ngroups, dstate, device=device)
    z = torch.randn_like(x) if has_z else None
    state_ref = state[state_indices, :].detach().clone()
-    selective_state_update(state,
-                           x,
-                           dt,
-                           A,
-                           B,
-                           C,
-                           D=D,
-                           z=z,
-                           dt_bias=dt_bias,
-                           dt_softplus=True,
-                           state_batch_indices=state_indices,
-                           pad_slot_id=PAD_SLOT_ID,
-                           out=out)
-    out_ref = selective_state_update_ref(state_ref,
-                                         x,
-                                         dt,
-                                         A,
-                                         B,
-                                         C,
-                                         D=D,
-                                         z=z,
-                                         dt_bias=dt_bias,
-                                         dt_softplus=True)
+    selective_state_update(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=True,
+        state_batch_indices=state_indices,
+        pad_slot_id=PAD_SLOT_ID,
+        out=out,
+    )
+    out_ref = selective_state_update_ref(
+        state_ref, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True
+    )

    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
-    assert torch.allclose(state[state_indices, :],
-                          state_ref,
-                          rtol=rtol,
-                          atol=atol)
+    assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol)
    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -7,10 +7,10 @@ import torch.nn.functional as F
 from einops import rearrange, repeat

 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
-    mamba_chunk_scan_combined_varlen)
+    mamba_chunk_scan_combined_varlen,
+)
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.mamba2_attn import (
-    compute_varlen_chunk_metadata)
+from vllm.v1.attention.backends.mamba2_attn import compute_varlen_chunk_metadata

 # Added by the IBM Team, 2024

@@ -22,12 +22,10 @@ def segsum(x):
    """Calculates segment sum."""
    T = x.size(-1)
    x = repeat(x, "... d -> ... d e", e=T)
-    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool),
-                      diagonal=-1)
+    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool), diagonal=-1)
    x = x.masked_fill(~mask, 0)
    x_segsum = torch.cumsum(x, dim=-2)
-    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool),
-                      diagonal=0)
+    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool), diagonal=0)
    x_segsum = x_segsum.masked_fill(~mask, -torch.inf)
    return x_segsum

@@ -46,8 +44,9 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
    assert X.shape[1] % block_len == 0

    # Rearrange into blocks/chunks
-    X, A, B, C = (rearrange(x, "b (c l) ... -> b c l ...", l=block_len)
-                  for x in (X, A, B, C))
+    X, A, B, C = (
+        rearrange(x, "b (c l) ... -> b c l ...", l=block_len) for x in (X, A, B, C)
+    )

    A = rearrange(A, "b c l h -> b h c l")
    A_cumsum = torch.cumsum(A, dim=-1)
@@ -74,7 +73,7 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
    # 4. Compute state -> output conversion per chunk
    # (left term of low-rank factorization of off-diagonal blocks; C terms)
    state_decay_out = torch.exp(A_cumsum)
-    Y_off = torch.einsum('bclhn,bchpn,bhcl->bclhp', C, states, state_decay_out)
+    Y_off = torch.einsum("bclhn,bchpn,bhcl->bclhp", C, states, state_decay_out)

    # Add output of intra-chunk and inter-chunk terms
    # (diagonal and off-diagonal blocks)
@@ -82,42 +81,31 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
    return Y, final_state


-def generate_random_inputs(batch_size,
-                           seqlen,
-                           n_heads,
-                           d_head,
-                           itype,
-                           device='cuda'):
-
+def generate_random_inputs(batch_size, seqlen, n_heads, d_head, itype, device="cuda"):
    current_platform.seed_everything(0)
-    A = (-torch.exp(torch.rand(n_heads, dtype=itype, device=device)))
+    A = -torch.exp(torch.rand(n_heads, dtype=itype, device=device))
    dt = F.softplus(
-        torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) -
-        4)
-    X = torch.randn((batch_size, seqlen, n_heads, d_head),
-                    dtype=itype,
-                    device=device)
-    B = torch.randn((batch_size, seqlen, n_heads, d_head),
-                    dtype=itype,
-                    device=device)
-    C = torch.randn((batch_size, seqlen, n_heads, d_head),
-                    dtype=itype,
-                    device=device)
+        torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) - 4
+    )
+    X = torch.randn((batch_size, seqlen, n_heads, d_head), dtype=itype, device=device)
+    B = torch.randn((batch_size, seqlen, n_heads, d_head), dtype=itype, device=device)
+    C = torch.randn((batch_size, seqlen, n_heads, d_head), dtype=itype, device=device)

    return A, dt, X, B, C


-def generate_continuous_batched_examples(example_lens_by_batch,
-                                         num_examples,
-                                         full_length,
-                                         last_taken,
-                                         exhausted,
-                                         n_heads,
-                                         d_head,
-                                         itype,
-                                         device='cuda',
-                                         return_naive_ref=True):
-
+def generate_continuous_batched_examples(
+    example_lens_by_batch,
+    num_examples,
+    full_length,
+    last_taken,
+    exhausted,
+    n_heads,
+    d_head,
+    itype,
+    device="cuda",
+    return_naive_ref=True,
+):
    # this function generates a random examples of certain length
    # and then cut according to "example_lens_by_batch" and feed
    # them in continuous batches to the kernels.
@@ -126,23 +114,20 @@ def generate_continuous_batched_examples(example_lens_by_batch,
    # reference output.

    # generate the full-length example
-    A, dt, X, B, C = generate_random_inputs(num_examples, full_length, n_heads,
-                                            d_head, itype)
+    A, dt, X, B, C = generate_random_inputs(
+        num_examples, full_length, n_heads, d_head, itype
+    )

    if return_naive_ref:
-        Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1),
-                                                      A * dt,
-                                                      B,
-                                                      C,
-                                                      block_len=full_length //
-                                                      4)
+        Y_min, final_state_min = ssd_minimal_discrete(
+            X * dt.unsqueeze(-1), A * dt, B, C, block_len=full_length // 4
+        )

    # internal function that outputs a cont batch of examples
    # given a tuple of lengths for each example in the batch
    # e.g., example_lens=(8, 4) means take 8 samples from first eg,
    #       4 examples from second eg, etc
    def get_continuous_batch(example_lens: tuple[int, ...]):
-
        indices = []
        for i, x in enumerate(example_lens):
            c = last_taken.get(i, 0)
@@ -150,8 +135,10 @@ def generate_continuous_batched_examples(example_lens_by_batch,
            last_taken[i] = (c + x) % full_length
            exhausted[i] = last_taken[i] == 0

-        return (torch.concat([x[i, s:e] for i, (s, e) in enumerate(indices)
-                              ]).unsqueeze(0) for x in (dt, X, B, C))
+        return (
+            torch.concat([x[i, s:e] for i, (s, e) in enumerate(indices)]).unsqueeze(0)
+            for x in (dt, X, B, C)
+        )

    # internal function that maps "n" to the appropriate right boundary
    # value when forming continuous batches from examples of length given
@@ -163,19 +150,20 @@ def generate_continuous_batched_examples(example_lens_by_batch,

    IND_E = None
    for spec in example_lens_by_batch:
-
        # get the (maybe partial) example seen in this cont batch
        dt2, X2, B2, C2 = get_continuous_batch(spec)

        # get the metadata
-        cu_seqlens = torch.tensor((0, ) + spec, device=device).cumsum(dim=0)
-        seq_idx = torch.zeros(cu_seqlens[-1],
-                              dtype=torch.int32,
-                              device=cu_seqlens.device)
-        for i, (srt, end) in enumerate(zip(
+        cu_seqlens = torch.tensor((0,) + spec, device=device).cumsum(dim=0)
+        seq_idx = torch.zeros(
+            cu_seqlens[-1], dtype=torch.int32, device=cu_seqlens.device
+        )
+        for i, (srt, end) in enumerate(
+            zip(
                cu_seqlens,
                cu_seqlens[1:],
-        )):
+            )
+        ):
            seq_idx[srt:end] = i

        # for cont batch
@@ -190,19 +178,21 @@ def generate_continuous_batched_examples(example_lens_by_batch,
        X2 = X2.squeeze(0)
        B2 = B2.squeeze(0)
        C2 = C2.squeeze(0)
-        yield ([Y_min[s, IND_S[s]:IND_E[s]]
-                for s in range(num_examples)] if return_naive_ref else None,
-               cu_seqlens, seq_idx, (A, dt2, X2, B2, C2))
+        yield (
+            [Y_min[s, IND_S[s] : IND_E[s]] for s in range(num_examples)]
+            if return_naive_ref
+            else None,
+            cu_seqlens,
+            seq_idx,
+            (A, dt2, X2, B2, C2),
+        )


-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
@pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)])
-def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
-                                         itype):
-
+def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, itype):
    # this tests the kernels on a single example (bs=1)

    # TODO: the bfloat16 case requires higher thresholds. To be investigated
@@ -219,15 +209,16 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
    #   it is not an operational limitation.
    seqlen, chunk_size = seq_len_chunk_size

-    A, dt, X, B, C = generate_random_inputs(batch_size, seqlen, n_heads,
-                                            d_head, itype)
+    A, dt, X, B, C = generate_random_inputs(batch_size, seqlen, n_heads, d_head, itype)

-    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), A * dt,
-                                                  B, C, chunk_size)
+    Y_min, final_state_min = ssd_minimal_discrete(
+        X * dt.unsqueeze(-1), A * dt, B, C, chunk_size
+    )

    cu_seqlens = torch.tensor((0, seqlen), device="cuda").cumsum(dim=0)
    cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
-        compute_varlen_chunk_metadata(cu_seqlens, chunk_size))
+        compute_varlen_chunk_metadata(cu_seqlens, chunk_size)
+    )
    # varlen has implicit batch=1
    X = X.squeeze(0)
    dt = dt.squeeze(0)
@@ -255,10 +246,12 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,

    # just test the last head
    # NOTE, in the kernel we always cast states to fp32
-    torch.testing.assert_close(final_state[:, -1].to(torch.float32),
-                               final_state_min[:, -1].to(torch.float32),
-                               atol=atol,
-                               rtol=rtol)
+    torch.testing.assert_close(
+        final_state[:, -1].to(torch.float32),
+        final_state_min[:, -1].to(torch.float32),
+        atol=atol,
+        rtol=rtol,
+    )


@pytest.mark.parametrize("itype", [torch.float32, torch.float16])
@@ -267,32 +260,40 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
@pytest.mark.parametrize(
    "seq_len_chunk_size_cases",
    [
-
        # small-ish chunk_size (8)
        (64, 8, 2, [(64, 32), (64, 32)]),
        (64, 8, 2, [(32, 32), (32, 32), (32, 32)]),
        (64, 8, 2, [(8, 8), (8, 8), (8, 8)]),  # chunk size boundary
-        (64, 8, 2, [(4, 4), (4, 4), (4, 4),
-                    (4, 4)]),  # chunk_size larger than cont batches
-        (64, 8, 5, [
-            (64, 32, 16, 8, 8),
-            (8, 16, 32, 16, 8),
-            (8, 8, 16, 32, 16),
-        ]),  # mode examples with varied lengths
-
+        (
+            64,
+            8,
+            2,
+            [(4, 4), (4, 4), (4, 4), (4, 4)],
+        ),  # chunk_size larger than cont batches
+        (
+            64,
+            8,
+            5,
+            [
+                (64, 32, 16, 8, 8),
+                (8, 16, 32, 16, 8),
+                (8, 8, 16, 32, 16),
+            ],
+        ),  # mode examples with varied lengths
        # large-ish chunk_size (256)
-        (64, 256, 1, [(5, ), (1, ), (1, ),
-                      (1, )]),  # irregular sizes with small sequences
-        (64, 256, 2, [(5, 30), (1, 2), (1, 2),
-                      (1, 2)]),  # irregular sizes with small sequences
-
+        (64, 256, 1, [(5,), (1,), (1,), (1,)]),  # irregular sizes with small sequences
+        (
+            64,
+            256,
+            2,
+            [(5, 30), (1, 2), (1, 2), (1, 2)],
+        ),  # irregular sizes with small sequences
        # we also need to test some large seqlen
        # to catch errors with init states decay
        (768, 128, 2, [(138, 225), (138, 225)]),
-    ])
-def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
-                                     itype):
-
+    ],
+)
+def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, itype):
    # this test with multiple examples in a continuous batch
    # (i.e. chunked prefill)

@@ -311,12 +312,17 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,

    states = None
    for Y_min, cu_seqlens, _token_seq_idx, (
-            A, dt, X, B, C) in generate_continuous_batched_examples(
-                cases, num_examples, seqlen, last_taken, exhausted, n_heads,
-                d_head, itype):
-
+        A,
+        dt,
+        X,
+        B,
+        C,
+    ) in generate_continuous_batched_examples(
+        cases, num_examples, seqlen, last_taken, exhausted, n_heads, d_head, itype
+    ):
        cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
-            compute_varlen_chunk_metadata(cu_seqlens, chunk_size))
+            compute_varlen_chunk_metadata(cu_seqlens, chunk_size)
+        )

        Y = torch.empty_like(X)
        new_states = mamba_chunk_scan_combined_varlen(
@@ -337,9 +343,8 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,

        # just test the last in sequence
        for i in range(num_examples):
-
            # just test one dim and dstate
-            Y_eg = Y[cu_seqlens[i]:cu_seqlens[i + 1], 0, 0]
+            Y_eg = Y[cu_seqlens[i] : cu_seqlens[i + 1], 0, 0]
            Y_min_eg = Y_min[i][:, 0, 0]
            torch.testing.assert_close(Y_eg, Y_min_eg, atol=atol, rtol=rtol)

@@ -347,18 +352,20 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
        states = new_states
        for i, clear in exhausted.items():
            if clear:
-                states[i].fill_(0.)
+                states[i].fill_(0.0)
                exhausted[i] = False


@pytest.mark.parametrize("chunk_size", [8, 256])
-@pytest.mark.parametrize("seqlens", [
-    (16, 2, 8, 13),
-    (270, 88, 212, 203),
-    (16, 20),
-])
+@pytest.mark.parametrize(
+    "seqlens",
+    [
+        (16, 2, 8, 13),
+        (270, 88, 212, 203),
+        (16, 20),
+    ],
+)
 def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
-
    # This test verifies the correctness of the chunked prefill implementation
    # in the mamba2 ssd kernels, by comparing concatenation (in the sequence
    # dimension) of chunked results with the full sequence result.
@@ -387,21 +394,25 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
    last_taken: dict = {}  # map: eg -> pointer to last taken sample
    exhausted: dict = {}  # map: eg -> boolean indicating example is exhausted
    _, cu_seqlens, seq_idx, (A, dt, X, B, C) = next(
-        generate_continuous_batched_examples([seqlens],
-                                             num_sequences,
-                                             max_seqlen,
-                                             last_taken,
-                                             exhausted,
-                                             n_heads,
-                                             d_head,
-                                             itype,
-                                             return_naive_ref=False))
+        generate_continuous_batched_examples(
+            [seqlens],
+            num_sequences,
+            max_seqlen,
+            last_taken,
+            exhausted,
+            n_heads,
+            d_head,
+            itype,
+            return_naive_ref=False,
+        )
+    )
    seqlens = torch.tensor(seqlens, dtype=torch.int32, device=X.device)
    device = X.device

    ## full seqlen computation
    cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
-        compute_varlen_chunk_metadata(cu_seqlens, chunk_size))
+        compute_varlen_chunk_metadata(cu_seqlens, chunk_size)
+    )
    Y_ref = torch.empty_like(X)
    state_ref = mamba_chunk_scan_combined_varlen(
        X,
@@ -422,11 +433,9 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
    ## chunked seqlen computation
    # first chunk
    chunked_seqlens = seqlens // 2
-    chunked_cu_seqlens = torch.cat([
-        torch.tensor([0], device=device),
-        torch.cumsum(chunked_seqlens, dim=0)
-    ],
-                                   dim=0)
+    chunked_cu_seqlens = torch.cat(
+        [torch.tensor([0], device=device), torch.cumsum(chunked_seqlens, dim=0)], dim=0
+    )
    chunked_input_seq_len = chunked_cu_seqlens[-1]
    X_chunked = torch.zeros_like(X)[:chunked_input_seq_len, ...]
    dt_chunked = torch.zeros_like(dt)[:chunked_input_seq_len, ...]
@@ -443,7 +452,8 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
        # fmt: on

    cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
-        compute_varlen_chunk_metadata(chunked_cu_seqlens, chunk_size))
+        compute_varlen_chunk_metadata(chunked_cu_seqlens, chunk_size)
+    )
    Y_partial = torch.empty_like(X_chunked)
    partial_state = mamba_chunk_scan_combined_varlen(
        X_chunked,
@@ -463,11 +473,13 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):

    # remaining chunk
    remaining_chunked_seqlens = seqlens - chunked_seqlens
-    remaining_chunked_cu_seqlens = torch.cat([
-        torch.tensor([0], device=device),
-        torch.cumsum(remaining_chunked_seqlens, dim=0)
-    ],
-                                             dim=0)
+    remaining_chunked_cu_seqlens = torch.cat(
+        [
+            torch.tensor([0], device=device),
+            torch.cumsum(remaining_chunked_seqlens, dim=0),
+        ],
+        dim=0,
+    )
    remaining_chunked_input_seq_len = remaining_chunked_cu_seqlens[-1]
    # fmt: off
    remaining_X_chunked = torch.zeros_like(X)[:remaining_chunked_input_seq_len, ...]  # noqa: E501
@@ -497,8 +509,8 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
    assert concat_batch_f(C_chunked, remaining_C_chunked).equal(C)

    cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
-        compute_varlen_chunk_metadata(remaining_chunked_cu_seqlens,
-                                      chunk_size))
+        compute_varlen_chunk_metadata(remaining_chunked_cu_seqlens, chunk_size)
+    )

    Y_chunked = torch.empty_like(remaining_X_chunked)
    state_chunked = mamba_chunk_scan_combined_varlen(
@@ -520,20 +532,22 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):

    # kernel chunked is same as kernel overall
    for i in range(num_sequences):
-        Y_seq = Y[cu_seqlens[i]:cu_seqlens[i + 1], ...]
-        Y_ref_seq = Y_ref[cu_seqlens[i]:cu_seqlens[i + 1], ...]
+        Y_seq = Y[cu_seqlens[i] : cu_seqlens[i + 1], ...]
+        Y_ref_seq = Y_ref[cu_seqlens[i] : cu_seqlens[i + 1], ...]
        torch.testing.assert_close(
-            Y_seq[:chunked_seqlens[i], ...],
-            Y_ref_seq[:chunked_seqlens[i], ...],
+            Y_seq[: chunked_seqlens[i], ...],
+            Y_ref_seq[: chunked_seqlens[i], ...],
            atol=atol,
            rtol=rtol,
-            msg=lambda x: f"seq{i} output part1 " + x)  # noqa: B023
+            msg=lambda x: f"seq{i} output part1 " + x,
+        )  # noqa: B023
        torch.testing.assert_close(
-            Y_seq[chunked_seqlens[i]:, ...],
-            Y_ref_seq[chunked_seqlens[i]:, ...],
+            Y_seq[chunked_seqlens[i] :, ...],
+            Y_ref_seq[chunked_seqlens[i] :, ...],
            atol=atol,
            rtol=rtol,
-            msg=lambda x: f"seq{i} output part2 " + x)  # noqa: B023
+            msg=lambda x: f"seq{i} output part2 " + x,
+        )  # noqa: B023

        state_seq = state_chunked[i]
        state_seq_ref = state_ref[i]
@@ -542,4 +556,5 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
            state_seq_ref,
            atol=atol,
            rtol=rtol,
-            msg=lambda x: f"seq{i} state " + x)  # noqa: B023
+            msg=lambda x: f"seq{i} state " + x,
+        )  # noqa: B023