Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -51,8 +51,9 @@ AITER_MODEL_LIST = [
        pytest.param(
            "google/gemma-1.1-2b-it",  # gemma
            marks=[
-                pytest.mark.core_model, pytest.mark.cpu_model,
-                pytest.mark.slow_test
+                pytest.mark.core_model,
+                pytest.mark.cpu_model,
+                pytest.mark.slow_test,
            ],
        ),
        pytest.param(
@@ -65,8 +66,7 @@ AITER_MODEL_LIST = [
        pytest.param(
            "openbmb/MiniCPM3-4B",
            # fused_moe not supported on CPU
-            marks=[pytest.mark.core_model,
-                   large_gpu_mark(min_gb=32)],
+            marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
        ),
        pytest.param(
            "facebook/opt-125m",  # opt
@@ -82,8 +82,9 @@ AITER_MODEL_LIST = [
        pytest.param(
            "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
            marks=[
-                pytest.mark.core_model, pytest.mark.cpu_model,
-                pytest.mark.slow_test
+                pytest.mark.core_model,
+                pytest.mark.cpu_model,
+                pytest.mark.slow_test,
            ],
        ),
        pytest.param(
@@ -100,16 +101,25 @@ AITER_MODEL_LIST = [
            marks=[pytest.mark.cpu_model],
        ),
        pytest.param("swiss-ai/Apertus-8B-2509"),  # apertus
-    ])
+    ],
+)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
@pytest.mark.parametrize("use_prompt_embeds", [True, False])
-def test_models(hf_runner, vllm_runner, example_prompts, model: str,
-                max_tokens: int, num_logprobs: int, use_rocm_aiter: bool,
-                use_prompt_embeds: bool, monkeypatch) -> None:
-
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+    use_rocm_aiter: bool,
+    use_prompt_embeds: bool,
+    monkeypatch,
+) -> None:
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")
@@ -125,34 +135,37 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,

    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-        prompt_embeds: Optional[list[torch.Tensor]] = ([] if use_prompt_embeds
-                                                       else None)
+        prompt_embeds: Optional[list[torch.Tensor]] = [] if use_prompt_embeds else None

        prompt_token_ids = []
        for prompt in example_prompts:
-            token_ids = hf_model.tokenizer(prompt,
-                                           return_tensors="pt").input_ids.to(
-                                               hf_model.model.device)
+            token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
+                hf_model.model.device
+            )
            prompt_token_ids.append(token_ids)
            if prompt_embeds is not None:
-                prompt_embeds.append(hf_model.model.get_input_embeddings()(
-                    token_ids).squeeze(0))
+                prompt_embeds.append(
+                    hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
+                )

    with vllm_runner(
-            model,
-            tokenizer_name=model_info.tokenizer or model,
-            tokenizer_mode=model_info.tokenizer_mode,
-            trust_remote_code=model_info.trust_remote_code,
-            max_num_seqs=2,
-            enable_prompt_embeds=use_prompt_embeds,
+        model,
+        tokenizer_name=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        max_num_seqs=2,
+        enable_prompt_embeds=use_prompt_embeds,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
        if prompt_embeds is not None:
            vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
-                prompt_embeds, max_tokens, num_logprobs)
+                prompt_embeds, max_tokens, num_logprobs
+            )

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
--- a/tests/models/language/generation/test_gemma.py
+++ b/tests/models/language/generation/test_gemma.py
@@ -11,17 +11,17 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
    with monkeypatch.context() as m:
        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
        with vllm_runner(
-                model,
-                load_format="dummy",
+            model,
+            load_format="dummy",
        ) as llm:
            if model == "google/gemma-3-4b-it":
                normalizers = llm.llm.collective_rpc(
-                    lambda self: self.model_runner.model.language_model.model.
-                    normalizer.cpu().item())
+                    lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()
+                )
                config = llm.llm.llm_engine.model_config.hf_config.text_config
            else:
                normalizers = llm.llm.collective_rpc(
-                    lambda self: self.model_runner.model.model.normalizer.cpu(
-                    ).item())
+                    lambda self: self.model_runner.model.model.normalizer.cpu().item()
+                )
                config = llm.llm.llm_engine.model_config.hf_config
            assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
--- a/tests/models/language/generation/test_granite.py
+++ b/tests/models/language/generation/test_granite.py
@@ -26,11 +26,13 @@ def test_models(
 ) -> None:
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -24,7 +24,7 @@ SSM_MODELS = [
    "tiiuae/falcon-mamba-tiny-dev",
    # mamba2-codestral in transformers is broken pending:
    # https://github.com/huggingface/transformers/pull/40861
-    #"yujiepan/mamba2-codestral-v0.1-tiny-random",
+    # "yujiepan/mamba2-codestral-v0.1-tiny-random",
 ]

 HYBRID_MODELS = [
@@ -65,7 +65,6 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@@ -75,11 +74,13 @@ def test_models(

    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
@@ -109,13 +110,14 @@ def test_batching(
    for_loop_outputs = []
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        for prompt in example_prompts:
-            single_output, = vllm_model.generate_greedy_logprobs([prompt],
-                                                                 max_tokens,
-                                                                 num_logprobs)
+            (single_output,) = vllm_model.generate_greedy_logprobs(
+                [prompt], max_tokens, num_logprobs
+            )
            for_loop_outputs.append(single_output)

        batched_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=for_loop_outputs,
@@ -134,8 +136,8 @@ def test_chunked_prefill_with_parallel_sampling(
    max_tokens: int,
 ) -> None:
    """
-    Tests chunked prefill in conjunction with n > 1. 
-    
+    Tests chunked prefill in conjunction with n > 1.
+
    In this case, prefill is populated with decoding tokens and
    we test that it doesn't fail.

@@ -143,16 +145,13 @@ def test_chunked_prefill_with_parallel_sampling(
    decoding steps inside a chunked prefill forward pass
    (where we have both prefill and decode together)
    """
-    sampling_params = SamplingParams(n=3,
-                                     temperature=1,
-                                     seed=0,
-                                     max_tokens=max_tokens)
+    sampling_params = SamplingParams(n=3, temperature=1, seed=0, max_tokens=max_tokens)
    with vllm_runner(
-            model,
-            enable_chunked_prefill=True,
-            # forces prefill chunks with decoding
-            max_num_batched_tokens=MAX_NUM_SEQS * 3,
-            max_num_seqs=MAX_NUM_SEQS,
+        model,
+        enable_chunked_prefill=True,
+        # forces prefill chunks with decoding
+        max_num_batched_tokens=MAX_NUM_SEQS * 3,
+        max_num_seqs=MAX_NUM_SEQS,
    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)

@@ -170,10 +169,8 @@ def test_mamba_cache_cg_padding(
    batch size. If it's not, a torch RuntimeError will be raised because
    tensor dimensions aren't compatible.
    """
-    vllm_config = EngineArgs(model=model,
-                             trust_remote_code=True).create_engine_config()
-    while len(example_prompts) == vllm_config.pad_for_cudagraph(
-            len(example_prompts)):
+    vllm_config = EngineArgs(model=model, trust_remote_code=True).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(len(example_prompts)):
        example_prompts.append(example_prompts[0])

    try:
@@ -183,7 +180,8 @@ def test_mamba_cache_cg_padding(
        pytest.fail(
            "Couldn't run batch size which is not equal to a Cuda Graph "
            "captured batch size. "
-            "Could be related to mamba cache not padded correctly")
+            "Could be related to mamba cache not padded correctly"
+        )


@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -205,8 +203,10 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
    except ValueError:
-        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
-                    "steps finished requests registered unnecessarily ")
+        pytest.fail(
+            "Hybrid inner state wasn't cleaned up properly between"
+            "steps finished requests registered unnecessarily "
+        )


@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -215,10 +215,10 @@ def test_state_cleanup(
    example_prompts,
    model: str,
 ) -> None:
-    """ 
+    """
    This test is for verifying that the Hybrid state is cleaned up between
    steps.
-    
+
    If it's not cleaned, an error would be expected.
    """
    try:
@@ -226,8 +226,10 @@ def test_state_cleanup(
            for _ in range(10):
                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
    except ValueError:
-        pytest.fail("Hybrid inner state wasn't cleaned up between states, "
-                    "could be related to finished_requests_ids")
+        pytest.fail(
+            "Hybrid inner state wasn't cleaned up between states, "
+            "could be related to finished_requests_ids"
+        )


@multi_gpu_test(num_gpus=2)
@@ -241,15 +243,19 @@ def test_distributed_correctness(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    with vllm_runner(model, tensor_parallel_size=1,
-                     max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+    with vllm_runner(
+        model, tensor_parallel_size=1, max_num_seqs=MAX_NUM_SEQS
+    ) as vllm_model:
        vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-    with vllm_runner(model, tensor_parallel_size=2,
-                     max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+    with vllm_runner(
+        model, tensor_parallel_size=2, max_num_seqs=MAX_NUM_SEQS
+    ) as vllm_model:
        vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=vllm_outputs_tp_1,
@@ -271,7 +277,6 @@ def test_full_cuda_graph(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@@ -281,11 +286,13 @@ def test_full_cuda_graph(

    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
@@ -298,8 +305,9 @@ def test_full_cuda_graph(
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("cache_dtype_param",
-                         ["mamba_ssm_cache_dtype", "mamba_cache_dtype"])
+@pytest.mark.parametrize(
+    "cache_dtype_param", ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]
+)
 def test_fp32_cache_state(
    hf_runner,
    vllm_runner,
@@ -310,7 +318,6 @@ def test_fp32_cache_state(
    num_logprobs: int,
    cache_dtype_param: str,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@@ -320,13 +327,15 @@ def test_fp32_cache_state(

    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-    with vllm_runner(model,
-                     max_num_seqs=MAX_NUM_SEQS,
-                     **{cache_dtype_param: "float32"}) as vllm_model:
+    with vllm_runner(
+        model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}
+    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
@@ -339,21 +348,23 @@ def test_fp32_cache_state(
 # Helper functions for the APC tests
 def _get_vllm_runner_params(model, max_model_len, tensor_parallel_size=1):
    return {
-        'model_name': model,
-        'enable_prefix_caching': False,
-        'max_model_len': max_model_len,
-        'tensor_parallel_size': tensor_parallel_size,
-        'gpu_memory_utilization': 0.4
+        "model_name": model,
+        "enable_prefix_caching": False,
+        "max_model_len": max_model_len,
+        "tensor_parallel_size": tensor_parallel_size,
+        "gpu_memory_utilization": 0.4,
    }


-def _get_vLLM_output(vllm_runner,
-                     kwargs,
-                     prompts,
-                     max_tokens,
-                     num_logprobs,
-                     num_repetitions=1,
-                     vllm_model=None):
+def _get_vLLM_output(
+    vllm_runner,
+    kwargs,
+    prompts,
+    max_tokens,
+    num_logprobs,
+    num_repetitions=1,
+    vllm_model=None,
+):
    outs = []
    if vllm_model is None:
        vllm_model = vllm_runner(**kwargs)
@@ -362,7 +373,8 @@ def _get_vLLM_output(vllm_runner,
            vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
        else:
            vllm_output = vllm_model.generate_greedy_logprobs(
-                prompts, max_tokens, num_logprobs)
+                prompts, max_tokens, num_logprobs
+            )
        outs.append(vllm_output)

    return outs, vllm_model
@@ -387,7 +399,6 @@ def test_apc_single_prompt(
    num_logprobs: int,
    tensor_parallel_size: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@@ -395,29 +406,33 @@ def test_apc_single_prompt(
    except ValueError:
        pass

-    compare_operator: Callable = check_logprobs_close \
-          if num_logprobs > 0 else check_outputs_equal # type: ignore
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )

    MULTIPLE = 300

    # Sample prompts.
    generated_prompts = [MULTIPLE * example_prompts[0]]

-    max_model_len = max(
-        len(prompt) + max_tokens for prompt in generated_prompts)
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
    vllm_runner_kwargs = _get_vllm_runner_params(
-        model, max_model_len, tensor_parallel_size=tensor_parallel_size)
-    vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
-    vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
-                                                vllm_runner_kwargs,
-                                                generated_prompts, max_tokens,
-                                                num_logprobs)
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )

-    vllm_runner_kwargs['enable_prefix_caching'] = True
-    vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
-                                                 vllm_runner_kwargs,
-                                                 generated_prompts, max_tokens,
-                                                 num_logprobs, n_repetitions)
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    vllm_outputs_cache_rep, _ = _get_vLLM_output(
+        vllm_runner,
+        vllm_runner_kwargs,
+        generated_prompts,
+        max_tokens,
+        num_logprobs,
+        n_repetitions,
+    )

    for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
        # In the first repetition, the caches are filled
@@ -450,7 +465,6 @@ def test_apc_single_prompt_block_align_alignment(
    num_logprobs: int,
    tensor_parallel_size: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@@ -458,30 +472,29 @@ def test_apc_single_prompt_block_align_alignment(
    except ValueError:
        pass

-    compare_operator: Callable = check_logprobs_close \
-                    if num_logprobs > 0 else check_outputs_equal # type: ignore
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )

    MULTIPLE = 300

    # Sample prompts. This custom prompt is used, as it causes the most issues
    generated_prompts = ["The president of the United States is " * MULTIPLE]

-    max_model_len = max(
-        len(prompt) + max_tokens for prompt in generated_prompts)
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
    vllm_runner_kwargs = _get_vllm_runner_params(
-        model, max_model_len, tensor_parallel_size=tensor_parallel_size)
-    vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"

-    vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
-                                                vllm_runner_kwargs,
-                                                generated_prompts, max_tokens,
-                                                num_logprobs)
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )

-    vllm_runner_kwargs['enable_prefix_caching'] = True
+    vllm_runner_kwargs["enable_prefix_caching"] = True
    with vllm_runner(**vllm_runner_kwargs) as vllm_model:
        # Retrieve the default mamba state block size
-        mamba_block_size = vllm_model.llm.llm_engine.cache_config. \
-            mamba_block_size
+        mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size

    # In case the hybrid model does not have the
    # "mamba_block_size" assume a fixed constant
@@ -489,18 +502,18 @@ def test_apc_single_prompt_block_align_alignment(
        mamba_block_size = 512

    mamba_block_size_multiplier = 10
-    for offsets in [
-            -3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3
-    ]:
-
-        vllm_runner_kwargs[
-            'max_num_batched_tokens'] = mamba_block_size_multiplier * \
-                                        mamba_block_size - offsets
-        vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
-                                                     vllm_runner_kwargs,
-                                                     generated_prompts,
-                                                     max_tokens, num_logprobs,
-                                                     n_repetitions)
+    for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
+        vllm_runner_kwargs["max_num_batched_tokens"] = (
+            mamba_block_size_multiplier * mamba_block_size - offsets
+        )
+        vllm_outputs_cache_rep, _ = _get_vLLM_output(
+            vllm_runner,
+            vllm_runner_kwargs,
+            generated_prompts,
+            max_tokens,
+            num_logprobs,
+            n_repetitions,
+        )

        # Check alignment of the output logits when using APC
        for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
@@ -534,7 +547,6 @@ def test_apc_multiple_prompts_all_cached_outputs(
    num_logprobs: int,
    tensor_parallel_size: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@@ -542,30 +554,34 @@ def test_apc_multiple_prompts_all_cached_outputs(
    except ValueError:
        pass

-    compare_operator: Callable = check_logprobs_close \
-        if num_logprobs > 0 else check_outputs_equal # type: ignore
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )

    MULTIPLE = 300

    # Sample prompts.
    generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]

-    max_model_len = max(
-        len(prompt) + max_tokens for prompt in generated_prompts)
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
    vllm_runner_kwargs = _get_vllm_runner_params(
-        model, max_model_len, tensor_parallel_size=tensor_parallel_size)
-    vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"

-    vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
-                                                vllm_runner_kwargs,
-                                                generated_prompts, max_tokens,
-                                                num_logprobs)
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )

-    vllm_runner_kwargs['enable_prefix_caching'] = True
-    vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
-                                                 vllm_runner_kwargs,
-                                                 generated_prompts, max_tokens,
-                                                 num_logprobs, n_repetitions)
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    vllm_outputs_cache_rep, _ = _get_vLLM_output(
+        vllm_runner,
+        vllm_runner_kwargs,
+        generated_prompts,
+        max_tokens,
+        num_logprobs,
+        n_repetitions,
+    )

    for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
        # In the first repetition, the caches are filled
@@ -598,7 +614,6 @@ def test_apc_multiple_prompts_block_align_alignment(
    num_logprobs: int,
    tensor_parallel_size: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@@ -606,34 +621,31 @@ def test_apc_multiple_prompts_block_align_alignment(
    except ValueError:
        pass

-    compare_operator: Callable = check_logprobs_close \
-        if num_logprobs > 0 else check_outputs_equal # type: ignore
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )

    MULTIPLE = 300

    # Sample prompts. This custom prompt is used, as it causes the most issues
    prompt_text = "The president of the United States is "
    prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31]
-    generated_prompts = [
-        prompt_text[offset:] * MULTIPLE for offset in prompt_offsets
-    ]
+    generated_prompts = [prompt_text[offset:] * MULTIPLE for offset in prompt_offsets]

-    max_model_len = max(
-        len(prompt) + max_tokens for prompt in generated_prompts)
-    vllm_runner_kwargs = _get_vllm_runner_params(model, max_model_len,
-                                                 tensor_parallel_size)
-    vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
+    vllm_runner_kwargs = _get_vllm_runner_params(
+        model, max_model_len, tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"

-    vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
-                                                vllm_runner_kwargs,
-                                                generated_prompts, max_tokens,
-                                                num_logprobs)
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )

-    vllm_runner_kwargs['enable_prefix_caching'] = True
+    vllm_runner_kwargs["enable_prefix_caching"] = True
    with vllm_runner(**vllm_runner_kwargs) as vllm_model:
        # Retrieve the default mamba state block size
-        mamba_block_size = vllm_model.llm.llm_engine.cache_config. \
-            mamba_block_size
+        mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size

    # In case the hybrid model does not have the
    # "mamba_block_size" assume a fixed constant
@@ -641,18 +653,18 @@ def test_apc_multiple_prompts_block_align_alignment(
        mamba_block_size = 512

    mamba_block_size_multiplier = 10
-    for offsets in [
-            -3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3
-    ]:
-
-        vllm_runner_kwargs[
-            'max_num_batched_tokens'] = mamba_block_size_multiplier * \
-                                        mamba_block_size - offsets
-        vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
-                                                     vllm_runner_kwargs,
-                                                     generated_prompts,
-                                                     max_tokens, num_logprobs,
-                                                     n_repetitions)
+    for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
+        vllm_runner_kwargs["max_num_batched_tokens"] = (
+            mamba_block_size_multiplier * mamba_block_size - offsets
+        )
+        vllm_outputs_cache_rep, _ = _get_vLLM_output(
+            vllm_runner,
+            vllm_runner_kwargs,
+            generated_prompts,
+            max_tokens,
+            num_logprobs,
+            n_repetitions,
+        )

        # Check alignment of the output logits when using APC
        for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
@@ -686,7 +698,6 @@ def test_apc_multiple_prompts_partial_cached_outputs(
    num_logprobs: int,
    tensor_parallel_size: int,
 ) -> None:
-
    try:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
        model_info.check_available_online(on_fail="skip")
@@ -694,30 +705,30 @@ def test_apc_multiple_prompts_partial_cached_outputs(
    except ValueError:
        pass

-    compare_operator: Callable = check_logprobs_close \
-        if num_logprobs > 0 else check_outputs_equal # type: ignore
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )

    MULTIPLE = 300

    # Sample prompts.
    generated_prompts = [MULTIPLE * prompt for prompt in example_prompts]

-    max_model_len = max(
-        len(prompt) + max_tokens for prompt in generated_prompts)
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
    vllm_runner_kwargs = _get_vllm_runner_params(
-        model, max_model_len, tensor_parallel_size=tensor_parallel_size)
-    vllm_runner_kwargs['mamba_ssm_cache_dtype'] = "float32"
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"

-    vllm_outputs_no_cache, _ = _get_vLLM_output(vllm_runner,
-                                                vllm_runner_kwargs,
-                                                generated_prompts, max_tokens,
-                                                num_logprobs)
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )

    # Cache only part of all the prompts
-    vllm_runner_kwargs['enable_prefix_caching'] = True
+    vllm_runner_kwargs["enable_prefix_caching"] = True
    vllm_outputs_partial_cache, vllm_model = _get_vLLM_output(
-        vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens,
-        num_logprobs)
+        vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens, num_logprobs
+    )

    compare_operator(
        outputs_0_lst=vllm_outputs_no_cache[0][:3],
@@ -726,13 +737,15 @@ def test_apc_multiple_prompts_partial_cached_outputs(
        name_1="vllm_partial_cache",
    )

-    vllm_outputs_cache_rep, _ = _get_vLLM_output(vllm_runner,
-                                                 vllm_runner_kwargs,
-                                                 generated_prompts,
-                                                 max_tokens,
-                                                 num_logprobs,
-                                                 n_repetitions,
-                                                 vllm_model=vllm_model)
+    vllm_outputs_cache_rep, _ = _get_vLLM_output(
+        vllm_runner,
+        vllm_runner_kwargs,
+        generated_prompts,
+        max_tokens,
+        num_logprobs,
+        n_repetitions,
+        vllm_model=vllm_model,
+    )

    for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
        # In the first repetition, the caches are filled
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -6,7 +6,9 @@ import json
 import pytest

 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
-    MistralToolCall, MistralToolParser)
+    MistralToolCall,
+    MistralToolParser,
+)
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import MistralTokenizer

@@ -33,136 +35,114 @@ SYMBOLIC_LANG_PROMPTS = [
 ]

 # for function calling
-TOOLS = [{
-    "type": "function",
-    "function": {
-        "name": "get_current_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "city": {
-                    "type":
-                    "string",
-                    "description":
-                    "The city to find the weather for, e.g. 'San Francisco'"
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that the city is"
+                        " in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
                },
-                "state": {
-                    "type":
-                    "string",
-                    "description":
-                    "the two-letter abbreviation for the state that the city is"
-                    " in, e.g. 'CA' which would mean 'California'"
-                },
-                "unit": {
-                    "type": "string",
-                    "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
+                "required": ["city", "state", "unit"],
            },
-            "required": ["city", "state", "unit"]
-        }
+        },
    },
-}, {
-    "type": "function",
-    "function": {
-        "name": "rewrite",
-        "description": "Rewrites text",
-        "parameters": {
-            "type": "object",
-            "required": [],
-            "properties": {
-                "text": {
-                    "type": "string",
-                    "description": "The input text to rewrite."
-                }
-            }
-        }
-    }
-}]
+    {
+        "type": "function",
+        "function": {
+            "name": "rewrite",
+            "description": "Rewrites text",
+            "parameters": {
+                "type": "object",
+                "required": [],
+                "properties": {
+                    "text": {
+                        "type": "string",
+                        "description": "The input text to rewrite.",
+                    }
+                },
+            },
+        },
+    },
+]
 MSGS = [
+    {"role": "system", "content": "You are an assistant."},
    {
-        "role": "system",
-        "content": "You are an assistant."
-    },
-    {
-        "role":
-        "user",
-        "content":
-        "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors."  # noqa
-    },
-    {
-        "role":
-        "assistant",
-        "content":
-        "",
-        "tool_calls": [{
-            "id": "bbc5b7ede",
-            "type": "function",
-            "function": {
-                "name":
-                "rewrite",
-                "arguments":
-                '{\"text\":\"My English needs improvving, maybe I make errors.\"}'  # noqa
-            }
-        }]
-    },
-    {
-        "role": "tool",
-        "content":
-        "{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}",  # noqa
-        "tool_call_id": "bbc5b7ede",
-        "name": "rewrite"
+        "role": "user",
+        "content": "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors.",  # noqa
    },
    {
        "role": "assistant",
-        "content": "---\n\nMy English needs improving, maybe I make errors"
+        "content": "",
+        "tool_calls": [
+            {
+                "id": "bbc5b7ede",
+                "type": "function",
+                "function": {
+                    "name": "rewrite",
+                    "arguments": '{"text":"My English needs improvving, maybe I make errors."}',  # noqa
+                },
+            }
+        ],
    },
    {
-        "role":
-        "user",
-        "content": ("Can you tell me what the temperate"
-                    " will be in Dallas, in fahrenheit?")
-    }
+        "role": "tool",
+        "content": '{"action":"rewrite","outcome":"My English needs improving, maybe I make errors."}',  # noqa
+        "tool_call_id": "bbc5b7ede",
+        "name": "rewrite",
+    },
+    {
+        "role": "assistant",
+        "content": "---\n\nMy English needs improving, maybe I make errors",
+    },
+    {
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
 ]

 SAMPLE_JSON_SCHEMA = {
    "type": "object",
    "properties": {
-        "name": {
-            "type": "string"
-        },
-        "age": {
-            "type": "integer"
-        },
+        "name": {"type": "string"},
+        "age": {"type": "integer"},
        "skills": {
            "type": "array",
-            "items": {
-                "type": "string",
-                "maxLength": 10
-            },
-            "minItems": 3
+            "items": {"type": "string", "maxLength": 10},
+            "minItems": 3,
        },
        "work_history": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
-                    "company": {
-                        "type": "string"
-                    },
-                    "duration": {
-                        "type": "number"
-                    },
-                    "position": {
-                        "type": "string"
-                    }
+                    "company": {"type": "string"},
+                    "duration": {"type": "number"},
+                    "position": {"type": "string"},
                },
-                "required": ["company", "position"]
-            }
-        }
+                "required": ["company", "position"],
+            },
+        },
    },
-    "required": ["name", "age", "skills", "work_history"]
+    "required": ["name", "age", "skills", "work_history"],
 }


@@ -170,17 +150,25 @@ SAMPLE_JSON_SCHEMA = {
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, example_prompts, model: str,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    # TODO(sang): Sliding window should be tested separately.
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-    with vllm_runner(model, dtype=dtype,
-                     tokenizer_mode="mistral") as vllm_model:
+    with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral") as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
@@ -194,27 +182,35 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
-def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
-                        max_tokens: int, num_logprobs: int) -> None:
+def test_mistral_format(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="mistral",
-            load_format="mistral",
-            config_format="mistral",
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        load_format="mistral",
+        config_format="mistral",
    ) as mistral_format_model:
        mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="auto",
-            load_format="safetensors",
-            config_format="hf",
+        model,
+        dtype=dtype,
+        tokenizer_mode="auto",
+        load_format="safetensors",
+        config_format="hf",
    ) as hf_format_model:
        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=hf_format_outputs,
@@ -226,34 +222,35 @@ def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,

@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_mistral_symbolic_languages(vllm_runner, model: str,
-                                    dtype: str) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     max_model_len=8192,
-                     tokenizer_mode="mistral",
-                     config_format="mistral",
-                     load_format="mistral") as vllm_model:
+def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str) -> None:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        max_model_len=8192,
+        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
+    ) as vllm_model:
        for prompt in SYMBOLIC_LANG_PROMPTS:
            msg = {"role": "user", "content": prompt}
-            outputs = vllm_model.llm.chat([msg],
-                                          sampling_params=SAMPLING_PARAMS)
+            outputs = vllm_model.llm.chat([msg], sampling_params=SAMPLING_PARAMS)
            assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()


@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
 def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tokenizer_mode="mistral",
-                     config_format="mistral",
-                     load_format="mistral") as vllm_model:
-
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
+    ) as vllm_model:
        msgs = copy.deepcopy(MSGS)
-        outputs = vllm_model.llm.chat(msgs,
-                                      tools=TOOLS,
-                                      sampling_params=SAMPLING_PARAMS)
+        outputs = vllm_model.llm.chat(
+            msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS
+        )

        tokenizer = vllm_model.llm.get_tokenizer()
        tool_parser = MistralToolParser(tokenizer)
@@ -265,10 +262,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
        assert parsed_message.tools_called

        assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
-        assert parsed_message.tool_calls[
-            0].function.name == "get_current_weather"
-        assert parsed_message.tool_calls[
-            0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
+        assert parsed_message.tool_calls[0].function.name == "get_current_weather"
+        assert (
+            parsed_message.tool_calls[0].function.arguments
+            == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'
+        )  # noqa
        assert parsed_message.content is None


@@ -297,17 +295,10 @@ def test_mistral_function_call_nested_json():
        "city": "Dallas",
        "state": "TX",
        "unit": "fahrenheit",
-        "sub_dict": {
-            "foo": "bar",
-            "inner": {
-                "x": 1,
-                "y": 2
-            }
-        },
+        "sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
    }

-    model_output = (
-        f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}")
+    model_output = f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}"

    parsed = parser.extract_tool_calls(model_output, None)

--- a/tests/models/language/generation/test_phimoe.py
+++ b/tests/models/language/generation/test_phimoe.py
@@ -15,62 +15,56 @@ MODELS = [

 def test_phimoe_routing_function():
    from vllm.model_executor.models.phimoe import phimoe_routing_function
+
    test_case = {
        0: {
-            "hidden_states":
-            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
-                         dtype=torch.float32,
-                         requires_grad=False).view(4, 2),
-            "gating_output":
-            torch.tensor([0.1, 0.2, 0.3, 0.4],
-                         dtype=torch.float32,
-                         requires_grad=False),
-            "topk":
-            2,
-            "renormalize":
-            False,
+            "hidden_states": torch.tensor(
+                [1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
+            ).view(4, 2),
+            "gating_output": torch.tensor(
+                [0.1, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
+            ),
+            "topk": 2,
+            "renormalize": False,
        },
        1: {
-            "hidden_states":
-            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
-                         dtype=torch.float32,
-                         requires_grad=False).view(4, 2),
-            "gating_output":
-            torch.tensor([0.4, 0.2, 0.3, 0.4],
-                         dtype=torch.float32,
-                         requires_grad=False),
-            "topk":
-            2,
-            "renormalize":
-            False,
-        }
+            "hidden_states": torch.tensor(
+                [1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
+            ).view(4, 2),
+            "gating_output": torch.tensor(
+                [0.4, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
+            ),
+            "topk": 2,
+            "renormalize": False,
+        },
    }

    ground_truth = {
        0: {
-            "topk_weights":
-            torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
-            "topk_ids":
-            torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
+            "topk_weights": torch.tensor(
+                [1.0, 1.0], dtype=torch.float32, requires_grad=False
+            ),
+            "topk_ids": torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
        },
        1: {
-            "topk_weights":
-            torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
-            "topk_ids":
-            torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
-        }
+            "topk_weights": torch.tensor(
+                [0.5, 1.0], dtype=torch.float32, requires_grad=False
+            ),
+            "topk_ids": torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
+        },
    }

    for test_id in test_case:
        topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
-        assert torch.allclose(topk_weights,
-                              ground_truth[test_id]["topk_weights"])
+        assert torch.allclose(topk_weights, ground_truth[test_id]["topk_weights"])
        assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])


-@pytest.mark.skipif(condition=current_platform.is_cpu(),
-                    reason="This test takes a lot time to run on CPU, "
-                    "and vllm CI's disk space is not enough for this model.")
+@pytest.mark.skipif(
+    condition=current_platform.is_cpu(),
+    reason="This test takes a lot time to run on CPU, "
+    "and vllm CI's disk space is not enough for this model.",
+)
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -87,11 +81,13 @@ def test_models(
 ) -> None:
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,