Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -13,18 +13,25 @@ from compressed_tensors.quantization import QuantizationType

 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensors24, CompressedTensorsLinearMethod,
-    CompressedTensorsW4A4Fp4, CompressedTensorsW4A8Fp8,
-    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensors24,
+    CompressedTensorsLinearMethod,
+    CompressedTensorsW4A4Fp4,
+    CompressedTensorsW4A8Fp8,
+    CompressedTensorsW4A16Fp4,
+    CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16,
+)
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    W8A8BlockFp8LinearOp)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    cutlass_fp4_supported)
+    cutlass_fp4_supported,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    sparse_cutlass_supported)
+    sparse_cutlass_supported,
+)
 from vllm.platforms import current_platform

 # AITER only supports per-channel-per-channel INT8 gemm
@@ -32,7 +39,7 @@ from vllm.platforms import current_platform
 # It does not support mix precision MM and mix quantization scheme.
 ROCM_AITER_SUPPORTED_INT8_MODEL = [
    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
 ]

 # TritonScaledMMLinearKernel only supports symmetric quantization.
@@ -80,8 +87,10 @@ def enable_pickle(monkeypatch):
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
    model_path, strategy, quant_type, shape_0, is_symmetric = model_args

-    if current_platform.is_rocm(
-    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+    if (
+        current_platform.is_rocm()
+        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
+    ):
        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")

    with vllm_runner(model_path, enforce_eager=True) as llm:
@@ -106,14 +115,10 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
            assert zp_valid(gate_up_proj.input_zero_point)
            assert zp_valid(down_proj.input_zero_point)

-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            assert isinstance(o_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            assert isinstance(gate_up_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            assert isinstance(down_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+            assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
+            assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
+            assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)

            assert qkv_proj.scheme.strategy == strategy
@@ -151,7 +156,8 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.parametrize(
-    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
 def test_compressed_tensors_w8a8_logprobs(
    hf_runner,
    vllm_runner,
@@ -162,15 +168,15 @@ def test_compressed_tensors_w8a8_logprobs(
    use_aiter,
    monkeypatch,
 ):
-
-    if current_platform.is_rocm(
-    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+    if (
+        current_platform.is_rocm()
+        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
+    ):
        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")

    if use_aiter:
        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
-            pytest.skip(
-                f"Skip model {model_path} as it is not support by aiter.")
+            pytest.skip(f"Skip model {model_path} as it is not support by aiter.")
        # this will enable VLLM_ROCM_USE_AITER_LINEAR
        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")

@@ -178,18 +184,20 @@ def test_compressed_tensors_w8a8_logprobs(

    # skip language translation prompt for the static per tensor models
    if model_path in (
-            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
-            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
    ):
        example_prompts = example_prompts[0:-1]

    with hf_runner(model_path, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    with vllm_runner(model_path, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
@@ -225,7 +233,8 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
    ],
 )
@pytest.mark.parametrize(
-    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
 def test_compressed_tensors_w8a8_dynamic_per_token(
    vllm_runner,
    model_args,
@@ -234,14 +243,15 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
 ):
    model_path, strategy = model_args

-    if current_platform.is_rocm(
-    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+    if (
+        current_platform.is_rocm()
+        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
+    ):
        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")

    if use_aiter:
        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
-            pytest.skip(
-                f"Skip model {model_path} as it is not support by aiter.")
+            pytest.skip(f"Skip model {model_path} as it is not support by aiter.")
        # this will enable VLLM_ROCM_USE_AITER_LINEAR
        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")

@@ -252,8 +262,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(

            qkv_proj = layer.self_attn.qkv_proj

-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
            assert not qkv_proj.scheme.is_static_input_scheme
            assert qkv_proj.scheme.strategy == strategy
@@ -267,21 +276,60 @@ def test_compressed_tensors_w8a8_dynamic_per_token(

@pytest.mark.parametrize(
    "wNa16_args",
-    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8,
-      True, False),
-     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8, True,
-      False),
-     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4,
-      True, False),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256", "group", 128,
-      8, False, False),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
-      "channel", None, 8, False, False),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
-      "group", 128, 8, False, True)],
+    [
+        (
+            "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+            "channel",
+            None,
+            8,
+            True,
+            False,
+        ),
+        (
+            "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
+            "group",
+            128,
+            8,
+            True,
+            False,
+        ),
+        (
+            "nm-testing/tinyllama-oneshot-w8a16-per-channel",
+            "channel",
+            None,
+            4,
+            True,
+            False,
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256",
+            "group",
+            128,
+            8,
+            False,
+            False,
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
+            "channel",
+            None,
+            8,
+            False,
+            False,
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
+            "group",
+            128,
+            8,
+            False,
+            True,
+        ),
+    ],
+)
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="The tests are skipped on non-CUDA platform."
 )
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="The tests are skipped on non-CUDA platform.")
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
    model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
    with vllm_runner(model) as llm:
@@ -290,13 +338,11 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
            layer = model.model.layers[0]

            qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
            assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)

            assert qkv_proj.scheme.strategy == strategy
-            assert qkv_proj.scheme.group_size == (-1
-                                                  if group is None else group)
+            assert qkv_proj.scheme.group_size == (-1 if group is None else group)

            assert qkv_proj.scheme.pack_factor == pack_factor
            assert qkv_proj.scheme.symmetric == symmetric
@@ -308,8 +354,9 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
        assert output


-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test is skipped on non-CUDA platform.")
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
    model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
    with vllm_runner(model_path) as llm:
@@ -319,8 +366,7 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):

            qkv_proj = layer.self_attn.qkv_proj

-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
            assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
            assert qkv_proj.weight_packed.dtype is torch.int32

@@ -339,8 +385,7 @@ def test_compressed_tensors_fp8(vllm_runner):

            qkv_proj = layer.self_attn.qkv_proj

-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
            assert isinstance(
                qkv_proj.scheme,
                (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8),
@@ -362,9 +407,11 @@ def test_compressed_tensors_fp8(vllm_runner):

@pytest.mark.skipif(
    not current_platform.is_kv_cache_dtype_supported("fp8", None),
-    reason="FP8 KV cache is not supported on this device.")
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test is skipped on non-CUDA platform.")
+    reason="FP8 KV cache is not supported on this device.",
+)
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
 def test_compressed_tensors_kv_cache(vllm_runner):
    model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
    with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
@@ -376,10 +423,7 @@ def test_compressed_tensors_kv_cache(vllm_runner):
    not sparse_cutlass_supported(),
    reason="Sparse FP8 is not yet supported on this GPU type.",
 )
-def _test_2of4_quant_models(qkv_proj,
-                            weight_strategy,
-                            input_strategy,
-                            format="dense"):
+def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="dense"):
    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
    assert isinstance(qkv_proj.scheme, CompressedTensors24)

@@ -393,8 +437,7 @@ def _test_2of4_quant_models(qkv_proj,


@pytest.mark.skipif(
-    not current_platform.is_cuda()
-    or not current_platform.has_device_capability(90),
+    not current_platform.is_cuda() or not current_platform.has_device_capability(90),
    reason="Sparse FP8 is not yet supported on this GPU type.",
 )
@pytest.mark.parametrize(
@@ -441,8 +484,7 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):


@pytest.mark.skipif(
-    not current_platform.is_cuda()
-    or not current_platform.has_device_capability(90),
+    not current_platform.is_cuda() or not current_platform.has_device_capability(90),
    reason="Sparse FP8 is not yet supported on this GPU type.",
 )
@pytest.mark.parametrize(
@@ -603,17 +645,14 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
            layer = model.model.layers[0]

            qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
            assert isinstance(qkv_proj.scheme, CompressedTensors24)

            assert qkv_proj.scheme.weight_quant is None
            assert qkv_proj.scheme.input_quant is None
            assert not qkv_proj.scheme.quantized
            assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            sparsity_map = (
-                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            )  # noqa: E501
+            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
            assert sparsity_map.get("Linear").format == "dense"
            assert sparsity_map.get("Linear").sparsity_structure == "2:4"

@@ -629,7 +668,8 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
    reason="Cutlass is not yet supported on this GPU type.",
 )
@pytest.mark.parametrize(
-    "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")])
+    "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")]
+)
 def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
    model = args_2of4
    with vllm_runner(model) as llm:
@@ -638,17 +678,14 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
            layer = model.model.layers[0]

            qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
            assert isinstance(qkv_proj.scheme, CompressedTensors24)

            assert qkv_proj.scheme.weight_quant is None
            assert qkv_proj.scheme.input_quant is None
            assert not qkv_proj.scheme.quantized
            assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            sparsity_map = (
-                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            )  # noqa: E501
+            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
            assert sparsity_map.get("Linear").format == "sparse-24-bitmask"
            assert sparsity_map.get("Linear").sparsity_structure == "2:4"

@@ -661,9 +698,11 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):

@pytest.mark.parametrize(
    "args",
-    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16",
-      CompressedTensorsW4A16Fp4),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4)])
+    [
+        ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
+        ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4),
+    ],
+)
 def test_compressed_tensors_nvfp4(vllm_runner, args):
    model, scheme = args
    with vllm_runner(model, enforce_eager=True) as llm:
@@ -672,11 +711,12 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
            layer = model.model.layers[0]

            qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            if isinstance(qkv_proj.scheme, scheme) or isinstance(
-                    qkv_proj.scheme,
-                    CompressedTensorsW4A16Fp4) and not cutlass_fp4_supported():
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+            if (
+                isinstance(qkv_proj.scheme, scheme)
+                or isinstance(qkv_proj.scheme, CompressedTensorsW4A16Fp4)
+                and not cutlass_fp4_supported()
+            ):
                assert True
            else:
                raise AssertionError("FP4 Scheme Mismatch")
@@ -690,13 +730,13 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):


@pytest.mark.skipif(
-    not current_platform.is_cuda()
-    or not current_platform.has_device_capability(90),
+    not current_platform.is_cuda() or not current_platform.has_device_capability(90),
    reason="W4A8 FP8 is not yet supported on this GPU type.",
 )
-@pytest.mark.parametrize("args", [
-    ("czhu-cohere/TinyLlama-1.1B-Chat-v1.0-W4A8-e2e", CompressedTensorsW4A8Fp8)
-])
+@pytest.mark.parametrize(
+    "args",
+    [("czhu-cohere/TinyLlama-1.1B-Chat-v1.0-W4A8-e2e", CompressedTensorsW4A8Fp8)],
+)
 def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
    model, scheme = args
    with vllm_runner(model, enforce_eager=True) as llm:
@@ -710,8 +750,7 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
            down_proj = layer.mlp.down_proj

            for proj in (qkv_proj, o_proj, gate_up_proj, down_proj):
-                assert isinstance(proj.quant_method,
-                                  CompressedTensorsLinearMethod)
+                assert isinstance(proj.quant_method, CompressedTensorsLinearMethod)
                assert isinstance(proj.scheme, scheme)

                assert proj.weight_packed.dtype is torch.int32
@@ -725,22 +764,27 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
        assert output


-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test is skipped on non-CUDA platform.")
-@pytest.mark.parametrize("model,prompt,exp_perplexity", [
-    (
-        "nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16",
-        "Flat is better than nested.\nSparse is better than dense.",
-        150.0,
-    ),
-    (
-        "nm-testing/Llama-3.2-1B-Instruct-quip-w4a16",
-        "Flat is better than nested.\nSparse is better than dense.",
-        150.0,
-    ),
-])
-def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
-                                                  exp_perplexity):
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
+@pytest.mark.parametrize(
+    "model,prompt,exp_perplexity",
+    [
+        (
+            "nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16",
+            "Flat is better than nested.\nSparse is better than dense.",
+            150.0,
+        ),
+        (
+            "nm-testing/Llama-3.2-1B-Instruct-quip-w4a16",
+            "Flat is better than nested.\nSparse is better than dense.",
+            150.0,
+        ),
+    ],
+)
+def test_compressed_tensors_transforms_perplexity(
+    vllm_runner, model, prompt, exp_perplexity
+):
    with vllm_runner(model, enforce_eager=True) as llm:
        perplexity = llm.generate_prompt_perplexity([prompt])[0]
        print(perplexity)
@@ -750,26 +794,24 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
 def test_compressed_tensors_fp8_block_enabled(vllm_runner):
    model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
    with vllm_runner(model_path) as llm:
-
        fp8_dtype = current_platform.fp8_dtype()

        def check_model(model):
            layer = model.model.layers[0]

            qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
-            assert isinstance(qkv_proj.scheme.w8a8_block_fp8_linear,
-                              W8A8BlockFp8LinearOp)
+            assert isinstance(
+                qkv_proj.scheme.w8a8_block_fp8_linear, W8A8BlockFp8LinearOp
+            )

            assert qkv_proj.weight.dtype is fp8_dtype
            assert qkv_proj.weight_scale.dtype is torch.float32
            assert len(qkv_proj.weight.shape) == 2
            assert len(qkv_proj.weight_scale.shape) == 2

-            input_quant_op = \
-                qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
+            input_quant_op = qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
            assert isinstance(input_quant_op, QuantFP8)
            assert input_quant_op._forward_method == input_quant_op.forward_cuda