Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -14,8 +14,7 @@ from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
 from vllm.attention.backends.registry import _Backend
 from vllm.attention.selector import global_force_attn_backend_context_manager
-from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
-                         PassConfig)
+from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils import is_torch_equal_or_newer

@@ -25,43 +24,54 @@ from ..utils import create_new_process_for_each_test
 def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
        ("facebook/opt-125m", {}),
-        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
-            "dtype": torch.float16,
-        }),
-        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
-            "dtype": torch.float16,
-        }),
+        (
+            "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+            {
+                "dtype": torch.float16,
+            },
+        ),
+        (
+            "neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic",
+            {
+                "dtype": torch.float16,
+            },
+        ),
        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
        ("meta-llama/Llama-3.2-1B-Instruct", {}),
    ]

    if all:
-
        # TODO: figure out why this fails.
        if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
-                "quantization": "gguf"
-            }))
+            TEST_MODELS.append(
+                ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {"quantization": "gguf"})
+            )

        if is_quant_method_supported("gptq"):
-            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
-                "quantization": "gptq"
-            }))
+            TEST_MODELS.append(
+                ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {"quantization": "gptq"})
+            )

        if is_quant_method_supported("gptq_marlin"):
-            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
-                "quantization": "gptq_marlin"
-            }))
+            TEST_MODELS.append(
+                (
+                    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
+                    {"quantization": "gptq_marlin"},
+                )
+            )

        if is_quant_method_supported("gptq_marlin_24"):
-            TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
-                "quantization": "gptq_marlin_24"
-            }))
+            TEST_MODELS.append(
+                (
+                    "alexm-nm/tinyllama-24-marlin24-4bit-g128",
+                    {"quantization": "gptq_marlin_24"},
+                )
+            )

        if not current_platform.is_rocm() and is_quant_method_supported("awq"):
-            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
-                "quantization": "AWQ"
-            }))
+            TEST_MODELS.append(
+                ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {"quantization": "AWQ"})
+            )

    if keywords is None:
        return TEST_MODELS
@@ -95,22 +105,34 @@ def test_full_graph(
    "compilation_config, model_info",
    [
        # additional compile sizes, only some of the models
-        (CompilationConfig(level=CompilationLevel.PIECEWISE,
-                           compile_sizes=[1, 2]), model)
+        (
+            CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]),
+            model,
+        )
        for model in models_list(all=False)
-    ] + [
+    ]
+    + [
        # RMSNorm + quant fusion, only 8-bit quant models
-        (CompilationConfig(level=CompilationLevel.PIECEWISE,
-                           custom_ops=["+rms_norm"],
-                           pass_config=PassConfig(enable_fusion=True,
-                                                  enable_noop=True)), model)
+        (
+            CompilationConfig(
+                level=CompilationLevel.PIECEWISE,
+                custom_ops=["+rms_norm"],
+                pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+            ),
+            model,
+        )
        for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
-    ] + [
+    ]
+    + [
        # Test depyf integration works
-        (CompilationConfig(level=CompilationLevel.PIECEWISE,
-                           debug_dump_path=tempfile.gettempdir()),
-         ("facebook/opt-125m", {})),
-    ] + [
+        (
+            CompilationConfig(
+                level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir()
+            ),
+            ("facebook/opt-125m", {}),
+        ),
+    ]
+    + [
        # graph inductor partition
        (
            CompilationConfig(
@@ -119,20 +141,24 @@ def test_full_graph(
                # torch._C.Tag.cudagraph_unsafe to specify splitting ops
                use_inductor_graph_partition=True,
                cudagraph_mode=CUDAGraphMode.PIECEWISE,
-                compile_sizes=[1, 2]),
-            model) for model in models_list(all=False)
+                compile_sizes=[1, 2],
+            ),
+            model,
+        )
+        for model in models_list(all=False)
        if is_torch_equal_or_newer("2.9.0.dev")
-    ])
+    ],
+)
 # only test some of the models
@create_new_process_for_each_test()
 def test_custom_compile_config(
    compilation_config: CompilationConfig,
    model_info: tuple[str, dict[str, Any]],
 ):
-    if (compilation_config.use_inductor_graph_partition
-            and not is_torch_equal_or_newer("2.9.0.dev")):
-        pytest.skip("inductor graph partition is only available "
-                    "in PyTorch 2.9+")
+    if compilation_config.use_inductor_graph_partition and not is_torch_equal_or_newer(
+        "2.9.0.dev"
+    ):
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

    model, model_kwargs = model_info
    print(f"MODEL={model}")
@@ -156,8 +182,7 @@ def test_fp8_kv_scale_compile(optimization_level: int):

 def test_inductor_graph_partition_attn_fusion(caplog_vllm):
    if not is_torch_equal_or_newer("2.9.0.dev"):
-        pytest.skip("inductor graph partition is only available "
-                    "in PyTorch 2.9+")
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

    model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
    compilation_config = CompilationConfig(
@@ -171,14 +196,16 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
        "kv_cache_dtype": "fp8",
        "max_model_len": 1024,
    }
-    with caplog_vllm.at_level(
-            logging.DEBUG), global_force_attn_backend_context_manager(
-                _Backend.FLASHINFER):
+    with (
+        caplog_vllm.at_level(logging.DEBUG),
+        global_force_attn_backend_context_manager(_Backend.FLASHINFER),
+    ):
        run_model(compilation_config, model, model_kwargs)

    try:
-        assert ("Fused quantization onto 48 attention nodes"
-                in caplog_vllm.text), caplog_vllm.text
+        assert "Fused quantization onto 48 attention nodes" in caplog_vllm.text, (
+            caplog_vllm.text
+        )
    except AssertionError:
        # Note: this message is only triggered when the compilation goes
        # through the custom pass. Due to multiple layers of cache on
@@ -189,8 +216,11 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
        assert "Fused quantization" not in caplog_vllm.text


-def run_model(compile_config: Union[int, CompilationConfig], model: str,
-              model_kwargs: dict[str, Any]):
+def run_model(
+    compile_config: Union[int, CompilationConfig],
+    model: str,
+    model_kwargs: dict[str, Any],
+):
    prompts = [
        "Hello, my name is",
        "The president of the United States is",