Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -13,12 +13,13 @@ TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None

@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 def test_pre_quantized_model(vllm_runner):
-    with vllm_runner("drisspg/fp8-opt-125m",
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     enforce_eager=True) as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        "drisspg/fp8-opt-125m",
+        quantization="torchao",
+        dtype="bfloat16",
+        enforce_eager=True,
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
    assert output


@@ -28,17 +29,18 @@ def test_pre_quantized_model(vllm_runner):
    [
        "cuda:0",
        # {"": "cuda"},
-    ])
-def test_opt_125m_int8wo_model_loading_with_params(vllm_runner,
-                                                   pt_load_map_location):
+    ],
+)
+def test_opt_125m_int8wo_model_loading_with_params(vllm_runner, pt_load_map_location):
    torch._dynamo.reset()
    model_name = "jerryzh168/opt-125m-int8wo-partial-quant"
-    with vllm_runner(model_name=model_name,
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     pt_load_map_location=pt_load_map_location) as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location=pt_load_map_location,
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)

        assert output

@@ -47,12 +49,13 @@ def test_opt_125m_int8wo_model_loading_with_params(vllm_runner,
 def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
    torch._dynamo.reset()
    model_name = "jerryzh168/opt-125m-int4wo-per-module"
-    with vllm_runner(model_name=model_name,
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     pt_load_map_location="cuda:0") as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)

        assert output

@@ -61,12 +64,13 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
 def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
    torch._dynamo.reset()
    model_name = "mobicham/Qwen2.5-VL-3B-Instruct_int8wo_ao"
-    with vllm_runner(model_name=model_name,
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     pt_load_map_location="cuda:0") as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)

        assert output

@@ -75,17 +79,18 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
@pytest.mark.skip(
    reason="since torchao nightly is only compatible with torch nightly"
    "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
-    "torchao tests that requires newer versions (0.14.0.dev+) for now")
+    "torchao tests that requires newer versions (0.14.0.dev+) for now"
+)
 def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
    torch._dynamo.reset()
-    model_name = ("torchao-testing/opt-125m-AWQConfig-Int4WeightOnlyConfig-v2"
-                  "-0.14.0.dev")
-    with vllm_runner(model_name=model_name,
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     pt_load_map_location="cuda:0") as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    model_name = "torchao-testing/opt-125m-AWQConfig-Int4WeightOnlyConfig-v2-0.14.0.dev"
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)

        assert output

@@ -101,22 +106,24 @@ def test_on_the_fly_quant_config_dict_json(vllm_runner):
    import json

    from torchao.core.config import config_to_dict
-    from torchao.quantization import (
-        Float8DynamicActivationFloat8WeightConfig, PerRow)
+    from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow

    torchao_quant_config = Float8DynamicActivationFloat8WeightConfig(
-        granularity=PerRow())
+        granularity=PerRow()
+    )
    hf_overrides = {
-        "quantization_config_dict_json":
-        json.dumps(config_to_dict(torchao_quant_config))
+        "quantization_config_dict_json": json.dumps(
+            config_to_dict(torchao_quant_config)
+        )
    }
-    with vllm_runner(model_name=model_name,
-                     dtype="bfloat16",
-                     pt_load_map_location="cuda:0",
-                     quantization="torchao",
-                     hf_overrides=hf_overrides) as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        model_name=model_name,
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+        quantization="torchao",
+        hf_overrides=hf_overrides,
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)

        assert output

@@ -132,8 +139,7 @@ def test_on_the_fly_quant_config_file(vllm_runner):
    from tempfile import NamedTemporaryFile

    from torchao.core.config import config_to_dict
-    from torchao.quantization import (
-        Float8DynamicActivationFloat8WeightConfig, PerRow)
+    from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow

    config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())

@@ -144,13 +150,14 @@ def test_on_the_fly_quant_config_file(vllm_runner):
        config_file_name = str(f.name)

        hf_overrides = {"quantization_config_file": config_file_name}
-        with vllm_runner(model_name=model_name,
-                         dtype="bfloat16",
-                         pt_load_map_location="cuda:0",
-                         quantization="torchao",
-                         hf_overrides=hf_overrides) as llm:
-            output = llm.generate_greedy(["The capital of France is"],
-                                         max_tokens=32)
+        with vllm_runner(
+            model_name=model_name,
+            dtype="bfloat16",
+            pt_load_map_location="cuda:0",
+            quantization="torchao",
+            hf_overrides=hf_overrides,
+        ) as llm:
+            output = llm.generate_greedy(["The capital of France is"], max_tokens=32)

            assert output

@@ -160,17 +167,18 @@ def test_reload_weights():
    import json

    from torchao.core.config import config_to_dict
-    from torchao.quantization import (
-        Float8DynamicActivationFloat8WeightConfig, PerRow)
+    from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow

    from vllm import LLM, SamplingParams

    torchao_quant_config = Float8DynamicActivationFloat8WeightConfig(
-        granularity=PerRow())
+        granularity=PerRow()
+    )

    hf_overrides = {
-        "quantization_config_dict_json":
-        json.dumps(config_to_dict(torchao_quant_config))
+        "quantization_config_dict_json": json.dumps(
+            config_to_dict(torchao_quant_config)
+        )
    }

    llm = LLM(
@@ -182,12 +190,9 @@ def test_reload_weights():
        hf_overrides=hf_overrides,
    )
    # Update load format from `dummy` to `auto`
-    llm.collective_rpc("update_config",
-                       args=({
-                           "load_config": {
-                               "load_format": "auto"
-                           }
-                       }, ))
+    llm.collective_rpc(
+        "update_config", args=({"load_config": {"load_format": "auto"}},)
+    )
    # Now reload real weights inplace
    llm.collective_rpc("reload_weights")
    prompts = [