Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -4,13 +4,16 @@

 Run `pytest tests/quantization/test_fp8.py --forked`.
 """
+
 import pytest
 import torch

 from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
-                                                         Fp8LinearMethod)
+from vllm.model_executor.layers.quantization.fp8 import (
+    Fp8KVCacheMethod,
+    Fp8LinearMethod,
+)
 from vllm.platforms import current_platform

 MODELS = [
@@ -20,15 +23,18 @@ MODELS = [
 ]


-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="FP8 is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_id", MODELS)
@pytest.mark.parametrize("force_marlin", [False, True])
@pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
-                            use_rocm_aiter: bool, monkeypatch) -> None:
-
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_model_load_and_run(
+    vllm_runner, model_id: str, force_marlin: bool, use_rocm_aiter: bool, monkeypatch
+) -> None:
    if use_rocm_aiter:
        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")

@@ -50,13 +56,17 @@ KV_CACHE_MODELS = [
 ]


-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="FP8 is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
@pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
-                                     use_rocm_aiter: bool, monkeypatch):
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_kv_cache_model_load_and_run(
+    vllm_runner, model_id: str, use_rocm_aiter: bool, monkeypatch
+):
    if use_rocm_aiter:
        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")

@@ -93,14 +103,22 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
        print(outputs[0][1])


-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="FP8 is not supported on this GPU type.",
+)
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
@pytest.mark.parametrize("force_marlin", [False, True])
@pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
-                         use_rocm_aiter: bool, monkeypatch) -> None:
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_load_fp16_model(
+    vllm_runner,
+    kv_cache_dtype: str,
+    force_marlin: bool,
+    use_rocm_aiter: bool,
+    monkeypatch,
+) -> None:
    if use_rocm_aiter:
        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")

@@ -110,9 +128,9 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
    if force_marlin:
        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")

-    with vllm_runner("facebook/opt-125m",
-                     quantization="fp8",
-                     kv_cache_dtype=kv_cache_dtype) as llm:
+    with vllm_runner(
+        "facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
+    ) as llm:

        def check_model(model):
            fc1 = model.model.decoder.layers[0].fc1
@@ -139,26 +157,29 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
                    pytest.skip(
                        "Skip `test_load_fp16_model`. "
                        "It only runs on ROCm platform with FP8 compute."
-                        " e.g. MI300X and above.")
+                        " e.g. MI300X and above."
+                    )
            else:  # unsupported platform
-                pytest.skip("Skip `test_load_fp16_model`. "
-                            "It only runs on CUDA and ROCm platform.")
+                pytest.skip(
+                    "Skip `test_load_fp16_model`. "
+                    "It only runs on CUDA and ROCm platform."
+                )

        llm.apply_model(check_model)


-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="FP8 is not supported on this GPU type.",
+)
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_scaled_fp8_quant(dtype) -> None:
-
    def quantize_ref(tensor, inv_scale):
        # The reference implementation that fully aligns to
        # the kernel being tested.
        finfo = torch.finfo(torch.float8_e4m3fn)
        scale = inv_scale.reciprocal()
-        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min,
-                                                           max=finfo.max)
+        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
        qweight = qweight.to(torch.float8_e4m3fn)
        return qweight

@@ -177,26 +198,23 @@ def test_scaled_fp8_quant(dtype) -> None:

    # Reference dynamic quantizaton
    y = quantize_ref(x, inv_scale)
-    torch.testing.assert_close(ref_y,
-                               per_tensor_dequantize(y, inv_scale, dtype))
+    torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))

    # Static quantization
    y, _ = ops.scaled_fp8_quant(x, inv_scale)
-    torch.testing.assert_close(ref_y,
-                               per_tensor_dequantize(y, inv_scale, dtype))
+    torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))

    # Padding
    y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
    assert y.shape[0] == 17
    torch.testing.assert_close(
        ref_y,
-        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
-                              dtype))
+        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale, dtype),
+    )

    # non-contiguous input with padding
    m, n, padded_stride = 975, 512, 576
-    padded_tensor = (torch.randn(size=(m, padded_stride), device="cuda") *
-                     13).to(dtype)
+    padded_tensor = (torch.randn(size=(m, padded_stride), device="cuda") * 13).to(dtype)
    x_nc = padded_tensor[:, :n]  # shape (m, n) with stride (padded_stride, 1)

    assert not x_nc.is_contiguous()
@@ -209,19 +227,21 @@ def test_scaled_fp8_quant(dtype) -> None:
    # reference dynamic quantization
    y_nc = quantize_ref(x_nc, inv_scale_nc)
    torch.testing.assert_close(
-        ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype))
+        ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype)
+    )

    # static quantization
    y_nc, _ = ops.scaled_fp8_quant(x_nc, inv_scale_nc)
    torch.testing.assert_close(
-        ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype))
+        ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype)
+    )

    # padding after non-contiguous input quantization
-    y_nc_pad, _ = ops.scaled_fp8_quant(x_nc,
-                                       inv_scale_nc,
-                                       num_token_padding=m + 10)
+    y_nc_pad, _ = ops.scaled_fp8_quant(x_nc, inv_scale_nc, num_token_padding=m + 10)
    assert y_nc_pad.shape[0] == m + 10
    torch.testing.assert_close(
        ref_y_nc,
-        per_tensor_dequantize(torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]),
-                              inv_scale_nc, dtype))
+        per_tensor_dequantize(
+            torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]), inv_scale_nc, dtype
+        ),
+    )