Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -8,18 +8,30 @@ import torch

 import vllm.envs as envs
 from vllm.compilation.collective_fusion import AsyncTPPass
-from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
-                         PassConfig, VllmConfig)
-from vllm.distributed import (tensor_model_parallel_all_gather,
-                              tensor_model_parallel_reduce_scatter)
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.config import (
+    CompilationConfig,
+    DeviceConfig,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+)
+from vllm.distributed import (
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_reduce_scatter,
+)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.platforms import current_platform
 from vllm.utils import update_environment_variables

 from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import (compare_two_settings, create_new_process_for_each_test,
-                     multi_gpu_test)
+from ..utils import (
+    compare_two_settings,
+    create_new_process_for_each_test,
+    multi_gpu_test,
+)
 from .backend import TestBackend

 FP8_DTYPE = current_platform.fp8_dtype()
@@ -33,21 +45,20 @@ prompts = [


 class TestMMRSModel(torch.nn.Module):
-
    def __init__(self, hidden_size=16, dtype=torch.float16):
        super().__init__()
        self.hidden_size = hidden_size
        self.dtype = dtype
-        self.gate_proj = torch.nn.Parameter(torch.empty(
-            (self.hidden_size * 2, hidden_size)),
-                                            requires_grad=False)
+        self.gate_proj = torch.nn.Parameter(
+            torch.empty((self.hidden_size * 2, hidden_size)), requires_grad=False
+        )
        # Initialize weights
        torch.nn.init.normal_(self.gate_proj, std=0.02)

    def forward(self, hidden_states):
        """
        Forward pass implementing the mm + reduce scatter in the FX graph
-    
+
        """
        # Reshape input
        view = hidden_states.reshape(-1, self.hidden_size)
@@ -66,14 +77,13 @@ class TestMMRSModel(torch.nn.Module):


 class TestAGMMModel(torch.nn.Module):
-
    def __init__(self, hidden_size=16, dtype=torch.float16):
        super().__init__()
        self.hidden_size = hidden_size
        self.dtype = dtype
-        self.weight = torch.nn.Parameter(torch.empty(
-            (hidden_size, hidden_size)),
-                                         requires_grad=False)
+        self.weight = torch.nn.Parameter(
+            torch.empty((hidden_size, hidden_size)), requires_grad=False
+        )
        # Initialize weights
        torch.nn.init.normal_(self.weight, std=0.02)

@@ -96,32 +106,35 @@ class TestAGMMModel(torch.nn.Module):


 class _BaseScaledMMModel(torch.nn.Module):
-
    def __init__(self, hidden_size=16, dtype=torch.float16):
        super().__init__()
        self.hidden_size = hidden_size
        self.dtype = dtype
-        self.weight = torch.empty([hidden_size, hidden_size], dtype=FP8_DTYPE)\
-            .contiguous().transpose(0, 1)
+        self.weight = (
+            torch.empty([hidden_size, hidden_size], dtype=FP8_DTYPE)
+            .contiguous()
+            .transpose(0, 1)
+        )

        # Initialize scale_b for _scaled_mm.
        self.scale_b = torch.ones(1, self.hidden_size, dtype=torch.float32)


 class TestScaledMMRSModel(_BaseScaledMMModel):
-
    def forward(self, input: torch.Tensor):
        """
        Forward pass implementing the scaled_mm + reduce scatter in the FX graph
-    
+
        """
        fp8_input = input.to(FP8_DTYPE)
        scale_a = torch.ones(input.shape[0], 1, dtype=torch.float32)
-        scaled_mm = torch._scaled_mm(fp8_input,
-                                     self.weight,
-                                     scale_a=scale_a,
-                                     scale_b=self.scale_b,
-                                     out_dtype=self.dtype)
+        scaled_mm = torch._scaled_mm(
+            fp8_input,
+            self.weight,
+            scale_a=scale_a,
+            scale_b=self.scale_b,
+            out_dtype=self.dtype,
+        )
        reduce_scatter = tensor_model_parallel_reduce_scatter(scaled_mm, dim=0)
        return reduce_scatter

@@ -133,7 +146,6 @@ class TestScaledMMRSModel(_BaseScaledMMModel):


 class TestAGScaledMMModel(_BaseScaledMMModel):
-
    def forward(self, input: torch.Tensor):
        """
        Forward pass implementing the all gather + scaled_mm in the FX graph
@@ -143,11 +155,13 @@ class TestAGScaledMMModel(_BaseScaledMMModel):
        all_gather = tensor_model_parallel_all_gather(fp8_input, dim=0)

        scale_a = torch.ones(all_gather.shape[0], 1, dtype=torch.float32)
-        scaled_mm = torch._scaled_mm(all_gather,
-                                     self.weight,
-                                     scale_a=scale_a,
-                                     scale_b=self.scale_b,
-                                     out_dtype=self.dtype)
+        scaled_mm = torch._scaled_mm(
+            all_gather,
+            self.weight,
+            scale_a=scale_a,
+            scale_b=self.scale_b,
+            out_dtype=self.dtype,
+        )
        return scaled_mm

    def ops_in_model_before(self):
@@ -158,20 +172,22 @@ class TestAGScaledMMModel(_BaseScaledMMModel):


 class TestCutlassScaledMMRSModel(_BaseScaledMMModel):
-
    def forward(self, input: torch.Tensor):
        """
        Forward pass implementing the cutlass_scaled_mm + reduce scatter
        in the FX graph
-    
+
        """
        fp8_input = input.to(FP8_DTYPE)
        scale_a = torch.ones(input.shape[0], 1, dtype=torch.float32)
-        mm_out = torch.empty((fp8_input.shape[0], self.weight.shape[1]),
-                             dtype=self.dtype,
-                             device=input.device)
-        torch.ops._C.cutlass_scaled_mm(mm_out, fp8_input, self.weight, scale_a,
-                                       self.scale_b, None)
+        mm_out = torch.empty(
+            (fp8_input.shape[0], self.weight.shape[1]),
+            dtype=self.dtype,
+            device=input.device,
+        )
+        torch.ops._C.cutlass_scaled_mm(
+            mm_out, fp8_input, self.weight, scale_a, self.scale_b, None
+        )
        reduce_scatter = tensor_model_parallel_reduce_scatter(mm_out, dim=0)
        return reduce_scatter

@@ -183,10 +199,9 @@ class TestCutlassScaledMMRSModel(_BaseScaledMMModel):


 class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
-
    def forward(self, input: torch.Tensor):
        """
-        Forward pass implementing the all gather + cutlass_scaled_mm 
+        Forward pass implementing the all gather + cutlass_scaled_mm
        in the FX graph
        """
        # Reshape input
@@ -195,11 +210,14 @@ class TestAGCutlassScaledMMModel(_BaseScaledMMModel):

        scale_a = torch.ones(all_gather.shape[0], 1, dtype=torch.float32)

-        mm_out = torch.empty((all_gather.shape[0], self.weight.shape[1]),
-                             dtype=self.dtype,
-                             device=all_gather.device)
-        torch.ops._C.cutlass_scaled_mm(mm_out, all_gather, self.weight,
-                                       scale_a, self.scale_b, None)
+        mm_out = torch.empty(
+            (all_gather.shape[0], self.weight.shape[1]),
+            dtype=self.dtype,
+            device=all_gather.device,
+        )
+        torch.ops._C.cutlass_scaled_mm(
+            mm_out, all_gather, self.weight, scale_a, self.scale_b, None
+        )
        return mm_out

    def ops_in_model_before(self):
@@ -210,23 +228,37 @@ class TestAGCutlassScaledMMModel(_BaseScaledMMModel):


@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("test_model", [
-    TestMMRSModel, TestAGMMModel, TestScaledMMRSModel, TestAGScaledMMModel,
-    TestCutlassScaledMMRSModel, TestAGCutlassScaledMMModel
-])
+@pytest.mark.parametrize(
+    "test_model",
+    [
+        TestMMRSModel,
+        TestAGMMModel,
+        TestScaledMMRSModel,
+        TestAGScaledMMModel,
+        TestCutlassScaledMMRSModel,
+        TestAGCutlassScaledMMModel,
+    ],
+)
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("seq_len", [16])
@pytest.mark.parametrize("hidden_size", [16])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
-def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int,
-                               hidden_size: int, dtype: torch.dtype):
-    if test_model in (TestScaledMMRSModel, TestAGScaledMMModel,
-                      TestCutlassScaledMMRSModel,
-                      TestAGCutlassScaledMMModel) and dtype == torch.float16:
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
+def test_async_tp_pass_replace(
+    test_model: str, batch_size: int, seq_len: int, hidden_size: int, dtype: torch.dtype
+):
+    if (
+        test_model
+        in (
+            TestScaledMMRSModel,
+            TestAGScaledMMModel,
+            TestCutlassScaledMMRSModel,
+            TestAGCutlassScaledMMModel,
+        )
+        and dtype == torch.float16
+    ):
        pytest.skip(
-            "Only bf16 high precision output types are supported for " \
+            "Only bf16 high precision output types are supported for "
            "per-token (row-wise) scaling"
        )

@@ -235,19 +267,24 @@ def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int,
    def run_torch_spawn(fn, nprocs):
        # need to use torch.mp.spawn otherwise will have problems with
        # torch.distributed and cuda
-        torch.multiprocessing.spawn(fn,
-                                    args=(num_processes, test_model,
-                                          batch_size, seq_len, hidden_size,
-                                          dtype),
-                                    nprocs=nprocs)
+        torch.multiprocessing.spawn(
+            fn,
+            args=(num_processes, test_model, batch_size, seq_len, hidden_size, dtype),
+            nprocs=nprocs,
+        )

    run_torch_spawn(async_tp_pass_on_test_model, num_processes)


-def async_tp_pass_on_test_model(local_rank: int, world_size: int,
-                                test_model_cls: torch.nn.Module,
-                                batch_size: int, seq_len: int,
-                                hidden_size: int, dtype: torch.dtype):
+def async_tp_pass_on_test_model(
+    local_rank: int,
+    world_size: int,
+    test_model_cls: torch.nn.Module,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+):
    current_platform.seed_everything(0)

    device = torch.device(f"cuda:{local_rank}")
@@ -255,13 +292,15 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
    torch.set_default_device(device)
    torch.set_default_dtype(dtype)

-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': '12345',
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        }
+    )

    # initialize distributed
    init_distributed_environment()
@@ -269,27 +308,28 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,

    # configure vllm config for SequenceParallelismPass
    vllm_config = VllmConfig()
-    vllm_config.compilation_config = CompilationConfig(pass_config=PassConfig(
-        enable_async_tp=True, ), )
+    vllm_config.compilation_config = CompilationConfig(
+        pass_config=PassConfig(
+            enable_async_tp=True,
+        ),
+    )
    vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))

    # this is a fake model name to construct the model config
    # in the vllm_config, it's not really used.
    model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
-    vllm_config.model_config = ModelConfig(model=model_name,
-                                           trust_remote_code=True,
-                                           dtype=dtype,
-                                           seed=42)
+    vllm_config.model_config = ModelConfig(
+        model=model_name, trust_remote_code=True, dtype=dtype, seed=42
+    )

    async_tp_pass = AsyncTPPass(vllm_config)
    backend = TestBackend(async_tp_pass)

-    model = test_model_cls(hidden_size,
-                           dtype)  # Pass dtype to model constructor
+    model = test_model_cls(hidden_size, dtype)  # Pass dtype to model constructor

-    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
-                                dtype=dtype,
-                                requires_grad=False)
+    hidden_states = torch.randn(
+        (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
+    )

    compiled_model = torch.compile(model, backend=backend)
    compiled_model(hidden_states)
@@ -306,10 +346,10 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,


@create_new_process_for_each_test()
-@pytest.mark.parametrize("model_id", [
-    "meta-llama/Llama-3.2-1B-Instruct",
-    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
-])
+@pytest.mark.parametrize(
+    "model_id",
+    ["meta-llama/Llama-3.2-1B-Instruct", "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"],
+)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("async_tp_enabled", [True])
@pytest.mark.parametrize("distributed_backend", ["mp"])
@@ -342,12 +382,10 @@ def test_async_tp_pass_correctness(
        common_args.append("--enforce-eager")

    compilation_config = {
-        'level': 3,
-        'compile_sizes': [2, 4, 8],
-        'splitting_ops': [],
-        'pass_config': {
-            'enable_async_tp': async_tp_enabled
-        },
+        "level": 3,
+        "compile_sizes": [2, 4, 8],
+        "splitting_ops": [],
+        "pass_config": {"enable_async_tp": async_tp_enabled},
    }

    async_tp_env = tp_env = {
@@ -372,9 +410,6 @@ def test_async_tp_pass_correctness(
        "mp",
    ]

-    compare_two_settings(model_id,
-                         async_tp_args,
-                         tp_args,
-                         async_tp_env,
-                         tp_env,
-                         method="generate")
+    compare_two_settings(
+        model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
+    )