Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -13,13 +13,19 @@ import pytest
import ray
import torch
from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
tensor_model_parallel_all_gather,
tensor_model_parallel_all_reduce,
tensor_model_parallel_reduce_scatter)
from vllm.distributed import (
broadcast_tensor_dict,
get_pp_group,
tensor_model_parallel_all_gather,
tensor_model_parallel_all_reduce,
tensor_model_parallel_reduce_scatter,
)
from ..utils import (init_test_distributed_environment, multi_gpu_test,
multi_process_parallel)
from ..utils import (
init_test_distributed_environment,
multi_gpu_test,
multi_process_parallel,
)
@ray.remote(num_gpus=1, max_calls=1)
@@ -37,12 +43,11 @@ def all_reduce_test_worker(
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
num_elements = 8
all_tensors = [
torch.arange(num_elements, dtype=torch.float32, device="cuda") *
(r + 1) for r in range(tp_size)
torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
for r in range(tp_size)
]
expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
t = all_tensors[rank % tp_size]
@@ -51,28 +56,31 @@ def all_reduce_test_worker(
@ray.remote(num_gpus=1, max_calls=1)
def reduce_scatter_test_worker(monkeypatch: pytest.MonkeyPatch, tp_size: int,
pp_size: int, rank: int,
distributed_init_port: str):
def reduce_scatter_test_worker(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
num_elements = 8
all_tensors = [
torch.arange(num_elements, dtype=torch.float32, device="cuda") *
(r + 1) for r in range(tp_size)
torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
for r in range(tp_size)
]
index = rank % tp_size
partition_size = num_elements // tp_size
all_reduce = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
expected = all_reduce[index * partition_size:(index + 1) * partition_size]
expected = all_reduce[index * partition_size : (index + 1) * partition_size]
t = all_tensors[index]
t = tensor_model_parallel_reduce_scatter(t, 0)
torch.testing.assert_close(t, expected)
@@ -92,8 +100,7 @@ def all_gather_test_worker(
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
num_dimensions = 3
tensor_size = list(range(2, num_dimensions + 2))
total_size = 1
@@ -101,8 +108,10 @@ def all_gather_test_worker(
total_size *= s
for all_gather_dimension in range(num_dimensions):
all_tensors = [
torch.arange(total_size, dtype=torch.float32,
device="cuda").reshape(tensor_size) * (r + 1)
torch.arange(total_size, dtype=torch.float32, device="cuda").reshape(
tensor_size
)
* (r + 1)
for r in range(tp_size)
]
expected = torch.cat(all_tensors, dim=all_gather_dimension)
@@ -125,8 +134,7 @@ def broadcast_tensor_dict_test_worker(
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
test_dict = {
# device tensor
"a": torch.arange(8, dtype=torch.float32, device="cuda"),
@@ -134,10 +142,7 @@ def broadcast_tensor_dict_test_worker(
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
"c": "test",
"d": [1, 2, 3],
"e": {
"a": 1,
"b": 2
},
"e": {"a": 1, "b": 2},
# empty tensor
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
}
@@ -166,8 +171,7 @@ def send_recv_tensor_dict_test_worker(
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
test_dict = {
# device tensor
@@ -176,10 +180,7 @@ def send_recv_tensor_dict_test_worker(
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
"c": "test",
"d": [1, 2, 3],
"e": {
"a": 1,
"b": 2
},
"e": {"a": 1, "b": 2},
# empty tensor
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
}
@@ -211,8 +212,7 @@ def send_recv_test_worker(
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
size = 64
test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
@@ -229,10 +229,10 @@ def send_recv_test_worker(
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("test_target", [
all_reduce_test_worker, all_gather_test_worker,
broadcast_tensor_dict_test_worker
])
@pytest.mark.parametrize(
"test_target",
[all_reduce_test_worker, all_gather_test_worker, broadcast_tensor_dict_test_worker],
)
def test_multi_process_tensor_parallel(
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
@@ -244,7 +244,8 @@ def test_multi_process_tensor_parallel(
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]
)
def test_multi_process_pipeline_parallel(
monkeypatch: pytest.MonkeyPatch,
pp_size: int,
@@ -256,11 +257,16 @@ def test_multi_process_pipeline_parallel(
@multi_gpu_test(num_gpus=4)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize("test_target", [
send_recv_test_worker, send_recv_tensor_dict_test_worker,
all_reduce_test_worker, all_gather_test_worker,
broadcast_tensor_dict_test_worker
])
@pytest.mark.parametrize(
"test_target",
[
send_recv_test_worker,
send_recv_tensor_dict_test_worker,
all_reduce_test_worker,
all_gather_test_worker,
broadcast_tensor_dict_test_worker,
],
)
def test_multi_process_tensor_parallel_pipeline_parallel(
tp_size: int,
pp_size: int,