Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -21,24 +21,41 @@ from transformers import AutoTokenizer
from vllm_test_utils.monitor import monitor
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.transformers_utils.detokenizer_utils import (
convert_ids_list_to_tokens)
from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
# isort: off
from vllm.utils import (
CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot,
PlaceholderModule, bind_kv_cache, common_broadcastable_dtype,
current_stream, deprecate_kwargs, get_open_port, get_tcp_uri,
is_lossless_cast, join_host_port, make_zmq_path, make_zmq_socket,
memory_profiling, merge_async_iterators, sha256, split_host_port,
split_zmq_path, supports_kw, swap_dict_values, unique_filepath)
CacheInfo,
FlexibleArgumentParser,
LRUCache,
MemorySnapshot,
PlaceholderModule,
bind_kv_cache,
common_broadcastable_dtype,
current_stream,
deprecate_kwargs,
get_open_port,
get_tcp_uri,
is_lossless_cast,
join_host_port,
make_zmq_path,
make_zmq_socket,
memory_profiling,
merge_async_iterators,
sha256,
split_host_port,
split_zmq_path,
supports_kw,
swap_dict_values,
unique_filepath,
)
# isort: on
from ..utils import create_new_process_for_each_test, error_on_warning
@pytest.mark.asyncio
async def test_merge_async_iterators():
async def mock_async_iterator(idx: int):
try:
while True:
@@ -72,7 +89,6 @@ async def test_merge_async_iterators():
def test_deprecate_kwargs_always():
@deprecate_kwargs("old_arg", is_deprecated=True)
def dummy(*, old_arg: object = None, new_arg: object = None):
pass
@@ -85,7 +101,6 @@ def test_deprecate_kwargs_always():
def test_deprecate_kwargs_never():
@deprecate_kwargs("old_arg", is_deprecated=False)
def dummy(*, old_arg: object = None, new_arg: object = None):
pass
@@ -120,7 +135,6 @@ def test_deprecate_kwargs_dynamic():
def test_deprecate_kwargs_additional_message():
@deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
def dummy(*, old_arg: object = None, new_arg: object = None):
pass
@@ -145,99 +159,107 @@ def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
@pytest.fixture
def parser():
parser = FlexibleArgumentParser()
parser.add_argument('--image-input-type',
choices=['pixel_values', 'image_features'])
parser.add_argument('--model-name')
parser.add_argument('--batch-size', type=int)
parser.add_argument('--enable-feature', action='store_true')
parser.add_argument('--hf-overrides', type=json.loads)
parser.add_argument('-O', '--compilation-config', type=json.loads)
parser.add_argument(
"--image-input-type", choices=["pixel_values", "image_features"]
)
parser.add_argument("--model-name")
parser.add_argument("--batch-size", type=int)
parser.add_argument("--enable-feature", action="store_true")
parser.add_argument("--hf-overrides", type=json.loads)
parser.add_argument("-O", "--compilation-config", type=json.loads)
return parser
@pytest.fixture
def parser_with_config():
parser = FlexibleArgumentParser()
parser.add_argument('serve')
parser.add_argument('model_tag', nargs='?')
parser.add_argument('--model', type=str)
parser.add_argument('--served-model-name', type=str)
parser.add_argument('--config', type=str)
parser.add_argument('--port', type=int)
parser.add_argument('--tensor-parallel-size', type=int)
parser.add_argument('--trust-remote-code', action='store_true')
parser.add_argument("serve")
parser.add_argument("model_tag", nargs="?")
parser.add_argument("--model", type=str)
parser.add_argument("--served-model-name", type=str)
parser.add_argument("--config", type=str)
parser.add_argument("--port", type=int)
parser.add_argument("--tensor-parallel-size", type=int)
parser.add_argument("--trust-remote-code", action="store_true")
return parser
def test_underscore_to_dash(parser):
args = parser.parse_args(['--image_input_type', 'pixel_values'])
assert args.image_input_type == 'pixel_values'
args = parser.parse_args(["--image_input_type", "pixel_values"])
assert args.image_input_type == "pixel_values"
def test_mixed_usage(parser):
args = parser.parse_args([
'--image_input_type', 'image_features', '--model-name',
'facebook/opt-125m'
])
assert args.image_input_type == 'image_features'
assert args.model_name == 'facebook/opt-125m'
args = parser.parse_args(
["--image_input_type", "image_features", "--model-name", "facebook/opt-125m"]
)
assert args.image_input_type == "image_features"
assert args.model_name == "facebook/opt-125m"
def test_with_equals_sign(parser):
args = parser.parse_args(
['--image_input_type=pixel_values', '--model-name=facebook/opt-125m'])
assert args.image_input_type == 'pixel_values'
assert args.model_name == 'facebook/opt-125m'
["--image_input_type=pixel_values", "--model-name=facebook/opt-125m"]
)
assert args.image_input_type == "pixel_values"
assert args.model_name == "facebook/opt-125m"
def test_with_int_value(parser):
args = parser.parse_args(['--batch_size', '32'])
args = parser.parse_args(["--batch_size", "32"])
assert args.batch_size == 32
args = parser.parse_args(['--batch-size', '32'])
args = parser.parse_args(["--batch-size", "32"])
assert args.batch_size == 32
def test_with_bool_flag(parser):
args = parser.parse_args(['--enable_feature'])
args = parser.parse_args(["--enable_feature"])
assert args.enable_feature is True
args = parser.parse_args(['--enable-feature'])
args = parser.parse_args(["--enable-feature"])
assert args.enable_feature is True
def test_invalid_choice(parser):
with pytest.raises(SystemExit):
parser.parse_args(['--image_input_type', 'invalid_choice'])
parser.parse_args(["--image_input_type", "invalid_choice"])
def test_missing_required_argument(parser):
parser.add_argument('--required-arg', required=True)
parser.add_argument("--required-arg", required=True)
with pytest.raises(SystemExit):
parser.parse_args([])
def test_cli_override_to_config(parser_with_config, cli_config_file):
args = parser_with_config.parse_args([
'serve', 'mymodel', '--config', cli_config_file,
'--tensor-parallel-size', '3'
])
args = parser_with_config.parse_args(
["serve", "mymodel", "--config", cli_config_file, "--tensor-parallel-size", "3"]
)
assert args.tensor_parallel_size == 3
args = parser_with_config.parse_args([
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
cli_config_file
])
args = parser_with_config.parse_args(
["serve", "mymodel", "--tensor-parallel-size", "3", "--config", cli_config_file]
)
assert args.tensor_parallel_size == 3
assert args.port == 12312
args = parser_with_config.parse_args([
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
cli_config_file, '--port', '666'
])
args = parser_with_config.parse_args(
[
"serve",
"mymodel",
"--tensor-parallel-size",
"3",
"--config",
cli_config_file,
"--port",
"666",
]
)
assert args.tensor_parallel_size == 3
assert args.port == 666
def test_config_args(parser_with_config, cli_config_file):
args = parser_with_config.parse_args(
['serve', 'mymodel', '--config', cli_config_file])
["serve", "mymodel", "--config", cli_config_file]
)
assert args.tensor_parallel_size == 2
assert args.trust_remote_code
@@ -245,22 +267,31 @@ def test_config_args(parser_with_config, cli_config_file):
def test_config_file(parser_with_config):
with pytest.raises(FileNotFoundError):
parser_with_config.parse_args(
['serve', 'mymodel', '--config', 'test_config.yml'])
["serve", "mymodel", "--config", "test_config.yml"]
)
with pytest.raises(ValueError):
parser_with_config.parse_args(
['serve', 'mymodel', '--config', './data/test_config.json'])
["serve", "mymodel", "--config", "./data/test_config.json"]
)
with pytest.raises(ValueError):
parser_with_config.parse_args([
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
'--batch-size', '32'
])
parser_with_config.parse_args(
[
"serve",
"mymodel",
"--tensor-parallel-size",
"3",
"--config",
"--batch-size",
"32",
]
)
def test_no_model_tag(parser_with_config, cli_config_file):
with pytest.raises(ValueError):
parser_with_config.parse_args(['serve', '--config', cli_config_file])
parser_with_config.parse_args(["serve", "--config", cli_config_file])
def test_dict_args(parser):
@@ -323,7 +354,7 @@ def test_dict_args(parser):
},
"key14": {
"key15": "-minus.and.dot",
}
},
}
assert parsed_args.compilation_config == {
"level": 1,
@@ -375,24 +406,29 @@ def test_duplicate_dict_args(caplog_vllm, parser):
(lambda foo, **kwargs: None, "something_else", False, True, True),
(lambda foo, **kwargs: None, "kwargs", True, True, False),
(lambda foo, **kwargs: None, "foo", True, True, False),
])
],
)
# yapf: disable
def test_supports_kw(callable,kw_name,requires_kw_only,
allow_var_kwargs,is_supported):
assert supports_kw(
def test_supports_kw(
callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported
):
assert (
supports_kw(
callable=callable,
kw_name=kw_name,
requires_kw_only=requires_kw_only,
allow_var_kwargs=allow_var_kwargs
) == is_supported
allow_var_kwargs=allow_var_kwargs,
)
== is_supported
)
@create_new_process_for_each_test()
def test_memory_profiling():
# Fake out some model loading + inference memory usage to test profiling
# Memory used by other processes will show up as cuda usage outside of torch
from vllm.distributed.device_communicators.cuda_wrapper import (
CudaRTLibrary)
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
lib = CudaRTLibrary()
# 512 MiB allocation outside of this instance
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
@@ -401,9 +437,9 @@ def test_memory_profiling():
# load weights
weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
def measure_current_non_torch():
free, total = torch.cuda.mem_get_info()
@@ -412,11 +448,14 @@ def test_memory_profiling():
current_non_torch = current_used - current_torch
return current_non_torch
with memory_profiling(baseline_snapshot=baseline_snapshot,
weights_memory=weights_memory) as result, \
monitor(measure_current_non_torch) as monitored_values:
with (
memory_profiling(
baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
) as result,
monitor(measure_current_non_torch) as monitored_values,
):
# make a memory spike, 1 GiB
spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
del spike
# Add some extra non-torch memory 256 MiB (simulate NCCL)
@@ -431,7 +470,7 @@ def test_memory_profiling():
# 5% tolerance is caused by cuda runtime.
# we cannot control cuda runtime in the granularity of bytes,
# which causes a small error (<10 MiB in practice)
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
assert abs(non_torch_ratio - 1) <= 0.05
assert result.torch_peak_increase == 1024 * 1024 * 1024
del weights
@@ -443,87 +482,84 @@ def test_bind_kv_cache():
from vllm.attention import Attention
ctx = {
'layers.0.self_attn': Attention(32, 128, 0.1),
'layers.1.self_attn': Attention(32, 128, 0.1),
'layers.2.self_attn': Attention(32, 128, 0.1),
'layers.3.self_attn': Attention(32, 128, 0.1),
"layers.0.self_attn": Attention(32, 128, 0.1),
"layers.1.self_attn": Attention(32, 128, 0.1),
"layers.2.self_attn": Attention(32, 128, 0.1),
"layers.3.self_attn": Attention(32, 128, 0.1),
}
kv_cache = [
torch.zeros((1, )),
torch.zeros((1, )),
torch.zeros((1, )),
torch.zeros((1, )),
torch.zeros((1,)),
torch.zeros((1,)),
torch.zeros((1,)),
torch.zeros((1,)),
]
bind_kv_cache(ctx, [kv_cache])
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]
def test_bind_kv_cache_kv_sharing():
from vllm.attention import Attention
ctx = {
'layers.0.self_attn': Attention(32, 128, 0.1),
'layers.1.self_attn': Attention(32, 128, 0.1),
'layers.2.self_attn': Attention(32, 128, 0.1),
'layers.3.self_attn': Attention(32, 128, 0.1),
"layers.0.self_attn": Attention(32, 128, 0.1),
"layers.1.self_attn": Attention(32, 128, 0.1),
"layers.2.self_attn": Attention(32, 128, 0.1),
"layers.3.self_attn": Attention(32, 128, 0.1),
}
kv_cache = [
torch.zeros((1, )),
torch.zeros((1, )),
torch.zeros((1, )),
torch.zeros((1, )),
torch.zeros((1,)),
torch.zeros((1,)),
torch.zeros((1,)),
torch.zeros((1,)),
]
shared_kv_cache_layers = {
'layers.2.self_attn': 'layers.1.self_attn',
'layers.3.self_attn': 'layers.0.self_attn'
"layers.2.self_attn": "layers.1.self_attn",
"layers.3.self_attn": "layers.0.self_attn",
}
bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[1]
assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[0]
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]
def test_bind_kv_cache_non_attention():
from vllm.attention import Attention
# example from Jamba PP=2
ctx = {
'model.layers.20.attn': Attention(32, 128, 0.1),
'model.layers.28.attn': Attention(32, 128, 0.1),
"model.layers.20.attn": Attention(32, 128, 0.1),
"model.layers.28.attn": Attention(32, 128, 0.1),
}
kv_cache = [
torch.zeros((1, )),
torch.zeros((1, )),
torch.zeros((1,)),
torch.zeros((1,)),
]
bind_kv_cache(ctx, [kv_cache])
assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0]
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]
def test_bind_kv_cache_pp():
with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
# this test runs with 1 GPU, but we simulate 2 GPUs
cfg = VllmConfig(
parallel_config=ParallelConfig(pipeline_parallel_size=2))
cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
with set_current_vllm_config(cfg):
from vllm.attention import Attention
ctx = {
'layers.0.self_attn': Attention(32, 128, 0.1),
"layers.0.self_attn": Attention(32, 128, 0.1),
}
kv_cache = [
[torch.zeros((1, ))],
[torch.zeros((1, ))]
]
kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
bind_kv_cache(ctx, kv_cache)
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0]
assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
class TestLRUCache(LRUCache):
def _on_remove(self, key, value):
if not hasattr(self, "_remove_counter"):
self._remove_counter = 0