Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -21,24 +21,41 @@ from transformers import AutoTokenizer
|
||||
from vllm_test_utils.monitor import monitor
|
||||
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.transformers_utils.detokenizer_utils import (
|
||||
convert_ids_list_to_tokens)
|
||||
from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
|
||||
|
||||
# isort: off
|
||||
from vllm.utils import (
|
||||
CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot,
|
||||
PlaceholderModule, bind_kv_cache, common_broadcastable_dtype,
|
||||
current_stream, deprecate_kwargs, get_open_port, get_tcp_uri,
|
||||
is_lossless_cast, join_host_port, make_zmq_path, make_zmq_socket,
|
||||
memory_profiling, merge_async_iterators, sha256, split_host_port,
|
||||
split_zmq_path, supports_kw, swap_dict_values, unique_filepath)
|
||||
CacheInfo,
|
||||
FlexibleArgumentParser,
|
||||
LRUCache,
|
||||
MemorySnapshot,
|
||||
PlaceholderModule,
|
||||
bind_kv_cache,
|
||||
common_broadcastable_dtype,
|
||||
current_stream,
|
||||
deprecate_kwargs,
|
||||
get_open_port,
|
||||
get_tcp_uri,
|
||||
is_lossless_cast,
|
||||
join_host_port,
|
||||
make_zmq_path,
|
||||
make_zmq_socket,
|
||||
memory_profiling,
|
||||
merge_async_iterators,
|
||||
sha256,
|
||||
split_host_port,
|
||||
split_zmq_path,
|
||||
supports_kw,
|
||||
swap_dict_values,
|
||||
unique_filepath,
|
||||
)
|
||||
|
||||
# isort: on
|
||||
from ..utils import create_new_process_for_each_test, error_on_warning
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_merge_async_iterators():
|
||||
|
||||
async def mock_async_iterator(idx: int):
|
||||
try:
|
||||
while True:
|
||||
@@ -72,7 +89,6 @@ async def test_merge_async_iterators():
|
||||
|
||||
|
||||
def test_deprecate_kwargs_always():
|
||||
|
||||
@deprecate_kwargs("old_arg", is_deprecated=True)
|
||||
def dummy(*, old_arg: object = None, new_arg: object = None):
|
||||
pass
|
||||
@@ -85,7 +101,6 @@ def test_deprecate_kwargs_always():
|
||||
|
||||
|
||||
def test_deprecate_kwargs_never():
|
||||
|
||||
@deprecate_kwargs("old_arg", is_deprecated=False)
|
||||
def dummy(*, old_arg: object = None, new_arg: object = None):
|
||||
pass
|
||||
@@ -120,7 +135,6 @@ def test_deprecate_kwargs_dynamic():
|
||||
|
||||
|
||||
def test_deprecate_kwargs_additional_message():
|
||||
|
||||
@deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
|
||||
def dummy(*, old_arg: object = None, new_arg: object = None):
|
||||
pass
|
||||
@@ -145,99 +159,107 @@ def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
|
||||
@pytest.fixture
|
||||
def parser():
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument('--image-input-type',
|
||||
choices=['pixel_values', 'image_features'])
|
||||
parser.add_argument('--model-name')
|
||||
parser.add_argument('--batch-size', type=int)
|
||||
parser.add_argument('--enable-feature', action='store_true')
|
||||
parser.add_argument('--hf-overrides', type=json.loads)
|
||||
parser.add_argument('-O', '--compilation-config', type=json.loads)
|
||||
parser.add_argument(
|
||||
"--image-input-type", choices=["pixel_values", "image_features"]
|
||||
)
|
||||
parser.add_argument("--model-name")
|
||||
parser.add_argument("--batch-size", type=int)
|
||||
parser.add_argument("--enable-feature", action="store_true")
|
||||
parser.add_argument("--hf-overrides", type=json.loads)
|
||||
parser.add_argument("-O", "--compilation-config", type=json.loads)
|
||||
return parser
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser_with_config():
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument('serve')
|
||||
parser.add_argument('model_tag', nargs='?')
|
||||
parser.add_argument('--model', type=str)
|
||||
parser.add_argument('--served-model-name', type=str)
|
||||
parser.add_argument('--config', type=str)
|
||||
parser.add_argument('--port', type=int)
|
||||
parser.add_argument('--tensor-parallel-size', type=int)
|
||||
parser.add_argument('--trust-remote-code', action='store_true')
|
||||
parser.add_argument("serve")
|
||||
parser.add_argument("model_tag", nargs="?")
|
||||
parser.add_argument("--model", type=str)
|
||||
parser.add_argument("--served-model-name", type=str)
|
||||
parser.add_argument("--config", type=str)
|
||||
parser.add_argument("--port", type=int)
|
||||
parser.add_argument("--tensor-parallel-size", type=int)
|
||||
parser.add_argument("--trust-remote-code", action="store_true")
|
||||
return parser
|
||||
|
||||
|
||||
def test_underscore_to_dash(parser):
|
||||
args = parser.parse_args(['--image_input_type', 'pixel_values'])
|
||||
assert args.image_input_type == 'pixel_values'
|
||||
args = parser.parse_args(["--image_input_type", "pixel_values"])
|
||||
assert args.image_input_type == "pixel_values"
|
||||
|
||||
|
||||
def test_mixed_usage(parser):
|
||||
args = parser.parse_args([
|
||||
'--image_input_type', 'image_features', '--model-name',
|
||||
'facebook/opt-125m'
|
||||
])
|
||||
assert args.image_input_type == 'image_features'
|
||||
assert args.model_name == 'facebook/opt-125m'
|
||||
args = parser.parse_args(
|
||||
["--image_input_type", "image_features", "--model-name", "facebook/opt-125m"]
|
||||
)
|
||||
assert args.image_input_type == "image_features"
|
||||
assert args.model_name == "facebook/opt-125m"
|
||||
|
||||
|
||||
def test_with_equals_sign(parser):
|
||||
args = parser.parse_args(
|
||||
['--image_input_type=pixel_values', '--model-name=facebook/opt-125m'])
|
||||
assert args.image_input_type == 'pixel_values'
|
||||
assert args.model_name == 'facebook/opt-125m'
|
||||
["--image_input_type=pixel_values", "--model-name=facebook/opt-125m"]
|
||||
)
|
||||
assert args.image_input_type == "pixel_values"
|
||||
assert args.model_name == "facebook/opt-125m"
|
||||
|
||||
|
||||
def test_with_int_value(parser):
|
||||
args = parser.parse_args(['--batch_size', '32'])
|
||||
args = parser.parse_args(["--batch_size", "32"])
|
||||
assert args.batch_size == 32
|
||||
args = parser.parse_args(['--batch-size', '32'])
|
||||
args = parser.parse_args(["--batch-size", "32"])
|
||||
assert args.batch_size == 32
|
||||
|
||||
|
||||
def test_with_bool_flag(parser):
|
||||
args = parser.parse_args(['--enable_feature'])
|
||||
args = parser.parse_args(["--enable_feature"])
|
||||
assert args.enable_feature is True
|
||||
args = parser.parse_args(['--enable-feature'])
|
||||
args = parser.parse_args(["--enable-feature"])
|
||||
assert args.enable_feature is True
|
||||
|
||||
|
||||
def test_invalid_choice(parser):
|
||||
with pytest.raises(SystemExit):
|
||||
parser.parse_args(['--image_input_type', 'invalid_choice'])
|
||||
parser.parse_args(["--image_input_type", "invalid_choice"])
|
||||
|
||||
|
||||
def test_missing_required_argument(parser):
|
||||
parser.add_argument('--required-arg', required=True)
|
||||
parser.add_argument("--required-arg", required=True)
|
||||
with pytest.raises(SystemExit):
|
||||
parser.parse_args([])
|
||||
|
||||
|
||||
def test_cli_override_to_config(parser_with_config, cli_config_file):
|
||||
args = parser_with_config.parse_args([
|
||||
'serve', 'mymodel', '--config', cli_config_file,
|
||||
'--tensor-parallel-size', '3'
|
||||
])
|
||||
args = parser_with_config.parse_args(
|
||||
["serve", "mymodel", "--config", cli_config_file, "--tensor-parallel-size", "3"]
|
||||
)
|
||||
assert args.tensor_parallel_size == 3
|
||||
args = parser_with_config.parse_args([
|
||||
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
|
||||
cli_config_file
|
||||
])
|
||||
args = parser_with_config.parse_args(
|
||||
["serve", "mymodel", "--tensor-parallel-size", "3", "--config", cli_config_file]
|
||||
)
|
||||
assert args.tensor_parallel_size == 3
|
||||
assert args.port == 12312
|
||||
args = parser_with_config.parse_args([
|
||||
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
|
||||
cli_config_file, '--port', '666'
|
||||
])
|
||||
args = parser_with_config.parse_args(
|
||||
[
|
||||
"serve",
|
||||
"mymodel",
|
||||
"--tensor-parallel-size",
|
||||
"3",
|
||||
"--config",
|
||||
cli_config_file,
|
||||
"--port",
|
||||
"666",
|
||||
]
|
||||
)
|
||||
assert args.tensor_parallel_size == 3
|
||||
assert args.port == 666
|
||||
|
||||
|
||||
def test_config_args(parser_with_config, cli_config_file):
|
||||
args = parser_with_config.parse_args(
|
||||
['serve', 'mymodel', '--config', cli_config_file])
|
||||
["serve", "mymodel", "--config", cli_config_file]
|
||||
)
|
||||
assert args.tensor_parallel_size == 2
|
||||
assert args.trust_remote_code
|
||||
|
||||
@@ -245,22 +267,31 @@ def test_config_args(parser_with_config, cli_config_file):
|
||||
def test_config_file(parser_with_config):
|
||||
with pytest.raises(FileNotFoundError):
|
||||
parser_with_config.parse_args(
|
||||
['serve', 'mymodel', '--config', 'test_config.yml'])
|
||||
["serve", "mymodel", "--config", "test_config.yml"]
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
parser_with_config.parse_args(
|
||||
['serve', 'mymodel', '--config', './data/test_config.json'])
|
||||
["serve", "mymodel", "--config", "./data/test_config.json"]
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
parser_with_config.parse_args([
|
||||
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
|
||||
'--batch-size', '32'
|
||||
])
|
||||
parser_with_config.parse_args(
|
||||
[
|
||||
"serve",
|
||||
"mymodel",
|
||||
"--tensor-parallel-size",
|
||||
"3",
|
||||
"--config",
|
||||
"--batch-size",
|
||||
"32",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def test_no_model_tag(parser_with_config, cli_config_file):
|
||||
with pytest.raises(ValueError):
|
||||
parser_with_config.parse_args(['serve', '--config', cli_config_file])
|
||||
parser_with_config.parse_args(["serve", "--config", cli_config_file])
|
||||
|
||||
|
||||
def test_dict_args(parser):
|
||||
@@ -323,7 +354,7 @@ def test_dict_args(parser):
|
||||
},
|
||||
"key14": {
|
||||
"key15": "-minus.and.dot",
|
||||
}
|
||||
},
|
||||
}
|
||||
assert parsed_args.compilation_config == {
|
||||
"level": 1,
|
||||
@@ -375,24 +406,29 @@ def test_duplicate_dict_args(caplog_vllm, parser):
|
||||
(lambda foo, **kwargs: None, "something_else", False, True, True),
|
||||
(lambda foo, **kwargs: None, "kwargs", True, True, False),
|
||||
(lambda foo, **kwargs: None, "foo", True, True, False),
|
||||
])
|
||||
],
|
||||
)
|
||||
# yapf: disable
|
||||
def test_supports_kw(callable,kw_name,requires_kw_only,
|
||||
allow_var_kwargs,is_supported):
|
||||
assert supports_kw(
|
||||
def test_supports_kw(
|
||||
callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported
|
||||
):
|
||||
assert (
|
||||
supports_kw(
|
||||
callable=callable,
|
||||
kw_name=kw_name,
|
||||
requires_kw_only=requires_kw_only,
|
||||
allow_var_kwargs=allow_var_kwargs
|
||||
) == is_supported
|
||||
allow_var_kwargs=allow_var_kwargs,
|
||||
)
|
||||
== is_supported
|
||||
)
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_memory_profiling():
|
||||
# Fake out some model loading + inference memory usage to test profiling
|
||||
# Memory used by other processes will show up as cuda usage outside of torch
|
||||
from vllm.distributed.device_communicators.cuda_wrapper import (
|
||||
CudaRTLibrary)
|
||||
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
||||
|
||||
lib = CudaRTLibrary()
|
||||
# 512 MiB allocation outside of this instance
|
||||
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
|
||||
@@ -401,9 +437,9 @@ def test_memory_profiling():
|
||||
|
||||
# load weights
|
||||
|
||||
weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
|
||||
weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
|
||||
|
||||
weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
|
||||
weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
|
||||
|
||||
def measure_current_non_torch():
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
@@ -412,11 +448,14 @@ def test_memory_profiling():
|
||||
current_non_torch = current_used - current_torch
|
||||
return current_non_torch
|
||||
|
||||
with memory_profiling(baseline_snapshot=baseline_snapshot,
|
||||
weights_memory=weights_memory) as result, \
|
||||
monitor(measure_current_non_torch) as monitored_values:
|
||||
with (
|
||||
memory_profiling(
|
||||
baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
|
||||
) as result,
|
||||
monitor(measure_current_non_torch) as monitored_values,
|
||||
):
|
||||
# make a memory spike, 1 GiB
|
||||
spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
|
||||
spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
|
||||
del spike
|
||||
|
||||
# Add some extra non-torch memory 256 MiB (simulate NCCL)
|
||||
@@ -431,7 +470,7 @@ def test_memory_profiling():
|
||||
# 5% tolerance is caused by cuda runtime.
|
||||
# we cannot control cuda runtime in the granularity of bytes,
|
||||
# which causes a small error (<10 MiB in practice)
|
||||
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
|
||||
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
|
||||
assert abs(non_torch_ratio - 1) <= 0.05
|
||||
assert result.torch_peak_increase == 1024 * 1024 * 1024
|
||||
del weights
|
||||
@@ -443,87 +482,84 @@ def test_bind_kv_cache():
|
||||
from vllm.attention import Attention
|
||||
|
||||
ctx = {
|
||||
'layers.0.self_attn': Attention(32, 128, 0.1),
|
||||
'layers.1.self_attn': Attention(32, 128, 0.1),
|
||||
'layers.2.self_attn': Attention(32, 128, 0.1),
|
||||
'layers.3.self_attn': Attention(32, 128, 0.1),
|
||||
"layers.0.self_attn": Attention(32, 128, 0.1),
|
||||
"layers.1.self_attn": Attention(32, 128, 0.1),
|
||||
"layers.2.self_attn": Attention(32, 128, 0.1),
|
||||
"layers.3.self_attn": Attention(32, 128, 0.1),
|
||||
}
|
||||
kv_cache = [
|
||||
torch.zeros((1, )),
|
||||
torch.zeros((1, )),
|
||||
torch.zeros((1, )),
|
||||
torch.zeros((1, )),
|
||||
torch.zeros((1,)),
|
||||
torch.zeros((1,)),
|
||||
torch.zeros((1,)),
|
||||
torch.zeros((1,)),
|
||||
]
|
||||
bind_kv_cache(ctx, [kv_cache])
|
||||
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
|
||||
assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
|
||||
assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
|
||||
assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
|
||||
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
|
||||
assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
|
||||
assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
|
||||
assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]
|
||||
|
||||
|
||||
def test_bind_kv_cache_kv_sharing():
|
||||
from vllm.attention import Attention
|
||||
|
||||
ctx = {
|
||||
'layers.0.self_attn': Attention(32, 128, 0.1),
|
||||
'layers.1.self_attn': Attention(32, 128, 0.1),
|
||||
'layers.2.self_attn': Attention(32, 128, 0.1),
|
||||
'layers.3.self_attn': Attention(32, 128, 0.1),
|
||||
"layers.0.self_attn": Attention(32, 128, 0.1),
|
||||
"layers.1.self_attn": Attention(32, 128, 0.1),
|
||||
"layers.2.self_attn": Attention(32, 128, 0.1),
|
||||
"layers.3.self_attn": Attention(32, 128, 0.1),
|
||||
}
|
||||
kv_cache = [
|
||||
torch.zeros((1, )),
|
||||
torch.zeros((1, )),
|
||||
torch.zeros((1, )),
|
||||
torch.zeros((1, )),
|
||||
torch.zeros((1,)),
|
||||
torch.zeros((1,)),
|
||||
torch.zeros((1,)),
|
||||
torch.zeros((1,)),
|
||||
]
|
||||
shared_kv_cache_layers = {
|
||||
'layers.2.self_attn': 'layers.1.self_attn',
|
||||
'layers.3.self_attn': 'layers.0.self_attn'
|
||||
"layers.2.self_attn": "layers.1.self_attn",
|
||||
"layers.3.self_attn": "layers.0.self_attn",
|
||||
}
|
||||
bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
|
||||
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
|
||||
assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
|
||||
assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[1]
|
||||
assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[0]
|
||||
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
|
||||
assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
|
||||
assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
|
||||
assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]
|
||||
|
||||
|
||||
def test_bind_kv_cache_non_attention():
|
||||
from vllm.attention import Attention
|
||||
|
||||
# example from Jamba PP=2
|
||||
ctx = {
|
||||
'model.layers.20.attn': Attention(32, 128, 0.1),
|
||||
'model.layers.28.attn': Attention(32, 128, 0.1),
|
||||
"model.layers.20.attn": Attention(32, 128, 0.1),
|
||||
"model.layers.28.attn": Attention(32, 128, 0.1),
|
||||
}
|
||||
kv_cache = [
|
||||
torch.zeros((1, )),
|
||||
torch.zeros((1, )),
|
||||
torch.zeros((1,)),
|
||||
torch.zeros((1,)),
|
||||
]
|
||||
bind_kv_cache(ctx, [kv_cache])
|
||||
assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0]
|
||||
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
|
||||
assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
|
||||
assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]
|
||||
|
||||
|
||||
def test_bind_kv_cache_pp():
|
||||
with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
|
||||
# this test runs with 1 GPU, but we simulate 2 GPUs
|
||||
cfg = VllmConfig(
|
||||
parallel_config=ParallelConfig(pipeline_parallel_size=2))
|
||||
cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
|
||||
with set_current_vllm_config(cfg):
|
||||
from vllm.attention import Attention
|
||||
|
||||
ctx = {
|
||||
'layers.0.self_attn': Attention(32, 128, 0.1),
|
||||
"layers.0.self_attn": Attention(32, 128, 0.1),
|
||||
}
|
||||
kv_cache = [
|
||||
[torch.zeros((1, ))],
|
||||
[torch.zeros((1, ))]
|
||||
]
|
||||
kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
|
||||
bind_kv_cache(ctx, kv_cache)
|
||||
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0]
|
||||
assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
|
||||
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
|
||||
assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
|
||||
|
||||
|
||||
class TestLRUCache(LRUCache):
|
||||
|
||||
def _on_remove(self, key, value):
|
||||
if not hasattr(self, "_remove_counter"):
|
||||
self._remove_counter = 0
|
||||
|
||||
Reference in New Issue
Block a user