[Misc] Clean up utils (#27552)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-27 17:05:40 +08:00
committed by GitHub
parent 9932ed6a83
commit 7c2bdb83dc
12 changed files with 45 additions and 262 deletions

View File

@@ -4,23 +4,15 @@
import json
import os
import tempfile
from pathlib import Path
from unittest.mock import patch
import pytest
import torch
import yaml
from transformers import AutoTokenizer
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
from vllm.utils import (
FlexibleArgumentParser,
bind_kv_cache,
)
from ..utils import create_new_process_for_each_test, flat_product
from vllm.utils.argparse_utils import FlexibleArgumentParser
from ..utils import flat_product
# Tests for FlexibleArgumentParser
@@ -256,87 +248,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
assert "-O.mode" in caplog_vllm.text
def test_bind_kv_cache():
from vllm.attention import Attention
ctx = {
"layers.0.self_attn": Attention(32, 128, 0.1),
"layers.1.self_attn": Attention(32, 128, 0.1),
"layers.2.self_attn": Attention(32, 128, 0.1),
"layers.3.self_attn": Attention(32, 128, 0.1),
}
kv_cache = [
torch.zeros((1,)),
torch.zeros((1,)),
torch.zeros((1,)),
torch.zeros((1,)),
]
bind_kv_cache(ctx, [kv_cache])
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]
def test_bind_kv_cache_kv_sharing():
from vllm.attention import Attention
ctx = {
"layers.0.self_attn": Attention(32, 128, 0.1),
"layers.1.self_attn": Attention(32, 128, 0.1),
"layers.2.self_attn": Attention(32, 128, 0.1),
"layers.3.self_attn": Attention(32, 128, 0.1),
}
kv_cache = [
torch.zeros((1,)),
torch.zeros((1,)),
torch.zeros((1,)),
torch.zeros((1,)),
]
shared_kv_cache_layers = {
"layers.2.self_attn": "layers.1.self_attn",
"layers.3.self_attn": "layers.0.self_attn",
}
bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]
def test_bind_kv_cache_non_attention():
from vllm.attention import Attention
# example from Jamba PP=2
ctx = {
"model.layers.20.attn": Attention(32, 128, 0.1),
"model.layers.28.attn": Attention(32, 128, 0.1),
}
kv_cache = [
torch.zeros((1,)),
torch.zeros((1,)),
]
bind_kv_cache(ctx, [kv_cache])
assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]
def test_bind_kv_cache_pp():
with patch("vllm.utils.torch_utils.cuda_device_count_stateless", lambda: 2):
# this test runs with 1 GPU, but we simulate 2 GPUs
cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
with set_current_vllm_config(cfg):
from vllm.attention import Attention
ctx = {
"layers.0.self_attn": Attention(32, 128, 0.1),
}
kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
bind_kv_cache(ctx, kv_cache)
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
def test_model_specification(
parser_with_config, cli_config_file, cli_config_file_with_model
):

View File

@@ -14,7 +14,7 @@ from vllm.utils.serial_utils import (
@pytest.mark.parametrize("endianness", ENDIANNESS)
@pytest.mark.parametrize("embed_dtype", EMBED_DTYPE_TO_TORCH_DTYPE.keys())
@torch.inference_mode
@torch.inference_mode()
def test_encode_and_decode(embed_dtype: str, endianness: str):
for i in range(10):
tensor = torch.rand(2, 3, 5, 7, 11, 13, device="cpu", dtype=torch.float32)