[Misc] Clean up utils (#27552)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-27 17:05:40 +08:00
parent 9932ed6a83
commit 7c2bdb83dc
12 changed files with 45 additions and 262 deletions
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -4,23 +4,15 @@

 import json
 import os
-import tempfile
-from pathlib import Path
-from unittest.mock import patch

 import pytest
-import torch
 import yaml
 from transformers import AutoTokenizer

-from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens

-from vllm.utils import (
-    FlexibleArgumentParser,
-    bind_kv_cache,
-)
-from ..utils import create_new_process_for_each_test, flat_product
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from ..utils import flat_product


 # Tests for FlexibleArgumentParser
@@ -256,87 +248,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
    assert "-O.mode" in caplog_vllm.text


-def test_bind_kv_cache():
-    from vllm.attention import Attention
-
-    ctx = {
-        "layers.0.self_attn": Attention(32, 128, 0.1),
-        "layers.1.self_attn": Attention(32, 128, 0.1),
-        "layers.2.self_attn": Attention(32, 128, 0.1),
-        "layers.3.self_attn": Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-    ]
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
-    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
-    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
-    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]
-
-
-def test_bind_kv_cache_kv_sharing():
-    from vllm.attention import Attention
-
-    ctx = {
-        "layers.0.self_attn": Attention(32, 128, 0.1),
-        "layers.1.self_attn": Attention(32, 128, 0.1),
-        "layers.2.self_attn": Attention(32, 128, 0.1),
-        "layers.3.self_attn": Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-    ]
-    shared_kv_cache_layers = {
-        "layers.2.self_attn": "layers.1.self_attn",
-        "layers.3.self_attn": "layers.0.self_attn",
-    }
-    bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
-    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
-    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
-    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
-    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]
-
-
-def test_bind_kv_cache_non_attention():
-    from vllm.attention import Attention
-
-    # example from Jamba PP=2
-    ctx = {
-        "model.layers.20.attn": Attention(32, 128, 0.1),
-        "model.layers.28.attn": Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-    ]
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
-    assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]
-
-
-def test_bind_kv_cache_pp():
-    with patch("vllm.utils.torch_utils.cuda_device_count_stateless", lambda: 2):
-        # this test runs with 1 GPU, but we simulate 2 GPUs
-        cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
-    with set_current_vllm_config(cfg):
-        from vllm.attention import Attention
-
-        ctx = {
-            "layers.0.self_attn": Attention(32, 128, 0.1),
-        }
-        kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
-        bind_kv_cache(ctx, kv_cache)
-        assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
-        assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
-
-
 def test_model_specification(
    parser_with_config, cli_config_file, cli_config_file_with_model
 ):
--- a/tests/utils_/test_serial_utils.py
+++ b/tests/utils_/test_serial_utils.py
@@ -14,7 +14,7 @@ from vllm.utils.serial_utils import (

@pytest.mark.parametrize("endianness", ENDIANNESS)
@pytest.mark.parametrize("embed_dtype", EMBED_DTYPE_TO_TORCH_DTYPE.keys())
-@torch.inference_mode
+@torch.inference_mode()
 def test_encode_and_decode(embed_dtype: str, endianness: str):
    for i in range(10):
        tensor = torch.rand(2, 3, 5, 7, 11, 13, device="cpu", dtype=torch.float32)