[Misc] Move utils to avoid conflicts with stdlib, and move tests (#27169)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-19 20:20:55 +08:00
parent 7a6c8c3fa1
commit d31f7844f8
52 changed files with 246 additions and 237 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -60,7 +60,7 @@ from vllm.multimodal.utils import fetch_image
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.transformers_utils.utils import maybe_model_redirect
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_num_threads
 logger = init_logger(__name__)
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -12,7 +12,7 @@ from vllm.entrypoints.openai.api_server import (
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
-from vllm.utils.asyncio import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
 MODEL_PATH = "zai-org/chatglm3-6b"
 LORA_RANK = 64
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -17,7 +17,7 @@ from transformers import (
 )
 from vllm.platforms import current_platform
-from vllm.utils.functools import identity
+from vllm.utils.func_utils import identity
 from ....conftest import (
    IMAGE_ASSETS,
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -25,7 +25,7 @@ from transformers import (
 from transformers.video_utils import VideoMetadata
 from vllm.logprobs import SampleLogprobs
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
 from .types import RunnerOutput
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -34,7 +34,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype
 from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
--- a/tests/utils_/test_async_utils.py
+++ b/tests/utils_/test_async_utils.py
@@ -5,7 +5,7 @@ from collections.abc import AsyncIterator
 import pytest
-from vllm.utils.asyncio import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
 async def _mock_async_iterator(idx: int):
--- a/tests/utils_/test_collection_utils.py
+++ b/tests/utils_/test_collection_utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
-from vllm.utils.collections import swap_dict_values
+from vllm.utils.collection_utils import swap_dict_values
@pytest.mark.parametrize(
--- a/tests/utils_/test_func_utils.py
+++ b/tests/utils_/test_func_utils.py
@@ -4,7 +4,7 @@
 import pytest
-from vllm.utils.functools import deprecate_kwargs, supports_kw
+from vllm.utils.func_utils import deprecate_kwargs, supports_kw
 from ..utils import error_on_warning
--- a/tests/utils_/test_hashing.py
+++ b/tests/utils_/test_hashing.py
@@ -0,0 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 import pickle
 import pytest
 from vllm.utils.hashing import sha256
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
 def test_sha256(input: tuple):
    digest = sha256(input)
    assert digest is not None
    assert isinstance(digest, bytes)
    assert digest != b""
    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
    assert digest == hashlib.sha256(input_bytes).digest()
    # hashing again, returns the same value
    assert digest == sha256(input)
    # hashing different input, returns different value
    assert digest != sha256(input + (1,))
--- a/tests/utils_/test_mem_utils.py
+++ b/tests/utils_/test_mem_utils.py
@@ -0,0 +1,63 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 from vllm_test_utils.monitor import monitor
 from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
 from ..utils import create_new_process_for_each_test
@create_new_process_for_each_test()
 def test_memory_profiling():
    # Fake out some model loading + inference memory usage to test profiling
    # Memory used by other processes will show up as cuda usage outside of torch
    from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
    lib = CudaRTLibrary()
    # 512 MiB allocation outside of this instance
    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
    baseline_snapshot = MemorySnapshot()
    # load weights
    weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
    weights_memory = 128 * 1024 * 1024 * 4  # 512 MiB
    def measure_current_non_torch():
        free, total = torch.cuda.mem_get_info()
        current_used = total - free
        current_torch = torch.cuda.memory_reserved()
        current_non_torch = current_used - current_torch
        return current_non_torch
    with (
        memory_profiling(
            baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
        ) as result,
        monitor(measure_current_non_torch) as monitored_values,
    ):
        # make a memory spike, 1 GiB
        spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
        del spike
        # Add some extra non-torch memory 256 MiB (simulate NCCL)
        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
    # this is an analytic value, it is exact,
    # we only have 256 MiB non-torch memory increase
    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
    assert measured_diff == 256 * 1024 * 1024
    # Check that the memory usage is within 5% of the expected values
    # 5% tolerance is caused by cuda runtime.
    # we cannot control cuda runtime in the granularity of bytes,
    # which causes a small error (<10 MiB in practice)
    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024)  # noqa
    assert abs(non_torch_ratio - 1) <= 0.05
    assert result.torch_peak_increase == 1024 * 1024 * 1024
    del weights
    lib.cudaFree(handle1)
    lib.cudaFree(handle2)
--- a/tests/utils_/test_torch_utils.py
+++ b/tests/utils_/test_torch_utils.py
@@ -0,0 +1,104 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 from vllm.utils.torch_utils import (
    common_broadcastable_dtype,
    current_stream,
    is_lossless_cast,
 )
@pytest.mark.parametrize(
    ("src_dtype", "tgt_dtype", "expected_result"),
    [
        # Different precision_levels
        (torch.bool, torch.int8, True),
        (torch.bool, torch.float16, True),
        (torch.bool, torch.complex32, True),
        (torch.int64, torch.bool, False),
        (torch.int64, torch.float16, True),
        (torch.int64, torch.complex32, True),
        (torch.float64, torch.bool, False),
        (torch.float64, torch.int8, False),
        (torch.float64, torch.complex32, True),
        (torch.complex128, torch.bool, False),
        (torch.complex128, torch.int8, False),
        (torch.complex128, torch.float16, False),
        # precision_level=0
        (torch.bool, torch.bool, True),
        # precision_level=1
        (torch.int8, torch.int16, True),
        (torch.int16, torch.int8, False),
        (torch.uint8, torch.int8, False),
        (torch.int8, torch.uint8, False),
        # precision_level=2
        (torch.float16, torch.float32, True),
        (torch.float32, torch.float16, False),
        (torch.bfloat16, torch.float32, True),
        (torch.float32, torch.bfloat16, False),
        # precision_level=3
        (torch.complex32, torch.complex64, True),
        (torch.complex64, torch.complex32, False),
    ],
 )
 def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
@pytest.mark.parametrize(
    ("dtypes", "expected_result"),
    [
        ([torch.bool], torch.bool),
        ([torch.bool, torch.int8], torch.int8),
        ([torch.bool, torch.int8, torch.float16], torch.float16),
        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
    ],
 )
 def test_common_broadcastable_dtype(dtypes, expected_result):
    assert common_broadcastable_dtype(dtypes) == expected_result
 def test_current_stream_multithread():
    import threading
    if not torch.cuda.is_available():
        pytest.skip("CUDA not available")
    main_default_stream = torch.cuda.current_stream()
    child_stream = torch.cuda.Stream()
    thread_stream_ready = threading.Event()
    thread_can_exit = threading.Event()
    def child_thread_func():
        with torch.cuda.stream(child_stream):
            thread_stream_ready.set()
            thread_can_exit.wait(timeout=10)
    child_thread = threading.Thread(target=child_thread_func)
    child_thread.start()
    try:
        assert thread_stream_ready.wait(timeout=5), (
            "Child thread failed to enter stream context in time"
        )
        main_current_stream = current_stream()
        assert main_current_stream != child_stream, (
            "Main thread's current_stream was contaminated by child thread"
        )
        assert main_current_stream == main_default_stream, (
            "Main thread's current_stream is not the default stream"
        )
        # Notify child thread it can exit
        thread_can_exit.set()
    finally:
        # Ensure child thread exits properly
        child_thread.join(timeout=5)
        if child_thread.is_alive():
            pytest.fail("Child thread failed to exit properly")
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -2,10 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 import hashlib
 import json
 import os
 import pickle
 import tempfile
 from pathlib import Path
 from unittest.mock import patch
@@ -14,7 +12,6 @@ import pytest
 import torch
 import yaml
 from transformers import AutoTokenizer
 from vllm_test_utils.monitor import monitor
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
@@ -24,13 +21,6 @@ from vllm.utils import (
    bind_kv_cache,
    unique_filepath,
 )
 from vllm.utils.hashing import sha256
 from vllm.utils.torch_utils import (
    common_broadcastable_dtype,
    current_stream,
    is_lossless_cast,
 )
 from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
 from ..utils import create_new_process_for_each_test, flat_product
@@ -267,61 +257,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
    assert "-O.mode" in caplog_vllm.text
@create_new_process_for_each_test()
 def test_memory_profiling():
    # Fake out some model loading + inference memory usage to test profiling
    # Memory used by other processes will show up as cuda usage outside of torch
    from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
    lib = CudaRTLibrary()
    # 512 MiB allocation outside of this instance
    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
    baseline_snapshot = MemorySnapshot()
    # load weights
    weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
    weights_memory = 128 * 1024 * 1024 * 4  # 512 MiB
    def measure_current_non_torch():
        free, total = torch.cuda.mem_get_info()
        current_used = total - free
        current_torch = torch.cuda.memory_reserved()
        current_non_torch = current_used - current_torch
        return current_non_torch
    with (
        memory_profiling(
            baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
        ) as result,
        monitor(measure_current_non_torch) as monitored_values,
    ):
        # make a memory spike, 1 GiB
        spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
        del spike
        # Add some extra non-torch memory 256 MiB (simulate NCCL)
        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
    # this is an analytic value, it is exact,
    # we only have 256 MiB non-torch memory increase
    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
    assert measured_diff == 256 * 1024 * 1024
    # Check that the memory usage is within 5% of the expected values
    # 5% tolerance is caused by cuda runtime.
    # we cannot control cuda runtime in the granularity of bytes,
    # which causes a small error (<10 MiB in practice)
    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024)  # noqa
    assert abs(non_torch_ratio - 1) <= 0.05
    assert result.torch_peak_increase == 1024 * 1024 * 1024
    del weights
    lib.cudaFree(handle1)
    lib.cudaFree(handle2)
 def test_bind_kv_cache():
    from vllm.attention import Attention
@@ -403,56 +338,6 @@ def test_bind_kv_cache_pp():
        assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
@pytest.mark.parametrize(
    ("src_dtype", "tgt_dtype", "expected_result"),
    [
        # Different precision_levels
        (torch.bool, torch.int8, True),
        (torch.bool, torch.float16, True),
        (torch.bool, torch.complex32, True),
        (torch.int64, torch.bool, False),
        (torch.int64, torch.float16, True),
        (torch.int64, torch.complex32, True),
        (torch.float64, torch.bool, False),
        (torch.float64, torch.int8, False),
        (torch.float64, torch.complex32, True),
        (torch.complex128, torch.bool, False),
        (torch.complex128, torch.int8, False),
        (torch.complex128, torch.float16, False),
        # precision_level=0
        (torch.bool, torch.bool, True),
        # precision_level=1
        (torch.int8, torch.int16, True),
        (torch.int16, torch.int8, False),
        (torch.uint8, torch.int8, False),
        (torch.int8, torch.uint8, False),
        # precision_level=2
        (torch.float16, torch.float32, True),
        (torch.float32, torch.float16, False),
        (torch.bfloat16, torch.float32, True),
        (torch.float32, torch.bfloat16, False),
        # precision_level=3
        (torch.complex32, torch.complex64, True),
        (torch.complex64, torch.complex32, False),
    ],
 )
 def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
@pytest.mark.parametrize(
    ("dtypes", "expected_result"),
    [
        ([torch.bool], torch.bool),
        ([torch.bool, torch.int8], torch.int8),
        ([torch.bool, torch.int8, torch.float16], torch.float16),
        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
    ],
 )
 def test_common_broadcastable_dtype(dtypes, expected_result):
    assert common_broadcastable_dtype(dtypes) == expected_result
 def test_model_specification(
    parser_with_config, cli_config_file, cli_config_file_with_model
 ):
@@ -535,23 +420,6 @@ def test_model_specification(
    assert args.port == 12312
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
 def test_sha256(input: tuple):
    digest = sha256(input)
    assert digest is not None
    assert isinstance(digest, bytes)
    assert digest != b""
    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
    assert digest == hashlib.sha256(input_bytes).digest()
    # hashing again, returns the same value
    assert digest == sha256(input)
    # hashing different input, returns different value
    assert digest != sha256(input + (1,))
 def test_convert_ids_list_to_tokens():
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
    token_ids = tokenizer.encode("Hello, world!")
@@ -561,50 +429,6 @@ def test_convert_ids_list_to_tokens():
    assert tokens == ["Hello", ",", " world", "!"]
 def test_current_stream_multithread():
    import threading
    if not torch.cuda.is_available():
        pytest.skip("CUDA not available")
    main_default_stream = torch.cuda.current_stream()
    child_stream = torch.cuda.Stream()
    thread_stream_ready = threading.Event()
    thread_can_exit = threading.Event()
    def child_thread_func():
        with torch.cuda.stream(child_stream):
            thread_stream_ready.set()
            thread_can_exit.wait(timeout=10)
    child_thread = threading.Thread(target=child_thread_func)
    child_thread.start()
    try:
        assert thread_stream_ready.wait(timeout=5), (
            "Child thread failed to enter stream context in time"
        )
        main_current_stream = current_stream()
        assert main_current_stream != child_stream, (
            "Main thread's current_stream was contaminated by child thread"
        )
        assert main_current_stream == main_default_stream, (
            "Main thread's current_stream is not the default stream"
        )
        # Notify child thread it can exit
        thread_can_exit.set()
    finally:
        # Ensure child thread exits properly
        child_thread.join(timeout=5)
        if child_thread.is_alive():
            pytest.fail("Child thread failed to exit properly")
 def test_load_config_file(tmp_path):
    # Define the configuration data
    config_data = {
--- a/tools/pre_commit/check_pickle_imports.py
+++ b/tools/pre_commit/check_pickle_imports.py
@@ -23,13 +23,14 @@ ALLOWED_FILES = {
    "vllm/transformers_utils/config.py",
    "vllm/model_executor/models/registry.py",
    "vllm/compilation/caching.py",
    "tests/utils_/test_utils.py",
    "tests/tokenization/test_cached_tokenizer.py",
    "vllm/distributed/utils.py",
    "vllm/distributed/parallel_state.py",
    "vllm/distributed/device_communicators/all_reduce_utils.py",
    "vllm/distributed/device_communicators/shm_broadcast.py",
    "vllm/distributed/device_communicators/shm_object_storage.py",
    "vllm/utils/hashing.py",
    "tests/utils_/test_hashing.py",
    "tests/tokenization/test_cached_tokenizer.py",
    "benchmarks/kernels/graph_machete_bench.py",
    "benchmarks/kernels/benchmark_lora.py",
    "benchmarks/kernels/benchmark_machete.py",
@@ -40,10 +41,8 @@ ALLOWED_FILES = {
    "vllm/executor/mp_distributed_executor.py",
    "vllm/executor/ray_distributed_executor.py",
    "vllm/entrypoints/llm.py",
    "tests/utils.py",
    # pickle and cloudpickle
    "vllm/utils/__init__.py",
-    "vllm/utils/hashing.py",
+    "tests/utils.py",
 }
 PICKLE_RE = re.compile(
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -34,7 +34,7 @@ from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils.asyncio import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
 def run_vllm(
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -51,7 +51,7 @@ from vllm.transformers_utils.chat_templates import get_chat_template_fallback_pa
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import random_uuid
-from vllm.utils.functools import supports_kw
+from vllm.utils.func_utils import supports_kw
 logger = init_logger(__name__)
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -76,7 +76,7 @@ from vllm.transformers_utils.tokenizer import (
 )
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, Device
-from vllm.utils.collections import as_iter, is_list_of
+from vllm.utils.collection_utils import as_iter, is_list_of
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.llm_engine import LLMEngine
 from vllm.v1.sample.logits_processor import LogitsProcessor
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -70,7 +70,7 @@ from vllm.transformers_utils.tokenizers import (
    truncate_tool_call_ids,
    validate_request_params,
 )
-from vllm.utils.collections import as_list
+from vllm.utils.collection_utils import as_list
 logger = init_logger(__name__)
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -34,8 +34,8 @@ from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils.asyncio import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
-from vllm.utils.collections import as_list
+from vllm.utils.collection_utils import as_list
 logger = init_logger(__name__)
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -39,8 +39,8 @@ from vllm.outputs import (
    RequestOutput,
 )
 from vllm.pooling_params import PoolingParams
-from vllm.utils.asyncio import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
-from vllm.utils.collections import chunk_list
+from vllm.utils.collection_utils import chunk_list
 logger = init_logger(__name__)
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -91,13 +91,13 @@ from vllm.tracing import (
 )
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import random_uuid
-from vllm.utils.asyncio import (
+from vllm.utils.async_utils import (
    AsyncMicrobatchTokenizer,
    collect_from_async_generator,
    make_async,
    merge_async_iterators,
 )
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.v1.engine import EngineCoreRequest
 logger = init_logger(__name__)
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -36,7 +36,7 @@ from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
 from vllm.outputs import PoolingOutput, PoolingRequestOutput
 from vllm.tasks import SupportedTask
-from vllm.utils.asyncio import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
 logger = init_logger(__name__)
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -37,7 +37,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils.asyncio import make_async, merge_async_iterators
+from vllm.utils.async_utils import make_async, merge_async_iterators
 logger = init_logger(__name__)
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import import_from_path
 logger = init_logger(__name__)
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -17,7 +17,7 @@ from vllm.inputs.data import TextPrompt as EngineTextPrompt
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils.asyncio import AsyncMicrobatchTokenizer
+from vllm.utils.async_utils import AsyncMicrobatchTokenizer
@dataclass(frozen=True)
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -17,7 +17,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import ExecuteModelRequest
 from vllm.tasks import SupportedTask
-from vllm.utils.asyncio import make_async
+from vllm.utils.async_utils import make_async
 from vllm.v1.outputs import SamplerOutput
 from vllm.v1.worker.worker_base import WorkerBase
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -19,7 +19,7 @@ from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.ray.ray_env import get_env_vars_to_copy
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils.asyncio import make_async
+from vllm.utils.async_utils import make_async
 from vllm.utils.network_utils import (
    get_distributed_init_method,
    get_ip,
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Literal, NamedTuple, TypeAlias, TypedDict, cas
 from typing_extensions import TypeIs
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from .data import (
    EmbedsPrompt,
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -17,7 +17,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils.collections import LazyDict
+from vllm.utils.collection_utils import LazyDict
 logger = init_logger(__name__)
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -31,7 +31,7 @@ from vllm.utils.deep_gemm import (
    get_mk_alignment_for_contiguous_layout,
    m_grouped_fp8_gemm_nt_contiguous,
 )
-from vllm.utils.functools import run_once
+from vllm.utils.func_utils import run_once
 logger = init_logger(__name__)
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -28,7 +28,7 @@ from vllm.model_executor.parameter import (
    RowvLLMParameter,
 )
 from vllm.transformers_utils.config import get_safetensors_params_metadata
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 if TYPE_CHECKING:
    from vllm.model_executor.layers.quantization import QuantizationMethods
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -57,7 +57,7 @@ from vllm.model_executor.parameter import (
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.transformers_utils.config import get_safetensors_params_metadata
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 logger = init_logger(__name__)
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -48,7 +48,7 @@ from vllm.transformers_utils.configs.deepseek_vl2 import (
 )
 from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_dtype
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -24,7 +24,7 @@ from vllm.inputs import TokensPrompt
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.utils.functools import supports_kw
+from vllm.utils.func_utils import supports_kw
 from .interfaces_base import VllmModel, is_pooling_model
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -15,7 +15,7 @@ import torch.nn as nn
 from typing_extensions import TypeIs, TypeVar
 from vllm.logger import init_logger
-from vllm.utils.functools import supports_kw
+from vllm.utils.func_utils import supports_kw
 if TYPE_CHECKING:
    from vllm.config import VllmConfig
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -33,7 +33,7 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -85,7 +85,7 @@ from vllm.multimodal.processing import (
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils.collections import flatten_2d_lists
+from vllm.utils.collection_utils import flatten_2d_lists
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_dtype
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -79,7 +79,7 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from .interfaces import (
    MultiModalEmbeddings,
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -22,7 +22,7 @@ from typing import (
 import numpy as np
 from typing_extensions import NotRequired, TypeVar, deprecated
-from vllm.utils.collections import full_groupby, is_list_of
+from vllm.utils.collection_utils import full_groupby, is_list_of
 from vllm.utils.import_utils import LazyLoader
 from vllm.utils.jsontree import json_map_leaves
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -19,7 +19,7 @@ import numpy as np
 import torch
 from typing_extensions import assert_never
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import LazyLoader
 from .audio import AudioResampler
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -25,8 +25,8 @@ from typing_extensions import TypeVar, assert_never
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens
-from vllm.utils.collections import flatten_2d_lists, full_groupby
+from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
-from vllm.utils.functools import get_allowed_kwarg_only_overrides
+from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 from vllm.utils.jsontree import JSONTree, json_map_leaves
 from .hasher import MultiModalHasher
@@ -486,7 +486,7 @@ _M = TypeVar("_M", bound=_HasModalityAttr | _HasModalityProp)
 def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
    """
    Convenience function to apply
-    [`full_groupby`][vllm.utils.collections.full_groupby]
+    [`full_groupby`][vllm.utils.collection_utils.full_groupby]
    based on modality.
    """
    return full_groupby(values, key=lambda x: x.modality)
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -9,7 +9,7 @@ import torch.nn as nn
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
-from vllm.utils.collections import ClassRegistry
+from vllm.utils.collection_utils import ClassRegistry
 from .cache import BaseMultiModalProcessorCache
 from .processing import (
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Any
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.logger import init_logger
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import import_from_path
 if TYPE_CHECKING:
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -5,7 +5,7 @@ import os
 from collections.abc import Mapping
 from vllm.logger import init_logger
-from vllm.utils.functools import run_once
+from vllm.utils.func_utils import run_once
 TRACE_HEADERS = ["traceparent", "tracestate"]
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -16,7 +16,7 @@ from transformers.processing_utils import ProcessorMixin
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar
-from vllm.utils.functools import get_allowed_kwarg_only_overrides
+from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 if TYPE_CHECKING:
    from vllm.config import ModelConfig
--- a/vllm/utils/init.py
+++ b/vllm/utils/init.py
@@ -1122,9 +1122,6 @@ def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
    return cls
 ## moved to vllm.utils.profiling (imported at module top)
 # Only relevant for models using ALiBi (e.g, MPT)
 def check_use_alibi(model_config: ModelConfig) -> bool:
    cfg = model_config.hf_text_config
@@ -1150,9 +1147,6 @@ def check_use_alibi(model_config: ModelConfig) -> bool:
    )
 ## moved to vllm.utils.hashing
@cache
 def _has_module(module_name: str) -> bool:
    """Return True if *module_name* can be found in the current environment.
--- a/vllm/utils/async_utils.py
+++ b/vllm/utils/async_utils.py
--- a/vllm/utils/collection_utils.py
+++ b/vllm/utils/collection_utils.py
--- a/vllm/utils/func_utils.py
+++ b/vllm/utils/func_utils.py
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -30,9 +30,9 @@ from vllm.transformers_utils.config import maybe_register_config_serialize_by_va
 from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device, cdiv
-from vllm.utils.asyncio import cancel_task_threadsafe
+from vllm.utils.async_utils import cancel_task_threadsafe
-from vllm.utils.collections import as_list
+from vllm.utils.collection_utils import as_list
-from vllm.utils.functools import deprecate_kwargs
+from vllm.utils.func_utils import deprecate_kwargs
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -23,7 +23,7 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.tasks import SupportedTask
-from vllm.utils.asyncio import in_loop
+from vllm.utils.async_utils import in_loop
 from vllm.utils.network_utils import (
    close_sockets,
    get_open_port,
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -13,7 +13,7 @@ from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import length_from_prompt_token_ids_or_embeds
-from vllm.utils.collections import swap_dict_values
+from vllm.utils.collection_utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -10,7 +10,7 @@ import torch
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingType
 from vllm.utils import length_from_prompt_token_ids_or_embeds
-from vllm.utils.collections import swap_dict_values
+from vllm.utils.collection_utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.worker.block_table import MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState