[Misc] Move utils to avoid conflicts with stdlib, and move tests (#27169)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -60,7 +60,7 @@ from vllm.multimodal.utils import fetch_image
|
|||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.sampling_params import BeamSearchParams
|
from vllm.sampling_params import BeamSearchParams
|
||||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
from vllm.utils.torch_utils import set_default_torch_num_threads
|
from vllm.utils.torch_utils import set_default_torch_num_threads
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ from vllm.entrypoints.openai.api_server import (
|
|||||||
from vllm.inputs import TextPrompt
|
from vllm.inputs import TextPrompt
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.utils.asyncio import merge_async_iterators
|
from vllm.utils.async_utils import merge_async_iterators
|
||||||
|
|
||||||
MODEL_PATH = "zai-org/chatglm3-6b"
|
MODEL_PATH = "zai-org/chatglm3-6b"
|
||||||
LORA_RANK = 64
|
LORA_RANK = 64
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.functools import identity
|
from vllm.utils.func_utils import identity
|
||||||
|
|
||||||
from ....conftest import (
|
from ....conftest import (
|
||||||
IMAGE_ASSETS,
|
IMAGE_ASSETS,
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ from transformers import (
|
|||||||
from transformers.video_utils import VideoMetadata
|
from transformers.video_utils import VideoMetadata
|
||||||
|
|
||||||
from vllm.logprobs import SampleLogprobs
|
from vllm.logprobs import SampleLogprobs
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
|
|
||||||
from .....conftest import HfRunner, ImageAsset, ImageTestAssets
|
from .....conftest import HfRunner, ImageAsset, ImageTestAssets
|
||||||
from .types import RunnerOutput
|
from .types import RunnerOutput
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
|
|||||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||||
|
|
||||||
from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
|
from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from collections.abc import AsyncIterator
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.utils.asyncio import merge_async_iterators
|
from vllm.utils.async_utils import merge_async_iterators
|
||||||
|
|
||||||
|
|
||||||
async def _mock_async_iterator(idx: int):
|
async def _mock_async_iterator(idx: int):
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.utils.collections import swap_dict_values
|
from vllm.utils.collection_utils import swap_dict_values
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.utils.functools import deprecate_kwargs, supports_kw
|
from vllm.utils.func_utils import deprecate_kwargs, supports_kw
|
||||||
|
|
||||||
from ..utils import error_on_warning
|
from ..utils import error_on_warning
|
||||||
|
|
||||||
|
|||||||
25
tests/utils_/test_hashing.py
Normal file
25
tests/utils_/test_hashing.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import hashlib
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.utils.hashing import sha256
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
|
||||||
|
def test_sha256(input: tuple):
|
||||||
|
digest = sha256(input)
|
||||||
|
assert digest is not None
|
||||||
|
assert isinstance(digest, bytes)
|
||||||
|
assert digest != b""
|
||||||
|
|
||||||
|
input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
|
assert digest == hashlib.sha256(input_bytes).digest()
|
||||||
|
|
||||||
|
# hashing again, returns the same value
|
||||||
|
assert digest == sha256(input)
|
||||||
|
|
||||||
|
# hashing different input, returns different value
|
||||||
|
assert digest != sha256(input + (1,))
|
||||||
63
tests/utils_/test_mem_utils.py
Normal file
63
tests/utils_/test_mem_utils.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import torch
|
||||||
|
from vllm_test_utils.monitor import monitor
|
||||||
|
|
||||||
|
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
|
||||||
|
|
||||||
|
from ..utils import create_new_process_for_each_test
|
||||||
|
|
||||||
|
|
||||||
|
@create_new_process_for_each_test()
|
||||||
|
def test_memory_profiling():
|
||||||
|
# Fake out some model loading + inference memory usage to test profiling
|
||||||
|
# Memory used by other processes will show up as cuda usage outside of torch
|
||||||
|
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
||||||
|
|
||||||
|
lib = CudaRTLibrary()
|
||||||
|
# 512 MiB allocation outside of this instance
|
||||||
|
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
|
||||||
|
|
||||||
|
baseline_snapshot = MemorySnapshot()
|
||||||
|
|
||||||
|
# load weights
|
||||||
|
|
||||||
|
weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
|
||||||
|
|
||||||
|
weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
|
||||||
|
|
||||||
|
def measure_current_non_torch():
|
||||||
|
free, total = torch.cuda.mem_get_info()
|
||||||
|
current_used = total - free
|
||||||
|
current_torch = torch.cuda.memory_reserved()
|
||||||
|
current_non_torch = current_used - current_torch
|
||||||
|
return current_non_torch
|
||||||
|
|
||||||
|
with (
|
||||||
|
memory_profiling(
|
||||||
|
baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
|
||||||
|
) as result,
|
||||||
|
monitor(measure_current_non_torch) as monitored_values,
|
||||||
|
):
|
||||||
|
# make a memory spike, 1 GiB
|
||||||
|
spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
|
||||||
|
del spike
|
||||||
|
|
||||||
|
# Add some extra non-torch memory 256 MiB (simulate NCCL)
|
||||||
|
handle2 = lib.cudaMalloc(256 * 1024 * 1024)
|
||||||
|
|
||||||
|
# this is an analytic value, it is exact,
|
||||||
|
# we only have 256 MiB non-torch memory increase
|
||||||
|
measured_diff = monitored_values.values[-1] - monitored_values.values[0]
|
||||||
|
assert measured_diff == 256 * 1024 * 1024
|
||||||
|
|
||||||
|
# Check that the memory usage is within 5% of the expected values
|
||||||
|
# 5% tolerance is caused by cuda runtime.
|
||||||
|
# we cannot control cuda runtime in the granularity of bytes,
|
||||||
|
# which causes a small error (<10 MiB in practice)
|
||||||
|
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
|
||||||
|
assert abs(non_torch_ratio - 1) <= 0.05
|
||||||
|
assert result.torch_peak_increase == 1024 * 1024 * 1024
|
||||||
|
del weights
|
||||||
|
lib.cudaFree(handle1)
|
||||||
|
lib.cudaFree(handle2)
|
||||||
104
tests/utils_/test_torch_utils.py
Normal file
104
tests/utils_/test_torch_utils.py
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.utils.torch_utils import (
|
||||||
|
common_broadcastable_dtype,
|
||||||
|
current_stream,
|
||||||
|
is_lossless_cast,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("src_dtype", "tgt_dtype", "expected_result"),
|
||||||
|
[
|
||||||
|
# Different precision_levels
|
||||||
|
(torch.bool, torch.int8, True),
|
||||||
|
(torch.bool, torch.float16, True),
|
||||||
|
(torch.bool, torch.complex32, True),
|
||||||
|
(torch.int64, torch.bool, False),
|
||||||
|
(torch.int64, torch.float16, True),
|
||||||
|
(torch.int64, torch.complex32, True),
|
||||||
|
(torch.float64, torch.bool, False),
|
||||||
|
(torch.float64, torch.int8, False),
|
||||||
|
(torch.float64, torch.complex32, True),
|
||||||
|
(torch.complex128, torch.bool, False),
|
||||||
|
(torch.complex128, torch.int8, False),
|
||||||
|
(torch.complex128, torch.float16, False),
|
||||||
|
# precision_level=0
|
||||||
|
(torch.bool, torch.bool, True),
|
||||||
|
# precision_level=1
|
||||||
|
(torch.int8, torch.int16, True),
|
||||||
|
(torch.int16, torch.int8, False),
|
||||||
|
(torch.uint8, torch.int8, False),
|
||||||
|
(torch.int8, torch.uint8, False),
|
||||||
|
# precision_level=2
|
||||||
|
(torch.float16, torch.float32, True),
|
||||||
|
(torch.float32, torch.float16, False),
|
||||||
|
(torch.bfloat16, torch.float32, True),
|
||||||
|
(torch.float32, torch.bfloat16, False),
|
||||||
|
# precision_level=3
|
||||||
|
(torch.complex32, torch.complex64, True),
|
||||||
|
(torch.complex64, torch.complex32, False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
|
||||||
|
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("dtypes", "expected_result"),
|
||||||
|
[
|
||||||
|
([torch.bool], torch.bool),
|
||||||
|
([torch.bool, torch.int8], torch.int8),
|
||||||
|
([torch.bool, torch.int8, torch.float16], torch.float16),
|
||||||
|
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_common_broadcastable_dtype(dtypes, expected_result):
|
||||||
|
assert common_broadcastable_dtype(dtypes) == expected_result
|
||||||
|
|
||||||
|
|
||||||
|
def test_current_stream_multithread():
|
||||||
|
import threading
|
||||||
|
|
||||||
|
if not torch.cuda.is_available():
|
||||||
|
pytest.skip("CUDA not available")
|
||||||
|
|
||||||
|
main_default_stream = torch.cuda.current_stream()
|
||||||
|
child_stream = torch.cuda.Stream()
|
||||||
|
|
||||||
|
thread_stream_ready = threading.Event()
|
||||||
|
thread_can_exit = threading.Event()
|
||||||
|
|
||||||
|
def child_thread_func():
|
||||||
|
with torch.cuda.stream(child_stream):
|
||||||
|
thread_stream_ready.set()
|
||||||
|
thread_can_exit.wait(timeout=10)
|
||||||
|
|
||||||
|
child_thread = threading.Thread(target=child_thread_func)
|
||||||
|
child_thread.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
assert thread_stream_ready.wait(timeout=5), (
|
||||||
|
"Child thread failed to enter stream context in time"
|
||||||
|
)
|
||||||
|
|
||||||
|
main_current_stream = current_stream()
|
||||||
|
|
||||||
|
assert main_current_stream != child_stream, (
|
||||||
|
"Main thread's current_stream was contaminated by child thread"
|
||||||
|
)
|
||||||
|
assert main_current_stream == main_default_stream, (
|
||||||
|
"Main thread's current_stream is not the default stream"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Notify child thread it can exit
|
||||||
|
thread_can_exit.set()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Ensure child thread exits properly
|
||||||
|
child_thread.join(timeout=5)
|
||||||
|
if child_thread.is_alive():
|
||||||
|
pytest.fail("Child thread failed to exit properly")
|
||||||
@@ -2,10 +2,8 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# ruff: noqa
|
# ruff: noqa
|
||||||
|
|
||||||
import hashlib
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import pickle
|
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
@@ -14,7 +12,6 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
import yaml
|
import yaml
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from vllm_test_utils.monitor import monitor
|
|
||||||
|
|
||||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||||
from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
|
from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
|
||||||
@@ -24,13 +21,6 @@ from vllm.utils import (
|
|||||||
bind_kv_cache,
|
bind_kv_cache,
|
||||||
unique_filepath,
|
unique_filepath,
|
||||||
)
|
)
|
||||||
from vllm.utils.hashing import sha256
|
|
||||||
from vllm.utils.torch_utils import (
|
|
||||||
common_broadcastable_dtype,
|
|
||||||
current_stream,
|
|
||||||
is_lossless_cast,
|
|
||||||
)
|
|
||||||
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
|
|
||||||
from ..utils import create_new_process_for_each_test, flat_product
|
from ..utils import create_new_process_for_each_test, flat_product
|
||||||
|
|
||||||
|
|
||||||
@@ -267,61 +257,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
|
|||||||
assert "-O.mode" in caplog_vllm.text
|
assert "-O.mode" in caplog_vllm.text
|
||||||
|
|
||||||
|
|
||||||
@create_new_process_for_each_test()
|
|
||||||
def test_memory_profiling():
|
|
||||||
# Fake out some model loading + inference memory usage to test profiling
|
|
||||||
# Memory used by other processes will show up as cuda usage outside of torch
|
|
||||||
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
|
||||||
|
|
||||||
lib = CudaRTLibrary()
|
|
||||||
# 512 MiB allocation outside of this instance
|
|
||||||
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
|
|
||||||
|
|
||||||
baseline_snapshot = MemorySnapshot()
|
|
||||||
|
|
||||||
# load weights
|
|
||||||
|
|
||||||
weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
|
|
||||||
|
|
||||||
weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
|
|
||||||
|
|
||||||
def measure_current_non_torch():
|
|
||||||
free, total = torch.cuda.mem_get_info()
|
|
||||||
current_used = total - free
|
|
||||||
current_torch = torch.cuda.memory_reserved()
|
|
||||||
current_non_torch = current_used - current_torch
|
|
||||||
return current_non_torch
|
|
||||||
|
|
||||||
with (
|
|
||||||
memory_profiling(
|
|
||||||
baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
|
|
||||||
) as result,
|
|
||||||
monitor(measure_current_non_torch) as monitored_values,
|
|
||||||
):
|
|
||||||
# make a memory spike, 1 GiB
|
|
||||||
spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
|
|
||||||
del spike
|
|
||||||
|
|
||||||
# Add some extra non-torch memory 256 MiB (simulate NCCL)
|
|
||||||
handle2 = lib.cudaMalloc(256 * 1024 * 1024)
|
|
||||||
|
|
||||||
# this is an analytic value, it is exact,
|
|
||||||
# we only have 256 MiB non-torch memory increase
|
|
||||||
measured_diff = monitored_values.values[-1] - monitored_values.values[0]
|
|
||||||
assert measured_diff == 256 * 1024 * 1024
|
|
||||||
|
|
||||||
# Check that the memory usage is within 5% of the expected values
|
|
||||||
# 5% tolerance is caused by cuda runtime.
|
|
||||||
# we cannot control cuda runtime in the granularity of bytes,
|
|
||||||
# which causes a small error (<10 MiB in practice)
|
|
||||||
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
|
|
||||||
assert abs(non_torch_ratio - 1) <= 0.05
|
|
||||||
assert result.torch_peak_increase == 1024 * 1024 * 1024
|
|
||||||
del weights
|
|
||||||
lib.cudaFree(handle1)
|
|
||||||
lib.cudaFree(handle2)
|
|
||||||
|
|
||||||
|
|
||||||
def test_bind_kv_cache():
|
def test_bind_kv_cache():
|
||||||
from vllm.attention import Attention
|
from vllm.attention import Attention
|
||||||
|
|
||||||
@@ -403,56 +338,6 @@ def test_bind_kv_cache_pp():
|
|||||||
assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
|
assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
("src_dtype", "tgt_dtype", "expected_result"),
|
|
||||||
[
|
|
||||||
# Different precision_levels
|
|
||||||
(torch.bool, torch.int8, True),
|
|
||||||
(torch.bool, torch.float16, True),
|
|
||||||
(torch.bool, torch.complex32, True),
|
|
||||||
(torch.int64, torch.bool, False),
|
|
||||||
(torch.int64, torch.float16, True),
|
|
||||||
(torch.int64, torch.complex32, True),
|
|
||||||
(torch.float64, torch.bool, False),
|
|
||||||
(torch.float64, torch.int8, False),
|
|
||||||
(torch.float64, torch.complex32, True),
|
|
||||||
(torch.complex128, torch.bool, False),
|
|
||||||
(torch.complex128, torch.int8, False),
|
|
||||||
(torch.complex128, torch.float16, False),
|
|
||||||
# precision_level=0
|
|
||||||
(torch.bool, torch.bool, True),
|
|
||||||
# precision_level=1
|
|
||||||
(torch.int8, torch.int16, True),
|
|
||||||
(torch.int16, torch.int8, False),
|
|
||||||
(torch.uint8, torch.int8, False),
|
|
||||||
(torch.int8, torch.uint8, False),
|
|
||||||
# precision_level=2
|
|
||||||
(torch.float16, torch.float32, True),
|
|
||||||
(torch.float32, torch.float16, False),
|
|
||||||
(torch.bfloat16, torch.float32, True),
|
|
||||||
(torch.float32, torch.bfloat16, False),
|
|
||||||
# precision_level=3
|
|
||||||
(torch.complex32, torch.complex64, True),
|
|
||||||
(torch.complex64, torch.complex32, False),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
|
|
||||||
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
("dtypes", "expected_result"),
|
|
||||||
[
|
|
||||||
([torch.bool], torch.bool),
|
|
||||||
([torch.bool, torch.int8], torch.int8),
|
|
||||||
([torch.bool, torch.int8, torch.float16], torch.float16),
|
|
||||||
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_common_broadcastable_dtype(dtypes, expected_result):
|
|
||||||
assert common_broadcastable_dtype(dtypes) == expected_result
|
|
||||||
|
|
||||||
|
|
||||||
def test_model_specification(
|
def test_model_specification(
|
||||||
parser_with_config, cli_config_file, cli_config_file_with_model
|
parser_with_config, cli_config_file, cli_config_file_with_model
|
||||||
):
|
):
|
||||||
@@ -535,23 +420,6 @@ def test_model_specification(
|
|||||||
assert args.port == 12312
|
assert args.port == 12312
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
|
|
||||||
def test_sha256(input: tuple):
|
|
||||||
digest = sha256(input)
|
|
||||||
assert digest is not None
|
|
||||||
assert isinstance(digest, bytes)
|
|
||||||
assert digest != b""
|
|
||||||
|
|
||||||
input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
|
|
||||||
assert digest == hashlib.sha256(input_bytes).digest()
|
|
||||||
|
|
||||||
# hashing again, returns the same value
|
|
||||||
assert digest == sha256(input)
|
|
||||||
|
|
||||||
# hashing different input, returns different value
|
|
||||||
assert digest != sha256(input + (1,))
|
|
||||||
|
|
||||||
|
|
||||||
def test_convert_ids_list_to_tokens():
|
def test_convert_ids_list_to_tokens():
|
||||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
|
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
|
||||||
token_ids = tokenizer.encode("Hello, world!")
|
token_ids = tokenizer.encode("Hello, world!")
|
||||||
@@ -561,50 +429,6 @@ def test_convert_ids_list_to_tokens():
|
|||||||
assert tokens == ["Hello", ",", " world", "!"]
|
assert tokens == ["Hello", ",", " world", "!"]
|
||||||
|
|
||||||
|
|
||||||
def test_current_stream_multithread():
|
|
||||||
import threading
|
|
||||||
|
|
||||||
if not torch.cuda.is_available():
|
|
||||||
pytest.skip("CUDA not available")
|
|
||||||
|
|
||||||
main_default_stream = torch.cuda.current_stream()
|
|
||||||
child_stream = torch.cuda.Stream()
|
|
||||||
|
|
||||||
thread_stream_ready = threading.Event()
|
|
||||||
thread_can_exit = threading.Event()
|
|
||||||
|
|
||||||
def child_thread_func():
|
|
||||||
with torch.cuda.stream(child_stream):
|
|
||||||
thread_stream_ready.set()
|
|
||||||
thread_can_exit.wait(timeout=10)
|
|
||||||
|
|
||||||
child_thread = threading.Thread(target=child_thread_func)
|
|
||||||
child_thread.start()
|
|
||||||
|
|
||||||
try:
|
|
||||||
assert thread_stream_ready.wait(timeout=5), (
|
|
||||||
"Child thread failed to enter stream context in time"
|
|
||||||
)
|
|
||||||
|
|
||||||
main_current_stream = current_stream()
|
|
||||||
|
|
||||||
assert main_current_stream != child_stream, (
|
|
||||||
"Main thread's current_stream was contaminated by child thread"
|
|
||||||
)
|
|
||||||
assert main_current_stream == main_default_stream, (
|
|
||||||
"Main thread's current_stream is not the default stream"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Notify child thread it can exit
|
|
||||||
thread_can_exit.set()
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Ensure child thread exits properly
|
|
||||||
child_thread.join(timeout=5)
|
|
||||||
if child_thread.is_alive():
|
|
||||||
pytest.fail("Child thread failed to exit properly")
|
|
||||||
|
|
||||||
|
|
||||||
def test_load_config_file(tmp_path):
|
def test_load_config_file(tmp_path):
|
||||||
# Define the configuration data
|
# Define the configuration data
|
||||||
config_data = {
|
config_data = {
|
||||||
|
|||||||
@@ -23,13 +23,14 @@ ALLOWED_FILES = {
|
|||||||
"vllm/transformers_utils/config.py",
|
"vllm/transformers_utils/config.py",
|
||||||
"vllm/model_executor/models/registry.py",
|
"vllm/model_executor/models/registry.py",
|
||||||
"vllm/compilation/caching.py",
|
"vllm/compilation/caching.py",
|
||||||
"tests/utils_/test_utils.py",
|
|
||||||
"tests/tokenization/test_cached_tokenizer.py",
|
|
||||||
"vllm/distributed/utils.py",
|
"vllm/distributed/utils.py",
|
||||||
"vllm/distributed/parallel_state.py",
|
"vllm/distributed/parallel_state.py",
|
||||||
"vllm/distributed/device_communicators/all_reduce_utils.py",
|
"vllm/distributed/device_communicators/all_reduce_utils.py",
|
||||||
"vllm/distributed/device_communicators/shm_broadcast.py",
|
"vllm/distributed/device_communicators/shm_broadcast.py",
|
||||||
"vllm/distributed/device_communicators/shm_object_storage.py",
|
"vllm/distributed/device_communicators/shm_object_storage.py",
|
||||||
|
"vllm/utils/hashing.py",
|
||||||
|
"tests/utils_/test_hashing.py",
|
||||||
|
"tests/tokenization/test_cached_tokenizer.py",
|
||||||
"benchmarks/kernels/graph_machete_bench.py",
|
"benchmarks/kernels/graph_machete_bench.py",
|
||||||
"benchmarks/kernels/benchmark_lora.py",
|
"benchmarks/kernels/benchmark_lora.py",
|
||||||
"benchmarks/kernels/benchmark_machete.py",
|
"benchmarks/kernels/benchmark_machete.py",
|
||||||
@@ -40,10 +41,8 @@ ALLOWED_FILES = {
|
|||||||
"vllm/executor/mp_distributed_executor.py",
|
"vllm/executor/mp_distributed_executor.py",
|
||||||
"vllm/executor/ray_distributed_executor.py",
|
"vllm/executor/ray_distributed_executor.py",
|
||||||
"vllm/entrypoints/llm.py",
|
"vllm/entrypoints/llm.py",
|
||||||
"tests/utils.py",
|
|
||||||
# pickle and cloudpickle
|
|
||||||
"vllm/utils/__init__.py",
|
"vllm/utils/__init__.py",
|
||||||
"vllm/utils/hashing.py",
|
"tests/utils.py",
|
||||||
}
|
}
|
||||||
|
|
||||||
PICKLE_RE = re.compile(
|
PICKLE_RE = re.compile(
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ from vllm.inputs import TextPrompt, TokensPrompt
|
|||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.sampling_params import BeamSearchParams
|
from vllm.sampling_params import BeamSearchParams
|
||||||
from vllm.utils.asyncio import merge_async_iterators
|
from vllm.utils.async_utils import merge_async_iterators
|
||||||
|
|
||||||
|
|
||||||
def run_vllm(
|
def run_vllm(
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ from vllm.transformers_utils.chat_templates import get_chat_template_fallback_pa
|
|||||||
from vllm.transformers_utils.processor import cached_get_processor
|
from vllm.transformers_utils.processor import cached_get_processor
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||||
from vllm.utils import random_uuid
|
from vllm.utils import random_uuid
|
||||||
from vllm.utils.functools import supports_kw
|
from vllm.utils.func_utils import supports_kw
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ from vllm.transformers_utils.tokenizer import (
|
|||||||
)
|
)
|
||||||
from vllm.usage.usage_lib import UsageContext
|
from vllm.usage.usage_lib import UsageContext
|
||||||
from vllm.utils import Counter, Device
|
from vllm.utils import Counter, Device
|
||||||
from vllm.utils.collections import as_iter, is_list_of
|
from vllm.utils.collection_utils import as_iter, is_list_of
|
||||||
from vllm.v1.engine import EngineCoreRequest
|
from vllm.v1.engine import EngineCoreRequest
|
||||||
from vllm.v1.engine.llm_engine import LLMEngine
|
from vllm.v1.engine.llm_engine import LLMEngine
|
||||||
from vllm.v1.sample.logits_processor import LogitsProcessor
|
from vllm.v1.sample.logits_processor import LogitsProcessor
|
||||||
|
|||||||
@@ -70,7 +70,7 @@ from vllm.transformers_utils.tokenizers import (
|
|||||||
truncate_tool_call_ids,
|
truncate_tool_call_ids,
|
||||||
validate_request_params,
|
validate_request_params,
|
||||||
)
|
)
|
||||||
from vllm.utils.collections import as_list
|
from vllm.utils.collection_utils import as_list
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -34,8 +34,8 @@ from vllm.logprobs import Logprob
|
|||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.utils.asyncio import merge_async_iterators
|
from vllm.utils.async_utils import merge_async_iterators
|
||||||
from vllm.utils.collections import as_list
|
from vllm.utils.collection_utils import as_list
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -39,8 +39,8 @@ from vllm.outputs import (
|
|||||||
RequestOutput,
|
RequestOutput,
|
||||||
)
|
)
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.utils.asyncio import merge_async_iterators
|
from vllm.utils.async_utils import merge_async_iterators
|
||||||
from vllm.utils.collections import chunk_list
|
from vllm.utils.collection_utils import chunk_list
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -91,13 +91,13 @@ from vllm.tracing import (
|
|||||||
)
|
)
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||||
from vllm.utils import random_uuid
|
from vllm.utils import random_uuid
|
||||||
from vllm.utils.asyncio import (
|
from vllm.utils.async_utils import (
|
||||||
AsyncMicrobatchTokenizer,
|
AsyncMicrobatchTokenizer,
|
||||||
collect_from_async_generator,
|
collect_from_async_generator,
|
||||||
make_async,
|
make_async,
|
||||||
merge_async_iterators,
|
merge_async_iterators,
|
||||||
)
|
)
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
from vllm.v1.engine import EngineCoreRequest
|
from vllm.v1.engine import EngineCoreRequest
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ from vllm.entrypoints.utils import _validate_truncation_size
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.outputs import PoolingOutput, PoolingRequestOutput
|
from vllm.outputs import PoolingOutput, PoolingRequestOutput
|
||||||
from vllm.tasks import SupportedTask
|
from vllm.tasks import SupportedTask
|
||||||
from vllm.utils.asyncio import merge_async_iterators
|
from vllm.utils.async_utils import merge_async_iterators
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ from vllm.logger import init_logger
|
|||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
|
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||||
from vllm.utils.asyncio import make_async, merge_async_iterators
|
from vllm.utils.async_utils import make_async, merge_async_iterators
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
)
|
)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
from vllm.utils.import_utils import import_from_path
|
from vllm.utils.import_utils import import_from_path
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from vllm.inputs.data import TextPrompt as EngineTextPrompt
|
|||||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||||
from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
|
from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.utils.asyncio import AsyncMicrobatchTokenizer
|
from vllm.utils.async_utils import AsyncMicrobatchTokenizer
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from vllm.logger import init_logger
|
|||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import ExecuteModelRequest
|
from vllm.sequence import ExecuteModelRequest
|
||||||
from vllm.tasks import SupportedTask
|
from vllm.tasks import SupportedTask
|
||||||
from vllm.utils.asyncio import make_async
|
from vllm.utils.async_utils import make_async
|
||||||
from vllm.v1.outputs import SamplerOutput
|
from vllm.v1.outputs import SamplerOutput
|
||||||
from vllm.v1.worker.worker_base import WorkerBase
|
from vllm.v1.worker.worker_base import WorkerBase
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ from vllm.logger import init_logger
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.ray.ray_env import get_env_vars_to_copy
|
from vllm.ray.ray_env import get_env_vars_to_copy
|
||||||
from vllm.sequence import ExecuteModelRequest
|
from vllm.sequence import ExecuteModelRequest
|
||||||
from vllm.utils.asyncio import make_async
|
from vllm.utils.async_utils import make_async
|
||||||
from vllm.utils.network_utils import (
|
from vllm.utils.network_utils import (
|
||||||
get_distributed_init_method,
|
get_distributed_init_method,
|
||||||
get_ip,
|
get_ip,
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Literal, NamedTuple, TypeAlias, TypedDict, cas
|
|||||||
|
|
||||||
from typing_extensions import TypeIs
|
from typing_extensions import TypeIs
|
||||||
|
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
|
|
||||||
from .data import (
|
from .data import (
|
||||||
EmbedsPrompt,
|
EmbedsPrompt,
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from vllm.logger import init_logger
|
|||||||
from vllm.model_executor.custom_op import CustomOp
|
from vllm.model_executor.custom_op import CustomOp
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.collections import LazyDict
|
from vllm.utils.collection_utils import LazyDict
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ from vllm.utils.deep_gemm import (
|
|||||||
get_mk_alignment_for_contiguous_layout,
|
get_mk_alignment_for_contiguous_layout,
|
||||||
m_grouped_fp8_gemm_nt_contiguous,
|
m_grouped_fp8_gemm_nt_contiguous,
|
||||||
)
|
)
|
||||||
from vllm.utils.functools import run_once
|
from vllm.utils.func_utils import run_once
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ from vllm.model_executor.parameter import (
|
|||||||
RowvLLMParameter,
|
RowvLLMParameter,
|
||||||
)
|
)
|
||||||
from vllm.transformers_utils.config import get_safetensors_params_metadata
|
from vllm.transformers_utils.config import get_safetensors_params_metadata
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ from vllm.model_executor.parameter import (
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.scalar_type import scalar_types
|
from vllm.scalar_type import scalar_types
|
||||||
from vllm.transformers_utils.config import get_safetensors_params_metadata
|
from vllm.transformers_utils.config import get_safetensors_params_metadata
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ from vllm.transformers_utils.configs.deepseek_vl2 import (
|
|||||||
)
|
)
|
||||||
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
|
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
|
||||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ from vllm.inputs import TokensPrompt
|
|||||||
from vllm.inputs.data import PromptType
|
from vllm.inputs.data import PromptType
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.utils.functools import supports_kw
|
from vllm.utils.func_utils import supports_kw
|
||||||
|
|
||||||
from .interfaces_base import VllmModel, is_pooling_model
|
from .interfaces_base import VllmModel, is_pooling_model
|
||||||
|
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ import torch.nn as nn
|
|||||||
from typing_extensions import TypeIs, TypeVar
|
from typing_extensions import TypeIs, TypeVar
|
||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils.functools import supports_kw
|
from vllm.utils.func_utils import supports_kw
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ from vllm.multimodal.processing import (
|
|||||||
)
|
)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||||
|
|
||||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||||
|
|||||||
@@ -85,7 +85,7 @@ from vllm.multimodal.processing import (
|
|||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils.collections import flatten_2d_lists
|
from vllm.utils.collection_utils import flatten_2d_lists
|
||||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||||
|
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ from vllm.multimodal.processing import (
|
|||||||
)
|
)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
|
|
||||||
from .interfaces import (
|
from .interfaces import (
|
||||||
MultiModalEmbeddings,
|
MultiModalEmbeddings,
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ from typing import (
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from typing_extensions import NotRequired, TypeVar, deprecated
|
from typing_extensions import NotRequired, TypeVar, deprecated
|
||||||
|
|
||||||
from vllm.utils.collections import full_groupby, is_list_of
|
from vllm.utils.collection_utils import full_groupby, is_list_of
|
||||||
from vllm.utils.import_utils import LazyLoader
|
from vllm.utils.import_utils import LazyLoader
|
||||||
from vllm.utils.jsontree import json_map_leaves
|
from vllm.utils.jsontree import json_map_leaves
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
from typing_extensions import assert_never
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
from vllm.utils.import_utils import LazyLoader
|
from vllm.utils.import_utils import LazyLoader
|
||||||
|
|
||||||
from .audio import AudioResampler
|
from .audio import AudioResampler
|
||||||
|
|||||||
@@ -25,8 +25,8 @@ from typing_extensions import TypeVar, assert_never
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.transformers_utils.processor import cached_processor_from_config
|
from vllm.transformers_utils.processor import cached_processor_from_config
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens
|
||||||
from vllm.utils.collections import flatten_2d_lists, full_groupby
|
from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
|
||||||
from vllm.utils.functools import get_allowed_kwarg_only_overrides
|
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
|
||||||
from vllm.utils.jsontree import JSONTree, json_map_leaves
|
from vllm.utils.jsontree import JSONTree, json_map_leaves
|
||||||
|
|
||||||
from .hasher import MultiModalHasher
|
from .hasher import MultiModalHasher
|
||||||
@@ -486,7 +486,7 @@ _M = TypeVar("_M", bound=_HasModalityAttr | _HasModalityProp)
|
|||||||
def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
|
def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
|
||||||
"""
|
"""
|
||||||
Convenience function to apply
|
Convenience function to apply
|
||||||
[`full_groupby`][vllm.utils.collections.full_groupby]
|
[`full_groupby`][vllm.utils.collection_utils.full_groupby]
|
||||||
based on modality.
|
based on modality.
|
||||||
"""
|
"""
|
||||||
return full_groupby(values, key=lambda x: x.modality)
|
return full_groupby(values, key=lambda x: x.modality)
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ import torch.nn as nn
|
|||||||
from vllm.config.multimodal import BaseDummyOptions
|
from vllm.config.multimodal import BaseDummyOptions
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
|
||||||
from vllm.utils.collections import ClassRegistry
|
from vllm.utils.collection_utils import ClassRegistry
|
||||||
|
|
||||||
from .cache import BaseMultiModalProcessorCache
|
from .cache import BaseMultiModalProcessorCache
|
||||||
from .processing import (
|
from .processing import (
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Any
|
|||||||
|
|
||||||
from vllm.entrypoints.tool_server import ToolServer
|
from vllm.entrypoints.tool_server import ToolServer
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils.collections import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
from vllm.utils.import_utils import import_from_path
|
from vllm.utils.import_utils import import_from_path
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import os
|
|||||||
from collections.abc import Mapping
|
from collections.abc import Mapping
|
||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils.functools import run_once
|
from vllm.utils.func_utils import run_once
|
||||||
|
|
||||||
TRACE_HEADERS = ["traceparent", "tracestate"]
|
TRACE_HEADERS = ["traceparent", "tracestate"]
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ from transformers.processing_utils import ProcessorMixin
|
|||||||
from transformers.video_processing_utils import BaseVideoProcessor
|
from transformers.video_processing_utils import BaseVideoProcessor
|
||||||
from typing_extensions import TypeVar
|
from typing_extensions import TypeVar
|
||||||
|
|
||||||
from vllm.utils.functools import get_allowed_kwarg_only_overrides
|
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
|
|||||||
@@ -1122,9 +1122,6 @@ def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
|
|||||||
return cls
|
return cls
|
||||||
|
|
||||||
|
|
||||||
## moved to vllm.utils.profiling (imported at module top)
|
|
||||||
|
|
||||||
|
|
||||||
# Only relevant for models using ALiBi (e.g, MPT)
|
# Only relevant for models using ALiBi (e.g, MPT)
|
||||||
def check_use_alibi(model_config: ModelConfig) -> bool:
|
def check_use_alibi(model_config: ModelConfig) -> bool:
|
||||||
cfg = model_config.hf_text_config
|
cfg = model_config.hf_text_config
|
||||||
@@ -1150,9 +1147,6 @@ def check_use_alibi(model_config: ModelConfig) -> bool:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
## moved to vllm.utils.hashing
|
|
||||||
|
|
||||||
|
|
||||||
@cache
|
@cache
|
||||||
def _has_module(module_name: str) -> bool:
|
def _has_module(module_name: str) -> bool:
|
||||||
"""Return True if *module_name* can be found in the current environment.
|
"""Return True if *module_name* can be found in the current environment.
|
||||||
|
|||||||
@@ -30,9 +30,9 @@ from vllm.transformers_utils.config import maybe_register_config_serialize_by_va
|
|||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
|
||||||
from vllm.usage.usage_lib import UsageContext
|
from vllm.usage.usage_lib import UsageContext
|
||||||
from vllm.utils import Device, cdiv
|
from vllm.utils import Device, cdiv
|
||||||
from vllm.utils.asyncio import cancel_task_threadsafe
|
from vllm.utils.async_utils import cancel_task_threadsafe
|
||||||
from vllm.utils.collections import as_list
|
from vllm.utils.collection_utils import as_list
|
||||||
from vllm.utils.functools import deprecate_kwargs
|
from vllm.utils.func_utils import deprecate_kwargs
|
||||||
from vllm.v1.engine import EngineCoreRequest
|
from vllm.v1.engine import EngineCoreRequest
|
||||||
from vllm.v1.engine.core_client import EngineCoreClient
|
from vllm.v1.engine.core_client import EngineCoreClient
|
||||||
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
|
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ from vllm.config import VllmConfig
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.tasks import SupportedTask
|
from vllm.tasks import SupportedTask
|
||||||
from vllm.utils.asyncio import in_loop
|
from vllm.utils.async_utils import in_loop
|
||||||
from vllm.utils.network_utils import (
|
from vllm.utils.network_utils import (
|
||||||
close_sockets,
|
close_sockets,
|
||||||
get_open_port,
|
get_open_port,
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from vllm.multimodal.inputs import MultiModalFeatureSpec
|
|||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import SamplingParams, SamplingType
|
from vllm.sampling_params import SamplingParams, SamplingType
|
||||||
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||||
from vllm.utils.collections import swap_dict_values
|
from vllm.utils.collection_utils import swap_dict_values
|
||||||
from vllm.v1.outputs import LogprobsTensors
|
from vllm.v1.outputs import LogprobsTensors
|
||||||
from vllm.v1.pool.metadata import PoolingMetadata
|
from vllm.v1.pool.metadata import PoolingMetadata
|
||||||
from vllm.v1.sample.logits_processor import (
|
from vllm.v1.sample.logits_processor import (
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import torch
|
|||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sampling_params import SamplingType
|
from vllm.sampling_params import SamplingType
|
||||||
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||||
from vllm.utils.collections import swap_dict_values
|
from vllm.utils.collection_utils import swap_dict_values
|
||||||
from vllm.v1.outputs import LogprobsTensors
|
from vllm.v1.outputs import LogprobsTensors
|
||||||
from vllm.v1.worker.block_table import MultiGroupBlockTable
|
from vllm.v1.worker.block_table import MultiGroupBlockTable
|
||||||
from vllm.v1.worker.gpu_input_batch import CachedRequestState
|
from vllm.v1.worker.gpu_input_batch import CachedRequestState
|
||||||
|
|||||||
Reference in New Issue
Block a user