diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py index 5434f8b6a..20cd26bdd 100644 --- a/benchmarks/benchmark_block_pool.py +++ b/benchmarks/benchmark_block_pool.py @@ -5,7 +5,7 @@ import gc from benchmark_utils import TimeCollector from tabulate import tabulate -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.core.block_pool import BlockPool diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 6e0f3b51c..f64fd09ba 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -46,7 +46,7 @@ import time from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def test_long_document_qa(llm=None, sampling_params=None, prompts=None): diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index 626b150ee..dedb564ff 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -19,7 +19,7 @@ from vllm.config import ( VllmConfig, ) from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index d7dc0e991..146c268a6 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -37,7 +37,7 @@ from transformers import PreTrainedTokenizerBase from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser try: from vllm.transformers_utils.tokenizer import get_tokenizer diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 769f52dba..a35db0063 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -11,7 +11,7 @@ import time from transformers import AutoTokenizer, PreTrainedTokenizerBase from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser # Select a equi-probable random priority diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 539ab2ed0..55001cf37 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -51,7 +51,7 @@ except ImportError: from backend_request_func import get_tokenizer try: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index 22fc2678f..67fccdf4f 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -15,7 +15,7 @@ from utils import make_rand_sparse_tensors from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 2deebf3dd..f7325ddd2 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -18,7 +18,8 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( w8a8_triton_block_scaled_mm, ) -from vllm.utils import FlexibleArgumentParser, cdiv +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.math_utils import cdiv DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py index d33b84fc3..7792cfd03 100644 --- a/benchmarks/kernels/bench_per_token_quant_fp8.py +++ b/benchmarks/kernels/bench_per_token_quant_fp8.py @@ -10,7 +10,7 @@ import torch from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.triton_utils import triton -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py index 7662655b5..66268b71b 100644 --- a/benchmarks/kernels/benchmark_activation.py +++ b/benchmarks/kernels/benchmark_activation.py @@ -10,7 +10,7 @@ import vllm.model_executor.layers.activation # noqa F401 from vllm.model_executor.custom_op import CustomOp from vllm.platforms import current_platform from vllm.triton_utils import triton -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE batch_size_range = [1, 16, 32, 64, 128] diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py index 66b44c27d..6bcb17983 100644 --- a/benchmarks/kernels/benchmark_bitblas.py +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -28,7 +28,7 @@ except ImportError as e: from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser parser = FlexibleArgumentParser( description="Benchmark BitBLAS int4 on a specific target." diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py index 726a2a371..7982cbb14 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.scalar_type import scalar_types -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser WEIGHT_SHAPES_MOE = { "nvidia/DeepSeek-R1-FP4": [ diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index b419b2fa0..027f67ad4 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser # Weight shapes for different models: [num_experts, topk, hidden_size, # intermediate_size] diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py index df06a940e..b414efa6e 100644 --- a/benchmarks/kernels/benchmark_device_communicators.py +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import ( ) from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator from vllm.logger import init_logger -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser logger = init_logger(__name__) diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 14330ae6f..d525bd5fa 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_experts, fused_topk, ) -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = [ "nm-testing/Mixtral-8x7B-Instruct-v0.1", diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index bcfa64c3f..6fa5c2486 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -7,7 +7,7 @@ import torch from vllm.model_executor.layers.layernorm import RMSNorm from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 39338f338..bf1512268 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -25,7 +25,7 @@ if HAS_TRITON: from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_TP_SIZES = [1] diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index e1d5239f5..8787724d7 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( quantize_weights, ) from vllm.scalar_type import ScalarType, scalar_types -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024] diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 34cc45e94..12ca9214b 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( sort_weights, ) from vllm.scalar_type import ScalarType, scalar_types -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192] diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 9298d3b58..bc6cf83bc 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config from vllm.triton_utils import triton -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser FP8_DTYPE = current_platform.fp8_dtype() diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 459eafa6d..efa5a7386 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( ) from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser FP8_DTYPE = current_platform.fp8_dtype() diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index b91473617..cb848d2bf 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -39,7 +39,7 @@ import torch from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser device = torch.device("cuda" if torch.cuda.is_available() else "cpu") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 1b1e71ade..46ab2a5fe 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -9,7 +9,7 @@ import torch from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random, diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 61427a77b..3c2ac9128 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -7,7 +7,7 @@ import torch from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py index e0ff09d4b..0d3aef0c6 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache.py @@ -9,7 +9,7 @@ from tabulate import tabulate from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random, diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py index 29f1b2ccd..12f17ea57 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -12,7 +12,7 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import ( ) from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random_flash, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 24869c91a..29ef6409b 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -8,7 +8,7 @@ import torch from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def benchmark_rope_kernels_multi_lora( diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index f7cdc2579..29ce18234 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -8,7 +8,7 @@ from datetime import datetime import flashinfer import torch -from vllm.utils import round_up +from vllm.utils.math_utils import round_up FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FP8_DTYPE = torch.float8_e4m3fn diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 799335447..2a25d0374 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -8,7 +8,7 @@ from datetime import datetime import flashinfer import torch -from vllm.utils import round_up +from vllm.utils.math_utils import round_up FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FP8_DTYPE = torch.float8_e4m3fn diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 602fad181..ab54f8198 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -18,7 +18,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( ) from vllm.platforms import current_platform from vllm.triton_utils import triton -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser mp.set_start_method("spawn", force=True) diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index 9a4da0ef5..6964a3d3e 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -11,7 +11,7 @@ import regex as re import seaborn as sns from torch.utils.benchmark import Measurement as TMeasurement -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser if __name__ == "__main__": parser = FlexibleArgumentParser( diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index 0957a9c65..178599952 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -5,7 +5,7 @@ import cProfile import pstats from vllm import LLM, SamplingParams -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser # A very long prompt, total number of tokens is about 15k. LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000 diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index c4eed2037..53d69bbdb 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -18,7 +18,7 @@ from transformers import AutoTokenizer from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.audio import AudioAsset from vllm.lora.request import LoRARequest -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] question_per_audio_count = { diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py index 9e7036fea..c42b00730 100644 --- a/examples/offline_inference/basic/chat.py +++ b/examples/offline_inference/basic/chat.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def create_parser(): diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py index dc3bc399c..b72ddde1f 100644 --- a/examples/offline_inference/basic/classify.py +++ b/examples/offline_inference/basic/classify.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index 158836728..eeb7137ff 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py index 6a41ef4d8..9650dcfe9 100644 --- a/examples/offline_inference/basic/generate.py +++ b/examples/offline_inference/basic/generate.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def create_parser(): diff --git a/examples/offline_inference/basic/reward.py b/examples/offline_inference/basic/reward.py index aa173cf96..e95085686 100644 --- a/examples/offline_inference/basic/reward.py +++ b/examples/offline_inference/basic/reward.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index c9ca7a8bf..cbca50eb5 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index 4a1b0c406..c1d6c6db5 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -13,7 +13,7 @@ from typing import NamedTuple from vllm import LLM, EngineArgs, PromptType, SamplingParams from vllm.assets.audio import AudioAsset -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser class ModelRequestData(NamedTuple): diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py index d7f2a1633..d9215255a 100644 --- a/examples/offline_inference/llm_engine_example.py +++ b/examples/offline_inference/llm_engine_example.py @@ -8,7 +8,7 @@ for processing prompts with various sampling parameters. import argparse from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def create_test_prompts() -> list[tuple[str, SamplingParams]]: diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py index cc78c0cbb..52c2363c8 100644 --- a/examples/offline_inference/load_sharded_state.py +++ b/examples/offline_inference/load_sharded_state.py @@ -25,7 +25,7 @@ python load_sharded_state.py \ import dataclasses from vllm import LLM, EngineArgs, SamplingParams -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/pooling/embed_jina_embeddings_v3.py b/examples/offline_inference/pooling/embed_jina_embeddings_v3.py index 33a63deee..b117b0bd5 100644 --- a/examples/offline_inference/pooling/embed_jina_embeddings_v3.py +++ b/examples/offline_inference/pooling/embed_jina_embeddings_v3.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/pooling/embed_matryoshka_fy.py b/examples/offline_inference/pooling/embed_matryoshka_fy.py index 6871bcfcc..6544df852 100644 --- a/examples/offline_inference/pooling/embed_matryoshka_fy.py +++ b/examples/offline_inference/pooling/embed_matryoshka_fy.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs, PoolingParams -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/pooling/multi_vector_retrieval.py b/examples/offline_inference/pooling/multi_vector_retrieval.py index 8b8892117..fa7d1c3ba 100644 --- a/examples/offline_inference/pooling/multi_vector_retrieval.py +++ b/examples/offline_inference/pooling/multi_vector_retrieval.py @@ -4,7 +4,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/pooling/ner.py b/examples/offline_inference/pooling/ner.py index f18742fac..b2dffdd6c 100644 --- a/examples/offline_inference/pooling/ner.py +++ b/examples/offline_inference/pooling/ner.py @@ -5,7 +5,7 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py index dfcbd8c8d..3b127e4fd 100644 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -13,7 +13,7 @@ from tqdm import tqdm from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000)) DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0)) diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index 62effd5c8..6fbe1303f 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -13,7 +13,7 @@ from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.multimodal.image import convert_image_mode -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser class QueryResult(NamedTuple): diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index 41d7a3492..e25f46b12 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -30,7 +30,7 @@ from pathlib import Path from vllm import LLM, EngineArgs from vllm.model_executor.model_loader import ShardedStateLoader -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def parse_args(): diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index af65b6d38..f5f6e28b5 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -9,7 +9,7 @@ from vllm.inputs import TokensPrompt from vllm.v1.metrics.reader import Counter, Vector try: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 7668b1091..c1ea95f8d 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -22,7 +22,7 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.lora.request import LoRARequest from vllm.multimodal.image import convert_image_mode -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser class ModelRequestData(NamedTuple): diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index b9115121a..5cb47c150 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -18,7 +18,7 @@ from transformers import AutoProcessor, AutoTokenizer from vllm import LLM, EngineArgs, SamplingParams from vllm.lora.request import LoRARequest from vllm.multimodal.utils import fetch_image -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser QUESTION = "What is the content of each image?" IMAGE_URLS = [ diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/offline_inference/vision_language_pooling.py index cf4695c25..63d85d5d9 100644 --- a/examples/offline_inference/vision_language_pooling.py +++ b/examples/offline_inference/vision_language_pooling.py @@ -18,7 +18,7 @@ from PIL.Image import Image from vllm import LLM, EngineArgs from vllm.entrypoints.score_utils import ScoreMultiModalParam from vllm.multimodal.utils import fetch_image -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser ROOT_DIR = Path(__file__).parent.parent.parent EXAMPLES_DIR = ROOT_DIR / "examples" diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 5d515fbfb..9fa600ff4 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -26,7 +26,7 @@ import requests from openai import OpenAI from utils import get_first_model -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 2601c9eff..3644a03b3 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -16,7 +16,7 @@ from vllm.model_executor.model_loader.tensorizer import ( tensorize_vllm_model, tensorizer_kwargs_arg, ) -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser logger = logging.getLogger() diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index bcee0eb3d..472b1487e 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -22,7 +22,7 @@ from vllm.engine.arg_utils import ( optional_type, parse_type, ) -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser @pytest.mark.parametrize( diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 0b9d171aa..b5d71c20b 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -7,7 +7,7 @@ import pytest from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args from vllm.entrypoints.openai.serving_models import LoRAModulePath -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from ...utils import VLLM_PATH diff --git a/tests/kernels/attention/test_deepgemm_attention.py b/tests/kernels/attention/test_deepgemm_attention.py index 74a5d8117..e2ae3b833 100644 --- a/tests/kernels/attention/test_deepgemm_attention.py +++ b/tests/kernels/attention/test_deepgemm_attention.py @@ -6,7 +6,6 @@ import pytest import torch from vllm.platforms import current_platform -from vllm.utils import cdiv from vllm.utils.deep_gemm import ( _ceil_to_ue8m0, calc_diff, @@ -16,6 +15,7 @@ from vllm.utils.deep_gemm import ( get_paged_mqa_logits_metadata, ) from vllm.utils.import_utils import has_deep_gemm +from vllm.utils.math_utils import cdiv def kv_cache_cast_to_fp8(x: torch.Tensor) -> torch.Tensor: diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 00f06da5a..79981009c 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -10,7 +10,7 @@ from tests.kernels.quantization.nvfp4_utils import ( get_nvfp4_global_scale, ) from vllm.platforms import current_platform -from vllm.utils import round_up +from vllm.utils.math_utils import round_up if not current_platform.is_device_capability(100): pytest.skip( diff --git a/tests/kernels/attention/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py index 44f3e42e8..e1a7e50c2 100644 --- a/tests/kernels/attention/test_mla_decode_cpu.py +++ b/tests/kernels/attention/test_mla_decode_cpu.py @@ -7,7 +7,7 @@ from torch import Tensor import vllm._custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv def ref_mla( diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py index 01ba0951b..04085fe5f 100644 --- a/tests/kernels/attention/test_triton_decode_attention.py +++ b/tests/kernels/attention/test_triton_decode_attention.py @@ -5,7 +5,7 @@ import pytest import torch from vllm.attention.ops.triton_decode_attention import decode_attention_fwd -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv @pytest.mark.parametrize("B", [3, 5]) diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py index 4c60241bd..1c10cb3b2 100644 --- a/tests/kernels/moe/test_cutlass_grouped_gemm.py +++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py @@ -13,8 +13,8 @@ from tests.kernels.moe.utils import per_token_cast_to_fp8 from tests.kernels.utils import baseline_scaled_mm from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import cdiv from vllm.utils.deep_gemm import per_block_cast_to_fp8 +from vllm.utils.math_utils import cdiv @pytest.mark.parametrize( diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index d4a79a7ef..dfd317bcf 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -27,7 +27,7 @@ from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( triton_kernel_moe_forward, ) from vllm.model_executor.layers.utils import shuffle_weight -from vllm.utils import round_up +from vllm.utils.math_utils import round_up def deshuffle(w: torch.Tensor): diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py index bde0478d9..8975f00bd 100644 --- a/tests/kernels/moe/test_moe_align_block_size.py +++ b/tests/kernels/moe/test_moe_align_block_size.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( moe_align_block_size, ) from vllm.platforms import current_platform -from vllm.utils import round_up +from vllm.utils.math_utils import round_up NUM_TOKENS = [1, 3, 256, 2256, 4096] NUM_EXPERTS = [32, 160, 256, 257] diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index ac7f3fc5e..a2de64974 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassBatchedExper from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.platforms import current_platform -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index e665c636f..0f0ed3326 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -45,7 +45,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, ) from vllm.platforms import current_platform -from vllm.utils import round_up +from vllm.utils.math_utils import round_up from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index 8b3bebb39..92e78ec23 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -8,7 +8,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( persistent_masked_m_silu_mul_quant, ) from vllm.platforms import current_platform -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv fp8_dtype = torch.float8_e4m3fn diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 65ce4073a..c7e6c4240 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -16,8 +16,8 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( ) from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input -from vllm.utils import round_up from vllm.utils.deep_gemm import per_block_cast_to_fp8 +from vllm.utils.math_utils import round_up def triton_moe( diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 34ce91585..830d43569 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -6,7 +6,7 @@ import torch from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast from vllm.platforms import current_platform -from vllm.utils import round_up +from vllm.utils.math_utils import round_up # Using the default value (240.0) from pytorch will cause accuracy # issue on dynamic quantization models. Here use 224.0 for rocm. diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index 835c067e2..de595b0a3 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -13,7 +13,7 @@ import torch from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8 from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv MNK_FACTORS = [ (1, 256, 128), diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index 12f7fc66d..351cff246 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -18,7 +18,7 @@ from tests.v1.attention.utils import ( from vllm.attention.backends.registry import _Backend from vllm.config import ModelConfig from vllm.platforms import current_platform -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, is_torch_equal_or_newer from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 81fd6433b..1a256a6e1 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -22,7 +22,7 @@ from vllm import _custom_ops as ops from vllm.attention.backends.registry import _Backend from vllm.attention.ops.flashmla import is_flashmla_dense_supported from vllm.config.vllm import set_current_vllm_config -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import FullAttentionSpec diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py index 25de65a56..02324d2ac 100644 --- a/tests/v1/attention/test_sparse_mla_backends.py +++ b/tests/v1/attention/test_sparse_mla_backends.py @@ -23,7 +23,7 @@ from tests.v1.attention.utils import ( from vllm import _custom_ops as ops from vllm.attention.ops import flashmla from vllm.model_executor.layers.linear import ColumnParallelLinear -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.mla.flashmla_sparse import FlashMLASparseBackend from vllm.v1.attention.backends.mla.indexer import split_prefill_chunks diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py index 943402e42..cf632f146 100644 --- a/tests/v1/engine/test_engine_args.py +++ b/tests/v1/engine/test_engine_args.py @@ -8,7 +8,7 @@ import pytest from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def test_prefix_caching_from_cli(): diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py index a40a66308..4b89c28f0 100644 --- a/vllm/attention/layers/cross_attention.py +++ b/vllm/attention/layers/cross_attention.py @@ -16,7 +16,7 @@ from vllm.attention.layer import Attention from vllm.attention.selector import get_attn_backend from vllm.config import CacheConfig, VllmConfig from vllm.logger import init_logger -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, subclass_attention_backend, diff --git a/vllm/attention/ops/pallas_kv_cache_update.py b/vllm/attention/ops/pallas_kv_cache_update.py index d0d836cc6..51214b022 100644 --- a/vllm/attention/ops/pallas_kv_cache_update.py +++ b/vllm/attention/ops/pallas_kv_cache_update.py @@ -7,7 +7,7 @@ import jax from jax.experimental import pallas as pl from jax.experimental.pallas import tpu as pltpu -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv def _kv_cache_update_kernel( diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py index 5c1ce68dd..bcd1e2cd5 100644 --- a/vllm/attention/ops/rocm_aiter_paged_attn.py +++ b/vllm/attention/ops/rocm_aiter_paged_attn.py @@ -6,7 +6,7 @@ import torch from vllm.attention.ops.paged_attn import PagedAttention from vllm.platforms import current_platform -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv FP8_DTYPE = current_platform.fp8_dtype() diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index eb8cd64c3..55e24bd5d 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -58,7 +58,7 @@ except ImportError: librosa = PlaceholderModule("librosa") try: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index aaf19e6d4..4f44faece 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -19,7 +19,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( kStaticTensorScale, ) from vllm.platforms import current_platform -from vllm.utils import round_up +from vllm.utils.math_utils import round_up from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32 from .fx_utils import is_func diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c0ea84b6e..617c464cf 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -82,7 +82,8 @@ from vllm.transformers_utils.config import ( maybe_override_with_speculators, ) from vllm.transformers_utils.utils import check_gguf_file -from vllm.utils import FlexibleArgumentParser, is_in_ray_actor +from vllm.utils import is_in_ray_actor +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.mem_constants import GiB_bytes from vllm.utils.network_utils import get_ip from vllm.v1.sample.logits_processor import LogitsProcessor diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 53dab90f4..184cc47ce 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -26,7 +26,8 @@ from vllm.entrypoints.utils import with_cancellation from vllm.logger import init_logger from vllm.sampling_params import SamplingParams from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit +from vllm.utils import random_uuid, set_ulimit +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.version import __version__ as VLLM_VERSION logger = init_logger("vllm.entrypoints.api_server") diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py index 7a1d24776..2ff98577c 100644 --- a/vllm/entrypoints/cli/benchmark/main.py +++ b/vllm/entrypoints/cli/benchmark/main.py @@ -9,7 +9,7 @@ from vllm.entrypoints.cli.types import CLISubcommand from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG if typing.TYPE_CHECKING: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser else: FlexibleArgumentParser = argparse.ArgumentParser diff --git a/vllm/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py index e47dce0a4..ad943a63d 100644 --- a/vllm/entrypoints/cli/collect_env.py +++ b/vllm/entrypoints/cli/collect_env.py @@ -8,7 +8,7 @@ from vllm.collect_env import main as collect_env_main from vllm.entrypoints.cli.types import CLISubcommand if typing.TYPE_CHECKING: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser else: FlexibleArgumentParser = argparse.ArgumentParser diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py index 213a46603..a3e73eb7a 100644 --- a/vllm/entrypoints/cli/main.py +++ b/vllm/entrypoints/cli/main.py @@ -20,7 +20,7 @@ def main(): import vllm.entrypoints.cli.run_batch import vllm.entrypoints.cli.serve from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser CMD_MODULES = [ vllm.entrypoints.cli.openai, diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index a27c6fe66..99a8759c8 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -13,7 +13,7 @@ from openai.types.chat import ChatCompletionMessageParam from vllm.entrypoints.cli.types import CLISubcommand if TYPE_CHECKING: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser else: FlexibleArgumentParser = argparse.ArgumentParser diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py index 4b18ceb52..64d1bec1f 100644 --- a/vllm/entrypoints/cli/run_batch.py +++ b/vllm/entrypoints/cli/run_batch.py @@ -11,7 +11,7 @@ from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG from vllm.logger import init_logger if typing.TYPE_CHECKING: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser else: FlexibleArgumentParser = argparse.ArgumentParser diff --git a/vllm/entrypoints/cli/types.py b/vllm/entrypoints/cli/types.py index f4eeb5b3c..f22b844b4 100644 --- a/vllm/entrypoints/cli/types.py +++ b/vllm/entrypoints/cli/types.py @@ -5,7 +5,7 @@ import argparse import typing if typing.TYPE_CHECKING: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser else: FlexibleArgumentParser = argparse.ArgumentParser diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 99d6cbaa8..1a775d3d6 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -29,7 +29,7 @@ from vllm.entrypoints.constants import ( from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.logger import init_logger -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index da036e30b..4caccf88f 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -32,7 +32,8 @@ from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingM from vllm.entrypoints.openai.serving_score import ServingScores from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager -from vllm.utils import FlexibleArgumentParser, random_uuid +from vllm.utils import random_uuid +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index ec5fb3b56..088bb679f 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -31,7 +31,7 @@ from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.transformers_utils.tokenizers import MistralTokenizer -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/fla/ops/layernorm_guard.py b/vllm/model_executor/layers/fla/ops/layernorm_guard.py index 307d0859c..89352d12b 100644 --- a/vllm/model_executor/layers/fla/ops/layernorm_guard.py +++ b/vllm/model_executor/layers/fla/ops/layernorm_guard.py @@ -21,7 +21,7 @@ import torch.nn.functional as F from einops import rearrange from vllm.triton_utils import tl, triton -from vllm.utils import cdiv, next_power_of_2 +from vllm.utils.math_utils import cdiv, next_power_of_2 from .utils import input_guard diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 200212dfb..5403d4e62 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -14,9 +14,9 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import ( OCP_MX_Scheme, ) from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape -from vllm.utils import cdiv from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.import_utils import has_triton_kernels +from vllm.utils.math_utils import cdiv logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py index 85294f6ae..6cca95412 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py @@ -10,8 +10,8 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens from vllm.triton_utils import tl, triton -from vllm.utils import round_up from vllm.utils.deep_gemm import get_mk_alignment_for_contiguous_layout +from vllm.utils.math_utils import round_up def expert_num_tokens_round_up_and_sum( diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index a5c5c115f..13866a5c5 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -12,7 +12,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, ) from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input -from vllm.utils import round_up +from vllm.utils.math_utils import round_up from vllm.v1.worker.ubatching import ( dbo_current_ubatch_id, dbo_enabled, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 71393f4f6..c144aa23e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -55,9 +55,9 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum -from vllm.utils import cdiv, round_up from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.import_utils import has_deep_ep, has_pplx +from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import current_stream, direct_register_custom_op from vllm.v1.worker.ubatching import dbo_current_ubatch_id diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 8514b6355..3b5916f8c 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.utils import ( count_expert_num_tokens, disable_inplace, ) -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.worker.ubatching import ( dbo_current_ubatch_id, dbo_enabled, diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py index f4d8a86c0..7f6155997 100644 --- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py +++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py @@ -5,7 +5,7 @@ import torch from vllm import _custom_ops as ops from vllm.triton_utils import triton -from vllm.utils import round_up +from vllm.utils.math_utils import round_up def moe_align_block_size( diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 0e77fa54c..2766a2c22 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -15,7 +15,7 @@ from vllm.model_executor.layers.fused_moe.utils import ( _validate_scale_shape, moe_kernel_quantize_input, ) -from vllm.utils import cdiv, round_up +from vllm.utils.math_utils import cdiv, round_up logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 0627ea50d..1f946d67a 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -23,8 +23,8 @@ from vllm.model_executor.layers.quantization.utils.mxfp8_utils import ( mxfp8_e4m3_quantize, ) from vllm.triton_utils import tl, triton -from vllm.utils import cdiv from vllm.utils.flashinfer import flashinfer_fp4_quantize +from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import is_torch_equal_or_newer diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 2890a2c6d..06b4f9271 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -26,7 +26,7 @@ from vllm.config import ModelConfig, ParallelConfig, VllmConfig, set_current_vll from vllm.logger import init_logger from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.import_utils import PlaceholderModule if TYPE_CHECKING: diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index d4367be1c..d2f9f1b0b 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry -from vllm.utils import cdiv, round_up +from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 846c8e766..44f6824b5 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -32,7 +32,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import sequence_parallel_chunk from vllm.sequence import IntermediateTensors -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index e86fc23c7..069078850 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -22,7 +22,7 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import NestedTensors from vllm.sequence import IntermediateTensors -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import ( is_pin_memory_available, is_uva_available, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 098e9058f..1fb3aba9b 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -22,7 +22,7 @@ if TYPE_CHECKING: from vllm.inputs import ProcessorInputs, PromptType from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser else: FlexibleArgumentParser = object diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 38da04102..9cedea346 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -6,42 +6,56 @@ import datetime import enum import getpass import inspect -import json import multiprocessing import os import signal import sys import tempfile -import textwrap import threading import traceback import uuid import warnings import weakref -from argparse import ( - Action, - ArgumentDefaultsHelpFormatter, - ArgumentParser, - ArgumentTypeError, - RawDescriptionHelpFormatter, - _ArgumentGroup, -) -from collections import defaultdict from collections.abc import Callable -from functools import partial, wraps +from functools import cache, partial, wraps from typing import TYPE_CHECKING, Any, TypeVar import cloudpickle import psutil -import regex as re import torch -import yaml import vllm.envs as envs from vllm.logger import enable_trace_function_call, init_logger from vllm.ray.lazy_utils import is_in_ray_actor + +# Import utilities from specialized modules for backward compatibility +from vllm.utils.argparse_utils import ( + FlexibleArgumentParser, + SortedHelpFormatter, + StoreBoolean, +) +from vllm.utils.math_utils import ( + cdiv, + next_power_of_2, + prev_power_of_2, + round_down, + round_up, +) from vllm.utils.platform_utils import cuda_is_initialized, xpu_is_initialized +__all__ = [ + # Argparse utilities + "FlexibleArgumentParser", + "SortedHelpFormatter", + "StoreBoolean", + # Math utilities + "cdiv", + "next_power_of_2", + "prev_power_of_2", + "round_down", + "round_up", +] + _DEPRECATED_MAPPINGS = { "cprofile": "profiling", "cprofile_context": "profiling", @@ -139,31 +153,31 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) -def cdiv(a: int, b: int) -> int: - """Ceiling division.""" - return -(a // -b) +def update_environment_variables(envs: dict[str, str]): + for k, v in envs.items(): + if k in os.environ and os.environ[k] != v: + logger.warning( + "Overwriting environment variable %s from '%s' to '%s'", + k, + os.environ[k], + v, + ) + os.environ[k] = v -def next_power_of_2(n) -> int: - """The next power of 2 (inclusive)""" - if n < 1: - return 1 - return 1 << (n - 1).bit_length() +@cache +def is_pin_memory_available() -> bool: + from vllm.platforms import current_platform + + return current_platform.is_pin_memory_available() -def prev_power_of_2(n: int) -> int: - """The previous power of 2 (inclusive)""" - if n <= 0: - return 0 - return 1 << (n.bit_length() - 1) - - -def round_up(x: int, y: int) -> int: - return ((x + y - 1) // y) * y - - -def round_down(x: int, y: int) -> int: - return (x // y) * y +@cache +def is_uva_available() -> bool: + """Check if Unified Virtual Addressing (UVA) is available.""" + # UVA requires pinned memory. + # TODO: Add more requirements for UVA if needed. + return is_pin_memory_available() # TODO: This function can be removed if transformer_modules classes are @@ -214,488 +228,6 @@ def weak_bind( return weak_bound -class StoreBoolean(Action): - def __call__(self, parser, namespace, values, option_string=None): - if values.lower() == "true": - setattr(namespace, self.dest, True) - elif values.lower() == "false": - setattr(namespace, self.dest, False) - else: - raise ValueError( - f"Invalid boolean value: {values}. Expected 'true' or 'false'." - ) - - -class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter): - """SortedHelpFormatter that sorts arguments by their option strings.""" - - def _split_lines(self, text, width): - """ - 1. Sentences split across lines have their single newlines removed. - 2. Paragraphs and explicit newlines are split into separate lines. - 3. Each line is wrapped to the specified width (width of terminal). - """ - # The patterns also include whitespace after the newline - single_newline = re.compile(r"(? to the front, e,g: - # [Before] - # vllm serve -tp 2 --model --enforce-eager --port 8001 - # [After] - # vllm serve -tp 2 --enforce-eager --port 8001 - args = [ - "serve", - model_tag, - *args[1:model_idx], - *args[rest_start_idx:], - ] - print("args", args) - except StopIteration: - pass - - if "--config" in args: - args = self._pull_args_from_config(args) - - def repl(match: re.Match) -> str: - """Replaces underscores with dashes in the matched string.""" - return match.group(0).replace("_", "-") - - # Everything between the first -- and the first . - pattern = re.compile(r"(?<=--)[^\.]*") - - # Convert underscores to dashes and vice versa in argument names - processed_args = list[str]() - for i, arg in enumerate(args): - if arg.startswith("--help="): - FlexibleArgumentParser._search_keyword = arg.split("=", 1)[-1].lower() - processed_args.append("--help") - elif arg.startswith("--"): - if "=" in arg: - key, value = arg.split("=", 1) - key = pattern.sub(repl, key, count=1) - processed_args.append(f"{key}={value}") - else: - key = pattern.sub(repl, arg, count=1) - processed_args.append(key) - elif arg.startswith("-O") and arg != "-O" and arg[2] != ".": - # allow -O flag to be used without space, e.g. -O3 or -Odecode - # -O.<...> handled later - # also handle -O= here - mode = arg[3:] if arg[2] == "=" else arg[2:] - processed_args.append(f"-O.mode={mode}") - elif ( - arg == "-O" - and i + 1 < len(args) - and args[i + 1] in {"0", "1", "2", "3"} - ): - # Convert -O to -O.mode - processed_args.append("-O.mode") - else: - processed_args.append(arg) - - def create_nested_dict(keys: list[str], value: str) -> dict[str, Any]: - """Creates a nested dictionary from a list of keys and a value. - - For example, `keys = ["a", "b", "c"]` and `value = 1` will create: - `{"a": {"b": {"c": 1}}}` - """ - nested_dict: Any = value - for key in reversed(keys): - nested_dict = {key: nested_dict} - return nested_dict - - def recursive_dict_update( - original: dict[str, Any], - update: dict[str, Any], - ) -> set[str]: - """Recursively updates a dictionary with another dictionary. - Returns a set of duplicate keys that were overwritten. - """ - duplicates = set[str]() - for k, v in update.items(): - if isinstance(v, dict) and isinstance(original.get(k), dict): - nested_duplicates = recursive_dict_update(original[k], v) - duplicates |= {f"{k}.{d}" for d in nested_duplicates} - elif isinstance(v, list) and isinstance(original.get(k), list): - original[k] += v - else: - if k in original: - duplicates.add(k) - original[k] = v - return duplicates - - delete = set[int]() - dict_args = defaultdict[str, dict[str, Any]](dict) - duplicates = set[str]() - for i, processed_arg in enumerate(processed_args): - if i in delete: # skip if value from previous arg - continue - - if processed_arg.startswith("-") and "." in processed_arg: - if "=" in processed_arg: - processed_arg, value_str = processed_arg.split("=", 1) - if "." not in processed_arg: - # False positive, '.' was only in the value - continue - else: - value_str = processed_args[i + 1] - delete.add(i + 1) - - if processed_arg.endswith("+"): - processed_arg = processed_arg[:-1] - value_str = json.dumps(list(value_str.split(","))) - - key, *keys = processed_arg.split(".") - try: - value = json.loads(value_str) - except json.decoder.JSONDecodeError: - value = value_str - - # Merge all values with the same key into a single dict - arg_dict = create_nested_dict(keys, value) - arg_duplicates = recursive_dict_update(dict_args[key], arg_dict) - duplicates |= {f"{key}.{d}" for d in arg_duplicates} - delete.add(i) - # Filter out the dict args we set to None - processed_args = [a for i, a in enumerate(processed_args) if i not in delete] - if duplicates: - logger.warning("Found duplicate keys %s", ", ".join(duplicates)) - - # Add the dict args back as if they were originally passed as JSON - for dict_arg, dict_value in dict_args.items(): - processed_args.append(dict_arg) - processed_args.append(json.dumps(dict_value)) - - return super().parse_args(processed_args, namespace) - - def check_port(self, value): - try: - value = int(value) - except ValueError: - msg = "Port must be an integer" - raise ArgumentTypeError(msg) from None - - if not (1024 <= value <= 65535): - raise ArgumentTypeError("Port must be between 1024 and 65535") - - return value - - def _pull_args_from_config(self, args: list[str]) -> list[str]: - """Method to pull arguments specified in the config file - into the command-line args variable. - - The arguments in config file will be inserted between - the argument list. - - example: - ```yaml - port: 12323 - tensor-parallel-size: 4 - ``` - ```python - $: vllm {serve,chat,complete} "facebook/opt-12B" \ - --config config.yaml -tp 2 - $: args = [ - "serve,chat,complete", - "facebook/opt-12B", - '--config', 'config.yaml', - '-tp', '2' - ] - $: args = [ - "serve,chat,complete", - "facebook/opt-12B", - '--port', '12323', - '--tensor-parallel-size', '4', - '-tp', '2' - ] - ``` - - Please note how the config args are inserted after the sub command. - this way the order of priorities is maintained when these are args - parsed by super(). - """ - assert args.count("--config") <= 1, "More than one config file specified!" - - index = args.index("--config") - if index == len(args) - 1: - raise ValueError( - "No config file specified! \ - Please check your command-line arguments." - ) - - file_path = args[index + 1] - - config_args = self.load_config_file(file_path) - - # 0th index might be the sub command {serve,chat,complete,...} - # optionally followed by model_tag (only for serve) - # followed by config args - # followed by rest of cli args. - # maintaining this order will enforce the precedence - # of cli > config > defaults - if args[0].startswith("-"): - # No sub command (e.g., api_server entry point) - args = config_args + args[0:index] + args[index + 2 :] - elif args[0] == "serve": - model_in_cli = len(args) > 1 and not args[1].startswith("-") - model_in_config = any(arg == "--model" for arg in config_args) - - if not model_in_cli and not model_in_config: - raise ValueError( - "No model specified! Please specify model either " - "as a positional argument or in a config file." - ) - - if model_in_cli: - # Model specified as positional arg, keep CLI version - args = ( - [args[0]] - + [args[1]] - + config_args - + args[2:index] - + args[index + 2 :] - ) - else: - # No model in CLI, use config if available - args = [args[0]] + config_args + args[1:index] + args[index + 2 :] - else: - args = [args[0]] + config_args + args[1:index] + args[index + 2 :] - - return args - - def load_config_file(self, file_path: str) -> list[str]: - """Loads a yaml file and returns the key value pairs as a - flattened list with argparse like pattern - ```yaml - port: 12323 - tensor-parallel-size: 4 - ``` - returns: - processed_args: list[str] = [ - '--port': '12323', - '--tensor-parallel-size': '4' - ] - """ - extension: str = file_path.split(".")[-1] - if extension not in ("yaml", "yml"): - raise ValueError( - "Config file must be of a yaml/yml type.\ - %s supplied", - extension, - ) - - # only expecting a flat dictionary of atomic types - processed_args: list[str] = [] - - config: dict[str, int | str] = {} - try: - with open(file_path) as config_file: - config = yaml.safe_load(config_file) - except Exception as ex: - logger.error( - "Unable to read the config file at %s. \ - Make sure path is correct", - file_path, - ) - raise ex - - store_boolean_arguments = [ - action.dest for action in self._actions if isinstance(action, StoreBoolean) - ] - - for key, value in config.items(): - if isinstance(value, bool) and key not in store_boolean_arguments: - if value: - processed_args.append("--" + key) - elif isinstance(value, list): - if value: - processed_args.append("--" + key) - for item in value: - processed_args.append(str(item)) - else: - processed_args.append("--" + key) - processed_args.append(str(value)) - - return processed_args - - class AtomicCounter: """An atomic, thread-safe counter""" diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py new file mode 100644 index 000000000..0007c72f1 --- /dev/null +++ b/vllm/utils/argparse_utils.py @@ -0,0 +1,507 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Argument parsing utilities for vLLM.""" + +import json +import sys +import textwrap +from argparse import ( + Action, + ArgumentDefaultsHelpFormatter, + ArgumentParser, + ArgumentTypeError, + RawDescriptionHelpFormatter, + _ArgumentGroup, +) +from collections import defaultdict +from typing import TYPE_CHECKING, Any + +import regex as re +import yaml + +from vllm.logger import init_logger + +if TYPE_CHECKING: + from argparse import Namespace +else: + Namespace = object + +logger = init_logger(__name__) + + +class StoreBoolean(Action): + def __call__(self, parser, namespace, values, option_string=None): + if values.lower() == "true": + setattr(namespace, self.dest, True) + elif values.lower() == "false": + setattr(namespace, self.dest, False) + else: + raise ValueError( + f"Invalid boolean value: {values}. Expected 'true' or 'false'." + ) + + +class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter): + """SortedHelpFormatter that sorts arguments by their option strings.""" + + def _split_lines(self, text, width): + """ + 1. Sentences split across lines have their single newlines removed. + 2. Paragraphs and explicit newlines are split into separate lines. + 3. Each line is wrapped to the specified width (width of terminal). + """ + # The patterns also include whitespace after the newline + single_newline = re.compile(r"(? to the front, e,g: + # [Before] + # vllm serve -tp 2 --model --enforce-eager --port 8001 + # [After] + # vllm serve -tp 2 --enforce-eager --port 8001 + args = [ + "serve", + model_tag, + *args[1:model_idx], + *args[rest_start_idx:], + ] + except StopIteration: + pass + + if "--config" in args: + args = self._pull_args_from_config(args) + + def repl(match: re.Match) -> str: + """Replaces underscores with dashes in the matched string.""" + return match.group(0).replace("_", "-") + + # Everything between the first -- and the first . + pattern = re.compile(r"(?<=--)[^\.]*") + + # Convert underscores to dashes and vice versa in argument names + processed_args = list[str]() + for i, arg in enumerate(args): + if arg.startswith("--help="): + FlexibleArgumentParser._search_keyword = arg.split("=", 1)[-1].lower() + processed_args.append("--help") + elif arg.startswith("--"): + if "=" in arg: + key, value = arg.split("=", 1) + key = pattern.sub(repl, key, count=1) + processed_args.append(f"{key}={value}") + else: + key = pattern.sub(repl, arg, count=1) + processed_args.append(key) + elif arg.startswith("-O") and arg != "-O" and arg[2] != ".": + # allow -O flag to be used without space, e.g. -O3 or -Odecode + # -O.<...> handled later + # also handle -O= here + mode = arg[3:] if arg[2] == "=" else arg[2:] + processed_args.append(f"-O.mode={mode}") + elif ( + arg == "-O" + and i + 1 < len(args) + and args[i + 1] in {"0", "1", "2", "3"} + ): + # Convert -O to -O.mode + processed_args.append("-O.mode") + else: + processed_args.append(arg) + + def create_nested_dict(keys: list[str], value: str) -> dict[str, Any]: + """Creates a nested dictionary from a list of keys and a value. + + For example, `keys = ["a", "b", "c"]` and `value = 1` will create: + `{"a": {"b": {"c": 1}}}` + """ + nested_dict: Any = value + for key in reversed(keys): + nested_dict = {key: nested_dict} + return nested_dict + + def recursive_dict_update( + original: dict[str, Any], + update: dict[str, Any], + ) -> set[str]: + """Recursively updates a dictionary with another dictionary. + Returns a set of duplicate keys that were overwritten. + """ + duplicates = set[str]() + for k, v in update.items(): + if isinstance(v, dict) and isinstance(original.get(k), dict): + nested_duplicates = recursive_dict_update(original[k], v) + duplicates |= {f"{k}.{d}" for d in nested_duplicates} + elif isinstance(v, list) and isinstance(original.get(k), list): + original[k] += v + else: + if k in original: + duplicates.add(k) + original[k] = v + return duplicates + + delete = set[int]() + dict_args = defaultdict[str, dict[str, Any]](dict) + duplicates = set[str]() + for i, processed_arg in enumerate(processed_args): + if i in delete: # skip if value from previous arg + continue + + if processed_arg.startswith("-") and "." in processed_arg: + if "=" in processed_arg: + processed_arg, value_str = processed_arg.split("=", 1) + if "." not in processed_arg: + # False positive, '.' was only in the value + continue + else: + value_str = processed_args[i + 1] + delete.add(i + 1) + + if processed_arg.endswith("+"): + processed_arg = processed_arg[:-1] + value_str = json.dumps(list(value_str.split(","))) + + key, *keys = processed_arg.split(".") + try: + value = json.loads(value_str) + except json.decoder.JSONDecodeError: + value = value_str + + # Merge all values with the same key into a single dict + arg_dict = create_nested_dict(keys, value) + arg_duplicates = recursive_dict_update(dict_args[key], arg_dict) + duplicates |= {f"{key}.{d}" for d in arg_duplicates} + delete.add(i) + # Filter out the dict args we set to None + processed_args = [a for i, a in enumerate(processed_args) if i not in delete] + if duplicates: + logger.warning("Found duplicate keys %s", ", ".join(duplicates)) + + # Add the dict args back as if they were originally passed as JSON + for dict_arg, dict_value in dict_args.items(): + processed_args.append(dict_arg) + processed_args.append(json.dumps(dict_value)) + + return super().parse_args(processed_args, namespace) + + def check_port(self, value): + try: + value = int(value) + except ValueError: + msg = "Port must be an integer" + raise ArgumentTypeError(msg) from None + + if not (1024 <= value <= 65535): + raise ArgumentTypeError("Port must be between 1024 and 65535") + + return value + + def _pull_args_from_config(self, args: list[str]) -> list[str]: + """Method to pull arguments specified in the config file + into the command-line args variable. + + The arguments in config file will be inserted between + the argument list. + + example: + ```yaml + port: 12323 + tensor-parallel-size: 4 + ``` + ```python + $: vllm {serve,chat,complete} "facebook/opt-12B" \ + --config config.yaml -tp 2 + $: args = [ + "serve,chat,complete", + "facebook/opt-12B", + '--config', 'config.yaml', + '-tp', '2' + ] + $: args = [ + "serve,chat,complete", + "facebook/opt-12B", + '--port', '12323', + '--tensor-parallel-size', '4', + '-tp', '2' + ] + ``` + + Please note how the config args are inserted after the sub command. + this way the order of priorities is maintained when these are args + parsed by super(). + """ + assert args.count("--config") <= 1, "More than one config file specified!" + + index = args.index("--config") + if index == len(args) - 1: + raise ValueError( + "No config file specified! \ + Please check your command-line arguments." + ) + + file_path = args[index + 1] + + config_args = self.load_config_file(file_path) + + # 0th index might be the sub command {serve,chat,complete,...} + # optionally followed by model_tag (only for serve) + # followed by config args + # followed by rest of cli args. + # maintaining this order will enforce the precedence + # of cli > config > defaults + if args[0].startswith("-"): + # No sub command (e.g., api_server entry point) + args = config_args + args[0:index] + args[index + 2 :] + elif args[0] == "serve": + model_in_cli = len(args) > 1 and not args[1].startswith("-") + model_in_config = any(arg == "--model" for arg in config_args) + + if not model_in_cli and not model_in_config: + raise ValueError( + "No model specified! Please specify model either " + "as a positional argument or in a config file." + ) + + if model_in_cli: + # Model specified as positional arg, keep CLI version + args = ( + [args[0]] + + [args[1]] + + config_args + + args[2:index] + + args[index + 2 :] + ) + else: + # No model in CLI, use config if available + args = [args[0]] + config_args + args[1:index] + args[index + 2 :] + else: + args = [args[0]] + config_args + args[1:index] + args[index + 2 :] + + return args + + def load_config_file(self, file_path: str) -> list[str]: + """Loads a yaml file and returns the key value pairs as a + flattened list with argparse like pattern + ```yaml + port: 12323 + tensor-parallel-size: 4 + ``` + returns: + processed_args: list[str] = [ + '--port': '12323', + '--tensor-parallel-size': '4' + ] + """ + extension: str = file_path.split(".")[-1] + if extension not in ("yaml", "yml"): + raise ValueError( + f"Config file must be of a yaml/yml type. {extension} supplied" + ) + + # only expecting a flat dictionary of atomic types + processed_args: list[str] = [] + + config: dict[str, int | str] = {} + try: + with open(file_path) as config_file: + config = yaml.safe_load(config_file) + except Exception as ex: + logger.error( + "Unable to read the config file at %s. Check path correctness", + file_path, + ) + raise ex + + store_boolean_arguments = [ + action.dest for action in self._actions if isinstance(action, StoreBoolean) + ] + + for key, value in config.items(): + if isinstance(value, bool) and key not in store_boolean_arguments: + if value: + processed_args.append("--" + key) + elif isinstance(value, list): + if value: + processed_args.append("--" + key) + for item in value: + processed_args.append(str(item)) + else: + processed_args.append("--" + key) + processed_args.append(str(value)) + + return processed_args diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 2e8cd302b..a928cce09 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -16,8 +16,8 @@ import torch import vllm.envs as envs from vllm.logger import logger from vllm.platforms import current_platform -from vllm.utils import cdiv from vllm.utils.import_utils import has_deep_gemm +from vllm.utils.math_utils import cdiv @functools.cache diff --git a/vllm/utils/math_utils.py b/vllm/utils/math_utils.py new file mode 100644 index 000000000..bdfa5fd2c --- /dev/null +++ b/vllm/utils/math_utils.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Math utility functions for vLLM.""" + + +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def next_power_of_2(n: int) -> int: + """The next power of 2 (inclusive)""" + if n < 1: + return 1 + return 1 << (n - 1).bit_length() + + +def prev_power_of_2(n: int) -> int: + """The previous power of 2 (inclusive)""" + if n <= 0: + return 0 + return 1 << (n.bit_length() - 1) + + +def round_up(x: int, y: int) -> int: + """Round up x to the nearest multiple of y.""" + return ((x + y - 1) // y) * y + + +def round_down(x: int, y: int) -> int: + """Round down x to the nearest multiple of y.""" + return (x // y) * y diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 720fbd2c1..1eac94940 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -37,7 +37,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 029293d2f..e71d4ca46 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -34,12 +34,12 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton -from vllm.utils import cdiv from vllm.utils.flashinfer import ( can_use_trtllm_attention, flashinfer_disable_q_quantization, use_trtllm_attention, ) +from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.attention.backends.utils import ( AttentionCGSupport, diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index ffea14ec6..e12cc581d 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -28,7 +28,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import is_torch_equal_or_newer from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index 7ca8501a8..f9d2426ea 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -7,7 +7,7 @@ import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( PAD_SLOT_ID, diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index b920fd929..0ec157300 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -220,8 +220,8 @@ from vllm.model_executor.layers.linear import ( UnquantizedLinearMethod, ) from vllm.platforms import current_platform -from vllm.utils import cdiv, round_down from vllm.utils.flashinfer import has_nvidia_artifactory +from vllm.utils.math_utils import cdiv, round_down from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index 141436e66..bf8e4d5a6 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -22,7 +22,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.triton_utils import tl, triton -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.mla.common import MLACommonBaseImpl from vllm.v1.attention.backends.utils import ( AttentionCGSupport, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index d935c0224..962cad927 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -10,7 +10,7 @@ import vllm.envs as envs from vllm.attention.backends.abstract import AttentionLayer from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd from vllm.config import VllmConfig -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonDecodeMetadata, diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index 28085cb14..40a551787 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -13,7 +13,7 @@ from vllm.attention.backends.abstract import ( ) from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import cdiv, next_power_of_2 +from vllm.utils.math_utils import cdiv, next_power_of_2 logger = init_logger(__name__) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index cb5855548..a0d354df0 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -21,7 +21,7 @@ import torch from typing_extensions import runtime_checkable from vllm.config import VllmConfig, get_layers_from_vllm_config -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionImpl diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 584904dae..6e026215d 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -12,8 +12,8 @@ from typing import Any, NewType, TypeAlias from vllm import envs from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import cdiv from vllm.utils.hashing import sha256_cbor +from vllm.utils.math_utils import cdiv from vllm.utils.mem_constants import GiB_bytes from vllm.v1.kv_cache_interface import ( ChunkedLocalAttentionSpec, diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 586034182..6699fb981 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from collections import defaultdict from collections.abc import Sequence -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock from vllm.v1.kv_cache_interface import ( diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 62faf590b..fd0a9b395 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -29,10 +29,11 @@ from vllm.tracing import init_tracer from vllm.transformers_utils.config import maybe_register_config_serialize_by_value from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, cdiv +from vllm.utils import Device from vllm.utils.async_utils import cancel_task_threadsafe from vllm.utils.collection_utils import as_list from vllm.utils.func_utils import deprecate_kwargs +from vllm.utils.math_utils import cdiv from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 392519f8f..0f564fdb3 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -10,7 +10,7 @@ from typing_extensions import Self from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import get_dtype_size logger = init_logger(__name__) diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 9bf06d516..e041015e5 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -6,7 +6,7 @@ import torch from vllm.distributed import get_dcp_group from vllm.logger import init_logger -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.v1.utils import CpuGpuBuffer logger = init_logger(__name__) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 31429fe69..6759fe630 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -70,12 +70,11 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import ( - cdiv, check_use_alibi, length_from_prompt_token_ids_or_embeds, - round_up, ) from vllm.utils.jsontree import json_map_leaves +from vllm.utils.math_utils import cdiv, round_up from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_utils import DeviceMemoryProfiler from vllm.utils.platform_utils import is_pin_memory_available diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 18b857a64..ce769e857 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -53,7 +53,8 @@ from vllm.multimodal.inputs import ( from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask -from vllm.utils import LayerBlockType, cdiv, prev_power_of_2 +from vllm.utils import LayerBlockType +from vllm.utils.math_utils import cdiv, prev_power_of_2 from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.attention.backends.pallas import ( TPU_STR_DTYPE_TO_TORCH_DTYPE, diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index fae1f8e37..f1885f9b3 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -25,7 +25,7 @@ from vllm.model_executor import set_random_seed from vllm.platforms import current_platform from vllm.platforms.tpu import USE_TPU_INFERENCE from vllm.tasks import SupportedTask -from vllm.utils import cdiv +from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig, KVCacheSpec