diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 12f6aa327..aa158b4a6 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -36,7 +36,6 @@ SEPARATE_GROUPS = [ # TODO(woosuk): Include the code from Megatron and HuggingFace. EXCLUDE = [ - "vllm/engine/arg_utils.py", "vllm/model_executor/parallel_utils", "vllm/model_executor/models", "vllm/model_executor/layers/fla/ops", @@ -49,9 +48,6 @@ EXCLUDE = [ "vllm/profiler", "vllm/reasoning", "vllm/tool_parser", - "vllm/v1/cudagraph_dispatcher.py", - "vllm/outputs.py", - "vllm/logger.py", ] diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 149b0b9b7..daceaa6c2 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -39,7 +39,7 @@ KVOffloadingBackend = Literal["native", "lmcache"] class CacheConfig: """Configuration for the KV cache.""" - block_size: SkipValidation[BlockSize] = None # type: ignore + block_size: SkipValidation[BlockSize] = None # type: ignore[assignment] """Size of a contiguous cache block in number of tokens. On CUDA devices, only block sizes up to 32 are supported. diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 131db50f1..cc2cfa97b 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -182,7 +182,7 @@ class ParallelConfig: threshold, microbatching will be used. Otherwise, the request will be processed in a single batch.""" - disable_nccl_for_dp_synchronization: bool = Field(default=None) + disable_nccl_for_dp_synchronization: bool | None = Field(default=None) """Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py to use Gloo instead of NCCL for its all reduce. diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 5e44eb84f..fb162bd50 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -115,7 +115,7 @@ class SchedulerConfig: # scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler" # (default) or "mod.custom_class". - scheduler_cls: str | type[object] = Field(default=None) + scheduler_cls: str | type[object] | None = Field(default=None) """The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is the default scheduler. Can be a class directly or the path to a class of form "mod.custom_class".""" @@ -128,7 +128,7 @@ class SchedulerConfig: and starting configuration. """ - async_scheduling: bool = Field(default=None) + async_scheduling: bool | None = Field(default=None) """If set to False, disable async scheduling. Async scheduling helps to avoid gaps in GPU utilization, leading to better latency and throughput. """ diff --git a/vllm/config/utils.py b/vllm/config/utils.py index dff9b2c5a..d17637338 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -10,7 +10,7 @@ import json import pathlib import textwrap from collections.abc import Callable, Mapping, Sequence, Set -from dataclasses import MISSING, Field, field, fields, is_dataclass +from dataclasses import MISSING, field, fields, is_dataclass from itertools import pairwise from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast @@ -66,7 +66,7 @@ def config( return decorator(cls) -def get_field(cls: ConfigType, name: str) -> Field: +def get_field(cls: ConfigType, name: str) -> Any: """Get the default factory field of a dataclass by name. Used for getting default factory fields in `EngineArgs`.""" if not is_dataclass(cls): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index feb9d1bc8..8ea96de49 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -67,6 +67,7 @@ from vllm.config.cache import ( PrefixCachingHashAlgo, ) from vllm.config.device import Device +from vllm.config.lora import MaxLoRARanks from vllm.config.model import ( ConvertOption, HfOverrides, @@ -77,7 +78,12 @@ from vllm.config.model import ( ) from vllm.config.multimodal import MMCacheType, MMEncoderTPMode from vllm.config.observability import DetailedTraceModules -from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy +from vllm.config.parallel import ( + All2AllBackend, + DataParallelBackend, + DistributedExecutorBackend, + ExpertPlacementStrategy, +) from vllm.config.scheduler import SchedulerPolicy from vllm.config.utils import get_field from vllm.config.vllm import OptimizationLevel @@ -257,7 +263,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]: # VllmConfig's Fields have default_factory set to config classes. # These could emit logs on init, which would be confusing. with suppress_logging(): - default = default.default_factory() + default = default.default_factory() # type: ignore[call-arg] elif field.default_factory is not MISSING: default = field.default_factory() @@ -373,7 +379,7 @@ class EngineArgs: dtype: ModelDType = ModelConfig.dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype seed: int = ModelConfig.seed - max_model_len: int | None = ModelConfig.max_model_len + max_model_len: int = ModelConfig.max_model_len cudagraph_capture_sizes: list[int] | None = ( CompilationConfig.cudagraph_capture_sizes ) @@ -405,9 +411,9 @@ class EngineArgs: data_parallel_rpc_port: int | None = None data_parallel_hybrid_lb: bool = False data_parallel_external_lb: bool = False - data_parallel_backend: str = ParallelConfig.data_parallel_backend + data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel - all2all_backend: str = ParallelConfig.all2all_backend + all2all_backend: All2AllBackend = ParallelConfig.all2all_backend enable_dbo: bool = ParallelConfig.enable_dbo ubatch_size: int = ParallelConfig.ubatch_size dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold @@ -425,7 +431,7 @@ class EngineArgs: max_parallel_loading_workers: int | None = ( ParallelConfig.max_parallel_loading_workers ) - block_size: BlockSize | None = CacheConfig.block_size + block_size: BlockSize = CacheConfig.block_size enable_prefix_caching: bool | None = None prefix_caching_hash_algo: PrefixCachingHashAlgo = ( CacheConfig.prefix_caching_hash_algo @@ -451,7 +457,7 @@ class EngineArgs: hf_token: bool | str | None = ModelConfig.hf_token hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides") tokenizer_revision: str | None = ModelConfig.tokenizer_revision - quantization: QuantizationMethods | None = ModelConfig.quantization + quantization: QuantizationMethods | str | None = ModelConfig.quantization allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization enforce_eager: bool = ModelConfig.enforce_eager disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce @@ -479,11 +485,11 @@ class EngineArgs: ) io_processor_plugin: str | None = None skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling - video_pruning_rate: float = MultiModalConfig.video_pruning_rate + video_pruning_rate: float | None = MultiModalConfig.video_pruning_rate # LoRA fields enable_lora: bool = False max_loras: int = LoRAConfig.max_loras - max_lora_rank: int = LoRAConfig.max_lora_rank + max_lora_rank: MaxLoRARanks = LoRAConfig.max_lora_rank default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras max_cpu_loras: int | None = LoRAConfig.max_cpu_loras @@ -557,7 +563,7 @@ class EngineArgs: ModelConfig, "override_generation_config" ) model_impl: str = ModelConfig.model_impl - override_attention_dtype: str = ModelConfig.override_attention_dtype + override_attention_dtype: str | None = ModelConfig.override_attention_dtype attention_backend: AttentionBackendEnum | None = AttentionConfig.backend calculate_kv_scales: bool = CacheConfig.calculate_kv_scales @@ -569,7 +575,7 @@ class EngineArgs: additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config") use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load - pt_load_map_location: str = LoadConfig.pt_load_map_location + pt_load_map_location: str | dict[str, str] = LoadConfig.pt_load_map_location logits_processors: list[str | type[LogitsProcessor]] | None = ( ModelConfig.logits_processors @@ -1280,7 +1286,7 @@ class EngineArgs: hf_config_path=self.hf_config_path, runner=self.runner, convert=self.convert, - tokenizer=self.tokenizer, + tokenizer=self.tokenizer, # type: ignore[arg-type] tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, allowed_local_media_path=self.allowed_local_media_path, @@ -1445,12 +1451,16 @@ class EngineArgs: self.kv_cache_dtype, model_config ) + assert self.enable_prefix_caching is not None, ( + "enable_prefix_caching must be set by this point" + ) + cache_config = CacheConfig( block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, kv_cache_memory_bytes=self.kv_cache_memory_bytes, swap_space=self.swap_space, - cache_dtype=resolved_cache_dtype, + cache_dtype=resolved_cache_dtype, # type: ignore[arg-type] is_attention_free=model_config.is_attention_free, num_gpu_blocks_override=self.num_gpu_blocks_override, sliding_window=sliding_window, @@ -1676,6 +1686,16 @@ class EngineArgs: target_parallel_config=parallel_config, ) + assert self.max_num_batched_tokens is not None, ( + "max_num_batched_tokens must be set by this point" + ) + assert self.max_num_seqs is not None, "max_num_seqs must be set by this point" + assert self.enable_chunked_prefill is not None, ( + "enable_chunked_prefill must be set by this point" + ) + assert model_config.max_model_len is not None, ( + "max_model_len must be set by this point" + ) scheduler_config = SchedulerConfig( runner_type=model_config.runner_type, max_num_batched_tokens=self.max_num_batched_tokens, @@ -2043,6 +2063,9 @@ class EngineArgs: ) if orig_max_num_batched_tokens is None: + assert model_config.max_model_len is not None, ( + "max_model_len must be set by this point" + ) if not self.enable_chunked_prefill: # If max_model_len is too short, use the default for higher throughput. self.max_num_batched_tokens = max( diff --git a/vllm/logger.py b/vllm/logger.py index 2ec20003b..e8aecead3 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -38,7 +38,7 @@ def _use_color() -> bool: return False -DEFAULT_LOGGING_CONFIG = { +DEFAULT_LOGGING_CONFIG: dict[str, dict[str, Any] | Any] = { "formatters": { "vllm": { "class": "vllm.logging_utils.NewLineFormatter", @@ -157,7 +157,7 @@ _METHODS_TO_PATCH = { def _configure_vllm_root_logger() -> None: - logging_config = dict[str, dict[str, Any] | Any]() + logging_config: dict[str, dict[str, Any] | Any] = {} if not envs.VLLM_CONFIGURE_LOGGING and envs.VLLM_LOGGING_CONFIG_PATH: raise RuntimeError( @@ -225,7 +225,8 @@ def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]: logging.disable(current_level) -def current_formatter_type(lgr: Logger) -> Literal["color", "newline", None]: +def current_formatter_type(logger: Logger) -> Literal["color", "newline", None]: + lgr: Logger | None = logger while lgr is not None: if lgr.handlers and len(lgr.handlers) == 1 and lgr.handlers[0].name == "vllm": formatter = lgr.handlers[0].formatter diff --git a/vllm/outputs.py b/vllm/outputs.py index 5bd460aad..48f8e9dc0 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -162,7 +162,7 @@ class RequestOutput: completion.token_ids.extend(next_completion.token_ids) if next_completion.logprobs: assert completion.logprobs is not None - completion.logprobs.extend(next_completion.logprobs) + completion.logprobs.extend(next_completion.logprobs) # type: ignore[arg-type] completion.cumulative_logprob = ( next_completion.cumulative_logprob ) diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 6817c571b..26ca82b8f 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -71,6 +71,9 @@ class CudagraphDispatcher: """Pre-compute the mapping from batch size to padded graph size.""" max_size = self.compilation_config.max_cudagraph_capture_size capture_sizes = self.compilation_config.cudagraph_capture_sizes + assert capture_sizes is not None, ( + "Cudagraph capture sizes must be set when cudagraphs are enabled." + ) self._bs_to_padded_graph_size: list[int] = [0] * (max_size + 1) for end, start in zip( capture_sizes + [max_size + 1], @@ -89,6 +92,7 @@ class CudagraphDispatcher: and self.cudagraph_mode != CUDAGraphMode.NONE ): for size in self.compilation_config.compile_sizes: + size = int(size) if size <= self.compilation_config.max_cudagraph_capture_size: padded = self._bs_to_padded_graph_size[size] if padded != size: @@ -178,6 +182,9 @@ class CudagraphDispatcher: # guarantee all keys would be used. For example, if we allow lazy # capturing in future PR, some keys may never be triggered. if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE: + assert self.compilation_config.cudagraph_capture_sizes is not None, ( + "Cudagraph capture sizes must be set when mixed mode is enabled." + ) for bs, num_active_loras in product( self.compilation_config.cudagraph_capture_sizes, lora_cases ): @@ -200,6 +207,9 @@ class CudagraphDispatcher: uniform_decode_query_len * self.vllm_config.scheduler_config.max_num_seqs ) + assert self.compilation_config.cudagraph_capture_sizes is not None, ( + "Cudagraph capture sizes must be set when full mode is enabled." + ) cudagraph_capture_sizes_for_decode = [ x for x in self.compilation_config.cudagraph_capture_sizes @@ -262,6 +272,9 @@ class CudagraphDispatcher: else: # When not specializing, graphs are captured only with max_loras + 1, # so we must use max_loras + 1 for dispatch to find a matching graph. + assert self.vllm_config.lora_config is not None, ( + "LoRA config must be set when has_lora is True." + ) effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1 batch_desc = self._create_padded_batch_descriptor(