[CI] Enable mypy coverage for individual excluded files (#34292)

Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-02-16 07:34:29 -08:00
parent 6930becd45
commit a3205beffb
9 changed files with 60 additions and 27 deletions
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -36,7 +36,6 @@ SEPARATE_GROUPS = [

 # TODO(woosuk): Include the code from Megatron and HuggingFace.
 EXCLUDE = [
-    "vllm/engine/arg_utils.py",
    "vllm/model_executor/parallel_utils",
    "vllm/model_executor/models",
    "vllm/model_executor/layers/fla/ops",
@@ -49,9 +48,6 @@ EXCLUDE = [
    "vllm/profiler",
    "vllm/reasoning",
    "vllm/tool_parser",
-    "vllm/v1/cudagraph_dispatcher.py",
-    "vllm/outputs.py",
-    "vllm/logger.py",
 ]


--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -39,7 +39,7 @@ KVOffloadingBackend = Literal["native", "lmcache"]
 class CacheConfig:
    """Configuration for the KV cache."""

-    block_size: SkipValidation[BlockSize] = None  # type: ignore
+    block_size: SkipValidation[BlockSize] = None  # type: ignore[assignment]
    """Size of a contiguous cache block in number of tokens. On CUDA devices,
    only block sizes up to 32 are supported.

--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -182,7 +182,7 @@ class ParallelConfig:
    threshold, microbatching will be used. Otherwise, the request will be
    processed in a single batch."""

-    disable_nccl_for_dp_synchronization: bool = Field(default=None)
+    disable_nccl_for_dp_synchronization: bool | None = Field(default=None)
    """Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py 
    to use Gloo instead of NCCL for its all reduce.

--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -115,7 +115,7 @@ class SchedulerConfig:

    # scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
    # (default) or "mod.custom_class".
-    scheduler_cls: str | type[object] = Field(default=None)
+    scheduler_cls: str | type[object] | None = Field(default=None)
    """The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
    the default scheduler. Can be a class directly or the path to a class of
    form "mod.custom_class"."""
@@ -128,7 +128,7 @@ class SchedulerConfig:
    and starting configuration.
    """

-    async_scheduling: bool = Field(default=None)
+    async_scheduling: bool | None = Field(default=None)
    """If set to False, disable async scheduling. Async scheduling helps to
    avoid gaps in GPU utilization, leading to better latency and throughput.
    """
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -10,7 +10,7 @@ import json
 import pathlib
 import textwrap
 from collections.abc import Callable, Mapping, Sequence, Set
-from dataclasses import MISSING, Field, field, fields, is_dataclass
+from dataclasses import MISSING, field, fields, is_dataclass
 from itertools import pairwise
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast

@@ -66,7 +66,7 @@ def config(
    return decorator(cls)


-def get_field(cls: ConfigType, name: str) -> Field:
+def get_field(cls: ConfigType, name: str) -> Any:
    """Get the default factory field of a dataclass by name. Used for getting
    default factory fields in `EngineArgs`."""
    if not is_dataclass(cls):
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -67,6 +67,7 @@ from vllm.config.cache import (
    PrefixCachingHashAlgo,
 )
 from vllm.config.device import Device
+from vllm.config.lora import MaxLoRARanks
 from vllm.config.model import (
    ConvertOption,
    HfOverrides,
@@ -77,7 +78,12 @@ from vllm.config.model import (
 )
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
 from vllm.config.observability import DetailedTraceModules
-from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
+from vllm.config.parallel import (
+    All2AllBackend,
+    DataParallelBackend,
+    DistributedExecutorBackend,
+    ExpertPlacementStrategy,
+)
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
 from vllm.config.vllm import OptimizationLevel
@@ -257,7 +263,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
                    # VllmConfig's Fields have default_factory set to config classes.
                    # These could emit logs on init, which would be confusing.
                    with suppress_logging():
-                        default = default.default_factory()
+                        default = default.default_factory()  # type: ignore[call-arg]
        elif field.default_factory is not MISSING:
            default = field.default_factory()

@@ -373,7 +379,7 @@ class EngineArgs:
    dtype: ModelDType = ModelConfig.dtype
    kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
    seed: int = ModelConfig.seed
-    max_model_len: int | None = ModelConfig.max_model_len
+    max_model_len: int = ModelConfig.max_model_len
    cudagraph_capture_sizes: list[int] | None = (
        CompilationConfig.cudagraph_capture_sizes
    )
@@ -405,9 +411,9 @@ class EngineArgs:
    data_parallel_rpc_port: int | None = None
    data_parallel_hybrid_lb: bool = False
    data_parallel_external_lb: bool = False
-    data_parallel_backend: str = ParallelConfig.data_parallel_backend
+    data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend
    enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
-    all2all_backend: str = ParallelConfig.all2all_backend
+    all2all_backend: All2AllBackend = ParallelConfig.all2all_backend
    enable_dbo: bool = ParallelConfig.enable_dbo
    ubatch_size: int = ParallelConfig.ubatch_size
    dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
@@ -425,7 +431,7 @@ class EngineArgs:
    max_parallel_loading_workers: int | None = (
        ParallelConfig.max_parallel_loading_workers
    )
-    block_size: BlockSize | None = CacheConfig.block_size
+    block_size: BlockSize = CacheConfig.block_size
    enable_prefix_caching: bool | None = None
    prefix_caching_hash_algo: PrefixCachingHashAlgo = (
        CacheConfig.prefix_caching_hash_algo
@@ -451,7 +457,7 @@ class EngineArgs:
    hf_token: bool | str | None = ModelConfig.hf_token
    hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
    tokenizer_revision: str | None = ModelConfig.tokenizer_revision
-    quantization: QuantizationMethods | None = ModelConfig.quantization
+    quantization: QuantizationMethods | str | None = ModelConfig.quantization
    allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization
    enforce_eager: bool = ModelConfig.enforce_eager
    disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
@@ -479,11 +485,11 @@ class EngineArgs:
    )
    io_processor_plugin: str | None = None
    skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
-    video_pruning_rate: float = MultiModalConfig.video_pruning_rate
+    video_pruning_rate: float | None = MultiModalConfig.video_pruning_rate
    # LoRA fields
    enable_lora: bool = False
    max_loras: int = LoRAConfig.max_loras
-    max_lora_rank: int = LoRAConfig.max_lora_rank
+    max_lora_rank: MaxLoRARanks = LoRAConfig.max_lora_rank
    default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras
    fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
    max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
@@ -557,7 +563,7 @@ class EngineArgs:
        ModelConfig, "override_generation_config"
    )
    model_impl: str = ModelConfig.model_impl
-    override_attention_dtype: str = ModelConfig.override_attention_dtype
+    override_attention_dtype: str | None = ModelConfig.override_attention_dtype
    attention_backend: AttentionBackendEnum | None = AttentionConfig.backend

    calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
@@ -569,7 +575,7 @@ class EngineArgs:
    additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config")

    use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
-    pt_load_map_location: str = LoadConfig.pt_load_map_location
+    pt_load_map_location: str | dict[str, str] = LoadConfig.pt_load_map_location

    logits_processors: list[str | type[LogitsProcessor]] | None = (
        ModelConfig.logits_processors
@@ -1280,7 +1286,7 @@ class EngineArgs:
            hf_config_path=self.hf_config_path,
            runner=self.runner,
            convert=self.convert,
-            tokenizer=self.tokenizer,
+            tokenizer=self.tokenizer,  # type: ignore[arg-type]
            tokenizer_mode=self.tokenizer_mode,
            trust_remote_code=self.trust_remote_code,
            allowed_local_media_path=self.allowed_local_media_path,
@@ -1445,12 +1451,16 @@ class EngineArgs:
            self.kv_cache_dtype, model_config
        )

+        assert self.enable_prefix_caching is not None, (
+            "enable_prefix_caching must be set by this point"
+        )
+
        cache_config = CacheConfig(
            block_size=self.block_size,
            gpu_memory_utilization=self.gpu_memory_utilization,
            kv_cache_memory_bytes=self.kv_cache_memory_bytes,
            swap_space=self.swap_space,
-            cache_dtype=resolved_cache_dtype,
+            cache_dtype=resolved_cache_dtype,  # type: ignore[arg-type]
            is_attention_free=model_config.is_attention_free,
            num_gpu_blocks_override=self.num_gpu_blocks_override,
            sliding_window=sliding_window,
@@ -1676,6 +1686,16 @@ class EngineArgs:
            target_parallel_config=parallel_config,
        )

+        assert self.max_num_batched_tokens is not None, (
+            "max_num_batched_tokens must be set by this point"
+        )
+        assert self.max_num_seqs is not None, "max_num_seqs must be set by this point"
+        assert self.enable_chunked_prefill is not None, (
+            "enable_chunked_prefill must be set by this point"
+        )
+        assert model_config.max_model_len is not None, (
+            "max_model_len must be set by this point"
+        )
        scheduler_config = SchedulerConfig(
            runner_type=model_config.runner_type,
            max_num_batched_tokens=self.max_num_batched_tokens,
@@ -2043,6 +2063,9 @@ class EngineArgs:
            )

        if orig_max_num_batched_tokens is None:
+            assert model_config.max_model_len is not None, (
+                "max_model_len must be set by this point"
+            )
            if not self.enable_chunked_prefill:
                # If max_model_len is too short, use the default for higher throughput.
                self.max_num_batched_tokens = max(
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -38,7 +38,7 @@ def _use_color() -> bool:
    return False


-DEFAULT_LOGGING_CONFIG = {
+DEFAULT_LOGGING_CONFIG: dict[str, dict[str, Any] | Any] = {
    "formatters": {
        "vllm": {
            "class": "vllm.logging_utils.NewLineFormatter",
@@ -157,7 +157,7 @@ _METHODS_TO_PATCH = {


 def _configure_vllm_root_logger() -> None:
-    logging_config = dict[str, dict[str, Any] | Any]()
+    logging_config: dict[str, dict[str, Any] | Any] = {}

    if not envs.VLLM_CONFIGURE_LOGGING and envs.VLLM_LOGGING_CONFIG_PATH:
        raise RuntimeError(
@@ -225,7 +225,8 @@ def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]:
    logging.disable(current_level)


-def current_formatter_type(lgr: Logger) -> Literal["color", "newline", None]:
+def current_formatter_type(logger: Logger) -> Literal["color", "newline", None]:
+    lgr: Logger | None = logger
    while lgr is not None:
        if lgr.handlers and len(lgr.handlers) == 1 and lgr.handlers[0].name == "vllm":
            formatter = lgr.handlers[0].formatter
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -162,7 +162,7 @@ class RequestOutput:
                        completion.token_ids.extend(next_completion.token_ids)
                        if next_completion.logprobs:
                            assert completion.logprobs is not None
-                            completion.logprobs.extend(next_completion.logprobs)
+                            completion.logprobs.extend(next_completion.logprobs)  # type: ignore[arg-type]
                        completion.cumulative_logprob = (
                            next_completion.cumulative_logprob
                        )
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -71,6 +71,9 @@ class CudagraphDispatcher:
        """Pre-compute the mapping from batch size to padded graph size."""
        max_size = self.compilation_config.max_cudagraph_capture_size
        capture_sizes = self.compilation_config.cudagraph_capture_sizes
+        assert capture_sizes is not None, (
+            "Cudagraph capture sizes must be set when cudagraphs are enabled."
+        )
        self._bs_to_padded_graph_size: list[int] = [0] * (max_size + 1)
        for end, start in zip(
            capture_sizes + [max_size + 1],
@@ -89,6 +92,7 @@ class CudagraphDispatcher:
            and self.cudagraph_mode != CUDAGraphMode.NONE
        ):
            for size in self.compilation_config.compile_sizes:
+                size = int(size)
                if size <= self.compilation_config.max_cudagraph_capture_size:
                    padded = self._bs_to_padded_graph_size[size]
                    if padded != size:
@@ -178,6 +182,9 @@ class CudagraphDispatcher:
        # guarantee all keys would be used. For example, if we allow lazy
        # capturing in future PR, some keys may never be triggered.
        if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
+            assert self.compilation_config.cudagraph_capture_sizes is not None, (
+                "Cudagraph capture sizes must be set when mixed mode is enabled."
+            )
            for bs, num_active_loras in product(
                self.compilation_config.cudagraph_capture_sizes, lora_cases
            ):
@@ -200,6 +207,9 @@ class CudagraphDispatcher:
                uniform_decode_query_len
                * self.vllm_config.scheduler_config.max_num_seqs
            )
+            assert self.compilation_config.cudagraph_capture_sizes is not None, (
+                "Cudagraph capture sizes must be set when full mode is enabled."
+            )
            cudagraph_capture_sizes_for_decode = [
                x
                for x in self.compilation_config.cudagraph_capture_sizes
@@ -262,6 +272,9 @@ class CudagraphDispatcher:
            else:
                # When not specializing, graphs are captured only with max_loras + 1,
                # so we must use max_loras + 1 for dispatch to find a matching graph.
+                assert self.vllm_config.lora_config is not None, (
+                    "LoRA config must be set when has_lora is True."
+                )
                effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1

        batch_desc = self._create_padded_batch_descriptor(