[CI] Enable mypy coverage for individual excluded files (#34292)

Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Lucas Kabela
2026-02-16 07:34:29 -08:00
committed by GitHub
parent 6930becd45
commit a3205beffb
9 changed files with 60 additions and 27 deletions

View File

@@ -36,7 +36,6 @@ SEPARATE_GROUPS = [
# TODO(woosuk): Include the code from Megatron and HuggingFace.
EXCLUDE = [
"vllm/engine/arg_utils.py",
"vllm/model_executor/parallel_utils",
"vllm/model_executor/models",
"vllm/model_executor/layers/fla/ops",
@@ -49,9 +48,6 @@ EXCLUDE = [
"vllm/profiler",
"vllm/reasoning",
"vllm/tool_parser",
"vllm/v1/cudagraph_dispatcher.py",
"vllm/outputs.py",
"vllm/logger.py",
]

View File

@@ -39,7 +39,7 @@ KVOffloadingBackend = Literal["native", "lmcache"]
class CacheConfig:
"""Configuration for the KV cache."""
block_size: SkipValidation[BlockSize] = None # type: ignore
block_size: SkipValidation[BlockSize] = None # type: ignore[assignment]
"""Size of a contiguous cache block in number of tokens. On CUDA devices,
only block sizes up to 32 are supported.

View File

@@ -182,7 +182,7 @@ class ParallelConfig:
threshold, microbatching will be used. Otherwise, the request will be
processed in a single batch."""
disable_nccl_for_dp_synchronization: bool = Field(default=None)
disable_nccl_for_dp_synchronization: bool | None = Field(default=None)
"""Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py
to use Gloo instead of NCCL for its all reduce.

View File

@@ -115,7 +115,7 @@ class SchedulerConfig:
# scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
# (default) or "mod.custom_class".
scheduler_cls: str | type[object] = Field(default=None)
scheduler_cls: str | type[object] | None = Field(default=None)
"""The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
the default scheduler. Can be a class directly or the path to a class of
form "mod.custom_class"."""
@@ -128,7 +128,7 @@ class SchedulerConfig:
and starting configuration.
"""
async_scheduling: bool = Field(default=None)
async_scheduling: bool | None = Field(default=None)
"""If set to False, disable async scheduling. Async scheduling helps to
avoid gaps in GPU utilization, leading to better latency and throughput.
"""

View File

@@ -10,7 +10,7 @@ import json
import pathlib
import textwrap
from collections.abc import Callable, Mapping, Sequence, Set
from dataclasses import MISSING, Field, field, fields, is_dataclass
from dataclasses import MISSING, field, fields, is_dataclass
from itertools import pairwise
from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast
@@ -66,7 +66,7 @@ def config(
return decorator(cls)
def get_field(cls: ConfigType, name: str) -> Field:
def get_field(cls: ConfigType, name: str) -> Any:
"""Get the default factory field of a dataclass by name. Used for getting
default factory fields in `EngineArgs`."""
if not is_dataclass(cls):

View File

@@ -67,6 +67,7 @@ from vllm.config.cache import (
PrefixCachingHashAlgo,
)
from vllm.config.device import Device
from vllm.config.lora import MaxLoRARanks
from vllm.config.model import (
ConvertOption,
HfOverrides,
@@ -77,7 +78,12 @@ from vllm.config.model import (
)
from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
from vllm.config.observability import DetailedTraceModules
from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
from vllm.config.parallel import (
All2AllBackend,
DataParallelBackend,
DistributedExecutorBackend,
ExpertPlacementStrategy,
)
from vllm.config.scheduler import SchedulerPolicy
from vllm.config.utils import get_field
from vllm.config.vllm import OptimizationLevel
@@ -257,7 +263,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
# VllmConfig's Fields have default_factory set to config classes.
# These could emit logs on init, which would be confusing.
with suppress_logging():
default = default.default_factory()
default = default.default_factory() # type: ignore[call-arg]
elif field.default_factory is not MISSING:
default = field.default_factory()
@@ -373,7 +379,7 @@ class EngineArgs:
dtype: ModelDType = ModelConfig.dtype
kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
seed: int = ModelConfig.seed
max_model_len: int | None = ModelConfig.max_model_len
max_model_len: int = ModelConfig.max_model_len
cudagraph_capture_sizes: list[int] | None = (
CompilationConfig.cudagraph_capture_sizes
)
@@ -405,9 +411,9 @@ class EngineArgs:
data_parallel_rpc_port: int | None = None
data_parallel_hybrid_lb: bool = False
data_parallel_external_lb: bool = False
data_parallel_backend: str = ParallelConfig.data_parallel_backend
data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend
enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
all2all_backend: str = ParallelConfig.all2all_backend
all2all_backend: All2AllBackend = ParallelConfig.all2all_backend
enable_dbo: bool = ParallelConfig.enable_dbo
ubatch_size: int = ParallelConfig.ubatch_size
dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
@@ -425,7 +431,7 @@ class EngineArgs:
max_parallel_loading_workers: int | None = (
ParallelConfig.max_parallel_loading_workers
)
block_size: BlockSize | None = CacheConfig.block_size
block_size: BlockSize = CacheConfig.block_size
enable_prefix_caching: bool | None = None
prefix_caching_hash_algo: PrefixCachingHashAlgo = (
CacheConfig.prefix_caching_hash_algo
@@ -451,7 +457,7 @@ class EngineArgs:
hf_token: bool | str | None = ModelConfig.hf_token
hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
tokenizer_revision: str | None = ModelConfig.tokenizer_revision
quantization: QuantizationMethods | None = ModelConfig.quantization
quantization: QuantizationMethods | str | None = ModelConfig.quantization
allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization
enforce_eager: bool = ModelConfig.enforce_eager
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
@@ -479,11 +485,11 @@ class EngineArgs:
)
io_processor_plugin: str | None = None
skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
video_pruning_rate: float = MultiModalConfig.video_pruning_rate
video_pruning_rate: float | None = MultiModalConfig.video_pruning_rate
# LoRA fields
enable_lora: bool = False
max_loras: int = LoRAConfig.max_loras
max_lora_rank: int = LoRAConfig.max_lora_rank
max_lora_rank: MaxLoRARanks = LoRAConfig.max_lora_rank
default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras
fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
@@ -557,7 +563,7 @@ class EngineArgs:
ModelConfig, "override_generation_config"
)
model_impl: str = ModelConfig.model_impl
override_attention_dtype: str = ModelConfig.override_attention_dtype
override_attention_dtype: str | None = ModelConfig.override_attention_dtype
attention_backend: AttentionBackendEnum | None = AttentionConfig.backend
calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
@@ -569,7 +575,7 @@ class EngineArgs:
additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config")
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
pt_load_map_location: str = LoadConfig.pt_load_map_location
pt_load_map_location: str | dict[str, str] = LoadConfig.pt_load_map_location
logits_processors: list[str | type[LogitsProcessor]] | None = (
ModelConfig.logits_processors
@@ -1280,7 +1286,7 @@ class EngineArgs:
hf_config_path=self.hf_config_path,
runner=self.runner,
convert=self.convert,
tokenizer=self.tokenizer,
tokenizer=self.tokenizer, # type: ignore[arg-type]
tokenizer_mode=self.tokenizer_mode,
trust_remote_code=self.trust_remote_code,
allowed_local_media_path=self.allowed_local_media_path,
@@ -1445,12 +1451,16 @@ class EngineArgs:
self.kv_cache_dtype, model_config
)
assert self.enable_prefix_caching is not None, (
"enable_prefix_caching must be set by this point"
)
cache_config = CacheConfig(
block_size=self.block_size,
gpu_memory_utilization=self.gpu_memory_utilization,
kv_cache_memory_bytes=self.kv_cache_memory_bytes,
swap_space=self.swap_space,
cache_dtype=resolved_cache_dtype,
cache_dtype=resolved_cache_dtype, # type: ignore[arg-type]
is_attention_free=model_config.is_attention_free,
num_gpu_blocks_override=self.num_gpu_blocks_override,
sliding_window=sliding_window,
@@ -1676,6 +1686,16 @@ class EngineArgs:
target_parallel_config=parallel_config,
)
assert self.max_num_batched_tokens is not None, (
"max_num_batched_tokens must be set by this point"
)
assert self.max_num_seqs is not None, "max_num_seqs must be set by this point"
assert self.enable_chunked_prefill is not None, (
"enable_chunked_prefill must be set by this point"
)
assert model_config.max_model_len is not None, (
"max_model_len must be set by this point"
)
scheduler_config = SchedulerConfig(
runner_type=model_config.runner_type,
max_num_batched_tokens=self.max_num_batched_tokens,
@@ -2043,6 +2063,9 @@ class EngineArgs:
)
if orig_max_num_batched_tokens is None:
assert model_config.max_model_len is not None, (
"max_model_len must be set by this point"
)
if not self.enable_chunked_prefill:
# If max_model_len is too short, use the default for higher throughput.
self.max_num_batched_tokens = max(

View File

@@ -38,7 +38,7 @@ def _use_color() -> bool:
return False
DEFAULT_LOGGING_CONFIG = {
DEFAULT_LOGGING_CONFIG: dict[str, dict[str, Any] | Any] = {
"formatters": {
"vllm": {
"class": "vllm.logging_utils.NewLineFormatter",
@@ -157,7 +157,7 @@ _METHODS_TO_PATCH = {
def _configure_vllm_root_logger() -> None:
logging_config = dict[str, dict[str, Any] | Any]()
logging_config: dict[str, dict[str, Any] | Any] = {}
if not envs.VLLM_CONFIGURE_LOGGING and envs.VLLM_LOGGING_CONFIG_PATH:
raise RuntimeError(
@@ -225,7 +225,8 @@ def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]:
logging.disable(current_level)
def current_formatter_type(lgr: Logger) -> Literal["color", "newline", None]:
def current_formatter_type(logger: Logger) -> Literal["color", "newline", None]:
lgr: Logger | None = logger
while lgr is not None:
if lgr.handlers and len(lgr.handlers) == 1 and lgr.handlers[0].name == "vllm":
formatter = lgr.handlers[0].formatter

View File

@@ -162,7 +162,7 @@ class RequestOutput:
completion.token_ids.extend(next_completion.token_ids)
if next_completion.logprobs:
assert completion.logprobs is not None
completion.logprobs.extend(next_completion.logprobs)
completion.logprobs.extend(next_completion.logprobs) # type: ignore[arg-type]
completion.cumulative_logprob = (
next_completion.cumulative_logprob
)

View File

@@ -71,6 +71,9 @@ class CudagraphDispatcher:
"""Pre-compute the mapping from batch size to padded graph size."""
max_size = self.compilation_config.max_cudagraph_capture_size
capture_sizes = self.compilation_config.cudagraph_capture_sizes
assert capture_sizes is not None, (
"Cudagraph capture sizes must be set when cudagraphs are enabled."
)
self._bs_to_padded_graph_size: list[int] = [0] * (max_size + 1)
for end, start in zip(
capture_sizes + [max_size + 1],
@@ -89,6 +92,7 @@ class CudagraphDispatcher:
and self.cudagraph_mode != CUDAGraphMode.NONE
):
for size in self.compilation_config.compile_sizes:
size = int(size)
if size <= self.compilation_config.max_cudagraph_capture_size:
padded = self._bs_to_padded_graph_size[size]
if padded != size:
@@ -178,6 +182,9 @@ class CudagraphDispatcher:
# guarantee all keys would be used. For example, if we allow lazy
# capturing in future PR, some keys may never be triggered.
if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
assert self.compilation_config.cudagraph_capture_sizes is not None, (
"Cudagraph capture sizes must be set when mixed mode is enabled."
)
for bs, num_active_loras in product(
self.compilation_config.cudagraph_capture_sizes, lora_cases
):
@@ -200,6 +207,9 @@ class CudagraphDispatcher:
uniform_decode_query_len
* self.vllm_config.scheduler_config.max_num_seqs
)
assert self.compilation_config.cudagraph_capture_sizes is not None, (
"Cudagraph capture sizes must be set when full mode is enabled."
)
cudagraph_capture_sizes_for_decode = [
x
for x in self.compilation_config.cudagraph_capture_sizes
@@ -262,6 +272,9 @@ class CudagraphDispatcher:
else:
# When not specializing, graphs are captured only with max_loras + 1,
# so we must use max_loras + 1 for dispatch to find a matching graph.
assert self.vllm_config.lora_config is not None, (
"LoRA config must be set when has_lora is True."
)
effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1
batch_desc = self._create_padded_batch_descriptor(