[CI] Enable mypy coverage for individual excluded files (#34292)
Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -36,7 +36,6 @@ SEPARATE_GROUPS = [
|
||||
|
||||
# TODO(woosuk): Include the code from Megatron and HuggingFace.
|
||||
EXCLUDE = [
|
||||
"vllm/engine/arg_utils.py",
|
||||
"vllm/model_executor/parallel_utils",
|
||||
"vllm/model_executor/models",
|
||||
"vllm/model_executor/layers/fla/ops",
|
||||
@@ -49,9 +48,6 @@ EXCLUDE = [
|
||||
"vllm/profiler",
|
||||
"vllm/reasoning",
|
||||
"vllm/tool_parser",
|
||||
"vllm/v1/cudagraph_dispatcher.py",
|
||||
"vllm/outputs.py",
|
||||
"vllm/logger.py",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ KVOffloadingBackend = Literal["native", "lmcache"]
|
||||
class CacheConfig:
|
||||
"""Configuration for the KV cache."""
|
||||
|
||||
block_size: SkipValidation[BlockSize] = None # type: ignore
|
||||
block_size: SkipValidation[BlockSize] = None # type: ignore[assignment]
|
||||
"""Size of a contiguous cache block in number of tokens. On CUDA devices,
|
||||
only block sizes up to 32 are supported.
|
||||
|
||||
|
||||
@@ -182,7 +182,7 @@ class ParallelConfig:
|
||||
threshold, microbatching will be used. Otherwise, the request will be
|
||||
processed in a single batch."""
|
||||
|
||||
disable_nccl_for_dp_synchronization: bool = Field(default=None)
|
||||
disable_nccl_for_dp_synchronization: bool | None = Field(default=None)
|
||||
"""Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py
|
||||
to use Gloo instead of NCCL for its all reduce.
|
||||
|
||||
|
||||
@@ -115,7 +115,7 @@ class SchedulerConfig:
|
||||
|
||||
# scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
|
||||
# (default) or "mod.custom_class".
|
||||
scheduler_cls: str | type[object] = Field(default=None)
|
||||
scheduler_cls: str | type[object] | None = Field(default=None)
|
||||
"""The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
|
||||
the default scheduler. Can be a class directly or the path to a class of
|
||||
form "mod.custom_class"."""
|
||||
@@ -128,7 +128,7 @@ class SchedulerConfig:
|
||||
and starting configuration.
|
||||
"""
|
||||
|
||||
async_scheduling: bool = Field(default=None)
|
||||
async_scheduling: bool | None = Field(default=None)
|
||||
"""If set to False, disable async scheduling. Async scheduling helps to
|
||||
avoid gaps in GPU utilization, leading to better latency and throughput.
|
||||
"""
|
||||
|
||||
@@ -10,7 +10,7 @@ import json
|
||||
import pathlib
|
||||
import textwrap
|
||||
from collections.abc import Callable, Mapping, Sequence, Set
|
||||
from dataclasses import MISSING, Field, field, fields, is_dataclass
|
||||
from dataclasses import MISSING, field, fields, is_dataclass
|
||||
from itertools import pairwise
|
||||
from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast
|
||||
|
||||
@@ -66,7 +66,7 @@ def config(
|
||||
return decorator(cls)
|
||||
|
||||
|
||||
def get_field(cls: ConfigType, name: str) -> Field:
|
||||
def get_field(cls: ConfigType, name: str) -> Any:
|
||||
"""Get the default factory field of a dataclass by name. Used for getting
|
||||
default factory fields in `EngineArgs`."""
|
||||
if not is_dataclass(cls):
|
||||
|
||||
@@ -67,6 +67,7 @@ from vllm.config.cache import (
|
||||
PrefixCachingHashAlgo,
|
||||
)
|
||||
from vllm.config.device import Device
|
||||
from vllm.config.lora import MaxLoRARanks
|
||||
from vllm.config.model import (
|
||||
ConvertOption,
|
||||
HfOverrides,
|
||||
@@ -77,7 +78,12 @@ from vllm.config.model import (
|
||||
)
|
||||
from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
|
||||
from vllm.config.observability import DetailedTraceModules
|
||||
from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
|
||||
from vllm.config.parallel import (
|
||||
All2AllBackend,
|
||||
DataParallelBackend,
|
||||
DistributedExecutorBackend,
|
||||
ExpertPlacementStrategy,
|
||||
)
|
||||
from vllm.config.scheduler import SchedulerPolicy
|
||||
from vllm.config.utils import get_field
|
||||
from vllm.config.vllm import OptimizationLevel
|
||||
@@ -257,7 +263,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
|
||||
# VllmConfig's Fields have default_factory set to config classes.
|
||||
# These could emit logs on init, which would be confusing.
|
||||
with suppress_logging():
|
||||
default = default.default_factory()
|
||||
default = default.default_factory() # type: ignore[call-arg]
|
||||
elif field.default_factory is not MISSING:
|
||||
default = field.default_factory()
|
||||
|
||||
@@ -373,7 +379,7 @@ class EngineArgs:
|
||||
dtype: ModelDType = ModelConfig.dtype
|
||||
kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
|
||||
seed: int = ModelConfig.seed
|
||||
max_model_len: int | None = ModelConfig.max_model_len
|
||||
max_model_len: int = ModelConfig.max_model_len
|
||||
cudagraph_capture_sizes: list[int] | None = (
|
||||
CompilationConfig.cudagraph_capture_sizes
|
||||
)
|
||||
@@ -405,9 +411,9 @@ class EngineArgs:
|
||||
data_parallel_rpc_port: int | None = None
|
||||
data_parallel_hybrid_lb: bool = False
|
||||
data_parallel_external_lb: bool = False
|
||||
data_parallel_backend: str = ParallelConfig.data_parallel_backend
|
||||
data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend
|
||||
enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
|
||||
all2all_backend: str = ParallelConfig.all2all_backend
|
||||
all2all_backend: All2AllBackend = ParallelConfig.all2all_backend
|
||||
enable_dbo: bool = ParallelConfig.enable_dbo
|
||||
ubatch_size: int = ParallelConfig.ubatch_size
|
||||
dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
|
||||
@@ -425,7 +431,7 @@ class EngineArgs:
|
||||
max_parallel_loading_workers: int | None = (
|
||||
ParallelConfig.max_parallel_loading_workers
|
||||
)
|
||||
block_size: BlockSize | None = CacheConfig.block_size
|
||||
block_size: BlockSize = CacheConfig.block_size
|
||||
enable_prefix_caching: bool | None = None
|
||||
prefix_caching_hash_algo: PrefixCachingHashAlgo = (
|
||||
CacheConfig.prefix_caching_hash_algo
|
||||
@@ -451,7 +457,7 @@ class EngineArgs:
|
||||
hf_token: bool | str | None = ModelConfig.hf_token
|
||||
hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
|
||||
tokenizer_revision: str | None = ModelConfig.tokenizer_revision
|
||||
quantization: QuantizationMethods | None = ModelConfig.quantization
|
||||
quantization: QuantizationMethods | str | None = ModelConfig.quantization
|
||||
allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization
|
||||
enforce_eager: bool = ModelConfig.enforce_eager
|
||||
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
|
||||
@@ -479,11 +485,11 @@ class EngineArgs:
|
||||
)
|
||||
io_processor_plugin: str | None = None
|
||||
skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
|
||||
video_pruning_rate: float = MultiModalConfig.video_pruning_rate
|
||||
video_pruning_rate: float | None = MultiModalConfig.video_pruning_rate
|
||||
# LoRA fields
|
||||
enable_lora: bool = False
|
||||
max_loras: int = LoRAConfig.max_loras
|
||||
max_lora_rank: int = LoRAConfig.max_lora_rank
|
||||
max_lora_rank: MaxLoRARanks = LoRAConfig.max_lora_rank
|
||||
default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras
|
||||
fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
|
||||
max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
|
||||
@@ -557,7 +563,7 @@ class EngineArgs:
|
||||
ModelConfig, "override_generation_config"
|
||||
)
|
||||
model_impl: str = ModelConfig.model_impl
|
||||
override_attention_dtype: str = ModelConfig.override_attention_dtype
|
||||
override_attention_dtype: str | None = ModelConfig.override_attention_dtype
|
||||
attention_backend: AttentionBackendEnum | None = AttentionConfig.backend
|
||||
|
||||
calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
|
||||
@@ -569,7 +575,7 @@ class EngineArgs:
|
||||
additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config")
|
||||
|
||||
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
|
||||
pt_load_map_location: str = LoadConfig.pt_load_map_location
|
||||
pt_load_map_location: str | dict[str, str] = LoadConfig.pt_load_map_location
|
||||
|
||||
logits_processors: list[str | type[LogitsProcessor]] | None = (
|
||||
ModelConfig.logits_processors
|
||||
@@ -1280,7 +1286,7 @@ class EngineArgs:
|
||||
hf_config_path=self.hf_config_path,
|
||||
runner=self.runner,
|
||||
convert=self.convert,
|
||||
tokenizer=self.tokenizer,
|
||||
tokenizer=self.tokenizer, # type: ignore[arg-type]
|
||||
tokenizer_mode=self.tokenizer_mode,
|
||||
trust_remote_code=self.trust_remote_code,
|
||||
allowed_local_media_path=self.allowed_local_media_path,
|
||||
@@ -1445,12 +1451,16 @@ class EngineArgs:
|
||||
self.kv_cache_dtype, model_config
|
||||
)
|
||||
|
||||
assert self.enable_prefix_caching is not None, (
|
||||
"enable_prefix_caching must be set by this point"
|
||||
)
|
||||
|
||||
cache_config = CacheConfig(
|
||||
block_size=self.block_size,
|
||||
gpu_memory_utilization=self.gpu_memory_utilization,
|
||||
kv_cache_memory_bytes=self.kv_cache_memory_bytes,
|
||||
swap_space=self.swap_space,
|
||||
cache_dtype=resolved_cache_dtype,
|
||||
cache_dtype=resolved_cache_dtype, # type: ignore[arg-type]
|
||||
is_attention_free=model_config.is_attention_free,
|
||||
num_gpu_blocks_override=self.num_gpu_blocks_override,
|
||||
sliding_window=sliding_window,
|
||||
@@ -1676,6 +1686,16 @@ class EngineArgs:
|
||||
target_parallel_config=parallel_config,
|
||||
)
|
||||
|
||||
assert self.max_num_batched_tokens is not None, (
|
||||
"max_num_batched_tokens must be set by this point"
|
||||
)
|
||||
assert self.max_num_seqs is not None, "max_num_seqs must be set by this point"
|
||||
assert self.enable_chunked_prefill is not None, (
|
||||
"enable_chunked_prefill must be set by this point"
|
||||
)
|
||||
assert model_config.max_model_len is not None, (
|
||||
"max_model_len must be set by this point"
|
||||
)
|
||||
scheduler_config = SchedulerConfig(
|
||||
runner_type=model_config.runner_type,
|
||||
max_num_batched_tokens=self.max_num_batched_tokens,
|
||||
@@ -2043,6 +2063,9 @@ class EngineArgs:
|
||||
)
|
||||
|
||||
if orig_max_num_batched_tokens is None:
|
||||
assert model_config.max_model_len is not None, (
|
||||
"max_model_len must be set by this point"
|
||||
)
|
||||
if not self.enable_chunked_prefill:
|
||||
# If max_model_len is too short, use the default for higher throughput.
|
||||
self.max_num_batched_tokens = max(
|
||||
|
||||
@@ -38,7 +38,7 @@ def _use_color() -> bool:
|
||||
return False
|
||||
|
||||
|
||||
DEFAULT_LOGGING_CONFIG = {
|
||||
DEFAULT_LOGGING_CONFIG: dict[str, dict[str, Any] | Any] = {
|
||||
"formatters": {
|
||||
"vllm": {
|
||||
"class": "vllm.logging_utils.NewLineFormatter",
|
||||
@@ -157,7 +157,7 @@ _METHODS_TO_PATCH = {
|
||||
|
||||
|
||||
def _configure_vllm_root_logger() -> None:
|
||||
logging_config = dict[str, dict[str, Any] | Any]()
|
||||
logging_config: dict[str, dict[str, Any] | Any] = {}
|
||||
|
||||
if not envs.VLLM_CONFIGURE_LOGGING and envs.VLLM_LOGGING_CONFIG_PATH:
|
||||
raise RuntimeError(
|
||||
@@ -225,7 +225,8 @@ def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]:
|
||||
logging.disable(current_level)
|
||||
|
||||
|
||||
def current_formatter_type(lgr: Logger) -> Literal["color", "newline", None]:
|
||||
def current_formatter_type(logger: Logger) -> Literal["color", "newline", None]:
|
||||
lgr: Logger | None = logger
|
||||
while lgr is not None:
|
||||
if lgr.handlers and len(lgr.handlers) == 1 and lgr.handlers[0].name == "vllm":
|
||||
formatter = lgr.handlers[0].formatter
|
||||
|
||||
@@ -162,7 +162,7 @@ class RequestOutput:
|
||||
completion.token_ids.extend(next_completion.token_ids)
|
||||
if next_completion.logprobs:
|
||||
assert completion.logprobs is not None
|
||||
completion.logprobs.extend(next_completion.logprobs)
|
||||
completion.logprobs.extend(next_completion.logprobs) # type: ignore[arg-type]
|
||||
completion.cumulative_logprob = (
|
||||
next_completion.cumulative_logprob
|
||||
)
|
||||
|
||||
@@ -71,6 +71,9 @@ class CudagraphDispatcher:
|
||||
"""Pre-compute the mapping from batch size to padded graph size."""
|
||||
max_size = self.compilation_config.max_cudagraph_capture_size
|
||||
capture_sizes = self.compilation_config.cudagraph_capture_sizes
|
||||
assert capture_sizes is not None, (
|
||||
"Cudagraph capture sizes must be set when cudagraphs are enabled."
|
||||
)
|
||||
self._bs_to_padded_graph_size: list[int] = [0] * (max_size + 1)
|
||||
for end, start in zip(
|
||||
capture_sizes + [max_size + 1],
|
||||
@@ -89,6 +92,7 @@ class CudagraphDispatcher:
|
||||
and self.cudagraph_mode != CUDAGraphMode.NONE
|
||||
):
|
||||
for size in self.compilation_config.compile_sizes:
|
||||
size = int(size)
|
||||
if size <= self.compilation_config.max_cudagraph_capture_size:
|
||||
padded = self._bs_to_padded_graph_size[size]
|
||||
if padded != size:
|
||||
@@ -178,6 +182,9 @@ class CudagraphDispatcher:
|
||||
# guarantee all keys would be used. For example, if we allow lazy
|
||||
# capturing in future PR, some keys may never be triggered.
|
||||
if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
|
||||
assert self.compilation_config.cudagraph_capture_sizes is not None, (
|
||||
"Cudagraph capture sizes must be set when mixed mode is enabled."
|
||||
)
|
||||
for bs, num_active_loras in product(
|
||||
self.compilation_config.cudagraph_capture_sizes, lora_cases
|
||||
):
|
||||
@@ -200,6 +207,9 @@ class CudagraphDispatcher:
|
||||
uniform_decode_query_len
|
||||
* self.vllm_config.scheduler_config.max_num_seqs
|
||||
)
|
||||
assert self.compilation_config.cudagraph_capture_sizes is not None, (
|
||||
"Cudagraph capture sizes must be set when full mode is enabled."
|
||||
)
|
||||
cudagraph_capture_sizes_for_decode = [
|
||||
x
|
||||
for x in self.compilation_config.cudagraph_capture_sizes
|
||||
@@ -262,6 +272,9 @@ class CudagraphDispatcher:
|
||||
else:
|
||||
# When not specializing, graphs are captured only with max_loras + 1,
|
||||
# so we must use max_loras + 1 for dispatch to find a matching graph.
|
||||
assert self.vllm_config.lora_config is not None, (
|
||||
"LoRA config must be set when has_lora is True."
|
||||
)
|
||||
effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1
|
||||
|
||||
batch_desc = self._create_padded_batch_descriptor(
|
||||
|
||||
Reference in New Issue
Block a user