diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index cc275ae08..1ba1f8156 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -29,7 +29,7 @@ SEPARATE_GROUPS = [ "tests", # v0 related "vllm/lora", - "vllm/model_executor", + "vllm/model_executor/layers", ] # TODO(woosuk): Include the code from Megatron and HuggingFace. diff --git a/vllm/model_executor/layers/sparse_attn_indexer.py b/vllm/model_executor/layers/sparse_attn_indexer.py index c34800247..a4ee5cc1f 100644 --- a/vllm/model_executor/layers/sparse_attn_indexer.py +++ b/vllm/model_executor/layers/sparse_attn_indexer.py @@ -96,6 +96,7 @@ def sparse_attn_indexer( topk_indices_buffer[: hidden_states.shape[0]] = -1 if has_prefill: prefill_metadata = attn_metadata.prefill + assert prefill_metadata is not None # Get the full shared workspace buffers once (will allocate on first use) workspace_manager = current_workspace_manager() @@ -170,6 +171,8 @@ def sparse_attn_indexer( if has_decode: decode_metadata = attn_metadata.decode + assert decode_metadata is not None + # kv_cache shape [ # kv_cache size requirement [num_block, block_size, n_head, head_dim], # we only have [num_block, block_size, head_dim], kv_cache = kv_cache.unsqueeze(-2) diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index 25fa3ba03..75d0b3425 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from collections.abc import Generator +from typing import TYPE_CHECKING, cast import gguf import regex as re @@ -27,6 +28,9 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.transformers_utils.gguf_utils import detect_gguf_multimodal from vllm.utils.torch_utils import set_default_torch_dtype +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization.gguf import GGUFConfig + logger = init_logger(__name__) @@ -350,10 +354,9 @@ class GGUFModelLoader(BaseModelLoader): for name, weight_type in weight_type_map.items() if weight_type in ("F32", "F16", "BF16") and name.endswith(".weight") ] - logger.debug( - "GGUF unquantized modules: %s", - unquant_names, - ) + logger.debug("GGUF unquantized modules: %s", unquant_names) + if TYPE_CHECKING: + vllm_config.quant_config = cast(GGUFConfig, vllm_config.quant_config) vllm_config.quant_config.unquantized_modules.extend(unquant_names) target_device = torch.device(device_config.device) diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index 782514210..47c3c99b1 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -27,28 +27,16 @@ class RunaiModelStreamerLoader(BaseModelLoader): def __init__(self, load_config: LoadConfig): super().__init__(load_config) - self._is_distributed = False + self._is_distributed: bool = False if load_config.model_loader_extra_config: extra_config = load_config.model_loader_extra_config - if "distributed" in extra_config and isinstance( - extra_config.get("distributed"), bool - ): - self._is_distributed = extra_config.get("distributed") - - if "concurrency" in extra_config and isinstance( - extra_config.get("concurrency"), int - ): - os.environ["RUNAI_STREAMER_CONCURRENCY"] = str( - extra_config.get("concurrency") - ) - - if "memory_limit" in extra_config and isinstance( - extra_config.get("memory_limit"), int - ): - os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str( - extra_config.get("memory_limit") - ) + if isinstance(distributed := extra_config.get("distributed"), bool): + self._is_distributed = distributed + if isinstance(concurrency := extra_config.get("concurrency"), int): + os.environ["RUNAI_STREAMER_CONCURRENCY"] = str(concurrency) + if isinstance(memory_limit := extra_config.get("memory_limit"), int): + os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str(memory_limit) runai_streamer_s3_endpoint = os.getenv("RUNAI_STREAMER_S3_ENDPOINT") aws_endpoint_url = os.getenv("AWS_ENDPOINT_URL") @@ -93,7 +81,7 @@ class RunaiModelStreamerLoader(BaseModelLoader): return hf_weights_files def _get_weights_iterator( - self, model_or_path: str, revision: str + self, model_or_path: str, revision: str | None ) -> Generator[tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_weights_files = self._prepare_weights(model_or_path, revision) diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index e27cedd99..a87731e8b 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -6,6 +6,7 @@ import glob import os import time from collections.abc import Generator +from copy import copy from typing import Any import torch @@ -42,7 +43,7 @@ class ShardedStateLoader(BaseModelLoader): extra_config = ( {} if load_config.model_loader_extra_config is None - else load_config.model_loader_extra_config.copy() + else copy(load_config.model_loader_extra_config) ) self.pattern = extra_config.pop("pattern", self.DEFAULT_PATTERN) if extra_config: diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 1ff1a448a..3e6ed248f 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -674,7 +674,8 @@ def serialize_vllm_model( key = f.read() encryption_params = EncryptionParams(key=key) - output_file = tensorizer_args.tensorizer_uri + if (output_file := tensorizer_args.tensorizer_uri) is None: + raise ValueError("tensorizer_uri must be specified for serialization.") if tensorizer_config._is_sharded: from vllm.distributed import get_tensor_model_parallel_rank diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index a3e3c9fd0..c5bff1312 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -121,6 +121,7 @@ class TensorizerLoader(BaseModelLoader): if parallel_config.tensor_parallel_size > 1: from vllm.distributed import get_tensor_model_parallel_rank + assert self.tensorizer_config.tensorizer_uri is not None self.tensorizer_config.tensorizer_uri = ( self.tensorizer_config.tensorizer_uri % get_tensor_model_parallel_rank() ) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index dc525c454..8f370717d 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -6,6 +6,7 @@ import inspect import warnings from contextlib import contextmanager from dataclasses import dataclass, field +from typing import Any import torch from torch import nn @@ -71,7 +72,7 @@ def initialize_model( model_class, ) # try to be compatible with old-style model class - kwargs = {} + kwargs: dict[str, Any] = {} if "prefix" in all_params: kwargs["prefix"] = prefix if "config" in all_params: diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index dd4bf636e..4f840cba6 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -257,6 +257,8 @@ def convert_bin_to_safetensor_file( def get_quant_config( model_config: ModelConfig, load_config: LoadConfig ) -> QuantizationConfig: + if model_config.quantization is None: + raise ValueError("Model quantization method is not specified in the config.") quant_cls = get_quantization_config(model_config.quantization) # GGUF doesn't have config file @@ -307,6 +309,11 @@ def get_quant_config( # if hf_quant_config is None, we will try to get config from # hf_overrides hf_overrides = model_config.hf_overrides + if not isinstance(hf_overrides, dict): + raise ValueError( + "hf_overrides must be a dict for get_quant_config " + "to get the quantization config from it." + ) quantization_config_file = hf_overrides.get("quantization_config_file", None) if quantization_config_file is not None: if hasattr(quant_cls, "from_config_file"): @@ -1087,7 +1094,7 @@ def multi_thread_pt_weights_iterator( def get_gguf_extra_tensor_names( - gguf_file: str, gguf_to_hf_name_map: dict[str, str] + gguf_file: str | Path, gguf_to_hf_name_map: dict[str, str] ) -> list[str]: reader = gguf.GGUFReader(gguf_file) expected_gguf_keys = set(gguf_to_hf_name_map.keys()) @@ -1097,7 +1104,7 @@ def get_gguf_extra_tensor_names( def get_gguf_weight_type_map( - gguf_file: str, gguf_to_hf_name_map: dict[str, str] + gguf_file: str | Path, gguf_to_hf_name_map: dict[str, str] ) -> dict[str, str]: """ Return GGUF mapped weight's name and its quant type @@ -1111,7 +1118,7 @@ def get_gguf_weight_type_map( def gguf_quant_weights_iterator( - gguf_file: str, gguf_to_hf_name_map: dict[str, str] + gguf_file: str | Path, gguf_to_hf_name_map: dict[str, str] ) -> Generator[tuple[str, torch.Tensor], None, None]: """ Iterate over the quant weights in the model gguf files and convert diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index 8f7a69482..410e27749 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -154,8 +154,8 @@ class _ColumnvLLMParameter(BasevLLMParameter): self.data.copy_(loaded_weight) def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): - shard_offset = kwargs.get("shard_offset") - shard_size = kwargs.get("shard_size") + shard_offset: int = kwargs["shard_offset"] + shard_size: int = kwargs["shard_size"] # TODO: move these to PackedColumnParameter and PackedvLLMParameter if ( @@ -176,10 +176,10 @@ class _ColumnvLLMParameter(BasevLLMParameter): param_data.copy_(loaded_weight) def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): - shard_offset = kwargs.get("shard_offset") - shard_size = kwargs.get("shard_size") - shard_id = kwargs.get("shard_id") - num_heads = kwargs.get("num_heads") + shard_offset: int = kwargs["shard_offset"] + shard_size: int = kwargs["shard_size"] + shard_id: str = kwargs["shard_id"] + num_heads: int = kwargs["num_heads"] # TODO: move these to PackedColumnParameter and PackedvLLMParameter if ( @@ -191,10 +191,10 @@ class _ColumnvLLMParameter(BasevLLMParameter): ) param_data = self.data - shard_id = self.tp_rank if shard_id == "q" else self.tp_rank // num_heads + shard_id_int = self.tp_rank if shard_id == "q" else self.tp_rank // num_heads param_data = param_data.narrow(self.output_dim, shard_offset, shard_size) loaded_weight = loaded_weight.narrow( - self.output_dim, shard_id * shard_size, shard_size + self.output_dim, shard_id_int * shard_size, shard_size ) assert param_data.shape == loaded_weight.shape