From 7409204d71414318e8f64ba243a7f87fc3dc05cd Mon Sep 17 00:00:00 2001 From: biondizzle Date: Mon, 18 May 2026 22:33:51 +0000 Subject: [PATCH] Use nightly's deepseek_v4.py + attention as base, add only NVFP4 mapper The upstream deepseek_v4.py has imports that don't exist in the nightly Docker image (norm_gate_linear, breakable_cudagraph, etc.). Use the nightly's own files as the base and add only the minimal NVFP4 changes: - Add _make_deepseek_v4_nvfp4_weights_mapper() for checkpoint key mapping - Select NVFP4 mapper when quant_config is modelopt_fp4 - cos_sin_cache float32 fix in attention - Remove utils.py patch (not needed) --- vllm/patches/deepseek_v4.py | 220 +++++++++----------- vllm/patches/deepseek_v4_attention.py | 3 +- vllm/patches/utils.py | 289 -------------------------- 3 files changed, 98 insertions(+), 414 deletions(-) delete mode 100644 vllm/patches/utils.py diff --git a/vllm/patches/deepseek_v4.py b/vllm/patches/deepseek_v4.py index a9ab511a..31eb7f44 100644 --- a/vllm/patches/deepseek_v4.py +++ b/vllm/patches/deepseek_v4.py @@ -23,14 +23,11 @@ from vllm.model_executor.layers.deepseek_v4_attention import ( DeepseekV4MLAModules, DeepseekV4MultiHeadLatentAttentionWrapper, ) -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import ( fused_topk_bias, ) -from vllm.model_executor.layers.fused_moe.router.norm_gate_linear import ( - NormGateLinear, -) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -38,12 +35,6 @@ from vllm.model_executor.layers.linear import ( RowParallelLinear, ) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mhc import ( - HCHeadOp, - MHCFusedPostPreOp, - MHCPostOp, - MHCPreOp, -) from vllm.model_executor.layers.quantization import ( QuantizationConfig, QuantizationMethods, @@ -758,23 +749,23 @@ class DeepseekV4MoE(nn.Module): "deep_gemm_mega_moe for this checkpoint." ) - # Fused RMSNorm + gate: owns both ffn_norm and the gate matmul. - self.norm_gate = NormGateLinear( - hidden_size=config.hidden_size, - num_experts=config.n_routed_experts, - rms_eps=config.rms_norm_eps, - prefix=f"{prefix}.norm_gate", + self.gate = GateLinear( + config.hidden_size, + config.n_routed_experts, + out_dtype=torch.float32, + bias=False, + prefix=f"{prefix}.gate", ) - # Routing-side tensors live on ``norm_gate`` directly (not on the - # inner gate); they are initialized to None in NormGatedLinear and - # populated below depending on the MoE variant. + self.gate.e_score_correction_bias = None + self.gate.tid2eid = None is_hash_moe = extract_layer_index(prefix) < config.num_hash_layers self.hash_indices_dtype = torch.int64 if self.use_mega_moe else torch.int32 + if is_hash_moe: # hash MoE doesn't use e_score_correction_bias # Use randint instead of empty to avoid garbage values causing # invalid memory access in dummy mode (--load-format="dummy") - self.norm_gate.tid2eid = nn.Parameter( + self.gate.tid2eid = nn.Parameter( torch.randint( 0, config.n_routed_experts, @@ -784,7 +775,7 @@ class DeepseekV4MoE(nn.Module): requires_grad=False, ) elif getattr(config, "topk_method", None) == "noaux_tc": - self.norm_gate.e_score_correction_bias = nn.Parameter( + self.gate.e_score_correction_bias = nn.Parameter( torch.empty(config.n_routed_experts, dtype=torch.float32), requires_grad=False, ) @@ -847,9 +838,10 @@ class DeepseekV4MoE(nn.Module): self.n_local_experts = config.n_routed_experts // self.tp_size self.experts_start_idx = self.tp_rank * self.n_local_experts self.experts_end_idx = self.experts_start_idx + self.n_local_experts - # We don't pass `gate` into FusedMoE + self.experts = FusedMoE( shared_experts=self.shared_experts, + gate=self.gate, num_experts=config.n_routed_experts, top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, @@ -859,8 +851,8 @@ class DeepseekV4MoE(nn.Module): prefix=f"{prefix}.experts", scoring_func=self.scoring_func, routed_scaling_factor=self.routed_scaling_factor, - e_score_correction_bias=self.norm_gate.e_score_correction_bias, - hash_indices_table=self.norm_gate.tid2eid, + e_score_correction_bias=self.gate.e_score_correction_bias, + hash_indices_table=self.gate.tid2eid, swiglu_limit=self.swiglu_limit, router_logits_dtype=torch.float32, ) @@ -868,40 +860,40 @@ class DeepseekV4MoE(nn.Module): def forward( self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None ) -> torch.Tensor: - if self.norm_gate.tid2eid is not None and input_ids is None: + if self.gate.tid2eid is not None and input_ids is None: raise ValueError("DeepSeek V4 hash MoE routing requires input_ids.") if not self.use_mega_moe: return self._forward_fused_moe(hidden_states, input_ids) org_shape = hidden_states.shape - normed_x, router_logits = self.norm_gate(hidden_states) + router_logits, _ = self.gate(hidden_states) topk_weights, topk_ids = fused_topk_bias( - hidden_states=normed_x, + hidden_states=hidden_states, gating_output=router_logits, scoring_func=self.scoring_func, - e_score_correction_bias=self.norm_gate.e_score_correction_bias.data - if self.norm_gate.e_score_correction_bias is not None + e_score_correction_bias=self.gate.e_score_correction_bias.data + if self.gate.e_score_correction_bias is not None else None, topk=self.n_activated_experts, renormalize=self.renormalize, indices_type=self.hash_indices_dtype, input_tokens=input_ids, - hash_indices_table=self.norm_gate.tid2eid, + hash_indices_table=self.gate.tid2eid, routed_scaling_factor=self.routed_scaling_factor, ) activation_clamp = ( float(self.swiglu_limit) if self.swiglu_limit is not None else None ) final_hidden_states = self.experts( - normed_x, + hidden_states, topk_weights, topk_ids, activation_clamp=activation_clamp, ) if self.shared_experts is not None: - shared_output = self.shared_experts(normed_x) + shared_output = self.shared_experts(hidden_states) final_hidden_states += shared_output return final_hidden_states.view(org_shape) @@ -909,14 +901,21 @@ class DeepseekV4MoE(nn.Module): def _forward_fused_moe( self, hidden_states: torch.Tensor, input_ids: torch.Tensor | None = None ) -> torch.Tensor: - assert not self.experts.is_internal_router org_shape = hidden_states.shape - normed_x, router_logits = self.norm_gate(hidden_states) - final_hidden_states = self.experts( - hidden_states=normed_x, - router_logits=router_logits, - input_ids=input_ids, - ) + if self.experts.is_internal_router: + # In this case, the gate/router runs inside the FusedMoE class + final_hidden_states = self.experts( + hidden_states=hidden_states, + router_logits=hidden_states, + input_ids=input_ids, + ) + else: + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states=hidden_states, + router_logits=router_logits, + input_ids=input_ids, + ) return final_hidden_states.view(org_shape) @@ -1120,8 +1119,7 @@ class DeepseekV4DecoderLayer(nn.Module): self.ffn = DeepseekV4MoE(vllm_config, prefix=f"{prefix}.ffn") self.attn_norm = RMSNorm(self.hidden_size, self.rms_norm_eps) - # ``ffn_norm`` is owned by ``self.ffn.norm_gate`` (fused with the - # router gate matmul); see ``NormGatedLinear``. + self.ffn_norm = RMSNorm(self.hidden_size, self.rms_norm_eps) self.hc_mult = config.hc_mult self.hc_sinkhorn_iters = config.hc_sinkhorn_iters self.hc_eps = config.hc_eps @@ -1170,9 +1168,6 @@ class DeepseekV4DecoderLayer(nn.Module): ), requires_grad=False, ) - self.mhc_pre = MHCPreOp() - self.mhc_post = MHCPostOp() - self.mhc_fused_post_pre = MHCFusedPostPreOp() def hc_pre( self, @@ -1181,7 +1176,7 @@ class DeepseekV4DecoderLayer(nn.Module): hc_scale: torch.Tensor, hc_base: torch.Tensor, ): - post_mix, res_mix, layer_input = self.mhc_pre( + post_mix, res_mix, layer_input = torch.ops.vllm.mhc_pre( residual=x, fn=hc_fn, hc_scale=hc_scale, @@ -1201,17 +1196,17 @@ class DeepseekV4DecoderLayer(nn.Module): post: torch.Tensor, comb: torch.Tensor, ): - return self.mhc_post(x, residual, post, comb) + return torch.ops.vllm.mhc_post(x, residual, post, comb) - def _forward_cuda( + def forward( self, x: torch.Tensor, positions: torch.Tensor, input_ids: torch.Tensor | None, - post_mix: torch.Tensor | None = None, - res_mix: torch.Tensor | None = None, - residual: torch.Tensor | None = None, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + post_mix: torch.Tensor | None, + res_mix: torch.Tensor | None, + residual: torch.Tensor | None, + ) -> torch.Tensor: if residual is None: # Run standalone hc_pre on first layer residual = x @@ -1219,7 +1214,7 @@ class DeepseekV4DecoderLayer(nn.Module): x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base ) else: - residual, post_mix, res_mix, x = self.mhc_fused_post_pre( + residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre( x, residual, post_mix, @@ -1237,7 +1232,7 @@ class DeepseekV4DecoderLayer(nn.Module): x = self.attn_norm(x) x = self.attn(positions, x, None) - residual, post_mix, res_mix, x = self.mhc_fused_post_pre( + residual, post_mix, res_mix, x = torch.ops.vllm.mhc_fused_post_pre( x, residual, post_mix, @@ -1251,65 +1246,29 @@ class DeepseekV4DecoderLayer(nn.Module): self.hc_post_alpha, self.hc_sinkhorn_iters, ) - # ffn_norm is now folded into self.ffn.norm_gate; ffn() takes - # the pre-norm activation directly. + + x = self.ffn_norm(x) x = self.ffn(x, input_ids) return x, residual, post_mix, res_mix - def _forward_rocm( - self, - x: torch.Tensor, - positions: torch.Tensor, - input_ids: torch.Tensor | None, - post_mix: torch.Tensor | None = None, - res_mix: torch.Tensor | None = None, - residual: torch.Tensor | None = None, - ) -> tuple[ - torch.Tensor, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None - ]: - residual = x - x, post, comb = self.hc_pre( - x, self.hc_attn_fn, self.hc_attn_scale, self.hc_attn_base - ) - x = self.attn_norm(x) - x = self.attn(positions, x, None) - x = self.hc_post(x, residual, post, comb) - - residual = x - x, post, comb = self.hc_pre( - x, self.hc_ffn_fn, self.hc_ffn_scale, self.hc_ffn_base - ) - # ffn_norm is now folded into self.ffn.norm_gate; ffn() takes - # the pre-norm activation directly. - x = self.ffn(x, input_ids) - x = self.hc_post(x, residual, post, comb) - return x, None, None, None - - def forward( - self, - x: torch.Tensor, - positions: torch.Tensor, - input_ids: torch.Tensor | None, - post_mix: torch.Tensor | None = None, - res_mix: torch.Tensor | None = None, - residual: torch.Tensor | None = None, - ) -> tuple[ - torch.Tensor, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None - ]: - if current_platform.is_rocm(): - return self._forward_rocm( - x, positions, input_ids, post_mix, res_mix, residual - ) - - return self._forward_cuda(x, positions, input_ids, post_mix, res_mix, residual) - @support_torch_compile class DeepseekV4Model(nn.Module): - def __init__(self, *, vllm_config: Vllm_config, prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + + # Select weight mapper based on quantization method. + # NVFP4 (modelopt_fp4) checkpoints use different key naming + # than the default MXFP4 format. + quant_config = vllm_config.quant_config + if quant_config is not None and getattr(quant_config, "get_name", lambda: None)() == "modelopt_fp4": + self.hf_to_vllm_mapper = _make_deepseek_v4_nvfp4_weights_mapper() + elif getattr(config, "expert_dtype", "fp4") != "fp4": + self.hf_to_vllm_mapper = _make_deepseek_v4_weights_mapper("fp8") + else: + self.hf_to_vllm_mapper = _make_deepseek_v4_weights_mapper("fp4") quant_config = vllm_config.quant_config self.config = config self.use_mega_moe = ( @@ -1392,7 +1351,7 @@ class DeepseekV4Model(nn.Module): torch.empty(1, dtype=torch.float32), requires_grad=False, ) - self.hc_head_op = HCHeadOp() + # Pre-hc_head residual stream buffer for the MTP draft. Stable # address (outside the cudagraph pool) so the copy_ in forward() # refreshes it correctly across captured shapes. @@ -1462,7 +1421,7 @@ class DeepseekV4Model(nn.Module): res_mix, residual, ) - if layer is not None and current_platform.is_cuda(): + else: hidden_states = layer.hc_post(hidden_states, residual, post_mix, res_mix) if not get_pp_group().is_last_rank: @@ -1472,7 +1431,7 @@ class DeepseekV4Model(nn.Module): num_tokens = hidden_states.shape[0] self._mtp_hidden_buffer[:num_tokens].copy_(hidden_states.flatten(1)) - hidden_states = self.hc_head_op( + hidden_states = hc_head( hidden_states, self.hc_head_fn, self.hc_head_scale, @@ -1601,6 +1560,36 @@ class DeepseekV4Model(nn.Module): layer.ffn.finalize_mega_moe_weights() +@torch.compile(backend=current_platform.simple_compile_backend) +def hc_head( + hidden_states: torch.Tensor, + hc_fn: torch.Tensor, + hc_scale: torch.Tensor, + hc_base: torch.Tensor, + rms_norm_eps: float, + hc_eps: float, +) -> torch.Tensor: + hc_mult, hidden_size = hidden_states.shape[-2:] + outer_shape = hidden_states.shape[:-2] + hs_flat = hidden_states.view(-1, hc_mult, hidden_size) + num_tokens = hs_flat.shape[0] + out = torch.empty( + num_tokens, hidden_size, dtype=torch.bfloat16, device=hidden_states.device + ) + torch.ops.vllm.hc_head_fused_kernel( + hs_flat, + hc_fn, + hc_scale, + hc_base, + out, + hidden_size, + rms_norm_eps, + hc_eps, + hc_mult, + ) + return out.view(*outer_shape, hidden_size) + + def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper: if expert_dtype == "fp4": # MXFP4 experts use Mxfp4MoEMethod, which registers scales as @@ -1630,13 +1619,7 @@ def _make_deepseek_v4_weights_mapper(expert_dtype: str) -> WeightsMapper: orig_to_new_suffix={ "head.weight": "lm_head.weight", "embed.weight": "embed_tokens.weight", - # Pre-MoE norm + gate are now owned by ``DeepseekV4MoE.norm_gate`` - # (see NormGatedLinear). - ".ffn_norm.weight": ".ffn.norm_gate.norm.weight", - ".ffn.gate.weight": ".ffn.norm_gate.gate.weight", - ".ffn.gate.bias": ".ffn.norm_gate.e_score_correction_bias", - # Hash MoE table also moved off the inner gate. - ".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid", + ".ffn.gate.bias": ".ffn.gate.e_score_correction_bias", }, orig_to_new_substr={ ".attn.compressor.": ".attn.mla_attn.compressor.", @@ -1655,21 +1638,15 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper: - Scales already have .weight_scale / .weight_scale_2 / .input_scale suffixes - Shared expert uses down_proj (not w2) - Self-attention uses .self_attn. prefix (same as checkpoint, renamed to .attn.) - - Hadamard coding uses .attn_hc. and .ffn_hc. prefixes This is the mapper that should be used when quantization is modelopt_fp4. """ - # Expert weight renames: gate_proj→w1, up_proj→w3, down_proj→w2 - # Must match BEFORE the general suffix renames expert_rename_regex = { re.compile(r"(\.experts\.\d+\.)gate_proj\."): r"\1w1.", re.compile(r"(\.experts\.\d+\.)up_proj\."): r"\1w3.", re.compile(r"(\.experts\.\d+\.)down_proj\."): r"\1w2.", } - # Suffix renames for non-expert keys - # NVFP4 checkpoints already use .weight_scale (not .scale), so no scale→weight_scale mapping needed - # But .self_attn. → .attn. and .mlp. → .ffn. renames are needed suffix_renames = { "head.weight": "lm_head.weight", "embed.weight": "embed_tokens.weight", @@ -1679,7 +1656,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper: ".ffn.gate.tid2eid": ".ffn.norm_gate.tid2eid", } - # Substr renames substr_renames = { ".attn.compressor.": ".attn.mla_attn.compressor.", ".mlp.shared_experts.gate_proj.": ".ffn.shared_experts.w1.", @@ -1687,8 +1663,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper: ".mlp.shared_experts.down_proj.": ".ffn.shared_experts.down_proj.", ".mlp.": ".ffn.", ".self_attn.": ".attn.", - ".attn_hc.": ".attn.hc_op.", - ".ffn_hc.": ".ffn.hc_op.", } return WeightsMapper( @@ -1696,8 +1670,6 @@ def _make_deepseek_v4_nvfp4_weights_mapper() -> WeightsMapper: "layers.": "model.layers.", "embed.": "model.embed.", "norm.": "model.norm.", - "hc_head": "model.hc_head", - "mtp.": "model.mtp.", }, orig_to_new_regex=expert_rename_regex, orig_to_new_suffix=suffix_renames, diff --git a/vllm/patches/deepseek_v4_attention.py b/vllm/patches/deepseek_v4_attention.py index acb77d72..bfe8741b 100644 --- a/vllm/patches/deepseek_v4_attention.py +++ b/vllm/patches/deepseek_v4_attention.py @@ -46,7 +46,7 @@ from vllm.logger import init_logger from vllm.model_executor.custom_op import PluggableLayer from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.deepseek_compressor import DeepseekCompressor -from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.input_quant_fp8 import ( QuantFP8, @@ -1109,6 +1109,7 @@ class DeepseekV4Indexer(nn.Module): quant_config=None, prefix=f"{prefix}.weights_proj", ) + self.k_norm = LayerNorm(self.head_dim, eps=1e-6) self.softmax_scale = self.head_dim**-0.5 self.scale_fmt = "ue8m0" diff --git a/vllm/patches/utils.py b/vllm/patches/utils.py deleted file mode 100644 index 63b0d8c7..00000000 --- a/vllm/patches/utils.py +++ /dev/null @@ -1,289 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Utilities for selecting and loading models.""" - -import inspect -import warnings -from contextlib import contextmanager -from dataclasses import dataclass, field -from typing import Any - -import torch -from torch import nn -from typing_extensions import assert_never - -import vllm.envs as envs -from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config -from vllm.logger import init_logger -from vllm.model_executor.layers.attention import ( - Attention, - MLAAttention, - MMEncoderAttention, -) -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig, - QuantizeMethodBase, -) -from vllm.model_executor.model_loader.reload import ( - record_metadata_for_reloading, - set_torchao_reload_attrs, -) -from vllm.model_executor.models.interfaces import SupportsQuant -from vllm.tracing import instrument -from vllm.utils.platform_utils import is_pin_memory_available -from vllm.utils.torch_utils import get_accelerator_view_from_cpu_tensor - -logger = init_logger(__name__) - - -@instrument(span_name="Initialize model") -def initialize_model( - vllm_config: VllmConfig, - *, - prefix: str = "", - model_class: type[nn.Module] | None = None, - model_config: ModelConfig | None = None, -) -> nn.Module: - """Initialize a model with the given configurations.""" - if model_config is None: - model_config = vllm_config.model_config - if model_class is None: - model_class, _ = get_model_architecture(model_config) - - if vllm_config.quant_config is not None: - configure_quant_config(vllm_config.quant_config, model_class) - - signatures = inspect.signature(model_class.__init__) - all_params = [param.name for param in signatures.parameters.values()] - if "vllm_config" in all_params and "prefix" in all_params: - # new-style model class - with set_current_vllm_config(vllm_config, check_compile=True, prefix=prefix): - model = model_class(vllm_config=vllm_config, prefix=prefix) - record_metadata_for_reloading(model) - return model - - msg = ( - "vLLM model class should accept `vllm_config` and `prefix` as " - "input arguments. Possibly you have an old-style model class" - " registered from out of tree and it is used for new vLLM version. " - "Check https://docs.vllm.ai/en/latest/design/arch_overview.html " - "for the design and update the model class accordingly." - ) - warnings.warn(msg, DeprecationWarning, stacklevel=2) - - logger.warning( - "Trying to guess the arguments for old-style model class %s", - model_class, - ) - # try to be compatible with old-style model class - kwargs: dict[str, Any] = {} - if "prefix" in all_params: - kwargs["prefix"] = prefix - if "config" in all_params: - kwargs["config"] = model_config.hf_config - if "cache_config" in all_params: - kwargs["cache_config"] = vllm_config.cache_config - if "quant_config" in all_params: - kwargs["quant_config"] = vllm_config.quant_config - if "lora_config" in all_params: - kwargs["lora_config"] = vllm_config.lora_config - if "scheduler_config" in all_params: - kwargs["scheduler_config"] = vllm_config.scheduler_config - with set_current_vllm_config(vllm_config, check_compile=True, prefix=prefix): - model = model_class(**kwargs) - record_metadata_for_reloading(model) - - return model - - -def process_weights_after_loading( - model: nn.Module, model_config: ModelConfig, target_device: torch.device -) -> None: - for _, module in model.named_modules(): - quant_method = getattr(module, "quant_method", None) - if isinstance(quant_method, QuantizeMethodBase): - # When quant methods need to process weights after loading - # (for repacking, quantizing, etc), they expect parameters - # to be on the global target device. This scope is for the - # case where cpu offloading is used, where we will move the - # parameters onto device for processing and back off after. - with device_loading_context(module, target_device): - quant_method.process_weights_after_loading(module) - - # Initialize post-load attention weights for Attention, MLA, and MM encoder. - # NOTE: Happens after other modules so we can easily decompress weights. - for _, module in model.named_modules(): - if isinstance( - module, (Attention, MLAAttention, MMEncoderAttention) - ) and hasattr(module, "process_weights_after_loading"): - # TODO(lucas): see if there is a way to unify the signatures - # of process_weights_after_loading - with device_loading_context(module, target_device): - module.process_weights_after_loading(model_config.dtype) - - if model_config.quantization == "torchao": - set_torchao_reload_attrs(model, model_config) - - -@contextmanager -def device_loading_context(module: torch.nn.Module, target_device: torch.device): - if target_device.type == "cpu": - # If target is CPU, no need to move anything - yield module - return - - original_device_states: dict[str, torch.device] = {} - uva_offloaded_parameters: list[str] = [] - - # Store original device states and move parameters to GPU if they're on CPU - for name, p in module.named_parameters(): - if p.device.type == "cpu": - original_device_states[name] = p.device - p.data = p.data.to(target_device) - if getattr(p, "_vllm_is_uva_offloaded", False): - uva_offloaded_parameters.append(name) - # Parameters already on target device are not touched - - try: - yield module - - finally: - use_pin_memory = ( - is_pin_memory_available() - and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY - ) - # Restore parameters to their original devices, ignoring new parameters - for name, p in module.named_parameters(): - if name in original_device_states: - original_device: torch.device = original_device_states[name] - p.data = p.data.to(original_device) - - # parameter is UVA offloaded, but was replaced with a new device tensor - # re-offload it to CPU using UVA - if name in uva_offloaded_parameters and not getattr( - p, "_vllm_is_uva_offloaded", False - ): - cpu_data = p.data.to(device="cpu") - if use_pin_memory: - cpu_data = cpu_data.pin_memory() - p.data = get_accelerator_view_from_cpu_tensor(cpu_data) - p._vllm_is_uva_offloaded = True - - -_MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]() -"""Caches the outputs of `_get_model_architecture`.""" - - -def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]: - from vllm.model_executor.models.adapters import as_embedding_model, as_seq_cls_model - - architectures = getattr(model_config.hf_config, "architectures", None) or [] - - model_cls, arch = model_config.registry.resolve_model_cls( - architectures, - model_config=model_config, - ) - - if arch == model_config._get_transformers_backend_cls(): - assert model_config.model_impl != "vllm" - if model_config.model_impl == "auto": - logger.warning_once( - "%s has no vLLM implementation, falling back to Transformers " - "implementation. Some features may not be supported and " - "performance may not be optimal.", - arch, - ) - - convert_type = model_config.convert_type - if convert_type == "none": - pass - elif convert_type == "embed": - logger.debug_once("Converting to embedding model.") - model_cls = as_embedding_model(model_cls) - elif convert_type == "classify": - logger.debug_once("Converting to sequence classification model.") - model_cls = as_seq_cls_model(model_cls) - else: - assert_never(convert_type) - - return model_cls, arch - - -def get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]: - key = hash( - ( - model_config.model, - model_config.convert_type, - model_config.runner_type, - model_config.trust_remote_code, - model_config.model_impl, - tuple(getattr(model_config.hf_config, "architectures", None) or []), - ) - ) - if key in _MODEL_ARCH_BY_HASH: - return _MODEL_ARCH_BY_HASH[key] - - model_arch = _get_model_architecture(model_config) - _MODEL_ARCH_BY_HASH[key] = model_arch - return model_arch - - -def get_model_cls(model_config: ModelConfig) -> type[nn.Module]: - return get_model_architecture(model_config)[0] - - -def get_architecture_class_name(model_config: ModelConfig) -> str: - return get_model_architecture(model_config)[1] - - -@dataclass -class ParamMapping: - """ - A class to handle parameter mapping for model weight loading. - It creates a bidirectional mapping between packed parameters and their - constituent parts. - """ - - packed_mapping: dict[str, list[str]] - inverse_packed_mapping: dict[str, tuple[str, int]] = field(default_factory=dict) - - def __post_init__(self): - for packed_name, sub_params in self.packed_mapping.items(): - # Skip self-contained cases (e.g., {"W_pack": ["W_pack"]}) - if len(sub_params) == 1 and sub_params[0] == packed_name: - continue - for index, param_name in enumerate(sub_params): - self.inverse_packed_mapping[param_name] = ( - packed_name, - index, - ) - - def get_sub_modules(self, module_name: str) -> tuple[str, list[str]] | None: - for key, value in self.packed_mapping.items(): - if module_name.endswith(key): - return key, value - return None - - -def configure_quant_config( - quant_config: QuantizationConfig, model_class: type[nn.Module] -): - """ - Pass packed_modules_mapping by reference to quant_config so that - quant_config can properly match fused modules - - Note that model attributes are passed by reference to quant_config, - enabling them to be updated by model_class.__new__ (ex. chatglm, qwen) - - Once the `SupportsQuant` mixin has been added to all models, this - function can be removed - """ - if not issubclass(model_class, SupportsQuant): - hf_to_vllm_mapper = getattr(model_class, "hf_to_vllm_mapper", None) - packed_mapping = getattr(model_class, "packed_modules_mapping", None) - - # pass mappings by reference to quant_config - if hf_to_vllm_mapper is not None: - quant_config.apply_vllm_mapper(hf_to_vllm_mapper) - if packed_mapping is not None: - quant_config.packed_modules_mapping = packed_mapping