Move online quantization to model.load_weights (#26327)

Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
2025-11-18 16:52:41 -08:00
parent 1395461f5f
commit da94c7c0eb
6 changed files with 309 additions and 108 deletions
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -88,6 +88,14 @@ def initialize_model(
 def process_weights_after_loading(
    model: nn.Module, model_config: ModelConfig, target_device: torch.device
 ) -> None:
+    if getattr(model, "process_weights_after_loading_already_called", False):
+        # In case `process_weights_after_loading` is called multiple times
+        # we'll skip it at later times
+        logger.debug_once(
+            "process_weights_after_loading already called for model %s", model
+        )
+        return
+
    # to avoid circular dependency
    from vllm.model_executor.model_loader.online_quantization import (
        maybe_save_metadata_and_attributes_for_weight_reloading,