Move online quantization to model.load_weights (#26327)

Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
This commit is contained in:
Jerry Zhang
2025-11-18 16:52:41 -08:00
committed by GitHub
parent 1395461f5f
commit da94c7c0eb
6 changed files with 309 additions and 108 deletions

View File

@@ -88,6 +88,14 @@ def initialize_model(
def process_weights_after_loading(
model: nn.Module, model_config: ModelConfig, target_device: torch.device
) -> None:
if getattr(model, "process_weights_after_loading_already_called", False):
# In case `process_weights_after_loading` is called multiple times
# we'll skip it at later times
logger.debug_once(
"process_weights_after_loading already called for model %s", model
)
return
# to avoid circular dependency
from vllm.model_executor.model_loader.online_quantization import (
maybe_save_metadata_and_attributes_for_weight_reloading,