diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8c92aab26..e4ddefc81 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -58,7 +58,7 @@ from vllm.model_executor.layers.rotary_embedding import ( MRotaryEmbedding, XDRotaryEmbedding, ) -from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader +from vllm.model_executor.model_loader import get_model_loader from vllm.model_executor.model_loader.reload import ( finalize_layerwise_reload, initialize_layerwise_reload, @@ -194,7 +194,6 @@ from .utils import ( ) if TYPE_CHECKING: - from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.spec_decode.ngram_proposer import NgramProposer @@ -4510,16 +4509,6 @@ class GPUModelRunner( weights_not_loaded, ) - def save_tensorized_model( - self, - tensorizer_config: "TensorizerConfig", - ) -> None: - TensorizerLoader.save_model( - self.get_model(), - tensorizer_config=tensorizer_config, - model_config=self.model_config, - ) - def _get_prompt_logprobs_dict( self, hidden_states: torch.Tensor, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 62f0433ef..c0654abd5 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -57,6 +57,7 @@ from vllm.v1.worker.utils import is_residual_scattered_for_sp from vllm.v1.worker.worker_base import WorkerBase from vllm.v1.worker.workspace import init_workspace_manager +from ...model_executor.model_loader import TensorizerLoader from .gpu.warmup import warmup_kernels from .utils import request_memory @@ -836,12 +837,11 @@ class Worker(WorkerBase): max_size=max_size, ) - def save_tensorized_model( - self, - tensorizer_config: "TensorizerConfig", - ) -> None: - self.model_runner.save_tensorized_model( + def save_tensorized_model(self, tensorizer_config: "TensorizerConfig") -> None: + TensorizerLoader.save_model( + self.get_model(), tensorizer_config=tensorizer_config, + model_config=self.model_config, ) def init_weight_transfer_engine(self, init_info: dict) -> None: