[Core] Move save_tensorized_model logic to Worker (#35825)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
@@ -58,7 +58,7 @@ from vllm.model_executor.layers.rotary_embedding import (
|
|||||||
MRotaryEmbedding,
|
MRotaryEmbedding,
|
||||||
XDRotaryEmbedding,
|
XDRotaryEmbedding,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
|
from vllm.model_executor.model_loader import get_model_loader
|
||||||
from vllm.model_executor.model_loader.reload import (
|
from vllm.model_executor.model_loader.reload import (
|
||||||
finalize_layerwise_reload,
|
finalize_layerwise_reload,
|
||||||
initialize_layerwise_reload,
|
initialize_layerwise_reload,
|
||||||
@@ -194,7 +194,6 @@ from .utils import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
|
||||||
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
||||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||||
|
|
||||||
@@ -4510,16 +4509,6 @@ class GPUModelRunner(
|
|||||||
weights_not_loaded,
|
weights_not_loaded,
|
||||||
)
|
)
|
||||||
|
|
||||||
def save_tensorized_model(
|
|
||||||
self,
|
|
||||||
tensorizer_config: "TensorizerConfig",
|
|
||||||
) -> None:
|
|
||||||
TensorizerLoader.save_model(
|
|
||||||
self.get_model(),
|
|
||||||
tensorizer_config=tensorizer_config,
|
|
||||||
model_config=self.model_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _get_prompt_logprobs_dict(
|
def _get_prompt_logprobs_dict(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ from vllm.v1.worker.utils import is_residual_scattered_for_sp
|
|||||||
from vllm.v1.worker.worker_base import WorkerBase
|
from vllm.v1.worker.worker_base import WorkerBase
|
||||||
from vllm.v1.worker.workspace import init_workspace_manager
|
from vllm.v1.worker.workspace import init_workspace_manager
|
||||||
|
|
||||||
|
from ...model_executor.model_loader import TensorizerLoader
|
||||||
from .gpu.warmup import warmup_kernels
|
from .gpu.warmup import warmup_kernels
|
||||||
from .utils import request_memory
|
from .utils import request_memory
|
||||||
|
|
||||||
@@ -836,12 +837,11 @@ class Worker(WorkerBase):
|
|||||||
max_size=max_size,
|
max_size=max_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
def save_tensorized_model(
|
def save_tensorized_model(self, tensorizer_config: "TensorizerConfig") -> None:
|
||||||
self,
|
TensorizerLoader.save_model(
|
||||||
tensorizer_config: "TensorizerConfig",
|
self.get_model(),
|
||||||
) -> None:
|
|
||||||
self.model_runner.save_tensorized_model(
|
|
||||||
tensorizer_config=tensorizer_config,
|
tensorizer_config=tensorizer_config,
|
||||||
|
model_config=self.model_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
def init_weight_transfer_engine(self, init_info: dict) -> None:
|
def init_weight_transfer_engine(self, init_info: dict) -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user