[2/N] executor pass the complete config to worker/modelrunner (#9938)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
youkaichao
2024-11-02 07:35:05 -07:00
committed by GitHub
parent 1d4cfe2be1
commit e893795443
44 changed files with 249 additions and 579 deletions

View File

@@ -9,10 +9,11 @@ import torch
import vllm.envs as envs
from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig,
DeviceConfig, EngineConfig, LoadConfig, LoadFormat,
LoRAConfig, ModelConfig, ObservabilityConfig,
ParallelConfig, PromptAdapterConfig, SchedulerConfig,
SpeculativeConfig, TaskOption, TokenizerPoolConfig)
DeviceConfig, LoadConfig, LoadFormat, LoRAConfig,
ModelConfig, ObservabilityConfig, ParallelConfig,
PromptAdapterConfig, SchedulerConfig,
SpeculativeConfig, TaskOption, TokenizerPoolConfig,
VllmConfig)
from vllm.executor.executor_base import ExecutorBase
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -955,7 +956,7 @@ class EngineArgs:
ignore_patterns=self.ignore_patterns,
)
def create_engine_config(self) -> EngineConfig:
def create_engine_config(self) -> VllmConfig:
# gguf file needs a specific model loader and doesn't use hf_repo
if check_gguf_file(self.model):
self.quantization = self.load_format = "gguf"
@@ -1167,7 +1168,7 @@ class EngineArgs:
or "all" in detailed_trace_modules,
)
return EngineConfig(
return VllmConfig(
model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,