[Frontend] [Core] feat: Add model loading using tensorizer (#3476)

This commit is contained in:
Sanger Steel
2024-04-13 20:13:01 -04:00
committed by GitHub
parent 989ae2538d
commit 711a000255
20 changed files with 1351 additions and 51 deletions

View File

@@ -6,7 +6,7 @@ from transformers import PreTrainedTokenizer
import vllm
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig, SpeculativeConfig,
VisionLanguageConfig)
TensorizerConfig, VisionLanguageConfig)
from vllm.core.scheduler import Scheduler, SchedulerOutputs
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.metrics import StatLogger, Stats
@@ -74,6 +74,7 @@ class LLMEngine:
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
speculative_config: Optional[SpeculativeConfig],
tensorizer_config: Optional[TensorizerConfig],
executor_class: Type[ExecutorBase],
log_stats: bool,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -110,6 +111,7 @@ class LLMEngine:
self.scheduler_config = scheduler_config
self.device_config = device_config
self.speculative_config = speculative_config
self.tensorizer_config = tensorizer_config
self.log_stats = log_stats
self._init_tokenizer()
@@ -125,6 +127,7 @@ class LLMEngine:
lora_config=lora_config,
vision_language_config=vision_language_config,
speculative_config=speculative_config,
tensorizer_config=tensorizer_config,
)
self._initialize_kv_caches()
@@ -264,6 +267,9 @@ class LLMEngine:
def _verify_args(self) -> None:
self.model_config.verify_with_parallel_config(self.parallel_config)
self.cache_config.verify_with_parallel_config(self.parallel_config)
if self.tensorizer_config:
self.tensorizer_config.verify_with_parallel_config(
self.parallel_config)
if self.lora_config:
self.lora_config.verify_with_model_config(self.model_config)
self.lora_config.verify_with_scheduler_config(