[Model]: Add transformers backend support (#11330)
# Adds support for `transformers` as a backend Following https://github.com/huggingface/transformers/pull/35235, a bunch of models should already be supported, we are ramping up support for more models. Thanks @Isotr0py for the TP support, and @hmellor for his help as well! This includes: - `trust_remote_code=True` support: any model on the hub, if it implements attention the correct way can be natively supported!! - tensor parallel support --------- Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -13,10 +13,10 @@ import vllm.envs as envs
|
||||
from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
|
||||
DecodingConfig, DeviceConfig, HfOverrides,
|
||||
KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
|
||||
ModelConfig, ObservabilityConfig, ParallelConfig,
|
||||
PoolerConfig, PromptAdapterConfig, SchedulerConfig,
|
||||
SpeculativeConfig, TaskOption, TokenizerPoolConfig,
|
||||
VllmConfig)
|
||||
ModelConfig, ModelImpl, ObservabilityConfig,
|
||||
ParallelConfig, PoolerConfig, PromptAdapterConfig,
|
||||
SchedulerConfig, SpeculativeConfig, TaskOption,
|
||||
TokenizerPoolConfig, VllmConfig)
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
@@ -199,6 +199,7 @@ class EngineArgs:
|
||||
generation_config: Optional[str] = None
|
||||
override_generation_config: Optional[Dict[str, Any]] = None
|
||||
enable_sleep_mode: bool = False
|
||||
model_impl: str = "auto"
|
||||
|
||||
calculate_kv_scales: Optional[bool] = None
|
||||
|
||||
@@ -378,6 +379,18 @@ class EngineArgs:
|
||||
'qualified names that can be passed with the `logits_processors` '
|
||||
'extra completion argument. Defaults to None, which allows no '
|
||||
'processors.')
|
||||
parser.add_argument(
|
||||
'--model-impl',
|
||||
type=str,
|
||||
default=EngineArgs.model_impl,
|
||||
choices=[f.value for f in ModelImpl],
|
||||
help='Which implementation of the model to use.\n\n'
|
||||
'* "auto" will try to use the vLLM implementation if it exists '
|
||||
'and fall back to the Transformers implementation if no vLLM '
|
||||
'implementation is available.\n'
|
||||
'* "vllm" will use the vLLM model implementation.\n'
|
||||
'* "transformers" will use the Transformers model '
|
||||
'implementation.\n')
|
||||
# Parallel arguments
|
||||
parser.add_argument(
|
||||
'--distributed-executor-backend',
|
||||
@@ -1017,6 +1030,7 @@ class EngineArgs:
|
||||
generation_config=self.generation_config,
|
||||
override_generation_config=self.override_generation_config,
|
||||
enable_sleep_mode=self.enable_sleep_mode,
|
||||
model_impl=self.model_impl,
|
||||
)
|
||||
|
||||
def create_load_config(self) -> LoadConfig:
|
||||
|
||||
Reference in New Issue
Block a user