[Model]: Add transformers backend support (#11330)

# Adds support for `transformers` as a backend Following https://github.com/huggingface/transformers/pull/35235, a bunch of models should already be supported, we are ramping up support for more models. Thanks @Isotr0py for the TP support, and @hmellor for his help as well! This includes: - `trust_remote_code=True` support: any model on the hub, if it implements attention the correct way can be natively supported!! - tensor parallel support --------- Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-02-03 14:30:38 +01:00
parent 1298a400e8
commit a1a2aaadb9
11 changed files with 528 additions and 9 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -83,6 +83,12 @@ class SupportsHash(Protocol):
        ...


+class ModelImpl(str, enum.Enum):
+    AUTO = "auto"
+    VLLM = "vllm"
+    TRANSFORMERS = "transformers"
+
+
 class ModelConfig:
    """Configuration for the model.

@@ -167,6 +173,12 @@ class ModelConfig:
            `logits_processors` extra completion argument. Defaults to None,
            which allows no processors.
        generation_config: Configuration parameter file for generation.
+        model_impl: Which implementation of the model to use:
+            "auto" will try to use the vLLM implementation if it exists and
+                fall back to the Transformers implementation if no vLLM
+                implementation is available.
+            "vllm" will use the vLLM model implementation.
+            "transformers" will use the Transformers model implementation.
        override_generation_config: Override the generation config with the
            given config.
    """
@@ -230,6 +242,7 @@ class ModelConfig:
        generation_config: Optional[str] = None,
        enable_sleep_mode: bool = False,
        override_generation_config: Optional[Dict[str, Any]] = None,
+        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
    ) -> None:
        self.model = model
        self.tokenizer = tokenizer
@@ -241,6 +254,7 @@ class ModelConfig:
        self.code_revision = code_revision
        self.rope_scaling = rope_scaling
        self.rope_theta = rope_theta
+        self.model_impl = model_impl

        if hf_overrides is None:
            hf_overrides = {}