diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index bfb0e91fd..6e178bb69 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -57,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/v1/kv_connector @ApostaC /tests/v1/offloading @ApostaC -# Transformers backend +# Transformers modeling backend /vllm/model_executor/models/transformers @hmellor /tests/models/test_transformers.py @hmellor diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md index d8c40c519..13f3edb7e 100644 --- a/docs/contributing/model/README.md +++ b/docs/contributing/model/README.md @@ -1,7 +1,7 @@ # Summary !!! important - Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve ` works first! + Many decoder language models can now be automatically loaded using the [Transformers modeling backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve ` works first! vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance. diff --git a/docs/deployment/frameworks/hf_inference_endpoints.md b/docs/deployment/frameworks/hf_inference_endpoints.md index d39bb9a89..05df0dacd 100644 --- a/docs/deployment/frameworks/hf_inference_endpoints.md +++ b/docs/deployment/frameworks/hf_inference_endpoints.md @@ -156,7 +156,7 @@ In this guide, we demonstrate manual deployment using the [`rednote-hilab/dots.o ## Advanced Deployment Details -With the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications. +With the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications. Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts. @@ -167,4 +167,4 @@ The platform integrates seamlessly with the Hugging Face Hub, allowing you to de - Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog - Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index) - Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm) -- Understand the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html) +- Understand the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index c1eb207ef..0439e9cf2 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -15,9 +15,9 @@ These models are what we list in [supported text models](#list-of-text-only-lang ### Transformers -vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend". +vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers modeling backend". -Currently, the Transformers backend works for the following: +Currently, the Transformers modeling backend works for the following: - Modalities: embedding models, language models and vision-language models* - Architectures: encoder-only, decoder-only, mixture-of-experts @@ -25,7 +25,7 @@ Currently, the Transformers backend works for the following: _*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._ -If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM: +If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers modeling backend, it will be compatible with the following features of vLLM: - All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature) - Any combination of the following vLLM parallelisation schemes: @@ -44,7 +44,7 @@ llm.apply_model(lambda model: print(type(model))) If the printed type starts with `Transformers...` then it's using the Transformers model implementation! -If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md). +If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers modeling backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md). !!! note For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance. @@ -53,12 +53,12 @@ If a model has a vLLM implementation but you would prefer to use the Transformer If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM! -For a model to be compatible with the Transformers backend for vLLM it must: +For a model to be compatible with the Transformers modeling backend for vLLM it must: - be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)): - The model directory must have the correct structure (e.g. `config.json` is present). - `config.json` must contain `auto_map.AutoModel`. -- be a Transformers backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)): +- be a Transformers modeling backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)): - Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`). If the compatible model is: @@ -66,13 +66,13 @@ If the compatible model is: - on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md). - in a local directory, simply pass directory path to `model=` for [offline-inference](../serving/offline_inference.md) or `vllm serve ` for the [openai-compatible-server](../serving/openai_compatible_server.md). -This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM! +This means that, with the Transformers modeling backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM! #### Writing custom models -This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)). +This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers modeling backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)). -To make your model compatible with the Transformers backend, it needs: +To make your model compatible with the Transformers modeling backend, it needs: 1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`. - If your model is encoder-only: @@ -134,7 +134,7 @@ Here is what happens in the background when this model is loaded: 1. The config is loaded. 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. -3. `MyModel` is loaded into one of the Transformers backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. +3. `MyModel` is loaded into one of the Transformers modeling backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. That's it! @@ -182,7 +182,7 @@ To determine whether a given model is natively supported, you can check the `con If the `"architectures"` field contains a model architecture listed below, then it should be natively supported. Models do not _need_ to be natively supported to be used in vLLM. -The [Transformers backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!). +The [Transformers modeling backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!). !!! tip The easiest way to check if your model is really supported at runtime is to run the program below: @@ -451,7 +451,7 @@ th { | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | | `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ | -Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! +Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | |--------------|--------|-------------------|----------------------|---------------------------| @@ -720,7 +720,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `TarsierForConditionalGeneration` | Tarsier | T + IE+ | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | | `Tarsier2ForConditionalGeneration`^ | Tarsier2 | T + IE+ + VE+ | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | -Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! +Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | |--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------| diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index a18f5b607..ae5befd2c 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Test the functionality of the Transformers backend.""" +"""Test the functionality of the Transformers modeling backend.""" from typing import Any @@ -85,7 +85,7 @@ def test_models( required = Version("5.0.0.dev") if model == "allenai/OLMoE-1B-7B-0924" and installed < required: pytest.skip( - "MoE models with the Transformers backend require " + "MoE models with the Transformers modeling backend require " f"transformers>={required}, but got {installed}" ) diff --git a/vllm/config/model.py b/vllm/config/model.py index 8ec66b6b3..b3a28af6d 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -732,7 +732,7 @@ class ModelConfig: return self def _get_transformers_backend_cls(self) -> str: - """Determine which Transformers backend class will be used if + """Determine which Transformers modeling backend class will be used if `model_impl` is set to `transformers` or `auto`.""" cls = "Transformers" # If 'hf_config != hf_text_config' it's a nested config, i.e. multimodal @@ -746,8 +746,8 @@ class ModelConfig: # User specified value take precedence if self.runner != "auto": runner = self.runner - # Only consider Transformers backend pooling classes if we're wrapping an - # architecture that defaults to pooling. Otherwise, we return the LM class + # Only consider Transformers modeling backend pooling classes if we're wrapping + # an architecture that defaults to pooling. Otherwise, we return the LM class # and use adapters. if runner == "pooling" and task in {"embed", "classify"}: if task == "embed": @@ -759,7 +759,7 @@ class ModelConfig: return cls def using_transformers_backend(self) -> bool: - """Check if the model is using the Transformers backend class.""" + """Check if the model is using the Transformers modeling backend class.""" used_cls = self._model_info.architecture transformers_backend_cls = self._get_transformers_backend_cls() return used_cls == transformers_backend_cls diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index d619a0edc..3db4165e2 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -121,7 +121,7 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - # In transformers backend, x and output have extra batch dimension like + # In Transformers modeling backend, x and output have extra batch dimension like # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim), # therefore we need to flatten the batch dimensions. if x.ndim == 3 and output.ndim == 3: diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index f742090df..a9cc49451 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -429,7 +429,7 @@ def load_weights_using_from_2_way_softmax( if text_config.tie_word_embeddings: # embed_tokens is the assumed name for input embeddings. If the model does not # have this attribute, we fallback to get_input_embeddings(), which is used by - # the Transformers backend. + # the Transformers modeling backend. embed_tokens = ( model.model.embed_tokens if hasattr(model.model, "embed_tokens") @@ -487,7 +487,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te if text_config.tie_word_embeddings: # embed_tokens is the assumed name for input embeddings. If the model does not # have this attribute, we fallback to get_input_embeddings(), which is used by - # the Transformers backend. + # the Transformers modeling backend. embed_tokens = ( model.model.embed_tokens if hasattr(model.model, "embed_tokens") diff --git a/vllm/model_executor/models/transformers/__init__.py b/vllm/model_executor/models/transformers/__init__.py index 365b5eb08..93cd8ff50 100644 --- a/vllm/model_executor/models/transformers/__init__.py +++ b/vllm/model_executor/models/transformers/__init__.py @@ -120,8 +120,8 @@ def __getattr__(name: str): """Handle imports of non-existent classes with a helpful error message.""" if name not in globals(): raise AttributeError( - "The Transformers backend does not currently have a class to handle " - f"the requested model type: {name}. Please open an issue at " + "The Transformers modeling backend does not currently have a class to " + f"handle the requested model type: {name}. Please open an issue at " "https://github.com/vllm-project/vllm/issues/new" ) return globals()[name] diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index 63096e57f..f4ba4758b 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend base class.""" +"""Transformers modeling backend base class.""" from collections.abc import Iterable from typing import TYPE_CHECKING @@ -118,7 +118,7 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""): super().__init__() - logger.info("Using Transformers backend.") + logger.info("Using Transformers modeling backend.") self.config = vllm_config.model_config.hf_config self.text_config = self.config.get_text_config() @@ -147,7 +147,8 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): # Check for unsupported quantization methods. if quant_method_name == "mxfp4": raise NotImplementedError( - "Transformers backend does not support MXFP4 quantization yet." + "Transformers modeling backend does " + "not support MXFP4 quantization yet." ) # Skip loading extra bias for GPTQ models. if "gptq" in quant_method_name: @@ -458,6 +459,6 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): required = Version(min_version) if installed < required: raise ImportError( - f"Transformers backend requires transformers>={required} " + f"Transformers modeling backend requires transformers>={required} " f"for {feature}, but got {installed}" ) diff --git a/vllm/model_executor/models/transformers/causal.py b/vllm/model_executor/models/transformers/causal.py index 42fd11117..b2865ed0c 100644 --- a/vllm/model_executor/models/transformers/causal.py +++ b/vllm/model_executor/models/transformers/causal.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend mixin for causal language models.""" +"""Transformers modeling backend mixin for causal language models.""" from typing import TYPE_CHECKING diff --git a/vllm/model_executor/models/transformers/legacy.py b/vllm/model_executor/models/transformers/legacy.py index a453870a2..aca630be5 100644 --- a/vllm/model_executor/models/transformers/legacy.py +++ b/vllm/model_executor/models/transformers/legacy.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend mixin for legacy models.""" +"""Transformers modeling backend mixin for legacy models.""" from typing import TYPE_CHECKING diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 8e39eb0b9..4973014c3 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend mixin for Mixture of Experts (MoE) models.""" +"""Transformers modeling backend mixin for Mixture of Experts (MoE) models.""" from typing import TYPE_CHECKING, Any @@ -39,7 +39,7 @@ if TYPE_CHECKING: @CustomOp.register("transformers_fused_moe") class TransformersFusedMoE(FusedMoE): - """Custom FusedMoE for the Transformers backend.""" + """Custom FusedMoE for the Transformers modeling backend.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py index 9b0463f41..ccf605371 100644 --- a/vllm/model_executor/models/transformers/multimodal.py +++ b/vllm/model_executor/models/transformers/multimodal.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend mixin for multi-modal models.""" +"""Transformers modeling backend mixin for multi-modal models.""" from collections.abc import Mapping from typing import TYPE_CHECKING @@ -310,9 +310,9 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE): return model_output def get_language_model(self) -> torch.nn.Module: - """Transformers backend multimodal classes do not contain a separate vLLM - language model class. Therefore, in order to return a language model vLLM class, - we use a wrapper to give `self` the same interface as a text model.""" + """Transformers modeling backend multimodal classes do not contain a separate + vLLM language model class. Therefore, in order to return a language model vLLM + class, we use a wrapper to give `self` the same interface as a text model.""" # Exclude self and object bases = self.__class__.mro()[1:-1] @@ -385,7 +385,9 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE): for k, v in kwargs.items() if k not in {"image_grid_thw", "video_grid_thw"} ): - raise NotImplementedError("Transformers backend only supports images.") + raise NotImplementedError( + "Transformers modeling backend only supports images." + ) image_grid_thw = kwargs.get("image_grid_thw", []) video_grid_thw = kwargs.get("video_grid_thw", []) diff --git a/vllm/model_executor/models/transformers/pooling.py b/vllm/model_executor/models/transformers/pooling.py index 8117bbac0..4c2a74bcc 100644 --- a/vllm/model_executor/models/transformers/pooling.py +++ b/vllm/model_executor/models/transformers/pooling.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend mixins for pooling models.""" +"""Transformers modeling backend mixins for pooling models.""" from typing import TYPE_CHECKING diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py index 267a6e06e..517eb54d5 100644 --- a/vllm/model_executor/models/transformers/utils.py +++ b/vllm/model_executor/models/transformers/utils.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Transformers backend utilities.""" +"""Transformers modeling backend utilities.""" from contextlib import contextmanager from pathlib import Path