[Model]: Add transformers backend support (#11330)

# Adds support for `transformers` as a backend Following https://github.com/huggingface/transformers/pull/35235, a bunch of models should already be supported, we are ramping up support for more models. Thanks @Isotr0py for the TP support, and @hmellor for his help as well! This includes: - `trust_remote_code=True` support: any model on the hub, if it implements attention the correct way can be natively supported!! - tensor parallel support --------- Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-02-03 14:30:38 +01:00
parent 1298a400e8
commit a1a2aaadb9
11 changed files with 528 additions and 9 deletions
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -0,0 +1,75 @@
+"""Test the functionality of the Transformers backend.
+
+Run `pytest tests/models/test_transformers.py`.
+"""
+from contextlib import nullcontext
+from typing import Type
+
+import pytest
+
+from ..conftest import HfRunner, VllmRunner
+from ..utils import multi_gpu_test
+from .utils import check_logprobs_close
+
+
+def check_implementation(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    **kwargs,
+):
+    max_tokens = 32
+    num_logprobs = 5
+
+    with vllm_runner(model, **kwargs) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize(
+    "model,model_impl",
+    [
+        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
+        ("openai-community/gpt2", "transformers"),
+        ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
+        ("meta-llama/Llama-3.2-1B-Instruct", "auto"),
+    ])  # trust_remote_code=True by default
+def test_models(hf_runner, vllm_runner, example_prompts, model,
+                model_impl) -> None:
+
+    maybe_raises = nullcontext()
+    if model == "openai-community/gpt2" and model_impl == "transformers":
+        # Model is not backend compatible
+        maybe_raises = pytest.raises(
+            ValueError,
+            match="The Transformers implementation.*not compatible with vLLM")
+
+    with maybe_raises:
+        check_implementation(hf_runner,
+                             vllm_runner,
+                             example_prompts,
+                             model,
+                             model_impl=model_impl)
+
+
+@multi_gpu_test(num_gpus=2)
+def test_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+):
+    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
+    check_implementation(hf_runner, vllm_runner, example_prompts,
+                         "meta-llama/Llama-3.2-1B-Instruct", **kwargs)