Enable hybrid attention models for Transformers backend (#18494)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-23 04:12:08 +02:00
parent c6b636f9fb
commit 4b0da7b60e
4 changed files with 106 additions and 30 deletions
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -1,37 +1,50 @@
 # SPDX-License-Identifier: Apache-2.0
 """Test the functionality of the Transformers backend."""
+from typing import Any, Optional, Union
+
 import pytest

 from vllm.platforms import current_platform

 from ..conftest import HfRunner, VllmRunner
+from ..core.block.e2e.test_correctness_sliding_window import prep_prompts
 from ..utils import multi_gpu_test
 from .utils import check_logprobs_close


 def check_implementation(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
+    runner_ref: type[Union[HfRunner, VllmRunner]],
+    runner_test: type[VllmRunner],
    example_prompts: list[str],
    model: str,
+    kwargs_ref: Optional[dict[str, Any]] = None,
+    kwargs_test: Optional[dict[str, Any]] = None,
    **kwargs,
 ):
+    if kwargs_ref is None:
+        kwargs_ref = {}
+    if kwargs_test is None:
+        kwargs_test = {}
+
    max_tokens = 32
    num_logprobs = 5

-    with vllm_runner(model, **kwargs) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+    args = (example_prompts, max_tokens, num_logprobs)

-    with hf_runner(model) as hf_model:
-        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+    with runner_test(model, **kwargs_test, **kwargs) as model_test:
+        outputs_test = model_test.generate_greedy_logprobs(*args)
+
+    with runner_ref(model, **kwargs_ref) as model_ref:
+        if isinstance(model_ref, VllmRunner):
+            outputs_ref = model_ref.generate_greedy_logprobs(*args)
+        else:
+            outputs_ref = model_ref.generate_greedy_logprobs_limit(*args)

    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
+        outputs_0_lst=outputs_ref,
+        outputs_1_lst=outputs_test,
+        name_0="ref",
+        name_1="test",
    )


@@ -58,6 +71,18 @@ def test_models(
                         model_impl=model_impl)


+def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
+    prompts, _, _ = prep_prompts(4, (800, 801))
+    kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
+    kwargs_test = {"model_impl": "transformers", **kwargs_ref}
+    check_implementation(vllm_runner,
+                         vllm_runner,
+                         prompts,
+                         model="hmellor/tiny-random-Gemma2ForCausalLM",
+                         kwargs_ref=kwargs_ref,
+                         kwargs_test=kwargs_test)
+
+
@multi_gpu_test(num_gpus=2)
 def test_distributed(
    hf_runner: type[HfRunner],
@@ -65,8 +90,11 @@ def test_distributed(
    example_prompts,
 ):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
-    check_implementation(hf_runner, vllm_runner, example_prompts,
-                         "meta-llama/Llama-3.2-1B-Instruct", **kwargs)
+    check_implementation(hf_runner,
+                         vllm_runner,
+                         example_prompts,
+                         "meta-llama/Llama-3.2-1B-Instruct",
+                         kwargs_test=kwargs)


@pytest.mark.skipif(