Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test the functionality of the Transformers backend."""
|
||||
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import pytest
|
||||
@@ -60,14 +61,16 @@ def check_implementation(
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
|
||||
reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"model,model_impl",
|
||||
[
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
|
||||
("hmellor/Ilama-3.2-1B", "auto"), # CUSTOM CODE
|
||||
("allenai/OLMoE-1B-7B-0924", "transformers"), # MoE
|
||||
]) # trust_remote_code=True by default
|
||||
],
|
||||
) # trust_remote_code=True by default
|
||||
def test_models(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
@@ -77,29 +80,32 @@ def test_models(
|
||||
) -> None:
|
||||
import transformers
|
||||
from packaging.version import Version
|
||||
|
||||
installed = Version(transformers.__version__)
|
||||
required = Version("4.57.0.dev0")
|
||||
if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
|
||||
pytest.skip("MoE models with the Transformers backend require "
|
||||
f"transformers>={required}, but got {installed}")
|
||||
pytest.skip(
|
||||
"MoE models with the Transformers backend require "
|
||||
f"transformers>={required}, but got {installed}"
|
||||
)
|
||||
|
||||
check_implementation(hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model,
|
||||
model_impl=model_impl)
|
||||
check_implementation(
|
||||
hf_runner, vllm_runner, example_prompts, model, model_impl=model_impl
|
||||
)
|
||||
|
||||
|
||||
def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
|
||||
prompts, _, _ = prep_prompts(4, (800, 801))
|
||||
kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
|
||||
kwargs_test = {"model_impl": "transformers", **kwargs_ref}
|
||||
check_implementation(vllm_runner,
|
||||
vllm_runner,
|
||||
prompts,
|
||||
model="hmellor/tiny-random-Gemma2ForCausalLM",
|
||||
kwargs_ref=kwargs_ref,
|
||||
kwargs_test=kwargs_test)
|
||||
check_implementation(
|
||||
vllm_runner,
|
||||
vllm_runner,
|
||||
prompts,
|
||||
model="hmellor/tiny-random-Gemma2ForCausalLM",
|
||||
kwargs_ref=kwargs_ref,
|
||||
kwargs_test=kwargs_test,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@@ -109,23 +115,28 @@ def test_distributed(
|
||||
example_prompts,
|
||||
):
|
||||
kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
|
||||
check_implementation(hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
kwargs_test=kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model, quantization_kwargs", [
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
|
||||
(
|
||||
check_implementation(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
{
|
||||
"quantization": "bitsandbytes",
|
||||
},
|
||||
),
|
||||
])
|
||||
kwargs_test=kwargs,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model, quantization_kwargs",
|
||||
[
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
|
||||
(
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
{
|
||||
"quantization": "bitsandbytes",
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_quantization(
|
||||
@@ -136,27 +147,34 @@ def test_quantization(
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
if (current_platform.is_rocm()
|
||||
and quantization_kwargs.get("quantization", "") == "bitsandbytes"):
|
||||
pytest.skip(
|
||||
"bitsandbytes quantization is currently not supported in rocm.")
|
||||
if (
|
||||
current_platform.is_rocm()
|
||||
and quantization_kwargs.get("quantization", "") == "bitsandbytes"
|
||||
):
|
||||
pytest.skip("bitsandbytes quantization is currently not supported in rocm.")
|
||||
|
||||
with vllm_runner(
|
||||
model, model_impl="auto", enforce_eager=True,
|
||||
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
|
||||
model,
|
||||
model_impl="auto",
|
||||
enforce_eager=True,
|
||||
**quantization_kwargs, # type: ignore[arg-type]
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
|
||||
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
model_impl="transformers",
|
||||
enforce_eager=True,
|
||||
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
|
||||
model,
|
||||
model_impl="transformers",
|
||||
enforce_eager=True,
|
||||
**quantization_kwargs, # type: ignore[arg-type]
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.using_transformers_backend()
|
||||
|
||||
transformers_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
|
||||
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=transformers_outputs,
|
||||
@@ -172,22 +190,24 @@ def test_quantization(
|
||||
# Layers live in `layers`
|
||||
"Qwen/Qwen3-Embedding-0.6B",
|
||||
# Layers live in `model.layers`
|
||||
"meta-llama/Llama-3.2-1B-Instruct"
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
],
|
||||
)
|
||||
def test_embed_loading(vllm_runner, model):
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
runner="pooling",
|
||||
model_impl="transformers") as model_test:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
runner="pooling",
|
||||
model_impl="transformers",
|
||||
) as model_test:
|
||||
model_config = model_test.llm.llm_engine.model_config
|
||||
assert model_config.using_transformers_backend()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arch",
|
||||
["TransformersEmbeddingModel", "TransformersForSequenceClassification"])
|
||||
"arch", ["TransformersEmbeddingModel", "TransformersForSequenceClassification"]
|
||||
)
|
||||
def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
|
||||
model = get_model(arch)
|
||||
|
||||
@@ -202,6 +222,7 @@ def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
|
||||
hf_kwargs["is_sentence_transformer"] = True
|
||||
elif arch == "TransformersForSequenceClassification":
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
hf_kwargs["auto_cls"] = AutoModelForSequenceClassification
|
||||
|
||||
# The example_prompts has ending "\n", for example:
|
||||
@@ -212,8 +233,10 @@ def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
|
||||
# So we need to strip the input texts to avoid test failing.
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
with (vllm_runner(model, **vllm_kwargs) as
|
||||
vllm_model, hf_runner(model, **hf_kwargs) as hf_model):
|
||||
with (
|
||||
vllm_runner(model, **vllm_kwargs) as vllm_model,
|
||||
hf_runner(model, **hf_kwargs) as hf_model,
|
||||
):
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.using_transformers_backend()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user