[Bugfix] Fix standard models tests (#17217)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-26 17:26:41 +08:00
parent 8c1c926d00
commit 909fdaf152
4 changed files with 72 additions and 63 deletions
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -9,6 +9,7 @@ import torch

 from vllm.platforms import current_platform

+from ....utils import large_gpu_mark
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close

@@ -26,7 +27,7 @@ REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
 AITER_MODEL_LIST = [
    "meta-llama/Llama-3.2-1B-Instruct",
    "openbmb/MiniCPM3-4B",
-    "Qwen/Qwen-7B",
+    "Qwen/Qwen-7B-Chat",
    "Qwen/Qwen2.5-0.5B-Instruct",
    "ehristoforu/Falcon3-MoE-2x7B-Insruct",
 ]
@@ -34,63 +35,70 @@ AITER_MODEL_LIST = [

 # @maybe_test_rocm_aiter
@pytest.mark.parametrize(
-    "model_arch",
+    "model",
    [
        pytest.param(
-            "BloomForCausalLM",  # testing alibi slopes
+            "bigscience/bloom-560m",  # bloom - testing alibi slopes
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
        pytest.param(
-            "GPT2LMHeadModel",  # gpt2
+            "openai-community/gpt2",  # gpt2
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
-        pytest.param("GPTJForCausalLM"),
-        pytest.param("GPTBigCodeForCausalLM"),
-        pytest.param("GPTNeoXForCausalLM"),
+        pytest.param("Milos/slovak-gpt-j-405M"),  # gptj
+        pytest.param("bigcode/tiny_starcoder_py"),  # gpt_bigcode
+        pytest.param("EleutherAI/pythia-70m"),  # gpt_neox
        pytest.param(
-            "GemmaForCausalLM",  # gemma
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
-        ),
-        pytest.param("GlmForCausalLM"),
-        pytest.param(
-            "LlamaForCausalLM",
+            "google/gemma-1.1-2b-it",  # gemma
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
        pytest.param(
-            "MiniCPM3ForCausalLM",
+            "THUDM/chatglm3-6b",  # chatglm (text-only)
+        ),
+        pytest.param(
+            "meta-llama/Llama-3.2-1B-Instruct",  # llama
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "openbmb/MiniCPM3-4B",
            # fused_moe not supported on CPU
-            marks=[pytest.mark.core_model],
+            marks=[pytest.mark.core_model,
+                   large_gpu_mark(min_gb=32)],
        ),
        pytest.param(
-            "OPTForCausalLM",
+            "facebook/opt-125m",  # opt
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
        pytest.param(
-            "PhiForCausalLM",
+            "microsoft/phi-2",  # phi
            marks=[pytest.mark.core_model],
        ),
-        pytest.param("QWenLMHeadModel", ),
        pytest.param(
-            "Qwen2ForCausalLM",
+            "Qwen/Qwen-7B-Chat",  # qwen (text-only)
+        ),
+        pytest.param(
+            "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
            marks=[pytest.mark.core_model],
        ),
-        pytest.param("StableLmForCausalLM"),
-        pytest.param("Starcoder2ForCausalLM"),
+        pytest.param("stabilityai/stablelm-3b-4e1t"),  # stablelm
+        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
        pytest.param(
-            "MixtralForCausalLM",
-            marks=[pytest.mark.cpu_model],
+            "ehristoforu/Falcon3-MoE-2x7B-Insruct",  # mixtral
+            marks=[pytest.mark.cpu_model,
+                   large_gpu_mark(min_gb=48)],
        )
    ])
-@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize(
    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-def test_models(hf_runner, vllm_runner, example_prompts, model_arch: str,
-                dtype: str, max_tokens: int, num_logprobs: int,
-                use_rocm_aiter: bool, monkeypatch) -> None:
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+                max_tokens: int, num_logprobs: int, use_rocm_aiter: bool,
+                monkeypatch) -> None:

-    model = HF_EXAMPLE_MODELS.get_hf_info(model_arch).default
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")

    if model in REQUIRES_V0:
        monkeypatch.setenv("VLLM_USE_V1", "0")
@@ -104,15 +112,17 @@ def test_models(hf_runner, vllm_runner, example_prompts, model_arch: str,
        # in parts of the operators
        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")

-    with hf_runner(model, dtype=dtype) as hf_model:
-        if model.startswith("THUDM/chatglm3"):
-            hf_model.model.get_output_embeddings = lambda: \
-                hf_model.model.transformer.output_layer
-
+    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(
+            model,
+            tokenizer_name=model_info.tokenizer or model,
+            tokenizer_mode=model_info.tokenizer_mode,
+            trust_remote_code=model_info.trust_remote_code,
+            max_num_seqs=2,
+    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)