[CI/Build] Split up models tests (#10069)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-10 03:39:14 +08:00
parent b09895a618
commit 51c2e1fcef
21 changed files with 115 additions and 129 deletions
--- a/tests/models/decoder_only/language/test_aqlm.py
+++ b/tests/models/decoder_only/language/test_aqlm.py
@@ -38,6 +38,7 @@ ground_truth_generations = [
 ]


+@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
                    reason="AQLM is not supported on this GPU type.")
@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -15,6 +15,7 @@ from ...utils import check_logprobs_close
 os.environ["TOKENIZERS_PARALLELISM"] = "true"


+@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize(
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -17,26 +17,21 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"

 MAX_MODEL_LEN = 1024

-# FIXME: Move this to confest
-MODELS = [
-    ("meta-llama/Llama-3.2-1B-Instruct",
-     hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
-                     filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf")),
-    ("meta-llama/Llama-3.2-1B-Instruct",
-     hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
-                     filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf")),
-    ("Qwen/Qwen2-1.5B-Instruct",
-     hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
-                     filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
-    ("Qwen/Qwen2-1.5B-Instruct",
-     hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
-                     filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
-]
-

@pytest.mark.skipif(not is_quant_method_supported("gguf"),
                    reason="gguf is not supported on this GPU type.")
-@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [
+    ("meta-llama/Llama-3.2-1B-Instruct",
+     "bartowski/Llama-3.2-1B-Instruct-GGUF",
+     "Llama-3.2-1B-Instruct-Q4_K_M.gguf"),
+    ("meta-llama/Llama-3.2-1B-Instruct",
+     "bartowski/Llama-3.2-1B-Instruct-GGUF",
+     "Llama-3.2-1B-Instruct-IQ4_XS.gguf"),
+    ("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF",
+     "qwen2-1_5b-instruct-q4_k_m.gguf"),
+    ("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
+     "Qwen2-1.5B-Instruct.IQ4_XS.gguf"),
+])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@@ -45,7 +40,9 @@ def test_models(
    num_gpus_available,
    vllm_runner,
    example_prompts,
-    model,
+    original_model,
+    gguf_id,
+    gguf_path,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
@@ -54,7 +51,7 @@ def test_models(
    if num_gpus_available < tp_size:
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

-    original_model, gguf_model = model
+    gguf_model = hf_hub_download(gguf_id, filename=gguf_path)

    tokenizer = AutoTokenizer.from_pretrained(original_model)
    messages = [[{
--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -33,6 +33,7 @@ MODELS = [
 ]


+@pytest.mark.quant_model
@pytest.mark.flaky(reruns=3)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                    reason="gptq_marlin is not supported on this GPU type.")
--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -38,6 +38,7 @@ model_pairs = [
 ]


+@pytest.mark.quant_model
@pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
                    reason="Marlin24 is not supported on this GPU type.")
--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
@@ -7,7 +7,9 @@ import pytest
 from ...utils import check_logprobs_close

 MODELS = [
+    # TODO(sang): Sliding window should be tested separately.
    "ibm/PowerLM-3b",
+    "ibm/PowerMoE-3b",
 ]


@@ -24,7 +26,6 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    # TODO(sang): Sliding window should be tested separately.
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)
--- a/tests/models/decoder_only/language/test_granitemoe.py
+++ b/tests/models/decoder_only/language/test_granitemoe.py
@@ -1,39 +0,0 @@
-"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
-
-Run `pytest tests/models/test_granite.py`.
-"""
-import pytest
-
-from ...utils import check_logprobs_close
-
-MODELS = [
-    "ibm/PowerMoE-3b",
-]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [64])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
@@ -39,6 +39,7 @@ EXPECTED_STRS_MAP = {
@pytest.mark.skip(
    reason=
    "Prevent unstable test based on golden strings from breaking the build.")
+@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -1,8 +1,5 @@
 """Compare the outputs of HF and vLLM when using greedy sampling.

-This test only tests small models. Big models such as 7B should be tested from
-test_big_models.py because it could use a larger instance to run tests.
-
 Run `pytest tests/models/test_models.py`.
 """
 import pytest
@@ -35,6 +32,7 @@ if not current_platform.is_cpu():
 target_dtype = "half"


+@pytest.mark.core_model
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [32])