[CI/Build] Split up models tests (#10069)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-10 03:39:14 +08:00
parent b09895a618
commit 51c2e1fcef
21 changed files with 115 additions and 129 deletions
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -17,26 +17,21 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"

 MAX_MODEL_LEN = 1024

-# FIXME: Move this to confest
-MODELS = [
-    ("meta-llama/Llama-3.2-1B-Instruct",
-     hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
-                     filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf")),
-    ("meta-llama/Llama-3.2-1B-Instruct",
-     hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
-                     filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf")),
-    ("Qwen/Qwen2-1.5B-Instruct",
-     hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
-                     filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
-    ("Qwen/Qwen2-1.5B-Instruct",
-     hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
-                     filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
-]
-

@pytest.mark.skipif(not is_quant_method_supported("gguf"),
                    reason="gguf is not supported on this GPU type.")
-@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [
+    ("meta-llama/Llama-3.2-1B-Instruct",
+     "bartowski/Llama-3.2-1B-Instruct-GGUF",
+     "Llama-3.2-1B-Instruct-Q4_K_M.gguf"),
+    ("meta-llama/Llama-3.2-1B-Instruct",
+     "bartowski/Llama-3.2-1B-Instruct-GGUF",
+     "Llama-3.2-1B-Instruct-IQ4_XS.gguf"),
+    ("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF",
+     "qwen2-1_5b-instruct-q4_k_m.gguf"),
+    ("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
+     "Qwen2-1.5B-Instruct.IQ4_XS.gguf"),
+])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@@ -45,7 +40,9 @@ def test_models(
    num_gpus_available,
    vllm_runner,
    example_prompts,
-    model,
+    original_model,
+    gguf_id,
+    gguf_path,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
@@ -54,7 +51,7 @@ def test_models(
    if num_gpus_available < tp_size:
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

-    original_model, gguf_model = model
+    gguf_model = hf_hub_download(gguf_id, filename=gguf_path)

    tokenizer = AutoTokenizer.from_pretrained(original_model)
    messages = [[{