Consolidate Llama model usage in tests (#13094)

2025-02-14 06:18:03 +00:00
parent 40932d7a05
commit f2b20fe491
22 changed files with 45 additions and 53 deletions
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -20,7 +20,7 @@ from ..utils import multi_gpu_test

 MODELS = [
    "facebook/opt-125m",
-    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
 ]


@@ -92,7 +92,7 @@ def test_models_distributed(
 ) -> None:
    override_backend_env_variable(monkeypatch, attention_backend)

-    if (model == "meta-llama/Llama-2-7b-hf"
+    if (model == "meta-llama/Llama-3.2-1B-Instruct"
            and distributed_executor_backend == "ray"):
        # test ray adag
        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
@@ -221,7 +221,7 @@ def test_with_prefix_caching(
    Checks exact match decode with and without prefix caching
    with chunked prefill enabled.
    """
-    model = "meta-llama/Llama-2-7b-chat-hf"
+    model = "meta-llama/Llama-3.2-1B-Instruct"
    # The common prompt has 142 tokens with Llama-2 tokenizer.
    common_prompt = "You are a helpful AI assistant " * 20
    unique_prompts = [