Consolidate Llama model usage in tests (#13094)

This commit is contained in:
Harry Mellor
2025-02-14 06:18:03 +00:00
committed by GitHub
parent 40932d7a05
commit f2b20fe491
22 changed files with 45 additions and 53 deletions

View File

@@ -20,7 +20,7 @@ from ..utils import multi_gpu_test
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
]
@@ -92,7 +92,7 @@ def test_models_distributed(
) -> None:
override_backend_env_variable(monkeypatch, attention_backend)
if (model == "meta-llama/Llama-2-7b-hf"
if (model == "meta-llama/Llama-3.2-1B-Instruct"
and distributed_executor_backend == "ray"):
# test ray adag
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
@@ -221,7 +221,7 @@ def test_with_prefix_caching(
Checks exact match decode with and without prefix caching
with chunked prefill enabled.
"""
model = "meta-llama/Llama-2-7b-chat-hf"
model = "meta-llama/Llama-3.2-1B-Instruct"
# The common prompt has 142 tokens with Llama-2 tokenizer.
common_prompt = "You are a helpful AI assistant " * 20
unique_prompts = [