Consolidate Llama model usage in tests (#13094)
This commit is contained in:
@@ -17,7 +17,7 @@ from ..utils import multi_gpu_test
|
||||
|
||||
MODELS = [
|
||||
"google/gemma-2-2b-it",
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
]
|
||||
|
||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||
@@ -96,12 +96,12 @@ def test_models(
|
||||
"test_suite", [
|
||||
("facebook/opt-125m", "ray", "", "L4"),
|
||||
("facebook/opt-125m", "mp", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
|
||||
("facebook/opt-125m", "ray", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
|
||||
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
|
||||
])
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
@@ -116,7 +116,7 @@ def test_models_distributed(
|
||||
if test_suite != TARGET_TEST_SUITE:
|
||||
pytest.skip(f"Skip test for {test_suite}")
|
||||
|
||||
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
@@ -20,7 +20,7 @@ from ..utils import multi_gpu_test
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
]
|
||||
|
||||
|
||||
@@ -92,7 +92,7 @@ def test_models_distributed(
|
||||
) -> None:
|
||||
override_backend_env_variable(monkeypatch, attention_backend)
|
||||
|
||||
if (model == "meta-llama/Llama-2-7b-hf"
|
||||
if (model == "meta-llama/Llama-3.2-1B-Instruct"
|
||||
and distributed_executor_backend == "ray"):
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
@@ -221,7 +221,7 @@ def test_with_prefix_caching(
|
||||
Checks exact match decode with and without prefix caching
|
||||
with chunked prefill enabled.
|
||||
"""
|
||||
model = "meta-llama/Llama-2-7b-chat-hf"
|
||||
model = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
# The common prompt has 142 tokens with Llama-2 tokenizer.
|
||||
common_prompt = "You are a helpful AI assistant " * 20
|
||||
unique_prompts = [
|
||||
|
||||
@@ -4,5 +4,5 @@ from ..utils import compare_two_settings
|
||||
|
||||
|
||||
def test_cpu_offload():
|
||||
compare_two_settings("meta-llama/Llama-3.2-1B", [],
|
||||
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
|
||||
["--cpu-offload-gb", "1"])
|
||||
|
||||
@@ -118,7 +118,7 @@ def test_cumem_with_cudagraph():
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"meta-llama/Llama-3.2-1B", # sleep mode with safetensors
|
||||
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
|
||||
"facebook/opt-125m" # sleep mode with pytorch checkpoint
|
||||
])
|
||||
def test_end_to_end(model):
|
||||
|
||||
Reference in New Issue
Block a user