Consolidate Llama model usage in tests (#13094)

2025-02-14 06:18:03 +00:00
parent 40932d7a05
commit f2b20fe491
22 changed files with 45 additions and 53 deletions
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -6,7 +6,6 @@ import torch

 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationLevel
 from vllm.platforms import current_platform

 TEST_MODELS = [
@@ -15,14 +14,14 @@ TEST_MODELS = [
        "dtype": torch.float16,
        "quantization": "compressed-tensors"
    }),
-    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
+    ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
        "dtype": torch.float16,
-        "quantization": "fp8"
-    }),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
        "quantization": "compressed-tensors"
    }),
-    ("meta-llama/Meta-Llama-3-8B", {}),
+    ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
+        "quantization": "compressed-tensors"
+    }),
+    ("meta-llama/Llama-3.2-1B-Instruct", {}),
 ]

 if is_quant_method_supported("aqlm"):
@@ -69,11 +68,6 @@ def check_full_graph_support(model,
    # make sure these models can be captured in full graph mode
    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"

-    # The base meta llama uses too much memory.
-    if (model == "meta-llama/Meta-Llama-3-8B"
-            and optimization_level >= CompilationLevel.PIECEWISE):
-        return
-
    print(f"MODEL={model}")

    prompts = [