Consolidate Llama model usage in tests (#13094)
This commit is contained in:
@@ -6,7 +6,6 @@ import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import CompilationLevel
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
TEST_MODELS = [
|
||||
@@ -15,14 +14,14 @@ TEST_MODELS = [
|
||||
"dtype": torch.float16,
|
||||
"quantization": "compressed-tensors"
|
||||
}),
|
||||
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
|
||||
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
|
||||
"dtype": torch.float16,
|
||||
"quantization": "fp8"
|
||||
}),
|
||||
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
|
||||
"quantization": "compressed-tensors"
|
||||
}),
|
||||
("meta-llama/Meta-Llama-3-8B", {}),
|
||||
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
|
||||
"quantization": "compressed-tensors"
|
||||
}),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", {}),
|
||||
]
|
||||
|
||||
if is_quant_method_supported("aqlm"):
|
||||
@@ -69,11 +68,6 @@ def check_full_graph_support(model,
|
||||
# make sure these models can be captured in full graph mode
|
||||
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
|
||||
|
||||
# The base meta llama uses too much memory.
|
||||
if (model == "meta-llama/Meta-Llama-3-8B"
|
||||
and optimization_level >= CompilationLevel.PIECEWISE):
|
||||
return
|
||||
|
||||
print(f"MODEL={model}")
|
||||
|
||||
prompts = [
|
||||
|
||||
Reference in New Issue
Block a user