Consolidate Llama model usage in tests (#13094)

2025-02-14 06:18:03 +00:00
parent 40932d7a05
commit f2b20fe491
22 changed files with 45 additions and 53 deletions
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -26,12 +26,12 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
-        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
+        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
         "meta-llama/Llama-3.2-1B-Instruct"),
-        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
-        ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
-         "meta-llama/Llama-2-7b-chat-hf")
+        # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
+        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct")
    ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])