Consolidate Llama model usage in tests (#13094)

This commit is contained in:
Harry Mellor
2025-02-14 06:18:03 +00:00
committed by GitHub
parent 40932d7a05
commit f2b20fe491
22 changed files with 45 additions and 53 deletions

View File

@@ -26,12 +26,12 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct"),
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
"meta-llama/Llama-2-7b-chat-hf")
# Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct")
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])