support bitsandbytes quantization with more models (#9148)
This commit is contained in:
@@ -9,22 +9,22 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
|
||||
models_4bit_to_test = [
|
||||
('huggyllama/llama-7b', 'quantize model inflight'),
|
||||
("facebook/opt-125m", "quantize opt model inflight"),
|
||||
]
|
||||
|
||||
models_pre_qaunt_4bit_to_test = [
|
||||
('lllyasviel/omost-llama-3-8b-4bits',
|
||||
'read pre-quantized 4-bit NF4 model'),
|
||||
('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
|
||||
'read pre-quantized 4-bit FP4 model'),
|
||||
('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
|
||||
]
|
||||
|
||||
models_pre_quant_8bit_to_test = [
|
||||
('meta-llama/Llama-Guard-3-8B-INT8', 'read pre-quantized 8-bit model'),
|
||||
('meta-llama/Llama-Guard-3-8B-INT8',
|
||||
'read pre-quantized llama 8-bit model'),
|
||||
("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
|
||||
]
|
||||
|
||||
|
||||
@@ -133,6 +133,7 @@ def validate_generated_texts(hf_runner,
|
||||
hf_str = hf_log["generated_text"]
|
||||
vllm_str = vllm_log["generated_text"]
|
||||
prompt = hf_log["prompt"]
|
||||
|
||||
assert hf_str == vllm_str, (f"Model: {model_name}"
|
||||
f"Mismatch between HF and vLLM outputs:\n"
|
||||
f"Prompt: {prompt}\n"
|
||||
|
||||
Reference in New Issue
Block a user