support bitsandbytes quantization with more models (#9148)

This commit is contained in:
chenqianfzh
2024-10-08 18:52:19 -07:00
committed by GitHub
parent 9ba0bd6aa6
commit 2f4117c38e
10 changed files with 165 additions and 28 deletions

View File

@@ -9,22 +9,22 @@ import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from ..utils import fork_new_process_for_each_test
from tests.utils import fork_new_process_for_each_test
models_4bit_to_test = [
('huggyllama/llama-7b', 'quantize model inflight'),
("facebook/opt-125m", "quantize opt model inflight"),
]
models_pre_qaunt_4bit_to_test = [
('lllyasviel/omost-llama-3-8b-4bits',
'read pre-quantized 4-bit NF4 model'),
('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
'read pre-quantized 4-bit FP4 model'),
('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
]
models_pre_quant_8bit_to_test = [
('meta-llama/Llama-Guard-3-8B-INT8', 'read pre-quantized 8-bit model'),
('meta-llama/Llama-Guard-3-8B-INT8',
'read pre-quantized llama 8-bit model'),
("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
]
@@ -133,6 +133,7 @@ def validate_generated_texts(hf_runner,
hf_str = hf_log["generated_text"]
vllm_str = vllm_log["generated_text"]
prompt = hf_log["prompt"]
assert hf_str == vllm_str, (f"Model: {model_name}"
f"Mismatch between HF and vLLM outputs:\n"
f"Prompt: {prompt}\n"