support bitsandbytes quantization with more models (#9148)

2024-10-08 18:52:19 -07:00
parent 9ba0bd6aa6
commit 2f4117c38e
10 changed files with 165 additions and 28 deletions
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -9,22 +9,22 @@ import pytest
 import torch

 from tests.quantization.utils import is_quant_method_supported
-
-from ..utils import fork_new_process_for_each_test
+from tests.utils import fork_new_process_for_each_test

 models_4bit_to_test = [
-    ('huggyllama/llama-7b', 'quantize model inflight'),
+    ("facebook/opt-125m", "quantize opt model inflight"),
 ]

 models_pre_qaunt_4bit_to_test = [
-    ('lllyasviel/omost-llama-3-8b-4bits',
-     'read pre-quantized 4-bit NF4 model'),
    ('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
     'read pre-quantized 4-bit FP4 model'),
+    ('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
 ]

 models_pre_quant_8bit_to_test = [
-    ('meta-llama/Llama-Guard-3-8B-INT8', 'read pre-quantized 8-bit model'),
+    ('meta-llama/Llama-Guard-3-8B-INT8',
+     'read pre-quantized llama 8-bit model'),
+    ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
 ]


@@ -133,6 +133,7 @@ def validate_generated_texts(hf_runner,
        hf_str = hf_log["generated_text"]
        vllm_str = vllm_log["generated_text"]
        prompt = hf_log["prompt"]
+
        assert hf_str == vllm_str, (f"Model: {model_name}"
                                    f"Mismatch between HF and vLLM outputs:\n"
                                    f"Prompt: {prompt}\n"