[Quantization][V1] BitsAndBytes support V1 (#15611)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-28 10:12:47 +08:00
parent bd45912b99
commit 726efc6a32
7 changed files with 52 additions and 24 deletions
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1259,6 +1259,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                                         pack_ratio)

                offsets = np.concatenate(([0], np.cumsum(num_elements)))
+                # Make torch infer_schema happy
+                offsets = torch.tensor(offsets).cpu()
                set_weight_attrs(param, {"bnb_shard_offsets": offsets})

                if load_8bit: