support bitsandbytes quantization with more models (#9148)

This commit is contained in:
chenqianfzh
2024-10-08 18:52:19 -07:00
committed by GitHub
parent 9ba0bd6aa6
commit 2f4117c38e
10 changed files with 165 additions and 28 deletions

View File

@@ -449,6 +449,19 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
"lm_head": "output_embeddings"
}
embedding_padding_modules = ["lm_head"]
# BitandBytes specific attributes
default_bitsandbytes_target_modules = [
".gate_proj.",
".down_proj.",
".up_proj.",
".q_proj.",
".k_proj.",
".v_proj.",
".o_proj.",
]
# in TP, these weights are partitioned along the column dimension (dim=-1)
column_parallel_weights_modules = [".down_proj.", ".o_proj."]
bitsandbytes_stacked_params_mapping = {
# shard_name, weight_name, index
"q_proj": ("qkv_proj", 0),