[Model] Support GGUF models newly added in transformers 4.46.0 (#9685)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
Isotr0py
2025-01-13 08:13:44 +08:00
committed by GitHub
parent 9597a095f2
commit d14e98d924
7 changed files with 162 additions and 87 deletions

View File

@@ -22,7 +22,7 @@ from typing import Iterable, List, Optional, Set, Tuple, Union
import torch
from torch import nn
from transformers import PretrainedConfig
from transformers import StableLmConfig
from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig, VllmConfig
@@ -50,8 +50,9 @@ from .utils import (is_pp_missing_parameter,
class StablelmMLP(nn.Module):
def __init__(self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None) -> None:
config: StableLmConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "") -> None:
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
@@ -59,10 +60,13 @@ class StablelmMLP(nn.Module):
self.gate_up_proj = MergedColumnParallelLinear(
config.hidden_size, [config.intermediate_size] * 2,
bias=False,
quant_config=quant_config)
quant_config=quant_config,
prefix=f"{prefix}.gate_up_proj")
self.down_proj = RowParallelLinear(config.intermediate_size,
config.hidden_size,
bias=False)
bias=False,
quant_config=quant_config,
prefix=f"{prefix}.down_proj")
self.act_fn = SiluAndMul()
def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -75,7 +79,7 @@ class StablelmMLP(nn.Module):
class StablelmAttention(nn.Module):
def __init__(self,
config: PretrainedConfig,
config: StableLmConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "") -> None:
@@ -116,11 +120,13 @@ class StablelmAttention(nn.Module):
self.total_num_heads,
self.total_num_key_value_heads,
self.qkv_bias,
quant_config=quant_config)
quant_config=quant_config,
prefix=f"{prefix}.qkv_proj")
self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
self.hidden_size,
bias=False,
quant_config=quant_config)
quant_config=quant_config,
prefix=f"{prefix}.o_proj")
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.rotary_ndims,
@@ -154,7 +160,7 @@ class StablelmDecoderLayer(nn.Module):
def __init__(
self,
config: PretrainedConfig,
config: StableLmConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
@@ -164,7 +170,7 @@ class StablelmDecoderLayer(nn.Module):
cache_config,
quant_config,
prefix=f"{prefix}.self_attn")
self.mlp = StablelmMLP(config, quant_config)
self.mlp = StablelmMLP(config, quant_config, prefix=f"{prefix}.mlp")
norm_eps = getattr(config, "norm_eps",
getattr(config, "layer_norm_eps", 1e-05))
self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
@@ -210,6 +216,8 @@ class StableLMEpochModel(nn.Module):
self.embed_tokens = VocabParallelEmbedding(
config.vocab_size,
config.hidden_size,
quant_config=quant_config,
prefix=f"{prefix}.embed_tokens",
)
self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers,
@@ -270,7 +278,8 @@ class StablelmForCausalLM(nn.Module, SupportsPP):
prefix=maybe_prefix(prefix, "model"))
self.lm_head = ParallelLMHead(config.vocab_size,
config.hidden_size,
quant_config=quant_config)
quant_config=quant_config,
prefix=f"{prefix}.lm_head")
if self.config.tie_word_embeddings:
self.lm_head.weight = self.model.embed_tokens.weight
self.logits_processor = LogitsProcessor(config.vocab_size)