Access partial_rotary_factor from rope_parameters (#29966)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -148,8 +148,6 @@ class ApertusAttention(nn.Module):
|
||||
if head_dim is None:
|
||||
head_dim = self.hidden_size // self.total_num_heads
|
||||
self.head_dim = head_dim
|
||||
# Phi models introduced a partial_rotary_factor parameter in the config
|
||||
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
@@ -228,11 +226,10 @@ class ApertusAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=int(self.partial_rotary_factor * self.head_dim),
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=is_neox_style,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -127,8 +127,6 @@ class BailingAttention(nn.Module):
|
||||
prefix=f"{prefix}.dense",
|
||||
)
|
||||
|
||||
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
|
||||
|
||||
self.rotary_dim = getattr(config, "rotary_dim", self.head_dim)
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
@@ -137,7 +135,6 @@ class BailingAttention(nn.Module):
|
||||
max_position=config.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
|
||||
self.attn = Attention(
|
||||
|
||||
@@ -178,9 +178,7 @@ class BambaAttentionDecoderLayer(nn.Module):
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
if hasattr(config, "partial_rotary_factor"):
|
||||
rotary_dim = int(self.head_dim * config.partial_rotary_factor)
|
||||
elif hasattr(config, "attn_rotary_emb"):
|
||||
if hasattr(config, "attn_rotary_emb"):
|
||||
rotary_dim = config.attn_rotary_emb # for backward compatibility
|
||||
else:
|
||||
rotary_dim = self.head_dim # default
|
||||
|
||||
@@ -8,7 +8,6 @@ import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import set_default_rope_theta
|
||||
from vllm.utils.math_utils import cdiv, round_up
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
|
||||
@@ -78,8 +77,6 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
|
||||
if not model_config.enforce_eager:
|
||||
max_position = round_up(max_position, 8)
|
||||
|
||||
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
|
||||
|
||||
config.rotary_kwargs = {
|
||||
"head_size": head_dim,
|
||||
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
|
||||
@@ -119,8 +116,6 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
|
||||
rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
|
||||
max_trained_positions = getattr(config, "max_trained_positions", 2048)
|
||||
|
||||
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
|
||||
|
||||
config.rotary_kwargs = {
|
||||
"head_size": head_dim,
|
||||
"rotary_dim": rotary_emb_dim,
|
||||
|
||||
@@ -242,9 +242,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
if hasattr(config, "partial_rotary_factor"):
|
||||
rotary_dim = self.head_dim * config.partial_rotary_factor
|
||||
elif hasattr(config, "attn_rotary_emb"):
|
||||
if hasattr(config, "attn_rotary_emb"):
|
||||
rotary_dim = config.attn_rotary_emb # for backward compatibility
|
||||
else:
|
||||
rotary_dim = self.head_dim # default
|
||||
|
||||
@@ -10,7 +10,8 @@ from .utils import PPMissingLayer
|
||||
|
||||
class GlmForCausalLM(LlamaForCausalLM):
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
vllm_config.model_config.hf_config.partial_rotary_factor = 0.5
|
||||
hf_config = vllm_config.model_config.hf_config
|
||||
hf_config.rope_parameters["partial_rotary_factor"] = 0.5
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
# Hack Llama model to fit HF format GLM implementation
|
||||
# Attention difference between GLM and Llama:
|
||||
|
||||
@@ -78,7 +78,7 @@ class Glm4Attention(nn.Module):
|
||||
# Number of KV heads is less than TP size, so we replicate
|
||||
# the KV heads across multiple tensor parallel GPUs.
|
||||
assert tp_size % self.total_num_kv_heads == 0
|
||||
partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
|
||||
config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
|
||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||
self.head_dim = head_dim or hidden_size // self.total_num_heads
|
||||
self.rotary_dim = self.head_dim
|
||||
@@ -106,7 +106,6 @@ class Glm4Attention(nn.Module):
|
||||
rotary_dim=self.rotary_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=config.rope_parameters,
|
||||
partial_rotary_factor=partial_rotary_factor,
|
||||
is_neox_style=False,
|
||||
)
|
||||
self.attn = Attention(
|
||||
|
||||
@@ -282,13 +282,12 @@ class Glm4MoeAttention(nn.Module):
|
||||
prefix=f"{prefix}.o_proj",
|
||||
)
|
||||
|
||||
partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
|
||||
config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
partial_rotary_factor=partial_rotary_factor,
|
||||
)
|
||||
self.attn = Attention(
|
||||
self.num_heads,
|
||||
|
||||
@@ -89,16 +89,14 @@ class GPTNeoXAttention(nn.Module):
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.dense",
|
||||
)
|
||||
scaling = self.head_size**-0.5
|
||||
rotary_dim = int(self.head_size * config.rotary_pct)
|
||||
assert rotary_dim % 2 == 0
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_size,
|
||||
rotary_dim=rotary_dim,
|
||||
rotary_dim=self.head_size,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
scaling = self.head_size**-0.5
|
||||
self.attn = Attention(
|
||||
self.num_heads,
|
||||
self.head_size,
|
||||
|
||||
@@ -149,8 +149,6 @@ class LlamaAttention(nn.Module):
|
||||
if head_dim is None:
|
||||
head_dim = self.hidden_size // self.total_num_heads
|
||||
self.head_dim = head_dim
|
||||
# Phi models introduced a partial_rotary_factor parameter in the config
|
||||
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
@@ -265,7 +263,6 @@ class LlamaAttention(nn.Module):
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=getattr(config, "rope_parameters", None),
|
||||
is_neox_style=is_neox_style,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -178,7 +178,6 @@ class NemotronAttention(nn.Module):
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.partial_rotary_factor = config.partial_rotary_factor
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
self.qkv_proj = QKVParallelLinear(
|
||||
@@ -203,7 +202,6 @@ class NemotronAttention(nn.Module):
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
self.attn = Attention(
|
||||
self.num_heads,
|
||||
|
||||
@@ -122,7 +122,6 @@ class DeciLMAttention(LlamaAttention):
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=is_neox_style,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -106,7 +106,6 @@ class PersimmonAttention(nn.Module):
|
||||
self.num_heads = self.total_num_heads // tensor_parallel_world_size
|
||||
self.head_dim = self.hidden_size // self.total_num_heads
|
||||
self.max_position_embeddings = config.max_position_embeddings
|
||||
self.partial_rotary_factor = config.partial_rotary_factor
|
||||
self.is_causal = True
|
||||
|
||||
assert (self.head_dim * self.total_num_heads) == self.hidden_size
|
||||
@@ -138,7 +137,6 @@ class PersimmonAttention(nn.Module):
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.attn = Attention(
|
||||
|
||||
@@ -109,10 +109,7 @@ class PhiAttention(nn.Module):
|
||||
)
|
||||
|
||||
scaling = self.head_size**-0.5
|
||||
rotary_dim = int(
|
||||
config.partial_rotary_factor
|
||||
* (config.hidden_size // config.num_attention_heads)
|
||||
)
|
||||
rotary_dim = config.hidden_size // config.num_attention_heads
|
||||
assert rotary_dim % 2 == 0
|
||||
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
|
||||
|
||||
@@ -750,7 +750,6 @@ class Qwen3NextAttention(nn.Module):
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=config.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
partial_rotary_factor=config.partial_rotary_factor,
|
||||
dual_chunk_attention_config=self.dual_chunk_attention_config,
|
||||
)
|
||||
|
||||
|
||||
@@ -119,9 +119,6 @@ class StablelmAttention(nn.Module):
|
||||
self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
|
||||
self.head_dim = self.hidden_size // self.total_num_heads
|
||||
self.max_position_embeddings = config.max_position_embeddings
|
||||
self.partial_rotary_factor = getattr(
|
||||
config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
|
||||
)
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_key_value_heads * self.head_dim
|
||||
@@ -154,7 +151,6 @@ class StablelmAttention(nn.Module):
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.config.max_position_embeddings,
|
||||
rope_parameters=self.config.rope_parameters,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
self.attn = Attention(
|
||||
self.num_heads,
|
||||
|
||||
Reference in New Issue
Block a user