Access partial_rotary_factor from rope_parameters (#29966)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-12-04 18:42:49 +00:00
committed by GitHub
parent ece2825a29
commit e10c84e06a
21 changed files with 43 additions and 62 deletions

View File

@@ -148,8 +148,6 @@ class ApertusAttention(nn.Module):
if head_dim is None:
head_dim = self.hidden_size // self.total_num_heads
self.head_dim = head_dim
# Phi models introduced a partial_rotary_factor parameter in the config
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
@@ -228,11 +226,10 @@ class ApertusAttention(nn.Module):
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=int(self.partial_rotary_factor * self.head_dim),
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)

View File

@@ -127,8 +127,6 @@ class BailingAttention(nn.Module):
prefix=f"{prefix}.dense",
)
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
self.rotary_dim = getattr(config, "rotary_dim", self.head_dim)
self.rotary_emb = get_rope(
@@ -137,7 +135,6 @@ class BailingAttention(nn.Module):
max_position=config.max_position_embeddings,
rope_parameters=config.rope_parameters,
is_neox_style=True,
partial_rotary_factor=self.partial_rotary_factor,
)
self.attn = Attention(

View File

@@ -178,9 +178,7 @@ class BambaAttentionDecoderLayer(nn.Module):
self.scaling = self.head_dim**-0.5
self.max_position_embeddings = max_position_embeddings
if hasattr(config, "partial_rotary_factor"):
rotary_dim = int(self.head_dim * config.partial_rotary_factor)
elif hasattr(config, "attn_rotary_emb"):
if hasattr(config, "attn_rotary_emb"):
rotary_dim = config.attn_rotary_emb # for backward compatibility
else:
rotary_dim = self.head_dim # default

View File

@@ -8,7 +8,6 @@ import vllm.envs as envs
from vllm.logger import init_logger
from vllm.model_executor.models import ModelRegistry
from vllm.platforms import current_platform
from vllm.transformers_utils.config import set_default_rope_theta
from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
@@ -78,8 +77,6 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
if not model_config.enforce_eager:
max_position = round_up(max_position, 8)
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
config.rotary_kwargs = {
"head_size": head_dim,
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
@@ -119,8 +116,6 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
max_trained_positions = getattr(config, "max_trained_positions", 2048)
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
config.rotary_kwargs = {
"head_size": head_dim,
"rotary_dim": rotary_emb_dim,

View File

@@ -242,9 +242,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
self.scaling = self.head_dim**-0.5
self.max_position_embeddings = max_position_embeddings
if hasattr(config, "partial_rotary_factor"):
rotary_dim = self.head_dim * config.partial_rotary_factor
elif hasattr(config, "attn_rotary_emb"):
if hasattr(config, "attn_rotary_emb"):
rotary_dim = config.attn_rotary_emb # for backward compatibility
else:
rotary_dim = self.head_dim # default

View File

@@ -10,7 +10,8 @@ from .utils import PPMissingLayer
class GlmForCausalLM(LlamaForCausalLM):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
vllm_config.model_config.hf_config.partial_rotary_factor = 0.5
hf_config = vllm_config.model_config.hf_config
hf_config.rope_parameters["partial_rotary_factor"] = 0.5
super().__init__(vllm_config=vllm_config, prefix=prefix)
# Hack Llama model to fit HF format GLM implementation
# Attention difference between GLM and Llama:

View File

@@ -78,7 +78,7 @@ class Glm4Attention(nn.Module):
# Number of KV heads is less than TP size, so we replicate
# the KV heads across multiple tensor parallel GPUs.
assert tp_size % self.total_num_kv_heads == 0
partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
self.head_dim = head_dim or hidden_size // self.total_num_heads
self.rotary_dim = self.head_dim
@@ -106,7 +106,6 @@ class Glm4Attention(nn.Module):
rotary_dim=self.rotary_dim,
max_position=max_position,
rope_parameters=config.rope_parameters,
partial_rotary_factor=partial_rotary_factor,
is_neox_style=False,
)
self.attn = Attention(

View File

@@ -282,13 +282,12 @@ class Glm4MoeAttention(nn.Module):
prefix=f"{prefix}.o_proj",
)
partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
rope_parameters=config.rope_parameters,
partial_rotary_factor=partial_rotary_factor,
)
self.attn = Attention(
self.num_heads,

View File

@@ -89,16 +89,14 @@ class GPTNeoXAttention(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.dense",
)
scaling = self.head_size**-0.5
rotary_dim = int(self.head_size * config.rotary_pct)
assert rotary_dim % 2 == 0
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.rotary_emb = get_rope(
self.head_size,
rotary_dim=rotary_dim,
rotary_dim=self.head_size,
max_position=max_position_embeddings,
rope_parameters=config.rope_parameters,
)
scaling = self.head_size**-0.5
self.attn = Attention(
self.num_heads,
self.head_size,

View File

@@ -149,8 +149,6 @@ class LlamaAttention(nn.Module):
if head_dim is None:
head_dim = self.hidden_size // self.total_num_heads
self.head_dim = head_dim
# Phi models introduced a partial_rotary_factor parameter in the config
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
@@ -265,7 +263,6 @@ class LlamaAttention(nn.Module):
max_position=self.max_position_embeddings,
rope_parameters=getattr(config, "rope_parameters", None),
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)

View File

@@ -178,7 +178,6 @@ class NemotronAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.partial_rotary_factor = config.partial_rotary_factor
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@@ -203,7 +202,6 @@ class NemotronAttention(nn.Module):
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
rope_parameters=config.rope_parameters,
partial_rotary_factor=self.partial_rotary_factor,
)
self.attn = Attention(
self.num_heads,

View File

@@ -122,7 +122,6 @@ class DeciLMAttention(LlamaAttention):
max_position=self.max_position_embeddings,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)

View File

@@ -106,7 +106,6 @@ class PersimmonAttention(nn.Module):
self.num_heads = self.total_num_heads // tensor_parallel_world_size
self.head_dim = self.hidden_size // self.total_num_heads
self.max_position_embeddings = config.max_position_embeddings
self.partial_rotary_factor = config.partial_rotary_factor
self.is_causal = True
assert (self.head_dim * self.total_num_heads) == self.hidden_size
@@ -138,7 +137,6 @@ class PersimmonAttention(nn.Module):
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
rope_parameters=config.rope_parameters,
partial_rotary_factor=self.partial_rotary_factor,
)
self.scaling = self.head_dim**-0.5
self.attn = Attention(

View File

@@ -109,10 +109,7 @@ class PhiAttention(nn.Module):
)
scaling = self.head_size**-0.5
rotary_dim = int(
config.partial_rotary_factor
* (config.hidden_size // config.num_attention_heads)
)
rotary_dim = config.hidden_size // config.num_attention_heads
assert rotary_dim % 2 == 0
max_position_embeddings = getattr(config, "max_position_embeddings", 2048)

View File

@@ -750,7 +750,6 @@ class Qwen3NextAttention(nn.Module):
rotary_dim=self.head_dim,
max_position=config.max_position_embeddings,
rope_parameters=config.rope_parameters,
partial_rotary_factor=config.partial_rotary_factor,
dual_chunk_attention_config=self.dual_chunk_attention_config,
)

View File

@@ -119,9 +119,6 @@ class StablelmAttention(nn.Module):
self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
self.head_dim = self.hidden_size // self.total_num_heads
self.max_position_embeddings = config.max_position_embeddings
self.partial_rotary_factor = getattr(
config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
)
self.scaling = self.head_dim**-0.5
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_key_value_heads * self.head_dim
@@ -154,7 +151,6 @@ class StablelmAttention(nn.Module):
rotary_dim=self.head_dim,
max_position=self.config.max_position_embeddings,
rope_parameters=self.config.rope_parameters,
partial_rotary_factor=self.partial_rotary_factor,
)
self.attn = Attention(
self.num_heads,