Update rope_scaling to rope_parameters in preparation for Transformers v5 (#28542)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -27,7 +27,6 @@
|
||||
import typing
|
||||
from collections.abc import Callable, Iterable
|
||||
from itertools import islice
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@@ -111,8 +110,6 @@ class DeepseekAttention(nn.Module):
|
||||
config: DeepseekV2Config | DeepseekV3Config,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: dict[str, Any] | None = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: CacheConfig | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
@@ -139,7 +136,6 @@ class DeepseekAttention(nn.Module):
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.rope_theta = rope_theta
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
self.qkv_proj = QKVParallelLinear(
|
||||
@@ -162,8 +158,7 @@ class DeepseekAttention(nn.Module):
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
base=rope_theta,
|
||||
rope_scaling=rope_scaling,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
self.attn = Attention(
|
||||
self.num_heads,
|
||||
@@ -409,8 +404,6 @@ class DeepseekV2Attention(nn.Module):
|
||||
v_head_dim: int,
|
||||
q_lora_rank: int,
|
||||
kv_lora_rank: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: dict[str, Any] | None = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: CacheConfig | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
@@ -430,7 +423,6 @@ class DeepseekV2Attention(nn.Module):
|
||||
assert num_heads % tp_size == 0
|
||||
self.num_local_heads = num_heads // tp_size
|
||||
self.scaling = self.qk_head_dim**-0.5
|
||||
self.rope_theta = rope_theta
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
assert topk_indices_buffer is None, (
|
||||
"topk_indices_buffer is not \
|
||||
@@ -485,21 +477,20 @@ class DeepseekV2Attention(nn.Module):
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.o_proj",
|
||||
)
|
||||
if rope_scaling:
|
||||
rope_scaling["rope_type"] = "deepseek_yarn"
|
||||
if config.rope_parameters["rope_type"] != "default":
|
||||
config.rope_parameters["rope_type"] = "deepseek_yarn"
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
qk_rope_head_dim,
|
||||
rotary_dim=qk_rope_head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
base=rope_theta,
|
||||
rope_scaling=rope_scaling,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=False,
|
||||
)
|
||||
|
||||
if rope_scaling:
|
||||
mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
|
||||
scaling_factor = rope_scaling["factor"]
|
||||
if config.rope_parameters["rope_type"] != "default":
|
||||
mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
|
||||
scaling_factor = config.rope_parameters["factor"]
|
||||
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
|
||||
self.scaling = self.scaling * mscale * mscale
|
||||
|
||||
@@ -903,8 +894,6 @@ class DeepseekV2MLAAttention(nn.Module):
|
||||
v_head_dim: int,
|
||||
q_lora_rank: int | None,
|
||||
kv_lora_rank: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: dict[str, Any] | None = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: CacheConfig | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
@@ -927,7 +916,6 @@ class DeepseekV2MLAAttention(nn.Module):
|
||||
self.num_local_heads = num_heads // tp_size
|
||||
|
||||
self.scaling = self.qk_head_dim**-0.5
|
||||
self.rope_theta = rope_theta
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
if self.q_lora_rank is not None:
|
||||
@@ -981,19 +969,18 @@ class DeepseekV2MLAAttention(nn.Module):
|
||||
prefix=f"{prefix}.o_proj",
|
||||
)
|
||||
|
||||
if rope_scaling:
|
||||
rope_scaling["rope_type"] = "deepseek_yarn"
|
||||
if config.rope_parameters["rope_type"] != "default":
|
||||
config.rope_parameters["rope_type"] = "deepseek_yarn"
|
||||
self.rotary_emb = get_rope(
|
||||
qk_rope_head_dim,
|
||||
rotary_dim=qk_rope_head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
base=rope_theta,
|
||||
rope_scaling=rope_scaling,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=False,
|
||||
)
|
||||
if rope_scaling:
|
||||
mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
|
||||
scaling_factor = rope_scaling["factor"]
|
||||
if config.rope_parameters["rope_type"] != "default":
|
||||
mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
|
||||
scaling_factor = config.rope_parameters["factor"]
|
||||
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
|
||||
self.scaling = self.scaling * mscale * mscale
|
||||
|
||||
@@ -1073,8 +1060,6 @@ class DeepseekV2DecoderLayer(nn.Module):
|
||||
parallel_config = vllm_config.parallel_config
|
||||
|
||||
self.hidden_size = config.hidden_size
|
||||
rope_theta = getattr(config, "rope_theta", 10000)
|
||||
rope_scaling = getattr(config, "rope_scaling", None)
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
||||
moe_layer_freq = getattr(config, "moe_layer_freq", 1)
|
||||
# DecoderLayers are created with `make_layers` which passes the prefix
|
||||
@@ -1107,8 +1092,6 @@ class DeepseekV2DecoderLayer(nn.Module):
|
||||
v_head_dim=v_head_dim,
|
||||
q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None,
|
||||
kv_lora_rank=kv_lora_rank,
|
||||
rope_theta=rope_theta,
|
||||
rope_scaling=rope_scaling,
|
||||
max_position_embeddings=max_position_embeddings,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
|
||||
Reference in New Issue
Block a user