Fix models which use layer_type_validation for Transformers v5 (#37398)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig, layer_type_validation
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class OlmoHybridConfig(PretrainedConfig):
|
||||
@@ -228,7 +228,15 @@ class OlmoHybridConfig(PretrainedConfig):
|
||||
if "full_attention" not in layer_types:
|
||||
layer_types[-1] = "full_attention"
|
||||
|
||||
layer_type_validation(layer_types, num_hidden_layers)
|
||||
if hasattr(self, "validate_layer_type"):
|
||||
# Transformers v5
|
||||
self.layer_types = layer_types
|
||||
self.validate_layer_type()
|
||||
else:
|
||||
# Transformers v4
|
||||
from transformers.configuration_utils import layer_type_validation
|
||||
|
||||
layer_type_validation(layer_types, num_hidden_layers)
|
||||
if "linear_attention" not in layer_types:
|
||||
raise ValueError(
|
||||
"OLMoHybrid expects at least one 'linear_attention' layer."
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# limitations under the License.
|
||||
"""Qwen3.5 model configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig, layer_type_validation
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class Qwen3_5TextConfig(PretrainedConfig):
|
||||
@@ -68,10 +68,6 @@ class Qwen3_5TextConfig(PretrainedConfig):
|
||||
eos_token_id=None,
|
||||
**kwargs,
|
||||
):
|
||||
kwargs["ignore_keys_at_rope_validation"] = [
|
||||
"mrope_section",
|
||||
"mrope_interleaved",
|
||||
]
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
@@ -98,7 +94,18 @@ class Qwen3_5TextConfig(PretrainedConfig):
|
||||
else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
layer_type_validation(self.layer_types, self.num_hidden_layers)
|
||||
if hasattr(self, "validate_layer_type"):
|
||||
# Transformers v5
|
||||
kwargs["ignore_keys_at_rope_validation"] = {
|
||||
"mrope_section",
|
||||
"mrope_interleaved",
|
||||
}
|
||||
self.validate_layer_type()
|
||||
else:
|
||||
# Transformers v4
|
||||
from transformers.configuration_utils import layer_type_validation
|
||||
|
||||
layer_type_validation(self.layer_types, self.num_hidden_layers)
|
||||
|
||||
# linear attention part
|
||||
self.linear_conv_kernel_dim = linear_conv_kernel_dim
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# limitations under the License.
|
||||
"""Qwen3.5-MoE model configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig, layer_type_validation
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class Qwen3_5MoeTextConfig(PretrainedConfig):
|
||||
@@ -75,10 +75,6 @@ class Qwen3_5MoeTextConfig(PretrainedConfig):
|
||||
eos_token_id=None,
|
||||
**kwargs,
|
||||
):
|
||||
kwargs["ignore_keys_at_rope_validation"] = [
|
||||
"mrope_section",
|
||||
"mrope_interleaved",
|
||||
]
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
@@ -104,7 +100,18 @@ class Qwen3_5MoeTextConfig(PretrainedConfig):
|
||||
else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
layer_type_validation(self.layer_types, self.num_hidden_layers)
|
||||
if hasattr(self, "validate_layer_type"):
|
||||
# Transformers v5
|
||||
kwargs["ignore_keys_at_rope_validation"] = {
|
||||
"mrope_section",
|
||||
"mrope_interleaved",
|
||||
}
|
||||
self.validate_layer_type()
|
||||
else:
|
||||
# Transformers v4
|
||||
from transformers.configuration_utils import layer_type_validation
|
||||
|
||||
layer_type_validation(self.layer_types, self.num_hidden_layers)
|
||||
|
||||
# linear attention part
|
||||
self.linear_conv_kernel_dim = linear_conv_kernel_dim
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# limitations under the License.
|
||||
"""Qwen3-Next model configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig, layer_type_validation
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -253,7 +253,14 @@ class Qwen3NextConfig(PretrainedConfig):
|
||||
"linear_attention" if bool((i + 1) % 4) else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
layer_type_validation(self.layer_types)
|
||||
if hasattr(self, "validate_layer_type"):
|
||||
# Transformers v5
|
||||
self.validate_layer_type()
|
||||
else:
|
||||
# Transformers v4
|
||||
from transformers.configuration_utils import layer_type_validation
|
||||
|
||||
layer_type_validation(self.layer_types)
|
||||
|
||||
# linear attention part
|
||||
self.linear_conv_kernel_dim = linear_conv_kernel_dim
|
||||
|
||||
Reference in New Issue
Block a user