diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 773926bff..c35cdb121 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -686,6 +686,7 @@ def get_model_params(config): "DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM", "DeepseekV32ForCausalLM", + "GlmMoeDsaForCausalLM", "Glm4MoeForCausalLM", "Glm4MoeLiteForCausalLM", "NemotronHForCausalLM", diff --git a/tests/models/registry.py b/tests/models/registry.py index 8ae94d080..f688985ce 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -275,6 +275,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "zai-org/GLM-4.7-Flash", min_transformers_version="5.0.0", ), + "GlmMoeDsaForCausalLM": _HfExamplesInfo( + "zai-org/GLM-5", min_transformers_version="5.0.1", is_available_online=False + ), "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo( "bigcode/starcoder", diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 0e5272d50..4ee86416a 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -97,7 +97,7 @@ def can_initialize( "pickle error when loading `transformers.models.auto.CONFIG_MAPPING`" ) - if model_arch == "DeepseekV32ForCausalLM": + if model_arch in ["DeepseekV32ForCausalLM", "GlmMoeDsaForCausalLM"]: from vllm.platforms import current_platform capability = current_platform.get_device_capability() diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 5a2fe8eeb..8a54dbb6d 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -181,7 +181,7 @@ class SpeculativeConfig: @staticmethod def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: initial_architecture = hf_config.architectures[0] - if hf_config.model_type in ("deepseek_v3", "deepseek_v32"): + if hf_config.model_type in ("deepseek_v3", "deepseek_v32", "glm_moe_dsa"): hf_config.model_type = "deepseek_mtp" if hf_config.model_type == "deepseek_mtp": n_predict = getattr(hf_config, "num_nextn_predict_layers", None) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 464518a3d..ab4f498b9 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -836,7 +836,7 @@ class DeepseekV2MLAAttention(nn.Module): qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, - is_neox_style=True, + is_neox_style=not getattr(config, "indexer_rope_interleave", True), ) self.indexer = Indexer( vllm_config, @@ -1499,6 +1499,10 @@ class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM): pass +class GlmMoeDsaForCausalLM(DeepseekV2ForCausalLM): + pass + + # Compatibility with # https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py def get_spec_layer_idx_from_weight_name( diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index c310f6f17..6e68b24ba 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -114,6 +114,7 @@ _TEXT_GENERATION_MODELS = { "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"), "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"), "Glm4MoeLiteForCausalLM": ("glm4_moe_lite", "Glm4MoeLiteForCausalLM"), + "GlmMoeDsaForCausalLM": ("deepseek_v2", "GlmMoeDsaForCausalLM"), "GptOssForCausalLM": ("gpt_oss", "GptOssForCausalLM"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"), diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py index bd6b7376e..f82186639 100644 --- a/vllm/transformers_utils/model_arch_config_convertor.py +++ b/vllm/transformers_utils/model_arch_config_convertor.py @@ -237,6 +237,7 @@ class ModelArchConfigConvertorBase: "deepseek_v3", "deepseek_v32", "deepseek_mtp", + "glm_moe_dsa", "glm4_moe_lite", "glm4_moe_lite_mtp", "kimi_k2",