diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 1c9576425..4e5abea8e 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -471,7 +471,7 @@ th { | `StableLMEpochForCausalLM` | StableLM Epoch | `stabilityai/stablelm-zephyr-3b`, etc. | | ✅︎ | | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | | `Step1ForCausalLM` | Step-Audio | `stepfun-ai/Step-Audio-EditX`, etc. | ✅︎ | ✅︎ | -| `Step3p5ForCausalLM` | Step-3.5-flash | `stepfun-ai/step-3.5-flash`, etc. | | ✅︎ | +| `Step3p5ForCausalLM` | Step-3.5-flash | `stepfun-ai/Step-3.5-Flash`, etc. | | ✅︎ | | `TeleChatForCausalLM` | TeleChat | `chuhac/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | | `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | | `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 69da8c7af..8ae94d080 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -481,16 +481,21 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "ByteDance-Seed/Seed-OSS-36B-Instruct", trust_remote_code=True, ), - "Step1ForCausalLM": _HfExamplesInfo( - "stepfun-ai/Step-Audio-EditX", trust_remote_code=True - ), - "Step3p5ForCausalLM": _HfExamplesInfo( - "stepfun-ai/step-3.5-flash", is_available_online=False - ), "SmolLM3ForCausalLM": _HfExamplesInfo("HuggingFaceTB/SmolLM3-3B"), "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"), "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), + "Step1ForCausalLM": _HfExamplesInfo( + "stepfun-ai/Step-Audio-EditX", trust_remote_code=True + ), + "Step3p5ForCausalLM": _HfExamplesInfo( + "stepfun-ai/Step-3.5-Flash", + use_original_num_layers=True, + # Initialize at least one MoE layer + hf_overrides={ + "num_hidden_layers": 4, + }, + ), "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3", trust_remote_code=True), "SolarForCausalLM": _HfExamplesInfo( "upstage/solar-pro-preview-instruct", trust_remote_code=True @@ -1129,8 +1134,12 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { ), "Step3p5MTP": _HfExamplesInfo( "stepfun-ai/Step-3.5-Flash", - trust_remote_code=True, speculative_model="stepfun-ai/Step-3.5-Flash", + use_original_num_layers=True, + # Initialize at least one MoE layer + hf_overrides={ + "num_hidden_layers": 4, + }, is_available_online=False, ), } diff --git a/vllm/model_executor/models/step3p5.py b/vllm/model_executor/models/step3p5.py index 8019dbdbe..195cfcedd 100644 --- a/vllm/model_executor/models/step3p5.py +++ b/vllm/model_executor/models/step3p5.py @@ -36,7 +36,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) @@ -770,37 +769,17 @@ class Step3p5ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts): ): super().__init__() config = vllm_config.model_config.hf_config - lora_config = vllm_config.lora_config - self.config = config - self.vllm_config = vllm_config - self.model = Step3p5Model( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") ) - - self.moe_layers: list[FusedMoEBlock] = [] - for layer in self.model.layers: - if isinstance(layer, PPMissingLayer): - continue - assert isinstance(layer, Step3p5DecoderLayer) - if hasattr(layer, "moe") and isinstance(layer.moe, FusedMoEBlock): - self.moe_layers.append(layer.moe) - if get_pp_group().is_last_rank: - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, + config.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE - if not lora_config - else lora_config.lora_vocab_padding_size, - ) - self.logits_processor = LogitsProcessor( - self.unpadded_vocab_size, config.vocab_size + quant_config=vllm_config.quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) + self.logits_processor = LogitsProcessor(config.vocab_size) else: self.lm_head = PPMissingLayer() @@ -809,6 +788,14 @@ class Step3p5ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts): ) # Set MoE hyperparameters + self.moe_layers: list[FusedMoEBlock] = [] + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + assert isinstance(layer, Step3p5DecoderLayer) + if hasattr(layer, "moe") and isinstance(layer.moe, FusedMoEBlock): + self.moe_layers.append(layer.moe) + self.expert_weights = [] assert len(self.moe_layers) > 0, "No MoE layers found in the model." example_layer = self.moe_layers[0]