[Model][Speculative Decoding] Expand DeepSeek MTP code to support k > n_predict (#13626)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-02-27 18:28:08 -05:00
parent 2e94b9cfbb
commit 9804145cac
6 changed files with 49 additions and 22 deletions
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -87,7 +87,7 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module):
                                                 hidden_states=hidden_states,
                                                 residual=None)
        hidden_states = residual + hidden_states
-        return self.shared_head(hidden_states)
+        return hidden_states


 class DeepSeekMultiTokenPredictor(nn.Module):
@@ -121,12 +121,13 @@ class DeepSeekMultiTokenPredictor(nn.Module):
        inputs_embeds: Optional[torch.Tensor] = None,
        spec_step_idx: int = 0,
    ) -> torch.Tensor:
-        return self.layers[str(self.mtp_start_layer_idx + spec_step_idx)](
+        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        return self.layers[str(self.mtp_start_layer_idx + current_step_idx)](
            input_ids,
            positions,
            previous_hidden_states,
            inputs_embeds,
-            spec_step_idx,
+            current_step_idx,
        )

    def compute_logits(
@@ -135,9 +136,12 @@ class DeepSeekMultiTokenPredictor(nn.Module):
        sampling_metadata: SamplingMetadata,
        spec_step_idx: int = 0,
    ) -> torch.Tensor:
-        mtp_layer = self.layers[str(self.mtp_start_layer_idx + spec_step_idx)]
+        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        mtp_layer = self.layers[str(self.mtp_start_layer_idx +
+                                    current_step_idx)]
        logits = self.logits_processor(mtp_layer.shared_head.head,
-                                       hidden_states, sampling_metadata)
+                                       mtp_layer.shared_head(hidden_states),
+                                       sampling_metadata)
        return logits