diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index b8b73bd24..8fb2bfb16 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -32,7 +32,6 @@ from .deepseek_v2 import ( DeepseekV2MoE, get_spec_layer_idx_from_weight_name, ) -from .interfaces import SupportsPP from .utils import maybe_prefix logger = init_logger(__name__) @@ -181,7 +180,7 @@ class DeepSeekMultiTokenPredictor(nn.Module): @support_torch_compile -class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts): +class DeepSeekMTP(nn.Module, DeepseekV2MixtureOfExperts): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py index 1b9abc357..05c4277b1 100644 --- a/vllm/model_executor/models/ernie_mtp.py +++ b/vllm/model_executor/models/ernie_mtp.py @@ -39,7 +39,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors -from .interfaces import SupportsPP from .llama import LlamaDecoderLayer from .utils import is_pp_missing_parameter, maybe_prefix @@ -143,7 +142,7 @@ class ErnieMultiTokenPredictor(nn.Module): return logits -class ErnieMTP(nn.Module, SupportsPP): +class ErnieMTP(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index 08e3a7837..ce674b389 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -47,7 +47,6 @@ from .glm4_moe import ( Glm4MoeDecoderLayer, get_spec_layer_idx_from_weight_name, ) -from .interfaces import SupportsPP from .utils import maybe_prefix @@ -184,7 +183,7 @@ class Glm4MoeMultiTokenPredictor(nn.Module): return logits -class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts): +class Glm4MoeMTP(nn.Module, Glm4MixtureOfExperts): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/longcat_flash_mtp.py b/vllm/model_executor/models/longcat_flash_mtp.py index e554d1e2d..f96d3cf28 100644 --- a/vllm/model_executor/models/longcat_flash_mtp.py +++ b/vllm/model_executor/models/longcat_flash_mtp.py @@ -24,7 +24,6 @@ from vllm.model_executor.models.longcat_flash import FlashConfig from vllm.sequence import IntermediateTensors from .deepseek_v2 import DeepseekV2DecoderLayer -from .interfaces import SupportsPP from .utils import maybe_prefix @@ -124,7 +123,7 @@ class LongCatMultiTokenPredictor(nn.Module): ) -class LongCatFlashMTP(nn.Module, SupportsPP): +class LongCatFlashMTP(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() # LongCat MTP without MoE layers diff --git a/vllm/model_executor/models/openpangu_mtp.py b/vllm/model_executor/models/openpangu_mtp.py index e2cea29c2..273351051 100644 --- a/vllm/model_executor/models/openpangu_mtp.py +++ b/vllm/model_executor/models/openpangu_mtp.py @@ -43,7 +43,6 @@ from vllm.model_executor.models.deepseek_mtp import ( from vllm.model_executor.models.utils import maybe_prefix from vllm.sequence import IntermediateTensors -from .interfaces import SupportsPP from .openpangu import OpenPanguDecoderLayer @@ -92,7 +91,7 @@ class OpenPanguMultiTokenPredictor(DeepSeekMultiTokenPredictor): @support_torch_compile -class OpenPanguMTP(nn.Module, SupportsPP): +class OpenPanguMTP(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index c07ed5932..565fd7d8f 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -27,7 +27,6 @@ from vllm.model_executor.models.qwen3_next import ( from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import Qwen3NextConfig -from .interfaces import SupportsPP from .utils import ( AutoWeightsLoader, is_pp_missing_parameter, @@ -221,7 +220,7 @@ class Qwen3NextMultiTokenPredictor(nn.Module): @support_torch_compile -class Qwen3NextMTP(nn.Module, SupportsPP, QwenNextMixtureOfExperts): +class Qwen3NextMTP(nn.Module, QwenNextMixtureOfExperts): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -253,9 +252,6 @@ class Qwen3NextMTP(nn.Module, SupportsPP, QwenNextMixtureOfExperts): prefix=maybe_prefix(prefix, "lm_head"), ) self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors - ) self.set_moe_parameters() def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: