[Refactor] Remove unused dead code (#38842)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Wentao Ye
2026-04-06 11:52:05 -04:00
committed by GitHub
parent f40d9879f2
commit 4ae218c122
3 changed files with 0 additions and 74 deletions

View File

@@ -17,8 +17,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from .utils import maybe_prefix
SQRT2 = 2**0.5
class MLPSpeculatorLayerNorm(nn.Module):
"""
@@ -171,57 +169,6 @@ class MLPSpeculator(nn.Module):
config.vocab_size, config.vocab_size, 1.0
)
# NOTE(woosuk): This method is commented out because it is old code
# using V0. We should either port it to V1 or remove it.
# def generate_proposals(
# self,
# input_ids: torch.Tensor,
# previous_hidden_states: torch.Tensor,
# num_predict_tokens: int,
# sampling_metadata: SamplingMetadata,
# ) -> list[SamplerOutput]:
# if num_predict_tokens > self.max_speculative_tokens:
# raise ValueError(f"Max speculative tokens for model is "
# f"{self.max_speculative_tokens}, but "
# f"{num_predict_tokens} were requested")
# # b x 1 x d
# previous_hidden_states = previous_hidden_states.unsqueeze(1)
# if self.scale_input:
# previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2
# # b x 1
# last_tokens = input_ids.unsqueeze(1)
# next_tokens = []
# for head_index in range(num_predict_tokens):
# # Project and predict
# z = self.emb[head_index](last_tokens) # b k d
# states = self.proj[head_index](previous_hidden_states)
# # Weighted add of state_weight*state and emb_weight*z
# # Let subsequent LN take care of denominator
# # state_weight is close to 1, so shouldn't be any precision issues
# states.add_(z, alpha=self.emb_weight / self.state_weight)
# states = self.activation(self.ln[head_index](states)) # b k d
# previous_hidden_states = states
# # TODO: not yet supporting top_k_tokens_per_head
# states = states.flatten(0, 1)
# logits = self.logits_processor(self.head[head_index], states,
# sampling_metadata)
# output = self.sampler(logits, sampling_metadata)
# last_tokens = output.sampled_token_ids
# next_tokens.append(output)
# return next_tokens
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
params_dict = dict(self.named_parameters())
loaded_params: set[str] = set()

View File

@@ -151,16 +151,3 @@ def flash_mla_with_kvcache_fp8(
descale_k,
)
return out, softmax_lse
#
# TODO: Add fake functions
#
# @register_fake("_flashmla_C::get_mla_metadata")
# def _get_mla_metadata_fake(....) -> Tuple[torch.Tensor, torch.Tensor]:
# return ....
#
# @register_fake("_flashmla_C::fwd_kvcache_mla")
# def _fwd_kvcache_mla_fake(....) -> Tuple[torch.Tensor, torch.Tensor]:
# return ....
#

View File

@@ -1,8 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.v1.executor.ray_executor import (
RayDistributedExecutor as _RayDistributedExecutor,
)
# For backwards compatibility.
RayDistributedExecutor = _RayDistributedExecutor