diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index 48604d8e5..612baba8e 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -17,8 +17,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from .utils import maybe_prefix -SQRT2 = 2**0.5 - class MLPSpeculatorLayerNorm(nn.Module): """ @@ -171,57 +169,6 @@ class MLPSpeculator(nn.Module): config.vocab_size, config.vocab_size, 1.0 ) - # NOTE(woosuk): This method is commented out because it is old code - # using V0. We should either port it to V1 or remove it. - - # def generate_proposals( - # self, - # input_ids: torch.Tensor, - # previous_hidden_states: torch.Tensor, - # num_predict_tokens: int, - # sampling_metadata: SamplingMetadata, - # ) -> list[SamplerOutput]: - # if num_predict_tokens > self.max_speculative_tokens: - # raise ValueError(f"Max speculative tokens for model is " - # f"{self.max_speculative_tokens}, but " - # f"{num_predict_tokens} were requested") - - # # b x 1 x d - # previous_hidden_states = previous_hidden_states.unsqueeze(1) - - # if self.scale_input: - # previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2 - - # # b x 1 - # last_tokens = input_ids.unsqueeze(1) - - # next_tokens = [] - - # for head_index in range(num_predict_tokens): - - # # Project and predict - # z = self.emb[head_index](last_tokens) # b k d - # states = self.proj[head_index](previous_hidden_states) - - # # Weighted add of state_weight*state and emb_weight*z - # # Let subsequent LN take care of denominator - # # state_weight is close to 1, so shouldn't be any precision issues - # states.add_(z, alpha=self.emb_weight / self.state_weight) - - # states = self.activation(self.ln[head_index](states)) # b k d - # previous_hidden_states = states - # # TODO: not yet supporting top_k_tokens_per_head - # states = states.flatten(0, 1) - - # logits = self.logits_processor(self.head[head_index], states, - # sampling_metadata) - - # output = self.sampler(logits, sampling_metadata) - # last_tokens = output.sampled_token_ids - # next_tokens.append(output) - - # return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() diff --git a/vllm/v1/attention/ops/flashmla.py b/vllm/v1/attention/ops/flashmla.py index aa667570a..df04f5bf2 100644 --- a/vllm/v1/attention/ops/flashmla.py +++ b/vllm/v1/attention/ops/flashmla.py @@ -151,16 +151,3 @@ def flash_mla_with_kvcache_fp8( descale_k, ) return out, softmax_lse - - -# -# TODO: Add fake functions -# -# @register_fake("_flashmla_C::get_mla_metadata") -# def _get_mla_metadata_fake(....) -> Tuple[torch.Tensor, torch.Tensor]: -# return .... -# -# @register_fake("_flashmla_C::fwd_kvcache_mla") -# def _fwd_kvcache_mla_fake(....) -> Tuple[torch.Tensor, torch.Tensor]: -# return .... -# diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py deleted file mode 100644 index 9a56c093a..000000000 --- a/vllm/v1/executor/ray_distributed_executor.py +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.v1.executor.ray_executor import ( - RayDistributedExecutor as _RayDistributedExecutor, -) - -# For backwards compatibility. -RayDistributedExecutor = _RayDistributedExecutor