[V0 Deprecation] Remove V0 Sequence class & Sampler (#25332)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-21 08:52:15 -07:00
parent 65a5910ce3
commit 26e673fe93
27 changed files with 69 additions and 3696 deletions
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -1,13 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that compute logits from hidden_stats."""
-import inspect
-from concurrent.futures import ThreadPoolExecutor
 from typing import Optional

 import torch

-import vllm.envs as envs
 from vllm.distributed import (tensor_model_parallel_all_gather,
                              tensor_model_parallel_gather)
 from vllm.model_executor.custom_op import CustomOp
@@ -16,11 +13,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.platforms import current_platform

-_logits_processor_threadpool: Optional[ThreadPoolExecutor] = None
-if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None:
-    _logits_processor_threadpool = ThreadPoolExecutor(
-        envs.VLLM_LOGITS_PROCESSOR_THREADS)
-

@CustomOp.register("logits_processor")
 class LogitsProcessor(CustomOp):
@@ -60,15 +52,10 @@ class LogitsProcessor(CustomOp):
        hidden_states: torch.Tensor,
        sampling_metadata: Optional[SamplingMetadata] = None,
        embedding_bias: Optional[torch.Tensor] = None,
-        prune_hidden_states: bool = True,
    ) -> Optional[torch.Tensor]:
        if self.logits_as_input:
            logits = hidden_states
        else:
-            if sampling_metadata is not None and prune_hidden_states:
-                hidden_states = _prune_hidden_states(hidden_states,
-                                                     sampling_metadata)
-
            # Get the logits for the next tokens.
            logits = self._get_logits(hidden_states, lm_head, embedding_bias)
        if logits is not None:
@@ -79,12 +66,6 @@ class LogitsProcessor(CustomOp):

            if self.scale != 1.0:
                logits *= self.scale
-
-            # Apply logits processors (if any).
-            if sampling_metadata is not None and \
-                sampling_metadata.seq_groups is not None:
-                logits = _apply_logits_processors(logits, sampling_metadata)
-
        return logits

    def _gather_logits(self, logits: torch.Tensor) -> torch.Tensor:
@@ -125,75 +106,3 @@ class LogitsProcessor(CustomOp):
        s += f", org_vocab_size={self.org_vocab_size}"
        s += f", scale={self.scale}, logits_as_input={self.logits_as_input}"
        return s
-
-
-def _prune_hidden_states(
-    hidden_states: torch.Tensor,
-    sampling_metadata: SamplingMetadata,
-) -> torch.Tensor:
-    # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios
-    # (warmup, profile_run) we might not have selected_token_indices,
-    # so we skip pruning.
-    if sampling_metadata.selected_token_indices is not None:
-        return hidden_states.index_select(
-            0, sampling_metadata.selected_token_indices)
-    else:
-        return hidden_states
-
-
-def _apply_logits_processors(
-    logits: torch.Tensor,
-    sampling_metadata: SamplingMetadata,
-) -> torch.Tensor:
-    found_logits_processors = False
-    logits_processed = 0
-    logits_row_ids_and_logits_row_futures = []
-    for seq_group in sampling_metadata.seq_groups:
-        seq_ids = seq_group.seq_ids
-        sampling_params = seq_group.sampling_params
-        logits_processors = sampling_params.logits_processors
-        if logits_processors:
-            found_logits_processors = True
-
-            for seq_id, logits_row_idx in zip(seq_ids,
-                                              seq_group.sample_indices):
-                logits_row = logits[logits_row_idx]
-                past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids
-                prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids
-
-                if _logits_processor_threadpool is not None:
-                    logits_row_ids_and_logits_row_futures.append(
-                        (logits_row_idx,
-                         _logits_processor_threadpool.submit(
-                             _apply_logits_processors_single_seq, logits_row,
-                             logits_processors, past_tokens_ids,
-                             prompt_tokens_ids)))
-                else:
-                    logits[logits_row_idx] = \
-                        _apply_logits_processors_single_seq(
-                            logits_row, logits_processors, past_tokens_ids,
-                            prompt_tokens_ids)
-
-        logits_processed += len(seq_group.sample_indices) + len(
-            seq_group.prompt_logprob_indices)
-
-    for logits_row_idx, future in logits_row_ids_and_logits_row_futures:
-        logits[logits_row_idx] = future.result()
-
-    if found_logits_processors:
-        # verifies that no rows in logits were missed unexpectedly
-        assert logits_processed == logits.shape[0]
-    return logits
-
-
-def _apply_logits_processors_single_seq(logits_row, logits_processors,
-                                        past_tokens_ids,
-                                        prompt_tokens_ids) -> torch.Tensor:
-    for logits_processor in logits_processors:
-        parameters = inspect.signature(logits_processor).parameters
-        if len(parameters) == 3:
-            logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids,
-                                          logits_row)
-        else:
-            logits_row = logits_processor(past_tokens_ids, logits_row)
-    return logits_row