[V0 deprecation] Remove more V0 references (#29088)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-11-21 19:56:59 +08:00
parent b34129bf8e
commit aab0102a26
15 changed files with 31 additions and 75 deletions
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -339,7 +339,6 @@ class LLM:

        log_non_default_args(engine_args)

-        # Create the Engine (autoselects V0 vs V1)
        self.llm_engine = LLMEngine.from_engine_args(
            engine_args=engine_args, usage_context=UsageContext.LLM_CLASS
        )
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -377,7 +377,7 @@ class ResponsesRequest(OpenAIBaseModel):
            "environments. The salt should be random, protected from "
            "access by 3rd parties, and long enough to be "
            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."
+            "to 256 bit)."
        ),
    )

@@ -763,7 +763,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
            "environments. The salt should be random, protected from "
            "access by 3rd parties, and long enough to be "
            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."
+            "to 256 bit)."
        ),
    )
    kv_transfer_params: dict[str, Any] | None = Field(
@@ -1249,7 +1249,7 @@ class CompletionRequest(OpenAIBaseModel):
            "environments. The salt should be random, protected from "
            "access by 3rd parties, and long enough to be "
            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."
+            "to 256 bit)."
        ),
    )

--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -590,7 +590,6 @@ class MambaMixer2(MambaBase, CustomOp):
            hidden_states, _B, _C = self.split_hidden_states_B_C_fn(hidden_states_B_C)
            return hidden_states

-        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
        num_prefills = attn_metadata.num_prefills  # request count
        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
        num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -586,13 +586,11 @@ class IsHybrid(Protocol):
    def get_mamba_state_shape_from_config(
        cls,
        vllm_config: VllmConfig,
-        use_v1: bool = True,
    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
        """Calculate shapes for Mamba's convolutional and state caches.

        Args:
            vllm_config: vLLM config
-            use_v1: Get shapes for V1 (or V0)

        Returns:
            Tuple containing:
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -290,7 +290,6 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
        has_decode = num_decodes > 0
        num_actual_tokens = num_prefill_tokens + num_decodes

-        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
        # Separate prefill and decode by splitting varlen input
        # Split along token dimension
        hidden_states_d, hidden_states_p = torch.split(