[V0 deprecation] Remove more V0 references (#29088)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-11-21 19:56:59 +08:00
committed by GitHub
parent b34129bf8e
commit aab0102a26
15 changed files with 31 additions and 75 deletions

View File

@@ -339,7 +339,6 @@ class LLM:
log_non_default_args(engine_args)
# Create the Engine (autoselects V0 vs V1)
self.llm_engine = LLMEngine.from_engine_args(
engine_args=engine_args, usage_context=UsageContext.LLM_CLASS
)

View File

@@ -377,7 +377,7 @@ class ResponsesRequest(OpenAIBaseModel):
"environments. The salt should be random, protected from "
"access by 3rd parties, and long enough to be "
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
"to 256 bit). Not supported by vLLM engine V0."
"to 256 bit)."
),
)
@@ -763,7 +763,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
"environments. The salt should be random, protected from "
"access by 3rd parties, and long enough to be "
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
"to 256 bit). Not supported by vLLM engine V0."
"to 256 bit)."
),
)
kv_transfer_params: dict[str, Any] | None = Field(
@@ -1249,7 +1249,7 @@ class CompletionRequest(OpenAIBaseModel):
"environments. The salt should be random, protected from "
"access by 3rd parties, and long enough to be "
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
"to 256 bit). Not supported by vLLM engine V0."
"to 256 bit)."
),
)

View File

@@ -590,7 +590,6 @@ class MambaMixer2(MambaBase, CustomOp):
hidden_states, _B, _C = self.split_hidden_states_B_C_fn(hidden_states_B_C)
return hidden_states
# NOTE: V0 put prefill before decode, v1 puts decode before prefill
num_prefills = attn_metadata.num_prefills # request count
num_decodes = attn_metadata.num_decode_tokens # token count (=request)
num_prefill_tokens = attn_metadata.num_prefill_tokens # token count

View File

@@ -586,13 +586,11 @@ class IsHybrid(Protocol):
def get_mamba_state_shape_from_config(
cls,
vllm_config: VllmConfig,
use_v1: bool = True,
) -> tuple[tuple[int, int], tuple[int, int, int]]:
"""Calculate shapes for Mamba's convolutional and state caches.
Args:
vllm_config: vLLM config
use_v1: Get shapes for V1 (or V0)
Returns:
Tuple containing:

View File

@@ -290,7 +290,6 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
has_decode = num_decodes > 0
num_actual_tokens = num_prefill_tokens + num_decodes
# NOTE: V0 put prefill before decode, v1 puts decode before prefill
# Separate prefill and decode by splitting varlen input
# Split along token dimension
hidden_states_d, hidden_states_p = torch.split(