[V0 deprecation] Remove more V0 references (#29088)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -339,7 +339,6 @@ class LLM:
|
||||
|
||||
log_non_default_args(engine_args)
|
||||
|
||||
# Create the Engine (autoselects V0 vs V1)
|
||||
self.llm_engine = LLMEngine.from_engine_args(
|
||||
engine_args=engine_args, usage_context=UsageContext.LLM_CLASS
|
||||
)
|
||||
|
||||
@@ -377,7 +377,7 @@ class ResponsesRequest(OpenAIBaseModel):
|
||||
"environments. The salt should be random, protected from "
|
||||
"access by 3rd parties, and long enough to be "
|
||||
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
||||
"to 256 bit). Not supported by vLLM engine V0."
|
||||
"to 256 bit)."
|
||||
),
|
||||
)
|
||||
|
||||
@@ -763,7 +763,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
"environments. The salt should be random, protected from "
|
||||
"access by 3rd parties, and long enough to be "
|
||||
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
||||
"to 256 bit). Not supported by vLLM engine V0."
|
||||
"to 256 bit)."
|
||||
),
|
||||
)
|
||||
kv_transfer_params: dict[str, Any] | None = Field(
|
||||
@@ -1249,7 +1249,7 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
"environments. The salt should be random, protected from "
|
||||
"access by 3rd parties, and long enough to be "
|
||||
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
||||
"to 256 bit). Not supported by vLLM engine V0."
|
||||
"to 256 bit)."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -590,7 +590,6 @@ class MambaMixer2(MambaBase, CustomOp):
|
||||
hidden_states, _B, _C = self.split_hidden_states_B_C_fn(hidden_states_B_C)
|
||||
return hidden_states
|
||||
|
||||
# NOTE: V0 put prefill before decode, v1 puts decode before prefill
|
||||
num_prefills = attn_metadata.num_prefills # request count
|
||||
num_decodes = attn_metadata.num_decode_tokens # token count (=request)
|
||||
num_prefill_tokens = attn_metadata.num_prefill_tokens # token count
|
||||
|
||||
@@ -586,13 +586,11 @@ class IsHybrid(Protocol):
|
||||
def get_mamba_state_shape_from_config(
|
||||
cls,
|
||||
vllm_config: VllmConfig,
|
||||
use_v1: bool = True,
|
||||
) -> tuple[tuple[int, int], tuple[int, int, int]]:
|
||||
"""Calculate shapes for Mamba's convolutional and state caches.
|
||||
|
||||
Args:
|
||||
vllm_config: vLLM config
|
||||
use_v1: Get shapes for V1 (or V0)
|
||||
|
||||
Returns:
|
||||
Tuple containing:
|
||||
|
||||
@@ -290,7 +290,6 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
|
||||
has_decode = num_decodes > 0
|
||||
num_actual_tokens = num_prefill_tokens + num_decodes
|
||||
|
||||
# NOTE: V0 put prefill before decode, v1 puts decode before prefill
|
||||
# Separate prefill and decode by splitting varlen input
|
||||
# Split along token dimension
|
||||
hidden_states_d, hidden_states_p = torch.split(
|
||||
|
||||
Reference in New Issue
Block a user