103 lines
3.6 KiB
Python
103 lines
3.6 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from typing import TYPE_CHECKING, List
|
|
|
|
if TYPE_CHECKING:
|
|
from vllm.outputs import RequestOutput
|
|
from vllm.v1.engine import EngineCoreOutput, FinishReason
|
|
|
|
|
|
@dataclass
|
|
class PrefixCacheStats:
|
|
"""Stores prefix cache hit statistics."""
|
|
# Whether reset_prefix_cache was invoked.
|
|
reset: bool = False
|
|
# The number of requests in this update.
|
|
requests: int = 0
|
|
# The number of queries in these requests. Note that "queries" here
|
|
# means the number of blocks that were queried from the cache.
|
|
queries: int = 0
|
|
# The number of hits in these requests.
|
|
hits: int = 0
|
|
|
|
|
|
@dataclass
|
|
class SchedulerStats:
|
|
"""Stats associated with the scheduler."""
|
|
|
|
num_running_reqs: int = 0
|
|
num_waiting_reqs: int = 0
|
|
|
|
gpu_cache_usage: float = 0.0
|
|
|
|
prefix_cache_stats: PrefixCacheStats = field(
|
|
default_factory=PrefixCacheStats)
|
|
|
|
|
|
@dataclass
|
|
class RequestStateStats:
|
|
"""Stats that need to be tracked across delta updates."""
|
|
|
|
num_generation_tokens: int = 0
|
|
last_token_time: float = 0.0
|
|
|
|
|
|
@dataclass
|
|
class FinishedRequestStats:
|
|
"""Stats associated with a finished request."""
|
|
|
|
finish_reason: "FinishReason"
|
|
num_prompt_tokens: int = 0
|
|
num_generation_tokens: int = 0
|
|
|
|
|
|
class IterationStats:
|
|
"""Stats associated with a single set of EngineCoreOutputs."""
|
|
|
|
def __init__(self, log_stats: bool):
|
|
self.log_stats = log_stats
|
|
self.num_generation_tokens = 0
|
|
self.num_prompt_tokens = 0
|
|
self.finished_requests: List[FinishedRequestStats] = []
|
|
self.time_to_first_tokens_iter: List[float] = []
|
|
self.time_per_output_tokens_iter: List[float] = []
|
|
|
|
def update_from_output(self, output: "EngineCoreOutput",
|
|
is_prefilling: bool, prompt_len: int,
|
|
request_state_stats: RequestStateStats):
|
|
if not self.log_stats:
|
|
return
|
|
|
|
num_new_generation_tokens = len(output.new_token_ids)
|
|
now = time.time()
|
|
last_token_latency = now - request_state_stats.last_token_time
|
|
|
|
self.num_generation_tokens += num_new_generation_tokens
|
|
if is_prefilling:
|
|
# TODO(andy): we used to assert that num_new_generation_tokens
|
|
# > 0 with an invariant that EngineCore does not stream outputs
|
|
# for partially completed prefills (scheduler.update_from_output
|
|
# makes EngineCoreOutput iff num_computed_tokens == num_tokens).
|
|
# When prompt logprobs are enabled, we currently stream out the
|
|
# partially completed prompt.
|
|
# This will be reverted in a follow up PR and we should re-enable
|
|
# this assertion / invariant.
|
|
if num_new_generation_tokens > 0:
|
|
self.num_prompt_tokens += prompt_len
|
|
self.time_to_first_tokens_iter.append(last_token_latency)
|
|
else:
|
|
self.time_per_output_tokens_iter.append(last_token_latency)
|
|
|
|
request_state_stats.num_generation_tokens += num_new_generation_tokens
|
|
request_state_stats.last_token_time = now
|
|
|
|
def update_from_finished_request(self, finish_reason: "FinishReason",
|
|
request_output: "RequestOutput",
|
|
request_state_stats: RequestStateStats):
|
|
self.finished_requests.append(
|
|
FinishedRequestStats(finish_reason,
|
|
len(request_output.prompt_token_ids),
|
|
request_state_stats.num_generation_tokens))
|