Files
vllm/vllm/v1/metrics/stats.py

103 lines
3.6 KiB
Python

# SPDX-License-Identifier: Apache-2.0
import time
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, List
if TYPE_CHECKING:
from vllm.outputs import RequestOutput
from vllm.v1.engine import EngineCoreOutput, FinishReason
@dataclass
class PrefixCacheStats:
"""Stores prefix cache hit statistics."""
# Whether reset_prefix_cache was invoked.
reset: bool = False
# The number of requests in this update.
requests: int = 0
# The number of queries in these requests. Note that "queries" here
# means the number of blocks that were queried from the cache.
queries: int = 0
# The number of hits in these requests.
hits: int = 0
@dataclass
class SchedulerStats:
"""Stats associated with the scheduler."""
num_running_reqs: int = 0
num_waiting_reqs: int = 0
gpu_cache_usage: float = 0.0
prefix_cache_stats: PrefixCacheStats = field(
default_factory=PrefixCacheStats)
@dataclass
class RequestStateStats:
"""Stats that need to be tracked across delta updates."""
num_generation_tokens: int = 0
last_token_time: float = 0.0
@dataclass
class FinishedRequestStats:
"""Stats associated with a finished request."""
finish_reason: "FinishReason"
num_prompt_tokens: int = 0
num_generation_tokens: int = 0
class IterationStats:
"""Stats associated with a single set of EngineCoreOutputs."""
def __init__(self, log_stats: bool):
self.log_stats = log_stats
self.num_generation_tokens = 0
self.num_prompt_tokens = 0
self.finished_requests: List[FinishedRequestStats] = []
self.time_to_first_tokens_iter: List[float] = []
self.time_per_output_tokens_iter: List[float] = []
def update_from_output(self, output: "EngineCoreOutput",
is_prefilling: bool, prompt_len: int,
request_state_stats: RequestStateStats):
if not self.log_stats:
return
num_new_generation_tokens = len(output.new_token_ids)
now = time.time()
last_token_latency = now - request_state_stats.last_token_time
self.num_generation_tokens += num_new_generation_tokens
if is_prefilling:
# TODO(andy): we used to assert that num_new_generation_tokens
# > 0 with an invariant that EngineCore does not stream outputs
# for partially completed prefills (scheduler.update_from_output
# makes EngineCoreOutput iff num_computed_tokens == num_tokens).
# When prompt logprobs are enabled, we currently stream out the
# partially completed prompt.
# This will be reverted in a follow up PR and we should re-enable
# this assertion / invariant.
if num_new_generation_tokens > 0:
self.num_prompt_tokens += prompt_len
self.time_to_first_tokens_iter.append(last_token_latency)
else:
self.time_per_output_tokens_iter.append(last_token_latency)
request_state_stats.num_generation_tokens += num_new_generation_tokens
request_state_stats.last_token_time = now
def update_from_finished_request(self, finish_reason: "FinishReason",
request_output: "RequestOutput",
request_state_stats: RequestStateStats):
self.finished_requests.append(
FinishedRequestStats(finish_reason,
len(request_output.prompt_token_ids),
request_state_stats.num_generation_tokens))