[V1][Metrics] Add several request timing histograms (#12644)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
Mark McLoughlin
2025-02-11 15:14:00 +00:00
committed by GitHub
parent 110f59a33e
commit 75e6e14516
16 changed files with 335 additions and 85 deletions

View File

@@ -53,10 +53,12 @@ class AsyncLLM(EngineClient):
self.log_requests = log_requests
self.log_stats = log_stats
self.stat_loggers: List[StatLoggerBase] = [
LoggingStatLogger(),
PrometheusStatLogger(vllm_config.model_config),
]
self.stat_loggers: List[StatLoggerBase] = []
if self.log_stats:
self.stat_loggers.extend([
LoggingStatLogger(),
PrometheusStatLogger(vllm_config.model_config),
])
# Tokenizer (+ ensure liveness if running in another process).
self.tokenizer = init_tokenizer_from_configs(
@@ -85,6 +87,7 @@ class AsyncLLM(EngineClient):
asyncio_mode=True,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=self.log_stats,
)
self.output_handler: Optional[asyncio.Task] = None
@@ -246,6 +249,8 @@ class AsyncLLM(EngineClient):
# 1) Pull EngineCoreOutputs from the EngineCore.
outputs = await self.engine_core.get_output_async()
iteration_stats = IterationStats() if self.log_stats else None
# Split outputs into chunks of at most
# VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
# event loop for too long.
@@ -257,14 +262,12 @@ class AsyncLLM(EngineClient):
outputs.outputs,
cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
iteration_stats = None
for i, outputs_slice in enumerate(slices):
# 2) Process EngineCoreOutputs.
processed_outputs = self.output_processor.process_outputs(
outputs_slice, iteration_stats)
outputs_slice, outputs.timestamp, iteration_stats)
# NOTE: RequestOutputs are pushed to their queues.
assert not processed_outputs.request_outputs
iteration_stats = processed_outputs.iteration_stats
# Allow other asyncio tasks to run between chunks
if i + 1 < len(slices):
@@ -277,7 +280,6 @@ class AsyncLLM(EngineClient):
# 4) Logging.
# TODO(rob): make into a coroutine and launch it in
# background thread once Prometheus overhead is non-trivial.
assert iteration_stats is not None
self._log_stats(
scheduler_stats=outputs.scheduler_stats,
iteration_stats=iteration_stats,
@@ -299,12 +301,14 @@ class AsyncLLM(EngineClient):
def _log_stats(
self,
scheduler_stats: SchedulerStats,
iteration_stats: IterationStats,
scheduler_stats: Optional[SchedulerStats],
iteration_stats: Optional[IterationStats],
):
if not self.log_stats:
return
assert scheduler_stats is not None
assert iteration_stats is not None
for logger in self.stat_loggers:
logger.log(scheduler_stats=scheduler_stats,
iteration_stats=iteration_stats)