[Feature] Add iteration level logging and enhance nvtx marker (#31193)

Signed-off-by: Max Hu <maxhu@nvidia.com>
Signed-off-by: Max Hu <hyoung2991@gmail.com>
Co-authored-by: Max Hu <maxhu@nvidia.com>
This commit is contained in:
Max Hu
2026-01-08 19:13:39 -05:00
committed by GitHub
parent 11cec296dd
commit 6ebe34d6fa
6 changed files with 137 additions and 9 deletions

View File

@@ -75,6 +75,12 @@ class ObservabilityConfig:
enable_mfu_metrics: bool = False
"""Enable Model FLOPs Utilization (MFU) metrics."""
enable_logging_iteration_details: bool = False
"""Enable detailed logging of iteration details.
If set, vllm EngineCore will log iteration details
This includes number of context/generation requests and tokens
and the elapsed cpu time for the iteration."""
@cached_property
def collect_model_forward_time(self) -> bool:
"""Whether to collect model forward time for the request."""

View File

@@ -525,6 +525,9 @@ class EngineArgs:
ObservabilityConfig.enable_layerwise_nvtx_tracing
)
enable_mfu_metrics: bool = ObservabilityConfig.enable_mfu_metrics
enable_logging_iteration_details: bool = (
ObservabilityConfig.enable_logging_iteration_details
)
enable_mm_processor_stats: bool = ObservabilityConfig.enable_mm_processor_stats
scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
@@ -1059,6 +1062,10 @@ class EngineArgs:
"--enable-mfu-metrics",
**observability_kwargs["enable_mfu_metrics"],
)
observability_group.add_argument(
"--enable-logging-iteration-details",
**observability_kwargs["enable_logging_iteration_details"],
)
# Scheduler arguments
scheduler_kwargs = get_kwargs(SchedulerConfig)
@@ -1713,6 +1720,7 @@ class EngineArgs:
enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
enable_mfu_metrics=self.enable_mfu_metrics,
enable_mm_processor_stats=self.enable_mm_processor_stats,
enable_logging_iteration_details=self.enable_logging_iteration_details,
)
# Compilation config overrides

View File

@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from functools import cached_property
from typing import TYPE_CHECKING
from vllm._bc_linter import bc_linter_include
@@ -151,6 +152,20 @@ class CachedRequestData:
def num_reqs(self) -> int:
return len(self.req_ids)
@cached_property
def _req_id_to_num_output_tokens(self) -> dict[str, int]:
"""Cache mapping of req_id to num_output_tokens for O(1) lookup.
This cached property is safe because CachedRequestData instances
are created fresh each scheduling iteration and not mutated during
computation of iteration details.
"""
return dict(zip(self.req_ids, self.num_output_tokens))
def is_context_phase(self, req_id: str) -> bool:
num_output_tokens = self._req_id_to_num_output_tokens.get(req_id)
return num_output_tokens is not None and num_output_tokens == 0
@classmethod
def make_empty(cls) -> "CachedRequestData":
return cls(

View File

@@ -65,6 +65,7 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
from vllm.v1.structured_output import StructuredOutputManager
from vllm.v1.utils import compute_iteration_details
from vllm.version import __version__ as VLLM_VERSION
logger = init_logger(__name__)
@@ -208,7 +209,6 @@ class EngineCore:
self.async_scheduling = vllm_config.scheduler_config.async_scheduling
self.aborts_queue = queue.Queue[list[str]]()
# Mark the startup heap as static so that it's ignored by GC.
# Reduces pause times of oldest generation collections.
freeze_gc_heap()
@@ -337,6 +337,36 @@ class EngineCore:
)
raise err
@contextmanager
def log_iteration_details(self, scheduler_output: SchedulerOutput):
if not self.vllm_config.observability_config.enable_logging_iteration_details:
yield
return
self._iteration_index = getattr(self, "_iteration_index", 0)
iteration_details = compute_iteration_details(scheduler_output)
before = time.monotonic()
yield
logger.info(
"".join(
[
"Iteration(",
str(self._iteration_index),
"): ",
str(iteration_details.num_ctx_requests),
" context requests, ",
str(iteration_details.num_ctx_tokens),
" context tokens, ",
str(iteration_details.num_generation_requests),
" generation requests, ",
str(iteration_details.num_generation_tokens),
" generation tokens, iteration elapsed time: ",
format((time.monotonic() - before) * 1000, ".2f"),
" ms",
]
)
)
self._iteration_index += 1
def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
"""Schedule, execute, and make output.
@@ -351,7 +381,10 @@ class EngineCore:
scheduler_output = self.scheduler.schedule()
future = self.model_executor.execute_model(scheduler_output, non_block=True)
grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
with self.log_error_detail(scheduler_output):
with (
self.log_error_detail(scheduler_output),
self.log_iteration_details(scheduler_output),
):
model_output = future.result()
if model_output is None:
model_output = self.model_executor.sample_tokens(grammar_output)
@@ -447,7 +480,10 @@ class EngineCore:
# Block until the next result is available.
future, scheduler_output, exec_model_fut = batch_queue.pop()
with self.log_error_detail(scheduler_output):
with (
self.log_error_detail(scheduler_output),
self.log_iteration_details(scheduler_output),
):
model_output = future.result()
if model_output is None:
# None from sample_tokens() implies that the original execute_model()

View File

@@ -7,6 +7,7 @@ import time
import weakref
from collections.abc import Callable, Sequence
from contextlib import AbstractContextManager
from dataclasses import dataclass
from multiprocessing import connection
from multiprocessing.process import BaseProcess
from typing import (
@@ -27,6 +28,7 @@ from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext, is_usage_stats_enabled, usage_message
from vllm.utils.network_utils import get_open_port, get_open_zmq_ipc_path, get_tcp_uri
from vllm.utils.system_utils import kill_process_tree
from vllm.v1.core.sched.output import SchedulerOutput
if TYPE_CHECKING:
import numpy as np
@@ -412,3 +414,53 @@ def tensor_data(tensor: torch.Tensor) -> memoryview:
A memoryview of the tensor data as uint8.
"""
return tensor.flatten().contiguous().view(torch.uint8).numpy().data
@dataclass
class IterationDetails:
num_ctx_requests: int
num_ctx_tokens: int
num_generation_requests: int
num_generation_tokens: int
def __repr__(self) -> str:
return f"IterationDetails(num_ctx_requests={self.num_ctx_requests},\
num_ctx_tokens={self.num_ctx_tokens}, \
num_generation_requests={self.num_generation_requests}, \
num_generation_tokens={self.num_generation_tokens})"
def compute_iteration_details(scheduler_output: SchedulerOutput) -> IterationDetails:
"""
Compute the number of context/generation requests and tokens
for the current iteration's scheduler output. A requests is regarded
as a context request if its output tokens are still 0, an extended chunk
of chunked prefill falls into this category.
Args:
scheduler_output: The scheduler output for the current iteration.
Returns:
An IterationDetails object containing the number of
context/generation requests and tokens.
"""
num_context_requests = 0
num_context_tokens = 0
num_generation_requests = 0
num_generation_tokens = 0
new_req_ids = {new_req.req_id for new_req in scheduler_output.scheduled_new_reqs}
for req_id, num_tokens in scheduler_output.num_scheduled_tokens.items():
if scheduler_output.scheduled_cached_reqs.is_context_phase(req_id) or (
req_id in new_req_ids
):
num_context_requests += 1
num_context_tokens += num_tokens
else:
num_generation_requests += 1
num_generation_tokens += num_tokens
return IterationDetails(
num_context_requests,
num_context_tokens,
num_generation_requests,
num_generation_tokens,
)

View File

@@ -50,7 +50,7 @@ from vllm.v1.outputs import (
DraftTokenIds,
ModelRunnerOutput,
)
from vllm.v1.utils import report_usage_stats
from vllm.v1.utils import compute_iteration_details, report_usage_stats
from vllm.v1.worker.utils import is_residual_scattered_for_sp
from vllm.v1.worker.worker_base import WorkerBase
from vllm.v1.worker.workspace import init_workspace_manager
@@ -547,18 +547,29 @@ class Worker(WorkerBase):
def annotate_profile(self, scheduler_output):
# add trace annotation so that we can easily distinguish
# new/cached request numbers in each iteration
# context/generation request numbers in each iteration.
# A context request is a request that has not yet generated any tokens
if not self.profiler:
return nullcontext()
self.profiler.step()
num_new = len(scheduler_output.scheduled_new_reqs)
num_cached = len(scheduler_output.scheduled_cached_reqs.req_ids)
iteration_details = compute_iteration_details(scheduler_output)
return self.profiler.annotate_context_manager(
f"execute_new_{num_new}_cached_{num_cached}"
annotation = "".join(
[
"execute_context_",
str(iteration_details.num_ctx_requests),
"(",
str(iteration_details.num_ctx_tokens),
")_generation_",
str(iteration_details.num_generation_requests),
"(",
str(iteration_details.num_generation_tokens),
")",
]
)
return self.profiler.annotate_context_manager(annotation)
@torch.inference_mode()
def sample_tokens(