[Feature] Add iteration level logging and enhance nvtx marker (#31193)

Signed-off-by: Max Hu <maxhu@nvidia.com>
Signed-off-by: Max Hu <hyoung2991@gmail.com>
Co-authored-by: Max Hu <maxhu@nvidia.com>
This commit is contained in:
Max Hu
2026-01-08 19:13:39 -05:00
committed by GitHub
parent 11cec296dd
commit 6ebe34d6fa
6 changed files with 137 additions and 9 deletions

View File

@@ -65,6 +65,7 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
from vllm.v1.structured_output import StructuredOutputManager
from vllm.v1.utils import compute_iteration_details
from vllm.version import __version__ as VLLM_VERSION
logger = init_logger(__name__)
@@ -208,7 +209,6 @@ class EngineCore:
self.async_scheduling = vllm_config.scheduler_config.async_scheduling
self.aborts_queue = queue.Queue[list[str]]()
# Mark the startup heap as static so that it's ignored by GC.
# Reduces pause times of oldest generation collections.
freeze_gc_heap()
@@ -337,6 +337,36 @@ class EngineCore:
)
raise err
@contextmanager
def log_iteration_details(self, scheduler_output: SchedulerOutput):
if not self.vllm_config.observability_config.enable_logging_iteration_details:
yield
return
self._iteration_index = getattr(self, "_iteration_index", 0)
iteration_details = compute_iteration_details(scheduler_output)
before = time.monotonic()
yield
logger.info(
"".join(
[
"Iteration(",
str(self._iteration_index),
"): ",
str(iteration_details.num_ctx_requests),
" context requests, ",
str(iteration_details.num_ctx_tokens),
" context tokens, ",
str(iteration_details.num_generation_requests),
" generation requests, ",
str(iteration_details.num_generation_tokens),
" generation tokens, iteration elapsed time: ",
format((time.monotonic() - before) * 1000, ".2f"),
" ms",
]
)
)
self._iteration_index += 1
def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
"""Schedule, execute, and make output.
@@ -351,7 +381,10 @@ class EngineCore:
scheduler_output = self.scheduler.schedule()
future = self.model_executor.execute_model(scheduler_output, non_block=True)
grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
with self.log_error_detail(scheduler_output):
with (
self.log_error_detail(scheduler_output),
self.log_iteration_details(scheduler_output),
):
model_output = future.result()
if model_output is None:
model_output = self.model_executor.sample_tokens(grammar_output)
@@ -447,7 +480,10 @@ class EngineCore:
# Block until the next result is available.
future, scheduler_output, exec_model_fut = batch_queue.pop()
with self.log_error_detail(scheduler_output):
with (
self.log_error_detail(scheduler_output),
self.log_iteration_details(scheduler_output),
):
model_output = future.result()
if model_output is None:
# None from sample_tokens() implies that the original execute_model()