[Feature] Add iteration level logging and enhance nvtx marker (#31193)

Signed-off-by: Max Hu <maxhu@nvidia.com>
Signed-off-by: Max Hu <hyoung2991@gmail.com>
Co-authored-by: Max Hu <maxhu@nvidia.com>
This commit is contained in:
Max Hu
2026-01-08 19:13:39 -05:00
committed by GitHub
parent 11cec296dd
commit 6ebe34d6fa
6 changed files with 137 additions and 9 deletions

View File

@@ -7,6 +7,7 @@ import time
import weakref
from collections.abc import Callable, Sequence
from contextlib import AbstractContextManager
from dataclasses import dataclass
from multiprocessing import connection
from multiprocessing.process import BaseProcess
from typing import (
@@ -27,6 +28,7 @@ from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext, is_usage_stats_enabled, usage_message
from vllm.utils.network_utils import get_open_port, get_open_zmq_ipc_path, get_tcp_uri
from vllm.utils.system_utils import kill_process_tree
from vllm.v1.core.sched.output import SchedulerOutput
if TYPE_CHECKING:
import numpy as np
@@ -412,3 +414,53 @@ def tensor_data(tensor: torch.Tensor) -> memoryview:
A memoryview of the tensor data as uint8.
"""
return tensor.flatten().contiguous().view(torch.uint8).numpy().data
@dataclass
class IterationDetails:
num_ctx_requests: int
num_ctx_tokens: int
num_generation_requests: int
num_generation_tokens: int
def __repr__(self) -> str:
return f"IterationDetails(num_ctx_requests={self.num_ctx_requests},\
num_ctx_tokens={self.num_ctx_tokens}, \
num_generation_requests={self.num_generation_requests}, \
num_generation_tokens={self.num_generation_tokens})"
def compute_iteration_details(scheduler_output: SchedulerOutput) -> IterationDetails:
"""
Compute the number of context/generation requests and tokens
for the current iteration's scheduler output. A requests is regarded
as a context request if its output tokens are still 0, an extended chunk
of chunked prefill falls into this category.
Args:
scheduler_output: The scheduler output for the current iteration.
Returns:
An IterationDetails object containing the number of
context/generation requests and tokens.
"""
num_context_requests = 0
num_context_tokens = 0
num_generation_requests = 0
num_generation_tokens = 0
new_req_ids = {new_req.req_id for new_req in scheduler_output.scheduled_new_reqs}
for req_id, num_tokens in scheduler_output.num_scheduled_tokens.items():
if scheduler_output.scheduled_cached_reqs.is_context_phase(req_id) or (
req_id in new_req_ids
):
num_context_requests += 1
num_context_tokens += num_tokens
else:
num_generation_requests += 1
num_generation_tokens += num_tokens
return IterationDetails(
num_context_requests,
num_context_tokens,
num_generation_requests,
num_generation_tokens,
)