diff --git a/docs/design/metrics.md b/docs/design/metrics.md index b24ff64b6..be917c0dc 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -244,6 +244,7 @@ statistics relating to that iteration: prefill in this iteration. However, we calculate this interval relative to when the request was first received by the frontend (`arrival_time`) in order to account for input processing time. + Currently `arrival_time` starts when tokenization begins. For any requests that were completed in a given iteration, we also record: @@ -587,7 +588,7 @@ see: - [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ) - [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf) - and . - + This is a non-trivial topic. Consider this comment from Rob: > I think this metric should focus on trying to estimate what the max diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 53c28693a..71d1945ae 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -710,9 +710,11 @@ class OpenAIServingResponses(OpenAIServing): "Only 'auto' tool_choice is supported in response API with Harmony" ) + arrival_time = time.time() messages = self._construct_input_messages_with_harmony(request, prev_response) prompt_token_ids = render_for_completion(messages) engine_prompt = token_inputs(prompt_token_ids) + engine_prompt["arrival_time"] = arrival_time # Add cache_salt if provided in the request if request.cache_salt is not None: