diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index b24ff64b6..be917c0dc 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -244,6 +244,7 @@ statistics relating to that iteration:
   prefill in this iteration. However, we calculate this interval
   relative to when the request was first received by the frontend
   (`arrival_time`) in order to account for input processing time.
+  Currently `arrival_time` starts when tokenization begins.
 
 For any requests that were completed in a given iteration, we also
 record:
@@ -587,7 +588,7 @@ see:
 - [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
 - [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
 - <https://github.com/vllm-project/vllm/issues/5041> and <https://github.com/vllm-project/vllm/pull/12726>.
-  
+
 This is a non-trivial topic. Consider this comment from Rob:
 
 > I think this metric should focus on trying to estimate what the max
diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py
index 53c28693a..71d1945ae 100644
--- a/vllm/entrypoints/openai/responses/serving.py
+++ b/vllm/entrypoints/openai/responses/serving.py
@@ -710,9 +710,11 @@ class OpenAIServingResponses(OpenAIServing):
                 "Only 'auto' tool_choice is supported in response API with Harmony"
             )
 
+        arrival_time = time.time()
         messages = self._construct_input_messages_with_harmony(request, prev_response)
         prompt_token_ids = render_for_completion(messages)
         engine_prompt = token_inputs(prompt_token_ids)
+        engine_prompt["arrival_time"] = arrival_time
 
         # Add cache_salt if provided in the request
         if request.cache_salt is not None: