Use monotonic time where appropriate (#1249)

2023-10-02 19:22:05 -07:00
parent 66d18a7fb0
commit acbed3ef40
7 changed files with 18 additions and 17 deletions
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -121,7 +121,7 @@ class Scheduler:
        blocks_to_copy: Dict[int, List[int]] = {}

        # Fix the current time.
-        now = time.time()
+        now = time.monotonic()

        # Join waiting sequences if possible.
        if not self.swapped:
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -417,7 +417,8 @@ class AsyncLLMEngine:
            request.
        """
        # Preprocess the request.
-        arrival_time = time.time()
+        # This should not be used for logging, as it is monotonic time.
+        arrival_time = time.monotonic()

        try:
            stream = await self.add_request(request_id,
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -256,10 +256,10 @@ class LLMEngine:
            prompt_token_ids: The token IDs of the prompt. If None, we
                use the tokenizer to convert the prompts to token IDs.
            arrival_time: The arrival time of the request. If None, we use
-                the current time.
+                the current monotonic time.
        """
        if arrival_time is None:
-            arrival_time = time.time()
+            arrival_time = time.monotonic()
        if prompt_token_ids is None:
            assert prompt is not None
            prompt_token_ids = self.tokenizer.encode(prompt)
@@ -568,7 +568,7 @@ class LLMEngine:
        prompt_run: bool,
        num_batched_tokens: int,
    ) -> None:
-        now = time.time()
+        now = time.monotonic()
        # Log the number of batched input tokens.
        if prompt_run:
            self.num_prompt_tokens.append((now, num_batched_tokens))
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -210,7 +210,7 @@ async def create_chat_completion(request: ChatCompletionRequest,

    model_name = request.model
    request_id = f"cmpl-{random_uuid()}"
-    created_time = int(time.time())
+    created_time = int(time.monotonic())
    try:
        sampling_params = SamplingParams(
            n=request.n,
@@ -411,7 +411,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
    if error_check_ret is not None:
        return error_check_ret

-    created_time = int(time.time())
+    created_time = int(time.monotonic())
    try:
        sampling_params = SamplingParams(
            n=request.n,