[Core] [Frontend] Priority scheduling for embeddings and in the OpenAI-API (#8965)

2024-10-01 11:58:06 +02:00
parent 1fe0a4264a
commit 35bd215168
8 changed files with 53 additions and 5 deletions
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1043,6 +1043,7 @@ class AsyncLLMEngine:
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
        """Generate outputs for a request from an embedding model.

@@ -1057,6 +1058,8 @@ class AsyncLLMEngine:
            request_id: The unique id of the request.
            lora_request: LoRA request to use for generation, if any.
            trace_headers: OpenTelemetry trace headers.
+            priority: The priority of the request.
+                Only applicable with priority scheduling.

        Yields:
            The output `EmbeddingRequestOutput` objects from the LLMEngine
@@ -1109,6 +1112,7 @@ class AsyncLLMEngine:
                pooling_params,
                lora_request=lora_request,
                trace_headers=trace_headers,
+                priority=priority,
        ):
            yield LLMEngine.validate_output(output, EmbeddingRequestOutput)